Spaces:
Build error
Build error
init
Browse files- G_157000.pth +3 -0
- app.py +99 -0
- ds_inference.py +33 -0
- egs/visinger2/__init__.py +0 -0
- egs/visinger2/bash/train.sh +13 -0
- egs/visinger2/config.json +68 -0
- egs/visinger2/dataset.py +334 -0
- egs/visinger2/inference.py +123 -0
- egs/visinger2/models.py +1023 -0
- egs/visinger2/train.py +456 -0
- infer/__init__.py +122 -0
- infer/share.ds +62 -0
- modules/attentions.py +397 -0
- modules/commons.py +162 -0
- modules/ddsp.py +189 -0
- modules/losses.py +62 -0
- modules/modules.py +450 -0
- modules/stft.py +512 -0
- modules/transforms.py +193 -0
- preprocess/mel_processing.py +104 -0
- preprocess/prepare_multispeaker.py +10 -0
- preprocess/preprocess.py +103 -0
- preprocess/preprocess_multispeaker.py +28 -0
- requirements.txt +9 -0
- text/npu/__init__.py +2 -0
- text/npu/symbol_converter.py +34 -0
- text/npu/symbols.py +61 -0
- utils/__init__.py +0 -0
- utils/audio.py +99 -0
- utils/utils.py +268 -0
G_157000.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:87e9b24e8cd7987f493629494018da15d4a4cbb34b1b788f3291a63851b2aaa1
|
3 |
+
size 143295815
|
app.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import gradio as gr
|
5 |
+
import librosa
|
6 |
+
import numpy as np
|
7 |
+
import soundfile
|
8 |
+
import torch
|
9 |
+
import logging
|
10 |
+
|
11 |
+
from egs.visinger2.models import SynthesizerTrn
|
12 |
+
from infer import infer_ds
|
13 |
+
from utils import utils
|
14 |
+
|
15 |
+
logging.getLogger('numba').setLevel(logging.WARNING)
|
16 |
+
logging.getLogger('markdown_it').setLevel(logging.WARNING)
|
17 |
+
logging.getLogger('urllib3').setLevel(logging.WARNING)
|
18 |
+
logging.getLogger('matplotlib').setLevel(logging.WARNING)
|
19 |
+
|
20 |
+
config_json = "egs/visinger2/config.json"
|
21 |
+
model_path = "G_157000.pth"
|
22 |
+
|
23 |
+
|
24 |
+
hps = utils.get_hparams_from_file(config_json)
|
25 |
+
net_g = SynthesizerTrn(hps)
|
26 |
+
_ = net_g.eval()
|
27 |
+
_ = utils.load_checkpoint(model_path, net_g, None)
|
28 |
+
|
29 |
+
def vc_fn(speaker, ds, vc_transform):
|
30 |
+
try:
|
31 |
+
ds = json.loads(ds)
|
32 |
+
except:
|
33 |
+
return "工程文件json解析失败,请将ds文件的完整内容粘贴与此处", None
|
34 |
+
|
35 |
+
dur = 0
|
36 |
+
flag = False
|
37 |
+
try:
|
38 |
+
for inp in ds:
|
39 |
+
f0_seq = inp["f0_seq"]
|
40 |
+
ph_dur = inp["ph_dur"]
|
41 |
+
ph_dur= [float(i) for i in ph_dur.split(" ")]
|
42 |
+
f0_seq = [float(i) for i in f0_seq.split(" ")]
|
43 |
+
dur+=sum(ph_dur)
|
44 |
+
if sum(ph_dur) >30:
|
45 |
+
flag = True
|
46 |
+
except:
|
47 |
+
return "ds工程需要冻结f0和音素参数才能使用此模型合成", None
|
48 |
+
if flag:
|
49 |
+
return "单个切片时长必须小于30s,否则请使用本地推理", None
|
50 |
+
if dur>120:
|
51 |
+
return "总时长需要小于2分钟,否则请使用本地推理", None
|
52 |
+
out_audio = infer_ds(net_g, hps, ds, speaker, vc_transform)
|
53 |
+
# return "请上传小于45s的音频,需要转换长音频请本地进行转换", None
|
54 |
+
# out_audio, out_sr = inference_main.infer(sid, out_wav_path, model_map[model], vc_transform)
|
55 |
+
# _audio = out_audio.cpu().numpy()
|
56 |
+
return "Success", (44100, out_audio.astype(np.float32))
|
57 |
+
|
58 |
+
|
59 |
+
app = gr.Blocks()
|
60 |
+
with app:
|
61 |
+
with gr.Tabs():
|
62 |
+
with gr.TabItem("Basic"):
|
63 |
+
gr.Markdown(value="""
|
64 |
+
这是visinger2 塔菲、电棍模型的在线demo, github 仓库地址是[visinger2-nomidi](https://github.com/innnky/VISinger2-nomidi)
|
65 |
+
|
66 |
+
由于训练集为录播数据全自动化制作,因此质量比较差,此模型并非visinger2的音质上限,最高质量模型效果请参照[VISinger2官方demo](https://zhangyongmao.github.io/VISinger2/)
|
67 |
+
|
68 |
+
其中ds工程文件为[DiffSinger](https://github.com/openvpi/DiffSinger)工程,需要通过[OpenSVIP](https://openvpi.github.io/) 转换器进行制作,原理是先通过别的歌声合成软件制作工程并转换为模型能够接受的输入格式。
|
69 |
+
|
70 |
+
由于此模型是nomidi模型,因此导出ds工程时需要冻结音素和音高参数, 否则会报错,具体DiffSinger工程制作详细问题可以加入DiffSinger QQ交流群 907879266
|
71 |
+
|
72 |
+
在线推理限制为总时长小于2分钟,且单个切片时长小于30s,有更大需求请下载本仓库或github仓库代码运行ds_inference.py进行本地推理
|
73 |
+
""")
|
74 |
+
sid = gr.Dropdown(label="音色", choices=["taffy", "otto"], value="taffy")
|
75 |
+
vc_input3 = gr.TextArea(label="ds工程(json格式)",value='''[
|
76 |
+
{
|
77 |
+
"text": "SP 清 晨 SP",
|
78 |
+
"ph_seq": "SP q ing ch en SP",
|
79 |
+
"note_seq": "rest D4 D4 G4 G4 rest",
|
80 |
+
"note_dur_seq": "0.6 0.273 0.273 0.4089999 0.4089999 0.4",
|
81 |
+
"is_slur_seq": "0 0 0 0 0 0",
|
82 |
+
"ph_dur": "0.469318 0.130682 0.120727 0.152273 0.409 0.4",
|
83 |
+
"f0_timestep": "0.005",
|
84 |
+
"f0_seq": "301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 302.0 302.4 301.9 301.4 300.5 299.4 299.0 298.3 297.9 297.6 297.2 297.2 297.0 296.8 296.9 296.7 296.6 296.8 296.9 296.9 297.4 297.6 297.7 298.2 298.5 298.3 298.6 298.7 298.5 298.6 298.3 297.8 296.4 293.9 291.5 286.7 283.2 279.6 278.5 283.4 288.4 293.5 298.6 303.9 309.3 314.7 320.3 325.9 331.7 337.5 343.5 349.5 355.7 362.0 368.3 374.8 381.5 387.1 388.7 391.3 393.6 396.1 397.7 398.7 399.3 399.6 399.8 399.4 399.0 398.6 397.9 397.7 397.1 396.7 396.1 396.0 395.4 395.6 395.7 395.9 395.9 396.1 396.4 396.8 397.0 397.3 397.5 397.5 397.5 397.7 397.7 397.7 397.7 397.9 397.7 397.7 397.7 397.7 397.7 397.7 397.5 397.5 397.2 397.0 397.0 396.7 396.6 396.6 396.5 396.3 396.3 396.1 396.1 396.3 396.3 396.1 396.3 396.3 396.4 396.6 396.7 396.6 396.9 397.2 396.8 397.4 397.9 398.0 398.5 399.1 399.1 399.1 399.0 398.7 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2",
|
85 |
+
"input_type": "phoneme",
|
86 |
+
"offset": 0.0
|
87 |
+
}
|
88 |
+
]''')
|
89 |
+
vc_transform = gr.Number(label="变调(整数,可以正负,半音数量,升高八度就是12)", value=0)
|
90 |
+
# model = gr.Dropdown(label="模型", choices=list(model_map.keys()), value="G_34000.pth")
|
91 |
+
vc_submit = gr.Button("合成", variant="primary")
|
92 |
+
vc_output1 = gr.Textbox(label="Output Message")
|
93 |
+
vc_output2 = gr.Audio(label="Output Audio")
|
94 |
+
vc_submit.click(vc_fn, [sid, vc_input3, vc_transform], [vc_output1, vc_output2])
|
95 |
+
|
96 |
+
app.launch()
|
97 |
+
|
98 |
+
|
99 |
+
|
ds_inference.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import time
|
4 |
+
import re
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
import soundfile
|
8 |
+
import torch
|
9 |
+
import tqdm
|
10 |
+
from scipy.interpolate import interp1d
|
11 |
+
|
12 |
+
from utils import utils
|
13 |
+
from egs.visinger2.models import SynthesizerTrn
|
14 |
+
from infer import preprocess, cross_fade, infer_ds
|
15 |
+
|
16 |
+
trans = -12
|
17 |
+
speaker = "otto"
|
18 |
+
ds_path = "infer/share.ds"
|
19 |
+
config_json = "egs/visinger2/config.json"
|
20 |
+
checkpoint_path = f"/Volumes/Extend/下载/G_110000.pth"
|
21 |
+
file_name = os.path.splitext(os.path.basename(ds_path))[0]
|
22 |
+
step = re.findall(r'G_(\d+)\.pth', checkpoint_path)[0]
|
23 |
+
|
24 |
+
|
25 |
+
if __name__ == '__main__':
|
26 |
+
ds = json.load(open(ds_path))
|
27 |
+
hps = utils.get_hparams_from_file(config_json)
|
28 |
+
net_g = SynthesizerTrn(hps)
|
29 |
+
_ = net_g.eval()
|
30 |
+
_ = utils.load_checkpoint(checkpoint_path, net_g, None)
|
31 |
+
|
32 |
+
audio = infer_ds(net_g, hps, ds, speaker, trans)
|
33 |
+
soundfile.write(f"{speaker}_{file_name}_{step}step.wav", audio, 44100)
|
egs/visinger2/__init__.py
ADDED
File without changes
|
egs/visinger2/bash/train.sh
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
num_gpu=$1
|
3 |
+
|
4 |
+
cd $(dirname $(dirname $0))
|
5 |
+
exp_dir=$(pwd)
|
6 |
+
base_dir=$(dirname $(dirname $exp_dir))
|
7 |
+
config=${exp_dir}/config.json
|
8 |
+
|
9 |
+
export PYTHONPATH=$base_dir
|
10 |
+
export PYTHONIOENCODING=UTF-8
|
11 |
+
|
12 |
+
CUDA_VISIBLE_DEVICES=${num_gpu} python train.py -c config.json
|
13 |
+
|
egs/visinger2/config.json
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"train": {
|
3 |
+
"log_interval": 200,
|
4 |
+
"eval_interval": 1000,
|
5 |
+
"seed": 1234,
|
6 |
+
"port": 8001,
|
7 |
+
"epochs": 10000,
|
8 |
+
"learning_rate": 2e-4,
|
9 |
+
"betas": [0.8, 0.99],
|
10 |
+
"eps": 1e-9,
|
11 |
+
"batch_size": 8,
|
12 |
+
"accumulation_steps": 1,
|
13 |
+
"fp16_run": false,
|
14 |
+
"lr_decay": 0.998,
|
15 |
+
"segment_size": 10240,
|
16 |
+
"init_lr_ratio": 1,
|
17 |
+
"warmup_epochs": 0,
|
18 |
+
"c_mel": 45,
|
19 |
+
"save_dir": "logdir/visinger2"
|
20 |
+
},
|
21 |
+
"data": {
|
22 |
+
"data_dir":"../../data",
|
23 |
+
"dataset_type": "SingDataset",
|
24 |
+
"collate_type": "SingCollate",
|
25 |
+
"training_filelist":"train.list",
|
26 |
+
"training_labellist":"transcriptions.txt",
|
27 |
+
"validation_filelist":"test.list",
|
28 |
+
"validation_labellist":"transcriptions.txt",
|
29 |
+
"max_wav_value": 32768.0,
|
30 |
+
"sample_rate": 44100,
|
31 |
+
"n_fft": 2048,
|
32 |
+
"fmin": 0,
|
33 |
+
"fmax": 22050,
|
34 |
+
"hop_size": 512,
|
35 |
+
"win_size": 2048,
|
36 |
+
"acoustic_dim": 80,
|
37 |
+
"min_level_db": -115,
|
38 |
+
"ref_level_db": 20,
|
39 |
+
"min_db": -115,
|
40 |
+
"max_abs_value": 4.0,
|
41 |
+
"n_speakers": 200,
|
42 |
+
"spk2id": {"opencpop": 0, "taffy": 1, "otto": 2, "nanami": 3}
|
43 |
+
},
|
44 |
+
"model": {
|
45 |
+
"hidden_channels": 192,
|
46 |
+
"spk_channels": 192,
|
47 |
+
"filter_channels": 768,
|
48 |
+
"n_heads": 2,
|
49 |
+
"n_layers": 4,
|
50 |
+
"kernel_size": 3,
|
51 |
+
"p_dropout": 0.1,
|
52 |
+
"prior_hidden_channels": 192,
|
53 |
+
"prior_filter_channels": 768,
|
54 |
+
"prior_n_heads": 2,
|
55 |
+
"prior_n_layers": 4,
|
56 |
+
"prior_kernel_size": 3,
|
57 |
+
"prior_p_dropout": 0.1,
|
58 |
+
"resblock": "1",
|
59 |
+
"use_spectral_norm": false,
|
60 |
+
"resblock_kernel_sizes": [3,7,11],
|
61 |
+
"resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
|
62 |
+
"upsample_rates": [8,8,4,2],
|
63 |
+
"upsample_initial_channel": 256,
|
64 |
+
"upsample_kernel_sizes": [16,16,8,4],
|
65 |
+
"n_harmonic": 64,
|
66 |
+
"n_bands": 65
|
67 |
+
}
|
68 |
+
}
|
egs/visinger2/dataset.py
ADDED
@@ -0,0 +1,334 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
import string
|
5 |
+
import random
|
6 |
+
import numpy as np
|
7 |
+
import math
|
8 |
+
import json
|
9 |
+
from torch.utils.data import DataLoader
|
10 |
+
import torch
|
11 |
+
|
12 |
+
sys.path.append('../..')
|
13 |
+
from utils.audio import load_wav
|
14 |
+
from text import npu
|
15 |
+
|
16 |
+
class BaseDataset(torch.utils.data.Dataset):
|
17 |
+
|
18 |
+
def __init__(self, hparams, fileid_list_path):
|
19 |
+
self.hparams = hparams
|
20 |
+
self.fileid_list = self.get_fileid_list(fileid_list_path)
|
21 |
+
random.seed(hparams.train.seed)
|
22 |
+
random.shuffle(self.fileid_list)
|
23 |
+
if(hparams.data.n_speakers > 0):
|
24 |
+
self.spk2id = hparams.data.spk2id
|
25 |
+
|
26 |
+
def get_fileid_list(self, fileid_list_path):
|
27 |
+
fileid_list = []
|
28 |
+
with open(fileid_list_path, 'r') as f:
|
29 |
+
for line in f.readlines():
|
30 |
+
fileid_list.append(line.strip())
|
31 |
+
|
32 |
+
return fileid_list
|
33 |
+
|
34 |
+
def __len__(self):
|
35 |
+
return len(self.fileid_list)
|
36 |
+
|
37 |
+
class SingDataset(BaseDataset):
|
38 |
+
def __init__(self, hparams, data_dir, fileid_list_path, label_list_path):
|
39 |
+
BaseDataset.__init__(self, hparams, os.path.join(data_dir, fileid_list_path))
|
40 |
+
self.hps = hparams
|
41 |
+
|
42 |
+
with open(os.path.join(data_dir, label_list_path), "r") as in_file:
|
43 |
+
self.id2label = {}
|
44 |
+
for line in in_file.readlines():
|
45 |
+
fileid, txt, phones, pitchid, dur, gtdur, slur = line.split('|')
|
46 |
+
self.id2label[fileid] = [phones, pitchid, dur, slur, gtdur]
|
47 |
+
|
48 |
+
self.data_dir = data_dir
|
49 |
+
# self.__filter__()
|
50 |
+
|
51 |
+
def __filter__(self):
|
52 |
+
new_fileid_list = []
|
53 |
+
print("before filter: ", len(self.fileid_list))
|
54 |
+
for file_id in self.fileid_list:
|
55 |
+
_is_qualified = True
|
56 |
+
if(not os.path.exists(os.path.join(self.label_dir, self.fileid_list[index] + '.lab')) or
|
57 |
+
not os.path.exists(os.path.join(self.dur_dir, self.fileid_list[index] + '.lab')) or
|
58 |
+
not os.path.exists(os.path.join(self.mel_dir, self.fileid_list[index] + '.npy')) or
|
59 |
+
not os.path.exists(os.path.join(self.pitch_dir, self.fileid_list[index] + '.npy'))):
|
60 |
+
_is_qualified = False
|
61 |
+
if(_is_qualified):
|
62 |
+
new_fileid_list.append(file_id)
|
63 |
+
self.fileid_list = new_fileid_list
|
64 |
+
print("after filter: ", len(self.fileid_list))
|
65 |
+
|
66 |
+
def interpolate_f0(self, data):
|
67 |
+
'''
|
68 |
+
对F0进行插值处理
|
69 |
+
'''
|
70 |
+
data = np.reshape(data, (data.size, 1))
|
71 |
+
|
72 |
+
vuv_vector = np.zeros((data.size, 1),dtype=np.float32)
|
73 |
+
vuv_vector[data > 0.0] = 1.0
|
74 |
+
vuv_vector[data <= 0.0] = 0.0
|
75 |
+
|
76 |
+
ip_data = data
|
77 |
+
|
78 |
+
frame_number = data.size
|
79 |
+
last_value = 0.0
|
80 |
+
for i in range(frame_number):
|
81 |
+
if data[i] <= 0.0:
|
82 |
+
j = i + 1
|
83 |
+
for j in range(i + 1, frame_number):
|
84 |
+
if data[j] > 0.0:
|
85 |
+
break
|
86 |
+
if j < frame_number - 1:
|
87 |
+
if last_value > 0.0:
|
88 |
+
step = (data[j] - data[i - 1]) / float(j - i)
|
89 |
+
for k in range(i, j):
|
90 |
+
ip_data[k] = data[i - 1] + step * (k - i + 1)
|
91 |
+
else:
|
92 |
+
for k in range(i, j):
|
93 |
+
ip_data[k] = data[j]
|
94 |
+
else:
|
95 |
+
for k in range(i, frame_number):
|
96 |
+
ip_data[k] = last_value
|
97 |
+
else:
|
98 |
+
ip_data[i] = data[i]
|
99 |
+
last_value = data[i]
|
100 |
+
|
101 |
+
return ip_data, vuv_vector
|
102 |
+
|
103 |
+
def parse_label(self, pho, pitchid, dur, slur, gtdur):
|
104 |
+
phos = []
|
105 |
+
pitchs = []
|
106 |
+
durs = []
|
107 |
+
slurs = []
|
108 |
+
gtdurs = []
|
109 |
+
|
110 |
+
for index in range(len(pho.split())):
|
111 |
+
phos.append(npu.symbol_converter.ttsing_phone_to_int[pho.strip().split()[index]])
|
112 |
+
pitchs.append(0)
|
113 |
+
durs.append(0)
|
114 |
+
slurs.append(0)
|
115 |
+
gtdurs.append(float(gtdur.strip().split()[index]))
|
116 |
+
|
117 |
+
phos = np.asarray(phos, dtype=np.int32)
|
118 |
+
pitchs = np.asarray(pitchs, dtype=np.int32)
|
119 |
+
durs = np.asarray(durs, dtype=np.float32)
|
120 |
+
slurs = np.asarray(slurs, dtype=np.int32)
|
121 |
+
gtdurs = np.asarray(gtdurs, dtype=np.float32)
|
122 |
+
|
123 |
+
acc_duration = np.cumsum(gtdurs)
|
124 |
+
acc_duration = np.pad(acc_duration, (1, 0), 'constant', constant_values=(0,))
|
125 |
+
acc_duration_frames = np.ceil(acc_duration / (self.hps.data.hop_size / self.hps.data.sample_rate))
|
126 |
+
gtdurs = acc_duration_frames[1:] - acc_duration_frames[:-1]
|
127 |
+
|
128 |
+
phos = torch.LongTensor(phos)
|
129 |
+
pitchs = torch.LongTensor(pitchs)
|
130 |
+
durs = torch.FloatTensor(durs)
|
131 |
+
slurs = torch.LongTensor(slurs)
|
132 |
+
gtdurs = torch.LongTensor(gtdurs)
|
133 |
+
return phos, pitchs, durs, slurs, gtdurs
|
134 |
+
|
135 |
+
def __getitem__(self, index):
|
136 |
+
|
137 |
+
pho, pitchid, dur, slur, gtdur = self.id2label[self.fileid_list[index]]
|
138 |
+
pho, pitchid, dur, slur, gtdur = self.parse_label(pho, pitchid, dur, slur, gtdur)
|
139 |
+
sum_dur = gtdur.sum()
|
140 |
+
spk, fileid = self.fileid_list[index].split("/")
|
141 |
+
spkid = self.spk2id[spk]
|
142 |
+
mel = np.load(os.path.join(self.data_dir, spk, "mels", fileid + '.npy'))
|
143 |
+
if mel.shape[0] <150:
|
144 |
+
print("drop short audio:", self.fileid_list[index])
|
145 |
+
return None
|
146 |
+
assert mel.shape[1] == 80
|
147 |
+
if(mel.shape[0] != sum_dur):
|
148 |
+
if(abs(mel.shape[0] - sum_dur) > 3):
|
149 |
+
print("dataset error mel: ",mel.shape, sum_dur)
|
150 |
+
return None
|
151 |
+
if(mel.shape[0] > sum_dur):
|
152 |
+
mel = mel[:sum_dur]
|
153 |
+
else:
|
154 |
+
mel = np.concatenate([mel, mel.min() * np.ones([sum_dur - mel.shape[0], self.hps.data.acoustic_dim])], axis=0)
|
155 |
+
mel = torch.FloatTensor(mel).transpose(0, 1)
|
156 |
+
|
157 |
+
f0 = np.load(os.path.join(self.data_dir, spk, "pitch", fileid + '.npy')).reshape([-1])
|
158 |
+
f0, _ = self.interpolate_f0(f0)
|
159 |
+
f0 = f0.reshape([-1])
|
160 |
+
if(f0.shape[0] != sum_dur):
|
161 |
+
if(abs(f0.shape[0] - sum_dur) > 3):
|
162 |
+
print("dataset error f0 : ",f0.shape, sum_dur)
|
163 |
+
return None
|
164 |
+
if(f0.shape[0] > sum_dur):
|
165 |
+
f0 = f0[:sum_dur]
|
166 |
+
else:
|
167 |
+
f0 = np.concatenate([f0, np.zeros([sum_dur - f0.shape[0]])], axis=0)
|
168 |
+
f0 = torch.FloatTensor(f0).reshape([1, -1])
|
169 |
+
|
170 |
+
wav = load_wav(os.path.join(self.data_dir, spk, "wavs", fileid + '.wav'),
|
171 |
+
raw_sr=self.hparams.data.sample_rate,
|
172 |
+
target_sr=self.hparams.data.sample_rate,
|
173 |
+
win_size=self.hparams.data.win_size,
|
174 |
+
hop_size=self.hparams.data.hop_size)
|
175 |
+
wav = wav.reshape(-1)
|
176 |
+
if(wav.shape[0] != sum_dur * self.hparams.data.hop_size):
|
177 |
+
if(abs(wav.shape[0] - sum_dur * self.hparams.data.hop_size) > 3 * self.hparams.data.hop_size):
|
178 |
+
print("dataset error wav : ", wav.shape, sum_dur)
|
179 |
+
return None
|
180 |
+
if(wav.shape[0] > sum_dur * self.hparams.data.hop_size):
|
181 |
+
wav = wav[:sum_dur * self.hparams.data.hop_size]
|
182 |
+
else:
|
183 |
+
wav = np.concatenate([wav, np.zeros([sum_dur * self.hparams.data.hop_size - wav.shape[0]])], axis=0)
|
184 |
+
wav = torch.FloatTensor(wav).reshape([1, -1])
|
185 |
+
|
186 |
+
return pho, pitchid, dur, slur, gtdur, mel, f0, wav, spkid
|
187 |
+
|
188 |
+
|
189 |
+
class SingCollate():
|
190 |
+
|
191 |
+
def __init__(self, hparams):
|
192 |
+
self.hparams = hparams
|
193 |
+
self.mel_dim = self.hparams.data.acoustic_dim
|
194 |
+
|
195 |
+
def __call__(self, batch):
|
196 |
+
|
197 |
+
batch = [b for b in batch if b is not None]
|
198 |
+
|
199 |
+
input_lengths, ids_sorted_decreasing = torch.sort(
|
200 |
+
torch.LongTensor([len(x[0]) for x in batch]),
|
201 |
+
dim=0, descending=True)
|
202 |
+
|
203 |
+
max_phone_len = max([len(x[0]) for x in batch])
|
204 |
+
max_pitchid_len = max([len(x[1]) for x in batch])
|
205 |
+
max_dur_len = max([len(x[2]) for x in batch])
|
206 |
+
max_slur_len = max([len(x[3]) for x in batch])
|
207 |
+
max_gtdur_len = max([len(x[4]) for x in batch])
|
208 |
+
max_mel_len = max([x[5].size(1) for x in batch])
|
209 |
+
max_f0_len = max([x[6].size(1) for x in batch])
|
210 |
+
max_wav_len = max([x[7].size(1) for x in batch])
|
211 |
+
|
212 |
+
phone_lengths = torch.LongTensor(len(batch))
|
213 |
+
pitchid_lengths = torch.LongTensor(len(batch))
|
214 |
+
dur_lengths = torch.LongTensor(len(batch))
|
215 |
+
slur_lengths = torch.LongTensor(len(batch))
|
216 |
+
gtdur_lengths = torch.LongTensor(len(batch))
|
217 |
+
mel_lengths = torch.LongTensor(len(batch))
|
218 |
+
f0_lengths = torch.LongTensor(len(batch))
|
219 |
+
wav_lengths = torch.LongTensor(len(batch))
|
220 |
+
|
221 |
+
phone_padded = torch.LongTensor(len(batch), max_phone_len)
|
222 |
+
pitchid_padded = torch.LongTensor(len(batch), max_pitchid_len)
|
223 |
+
dur_padded = torch.FloatTensor(len(batch), max_dur_len)
|
224 |
+
slur_padded = torch.LongTensor(len(batch), max_slur_len)
|
225 |
+
gtdur_padded = torch.LongTensor(len(batch), 1, max_gtdur_len)
|
226 |
+
mel_padded = torch.FloatTensor(len(batch), self.hparams.data.acoustic_dim, max_mel_len)
|
227 |
+
f0_padded = torch.FloatTensor(len(batch), 1, max_f0_len)
|
228 |
+
wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
|
229 |
+
spkids = torch.LongTensor(len(batch))
|
230 |
+
|
231 |
+
phone_padded.zero_()
|
232 |
+
pitchid_padded.zero_()
|
233 |
+
dur_padded.zero_()
|
234 |
+
slur_padded.zero_()
|
235 |
+
gtdur_padded.zero_()
|
236 |
+
mel_padded.zero_()
|
237 |
+
f0_padded.zero_()
|
238 |
+
wav_padded.zero_()
|
239 |
+
|
240 |
+
for i in range(len(ids_sorted_decreasing)):
|
241 |
+
row = batch[ids_sorted_decreasing[i]]
|
242 |
+
|
243 |
+
phone = row[0]
|
244 |
+
phone_padded[i, :phone.size(0)] = phone
|
245 |
+
phone_lengths[i] = phone.size(0)
|
246 |
+
|
247 |
+
pitchid = row[1]
|
248 |
+
pitchid_padded[i, :pitchid.size(0)] = pitchid
|
249 |
+
pitchid_lengths[i] = pitchid.size(0)
|
250 |
+
|
251 |
+
dur = row[2]
|
252 |
+
dur_padded[i, :dur.size(0)] = dur
|
253 |
+
dur_lengths[i] = dur.size(0)
|
254 |
+
|
255 |
+
slur = row[3]
|
256 |
+
slur_padded[i, :slur.size(0)] = slur
|
257 |
+
slur_lengths[i] = slur.size(0)
|
258 |
+
|
259 |
+
gtdur = row[4]
|
260 |
+
gtdur_padded[i, :, :gtdur.size(0)] = gtdur
|
261 |
+
gtdur_lengths[i] = gtdur.size(0)
|
262 |
+
|
263 |
+
mel = row[5]
|
264 |
+
mel_padded[i, :, :mel.size(1)] = mel
|
265 |
+
mel_lengths[i] = mel.size(1)
|
266 |
+
|
267 |
+
f0 = row[6]
|
268 |
+
f0_padded[i, :, :f0.size(1)] = f0
|
269 |
+
f0_lengths[i] = f0.size(1)
|
270 |
+
|
271 |
+
wav = row[7]
|
272 |
+
wav_padded[i, :, :wav.size(1)] = wav
|
273 |
+
wav_lengths[i] = wav.size(1)
|
274 |
+
|
275 |
+
spkids[i] = row[8]
|
276 |
+
|
277 |
+
data_dict = {}
|
278 |
+
data_dict["phone"] = phone_padded
|
279 |
+
data_dict["phone_lengths"] = phone_lengths
|
280 |
+
data_dict["pitchid"] = pitchid_padded
|
281 |
+
data_dict["dur"] = dur_padded
|
282 |
+
data_dict["slur"] = slur_padded
|
283 |
+
data_dict["gtdur"] = gtdur_padded
|
284 |
+
data_dict["mel"] = mel_padded
|
285 |
+
data_dict["f0"] = f0_padded
|
286 |
+
data_dict["wav"] = wav_padded
|
287 |
+
|
288 |
+
data_dict["mel_lengths"] = mel_lengths
|
289 |
+
data_dict["f0_lengths"] = f0_lengths
|
290 |
+
data_dict["wav_lengths"] = wav_lengths
|
291 |
+
data_dict["spkid"] = spkids
|
292 |
+
|
293 |
+
return data_dict
|
294 |
+
|
295 |
+
|
296 |
+
class DatasetConstructor():
|
297 |
+
|
298 |
+
def __init__(self, hparams, num_replicas=1, rank=1):
|
299 |
+
self.hparams = hparams
|
300 |
+
self.num_replicas = num_replicas
|
301 |
+
self.rank = rank
|
302 |
+
self.dataset_function = {"SingDataset": SingDataset}
|
303 |
+
self.collate_function = {"SingCollate": SingCollate}
|
304 |
+
self._get_components()
|
305 |
+
|
306 |
+
def _get_components(self):
|
307 |
+
self._init_datasets()
|
308 |
+
self._init_collate()
|
309 |
+
self._init_data_loaders()
|
310 |
+
|
311 |
+
def _init_datasets(self):
|
312 |
+
self._train_dataset = self.dataset_function[self.hparams.data.dataset_type](self.hparams, self.hparams.data.data_dir, self.hparams.data.training_filelist, self.hparams.data.training_labellist)
|
313 |
+
self._valid_dataset = self.dataset_function[self.hparams.data.dataset_type](self.hparams, self.hparams.data.data_dir, self.hparams.data.validation_filelist, self.hparams.data.validation_labellist)
|
314 |
+
|
315 |
+
def _init_collate(self):
|
316 |
+
self._collate_fn = self.collate_function[self.hparams.data.collate_type](self.hparams)
|
317 |
+
|
318 |
+
def _init_data_loaders(self):
|
319 |
+
train_sampler = torch.utils.data.distributed.DistributedSampler(self._train_dataset, num_replicas=self.num_replicas, rank=self.rank, shuffle=True)
|
320 |
+
|
321 |
+
self.train_loader = DataLoader(self._train_dataset, num_workers=4, shuffle=False,
|
322 |
+
batch_size=self.hparams.train.batch_size, pin_memory=True,
|
323 |
+
drop_last=True, collate_fn=self._collate_fn, sampler=train_sampler)
|
324 |
+
|
325 |
+
self.valid_loader = DataLoader(self._valid_dataset, num_workers=1, shuffle=False,
|
326 |
+
batch_size=1, pin_memory=True,
|
327 |
+
drop_last=True, collate_fn=self._collate_fn)
|
328 |
+
|
329 |
+
def get_train_loader(self):
|
330 |
+
return self.train_loader
|
331 |
+
|
332 |
+
def get_valid_loader(self):
|
333 |
+
return self.valid_loader
|
334 |
+
|
egs/visinger2/inference.py
ADDED
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
+
import IPython.display as ipd
|
3 |
+
|
4 |
+
import sys
|
5 |
+
import os
|
6 |
+
import json
|
7 |
+
import math
|
8 |
+
import torch
|
9 |
+
from torch import nn
|
10 |
+
from torch.nn import functional as F
|
11 |
+
from torch.utils.data import DataLoader
|
12 |
+
|
13 |
+
import modules.commons as commons
|
14 |
+
import utils.utils as utils
|
15 |
+
from models import SynthesizerTrn
|
16 |
+
from text import npu
|
17 |
+
from scipy.io.wavfile import write
|
18 |
+
from tqdm import tqdm
|
19 |
+
import numpy as np
|
20 |
+
import time
|
21 |
+
import argparse
|
22 |
+
|
23 |
+
def parse_label(hps, pho, pitchid, dur, slur, gtdur):
|
24 |
+
phos = []
|
25 |
+
pitchs = []
|
26 |
+
durs = []
|
27 |
+
slurs = []
|
28 |
+
gtdurs = []
|
29 |
+
|
30 |
+
for index in range(len(pho.split())):
|
31 |
+
phos.append(npu.symbol_converter.ttsing_phone_to_int[pho.strip().split()[index]])
|
32 |
+
pitchs.append(npu.symbol_converter.ttsing_opencpop_pitch_to_int[pitchid.strip().split()[index]])
|
33 |
+
durs.append(float(dur.strip().split()[index]))
|
34 |
+
slurs.append(int(slur.strip().split()[index]))
|
35 |
+
gtdurs.append(float(gtdur.strip().split()[index]))
|
36 |
+
|
37 |
+
phos = np.asarray(phos, dtype=np.int32)
|
38 |
+
pitchs = np.asarray(pitchs, dtype=np.int32)
|
39 |
+
durs = np.asarray(durs, dtype=np.float32)
|
40 |
+
slurs = np.asarray(slurs, dtype=np.int32)
|
41 |
+
gtdurs = np.asarray(gtdurs, dtype=np.float32)
|
42 |
+
gtdurs = np.ceil(gtdurs / (hps.data.hop_size / hps.data.sample_rate))
|
43 |
+
|
44 |
+
phos = torch.LongTensor(phos)
|
45 |
+
pitchs = torch.LongTensor(pitchs)
|
46 |
+
durs = torch.FloatTensor(durs)
|
47 |
+
slurs = torch.LongTensor(slurs)
|
48 |
+
gtdurs = torch.LongTensor(gtdurs)
|
49 |
+
return phos, pitchs, durs, slurs, gtdurs
|
50 |
+
|
51 |
+
def load_model(model_dir):
|
52 |
+
|
53 |
+
# load config and model
|
54 |
+
model_path = utils.latest_checkpoint_path(model_dir)
|
55 |
+
config_path = os.path.join(model_dir, "config.json")
|
56 |
+
|
57 |
+
hps = utils.get_hparams_from_file(config_path)
|
58 |
+
|
59 |
+
print("Load model from : ", model_path)
|
60 |
+
print("config: ", config_path)
|
61 |
+
|
62 |
+
net_g = SynthesizerTrn(hps)
|
63 |
+
_ = net_g.eval()
|
64 |
+
_ = utils.load_checkpoint(model_path, net_g, None)
|
65 |
+
return net_g, hps
|
66 |
+
|
67 |
+
def inference_label2wav(net_g, label_list_path, output_dir, hps, cuda_id=None):
|
68 |
+
|
69 |
+
id2label = {}
|
70 |
+
with open(label_list_path, "r") as in_file:
|
71 |
+
for line in in_file.readlines():
|
72 |
+
fileid, txt, phones, pitchid, dur, gtdur, slur = line.split('|')
|
73 |
+
id2label[fileid] = [phones, pitchid, dur, slur, gtdur]
|
74 |
+
|
75 |
+
for file_name in tqdm(id2label.keys()):
|
76 |
+
pho, pitchid, dur, slur, gtdur = id2label[file_name]
|
77 |
+
pho, pitchid, dur, slur, gtdur = parse_label(hps, pho, pitchid, dur, slur, gtdur)
|
78 |
+
|
79 |
+
with torch.no_grad():
|
80 |
+
|
81 |
+
# data
|
82 |
+
pho_lengths = torch.LongTensor([pho.size(0)])
|
83 |
+
pho = pho.unsqueeze(0)
|
84 |
+
pitchid = pitchid.unsqueeze(0)
|
85 |
+
dur = dur.unsqueeze(0)
|
86 |
+
slur = slur.unsqueeze(0)
|
87 |
+
|
88 |
+
if(cuda_id != None):
|
89 |
+
net_g = net_g.cuda(0)
|
90 |
+
pho = pho.cuda(0)
|
91 |
+
pho_lengths = pho_lengths.cuda(0)
|
92 |
+
pitchid = pitchid.cuda(0)
|
93 |
+
dur = dur.cuda(0)
|
94 |
+
slur = slur.cuda(0)
|
95 |
+
|
96 |
+
# infer
|
97 |
+
o, _, _ = net_g.infer(pho, pho_lengths, pitchid, dur, slur)
|
98 |
+
audio = o[0,0].data.cpu().float().numpy()
|
99 |
+
audio = audio * 32768 #hps.data.max_wav_value
|
100 |
+
audio = audio.astype(np.int16)
|
101 |
+
|
102 |
+
# save
|
103 |
+
write(os.path.join(output_dir, file_name.split('.')[0] + '.wav' ), hps.data.sample_rate, audio)
|
104 |
+
|
105 |
+
if __name__ == "__main__":
|
106 |
+
|
107 |
+
parser = argparse.ArgumentParser()
|
108 |
+
parser.add_argument('-model_dir', '--model_dir', type=str, required=True)
|
109 |
+
parser.add_argument('-input_dir', '--input_dir', type=str, required=True)
|
110 |
+
parser.add_argument('-output_dir', '--output_dir', type=str, required=True)
|
111 |
+
args = parser.parse_args()
|
112 |
+
|
113 |
+
model_dir = args.model_dir
|
114 |
+
input_dir = args.input_dir
|
115 |
+
output_dir = args.output_dir
|
116 |
+
|
117 |
+
model, hps = load_model(model_dir)
|
118 |
+
if(not os.path.exists(output_dir)):
|
119 |
+
os.makedirs(output_dir)
|
120 |
+
print("load model end!")
|
121 |
+
|
122 |
+
inference_label2wav(model, input_dir, output_dir, hps, cuda_id=0)
|
123 |
+
|
egs/visinger2/models.py
ADDED
@@ -0,0 +1,1023 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import copy
|
3 |
+
import math
|
4 |
+
import torch
|
5 |
+
from torch import nn
|
6 |
+
from torch.nn import functional as F
|
7 |
+
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
8 |
+
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
9 |
+
|
10 |
+
sys.path.append('../..')
|
11 |
+
import modules.commons as commons
|
12 |
+
import modules.modules as modules
|
13 |
+
import modules.attentions as attentions
|
14 |
+
|
15 |
+
from modules.commons import init_weights, get_padding
|
16 |
+
from text.npu.symbols import ttsing_phone_set, ttsing_opencpop_pitch_set, ttsing_slur_set
|
17 |
+
|
18 |
+
from modules.ddsp import mlp, gru, scale_function, remove_above_nyquist, upsample
|
19 |
+
from modules.ddsp import harmonic_synth, amp_to_impulse_response, fft_convolve
|
20 |
+
from modules.ddsp import resample
|
21 |
+
|
22 |
+
from modules.stft import TorchSTFT
|
23 |
+
|
24 |
+
import torch.distributions as D
|
25 |
+
|
26 |
+
from modules.losses import (
|
27 |
+
generator_loss,
|
28 |
+
discriminator_loss,
|
29 |
+
feature_loss,
|
30 |
+
kl_loss
|
31 |
+
)
|
32 |
+
|
33 |
+
LRELU_SLOPE = 0.1
|
34 |
+
|
35 |
+
|
36 |
+
class DurationPredictor(nn.Module):
|
37 |
+
def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_speakers=0, spk_channels=0):
|
38 |
+
super().__init__()
|
39 |
+
|
40 |
+
self.in_channels = in_channels
|
41 |
+
self.filter_channels = filter_channels
|
42 |
+
self.kernel_size = kernel_size
|
43 |
+
self.p_dropout = p_dropout
|
44 |
+
self.spk_channels = spk_channels
|
45 |
+
|
46 |
+
self.drop = nn.Dropout(p_dropout)
|
47 |
+
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size, padding=kernel_size // 2)
|
48 |
+
self.norm_1 = modules.LayerNorm(filter_channels)
|
49 |
+
self.conv_2 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
|
50 |
+
self.norm_2 = modules.LayerNorm(filter_channels)
|
51 |
+
self.conv_3 = nn.Conv1d(filter_channels, filter_channels, kernel_size, padding=kernel_size // 2)
|
52 |
+
self.norm_3 = modules.LayerNorm(filter_channels)
|
53 |
+
self.proj = nn.Conv1d(filter_channels, 2, 1)
|
54 |
+
|
55 |
+
if n_speakers != 0:
|
56 |
+
self.cond = nn.Conv1d(spk_channels, in_channels, 1)
|
57 |
+
|
58 |
+
def forward(self, x, x_mask, spk_emb=None):
|
59 |
+
# x = torch.detach(x)
|
60 |
+
if spk_emb is not None:
|
61 |
+
spk_emb = torch.detach(spk_emb)
|
62 |
+
x = x + self.cond(spk_emb)
|
63 |
+
|
64 |
+
x = self.conv_1(x * x_mask)
|
65 |
+
x = torch.relu(x)
|
66 |
+
x = self.norm_1(x)
|
67 |
+
x = self.drop(x)
|
68 |
+
|
69 |
+
x = self.conv_2(x * x_mask)
|
70 |
+
x = torch.relu(x)
|
71 |
+
x = self.norm_2(x)
|
72 |
+
x = self.drop(x)
|
73 |
+
|
74 |
+
x = self.conv_3(x * x_mask)
|
75 |
+
x = torch.relu(x)
|
76 |
+
x = self.norm_3(x)
|
77 |
+
x = self.drop(x)
|
78 |
+
|
79 |
+
x = self.proj(x * x_mask)
|
80 |
+
return x * x_mask
|
81 |
+
|
82 |
+
|
83 |
+
class TextEncoder(nn.Module):
|
84 |
+
def __init__(self,
|
85 |
+
n_vocab,
|
86 |
+
out_channels,
|
87 |
+
hidden_channels,
|
88 |
+
filter_channels,
|
89 |
+
n_heads,
|
90 |
+
n_layers,
|
91 |
+
kernel_size,
|
92 |
+
p_dropout):
|
93 |
+
super().__init__()
|
94 |
+
self.n_vocab = n_vocab
|
95 |
+
self.out_channels = out_channels
|
96 |
+
self.hidden_channels = hidden_channels
|
97 |
+
self.filter_channels = filter_channels
|
98 |
+
self.n_heads = n_heads
|
99 |
+
self.n_layers = n_layers
|
100 |
+
self.kernel_size = kernel_size
|
101 |
+
self.p_dropout = p_dropout
|
102 |
+
|
103 |
+
self.emb_phone = nn.Embedding(len(ttsing_phone_set), 256)
|
104 |
+
nn.init.normal_(self.emb_phone.weight, 0.0, 256 ** -0.5)
|
105 |
+
|
106 |
+
self.pre_net = torch.nn.Linear(256, hidden_channels)
|
107 |
+
|
108 |
+
self.encoder = attentions.Encoder(
|
109 |
+
hidden_channels,
|
110 |
+
filter_channels,
|
111 |
+
n_heads,
|
112 |
+
n_layers,
|
113 |
+
kernel_size,
|
114 |
+
p_dropout)
|
115 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
116 |
+
|
117 |
+
def forward(self, phone, phone_lengths, pitchid, dur, slur):
|
118 |
+
phone_end = self.emb_phone(phone) * math.sqrt(256)
|
119 |
+
x = phone_end
|
120 |
+
|
121 |
+
x = self.pre_net(x)
|
122 |
+
x = torch.transpose(x, 1, -1) # [b, h, t]
|
123 |
+
|
124 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(phone_lengths, x.size(2)), 1).to(x.dtype)
|
125 |
+
|
126 |
+
x = self.encoder(x * x_mask, x_mask)
|
127 |
+
x = self.proj(x) * x_mask
|
128 |
+
|
129 |
+
return x, x_mask
|
130 |
+
|
131 |
+
|
132 |
+
def pad_v2(input_ele, mel_max_length=None):
|
133 |
+
if mel_max_length:
|
134 |
+
max_len = mel_max_length
|
135 |
+
else:
|
136 |
+
max_len = max([input_ele[i].size(0) for i in range(len(input_ele))])
|
137 |
+
|
138 |
+
out_list = list()
|
139 |
+
for i, batch in enumerate(input_ele):
|
140 |
+
if len(batch.shape) == 1:
|
141 |
+
one_batch_padded = F.pad(
|
142 |
+
batch, (0, max_len - batch.size(0)), "constant", 0.0
|
143 |
+
)
|
144 |
+
elif len(batch.shape) == 2:
|
145 |
+
one_batch_padded = F.pad(
|
146 |
+
batch, (0, 0, 0, max_len - batch.size(0)), "constant", 0.0
|
147 |
+
)
|
148 |
+
out_list.append(one_batch_padded)
|
149 |
+
out_padded = torch.stack(out_list)
|
150 |
+
return out_padded
|
151 |
+
|
152 |
+
|
153 |
+
class LengthRegulator(nn.Module):
|
154 |
+
""" Length Regulator """
|
155 |
+
|
156 |
+
def __init__(self):
|
157 |
+
super(LengthRegulator, self).__init__()
|
158 |
+
|
159 |
+
def LR(self, x, duration, max_len):
|
160 |
+
x = torch.transpose(x, 1, 2)
|
161 |
+
output = list()
|
162 |
+
mel_len = list()
|
163 |
+
for batch, expand_target in zip(x, duration):
|
164 |
+
expanded = self.expand(batch, expand_target)
|
165 |
+
output.append(expanded)
|
166 |
+
mel_len.append(expanded.shape[0])
|
167 |
+
|
168 |
+
if max_len is not None:
|
169 |
+
output = pad_v2(output, max_len)
|
170 |
+
else:
|
171 |
+
output = pad_v2(output)
|
172 |
+
output = torch.transpose(output, 1, 2)
|
173 |
+
return output, torch.LongTensor(mel_len)
|
174 |
+
|
175 |
+
def expand(self, batch, predicted):
|
176 |
+
predicted = torch.squeeze(predicted)
|
177 |
+
out = list()
|
178 |
+
|
179 |
+
for i, vec in enumerate(batch):
|
180 |
+
expand_size = predicted[i].item()
|
181 |
+
state_info_index = torch.unsqueeze(torch.arange(0, expand_size), 1).float()
|
182 |
+
state_info_length = torch.unsqueeze(torch.Tensor([expand_size] * expand_size), 1).float()
|
183 |
+
state_info = torch.cat([state_info_index, state_info_length], 1).to(vec.device)
|
184 |
+
new_vec = vec.expand(max(int(expand_size), 0), -1)
|
185 |
+
new_vec = torch.cat([new_vec, state_info], 1)
|
186 |
+
out.append(new_vec)
|
187 |
+
out = torch.cat(out, 0)
|
188 |
+
return out
|
189 |
+
|
190 |
+
def forward(self, x, duration, max_len):
|
191 |
+
output, mel_len = self.LR(x, duration, max_len)
|
192 |
+
return output, mel_len
|
193 |
+
|
194 |
+
|
195 |
+
class PriorDecoder(nn.Module):
|
196 |
+
def __init__(self,
|
197 |
+
out_bn_channels,
|
198 |
+
hidden_channels,
|
199 |
+
filter_channels,
|
200 |
+
n_heads,
|
201 |
+
n_layers,
|
202 |
+
kernel_size,
|
203 |
+
p_dropout,
|
204 |
+
n_speakers=0,
|
205 |
+
spk_channels=0):
|
206 |
+
super().__init__()
|
207 |
+
self.out_bn_channels = out_bn_channels
|
208 |
+
self.hidden_channels = hidden_channels
|
209 |
+
self.filter_channels = filter_channels
|
210 |
+
self.n_heads = n_heads
|
211 |
+
self.n_layers = n_layers
|
212 |
+
self.kernel_size = kernel_size
|
213 |
+
self.p_dropout = p_dropout
|
214 |
+
self.spk_channels = spk_channels
|
215 |
+
|
216 |
+
self.prenet = nn.Conv1d(hidden_channels + 2, hidden_channels, 3, padding=1)
|
217 |
+
self.decoder = attentions.FFT(
|
218 |
+
hidden_channels,
|
219 |
+
filter_channels,
|
220 |
+
n_heads,
|
221 |
+
n_layers,
|
222 |
+
kernel_size,
|
223 |
+
p_dropout)
|
224 |
+
self.proj = nn.Conv1d(hidden_channels, out_bn_channels, 1)
|
225 |
+
|
226 |
+
if n_speakers != 0:
|
227 |
+
self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
|
228 |
+
|
229 |
+
def forward(self, x, x_lengths, spk_emb=None):
|
230 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
231 |
+
|
232 |
+
x = self.prenet(x) * x_mask
|
233 |
+
|
234 |
+
if (spk_emb is not None):
|
235 |
+
x = x + self.cond(spk_emb)
|
236 |
+
|
237 |
+
x = self.decoder(x * x_mask, x_mask)
|
238 |
+
|
239 |
+
bn = self.proj(x) * x_mask
|
240 |
+
|
241 |
+
return bn, x_mask
|
242 |
+
|
243 |
+
|
244 |
+
class Decoder(nn.Module):
|
245 |
+
def __init__(self,
|
246 |
+
out_channels,
|
247 |
+
hidden_channels,
|
248 |
+
filter_channels,
|
249 |
+
n_heads,
|
250 |
+
n_layers,
|
251 |
+
kernel_size,
|
252 |
+
p_dropout,
|
253 |
+
n_speakers=0,
|
254 |
+
spk_channels=0):
|
255 |
+
super().__init__()
|
256 |
+
self.out_channels = out_channels
|
257 |
+
self.hidden_channels = hidden_channels
|
258 |
+
self.filter_channels = filter_channels
|
259 |
+
self.n_heads = n_heads
|
260 |
+
self.n_layers = n_layers
|
261 |
+
self.kernel_size = kernel_size
|
262 |
+
self.p_dropout = p_dropout
|
263 |
+
self.spk_channels = spk_channels
|
264 |
+
|
265 |
+
self.prenet = nn.Conv1d(hidden_channels + 2, hidden_channels, 3, padding=1)
|
266 |
+
self.decoder = attentions.FFT(
|
267 |
+
hidden_channels,
|
268 |
+
filter_channels,
|
269 |
+
n_heads,
|
270 |
+
n_layers,
|
271 |
+
kernel_size,
|
272 |
+
p_dropout)
|
273 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
274 |
+
|
275 |
+
if n_speakers != 0:
|
276 |
+
self.cond = nn.Conv1d(spk_channels, hidden_channels, 1)
|
277 |
+
|
278 |
+
def forward(self, x, x_lengths, spk_emb=None):
|
279 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
280 |
+
|
281 |
+
x = self.prenet(x) * x_mask
|
282 |
+
|
283 |
+
if (spk_emb is not None):
|
284 |
+
x = x + self.cond(spk_emb)
|
285 |
+
|
286 |
+
x = self.decoder(x * x_mask, x_mask)
|
287 |
+
|
288 |
+
x = self.proj(x) * x_mask
|
289 |
+
|
290 |
+
return x, x_mask
|
291 |
+
|
292 |
+
|
293 |
+
class ConvReluNorm(nn.Module):
|
294 |
+
def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
|
295 |
+
super().__init__()
|
296 |
+
self.in_channels = in_channels
|
297 |
+
self.hidden_channels = hidden_channels
|
298 |
+
self.out_channels = out_channels
|
299 |
+
self.kernel_size = kernel_size
|
300 |
+
self.n_layers = n_layers
|
301 |
+
self.p_dropout = p_dropout
|
302 |
+
assert n_layers > 1, "Number of layers should be larger than 0."
|
303 |
+
|
304 |
+
self.conv_layers = nn.ModuleList()
|
305 |
+
self.norm_layers = nn.ModuleList()
|
306 |
+
self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
|
307 |
+
self.norm_layers.append(LayerNorm(hidden_channels))
|
308 |
+
self.relu_drop = nn.Sequential(
|
309 |
+
nn.ReLU(),
|
310 |
+
nn.Dropout(p_dropout))
|
311 |
+
for _ in range(n_layers - 1):
|
312 |
+
self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size // 2))
|
313 |
+
self.norm_layers.append(LayerNorm(hidden_channels))
|
314 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
315 |
+
self.proj.weight.data.zero_()
|
316 |
+
self.proj.bias.data.zero_()
|
317 |
+
|
318 |
+
def forward(self, x):
|
319 |
+
x = self.conv_layers[0](x)
|
320 |
+
x = self.norm_layers[0](x)
|
321 |
+
x = self.relu_drop(x)
|
322 |
+
|
323 |
+
for i in range(1, self.n_layers):
|
324 |
+
x_ = self.conv_layers[i](x)
|
325 |
+
x_ = self.norm_layers[i](x_)
|
326 |
+
x_ = self.relu_drop(x_)
|
327 |
+
x = (x + x_) / 2
|
328 |
+
x = self.proj(x)
|
329 |
+
return x
|
330 |
+
|
331 |
+
|
332 |
+
class PosteriorEncoder(nn.Module):
|
333 |
+
def __init__(self,
|
334 |
+
hps,
|
335 |
+
in_channels,
|
336 |
+
out_channels,
|
337 |
+
hidden_channels,
|
338 |
+
kernel_size,
|
339 |
+
dilation_rate,
|
340 |
+
n_layers):
|
341 |
+
super().__init__()
|
342 |
+
self.in_channels = in_channels
|
343 |
+
self.out_channels = out_channels
|
344 |
+
self.hidden_channels = hidden_channels
|
345 |
+
self.kernel_size = kernel_size
|
346 |
+
self.dilation_rate = dilation_rate
|
347 |
+
self.n_layers = n_layers
|
348 |
+
|
349 |
+
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
350 |
+
self.enc = modules.WN(hidden_channels, kernel_size, dilation_rate, n_layers, n_speakers=hps.data.n_speakers, spk_channels=hps.model.spk_channels)
|
351 |
+
# self.enc = ConvReluNorm(hidden_channels,
|
352 |
+
# hidden_channels,
|
353 |
+
# hidden_channels,
|
354 |
+
# kernel_size,
|
355 |
+
# n_layers,
|
356 |
+
# 0.1)
|
357 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
358 |
+
|
359 |
+
def forward(self, x, x_lengths, g=None):
|
360 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
361 |
+
x = self.pre(x) * x_mask
|
362 |
+
x = self.enc(x, x_mask, g=g)
|
363 |
+
stats = self.proj(x) * x_mask
|
364 |
+
return stats, x_mask
|
365 |
+
|
366 |
+
|
367 |
+
class ResBlock3(torch.nn.Module):
|
368 |
+
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
|
369 |
+
super(ResBlock3, self).__init__()
|
370 |
+
self.convs = nn.ModuleList([
|
371 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
372 |
+
padding=get_padding(kernel_size, dilation[0])))
|
373 |
+
])
|
374 |
+
self.convs.apply(init_weights)
|
375 |
+
|
376 |
+
def forward(self, x, x_mask=None):
|
377 |
+
for c in self.convs:
|
378 |
+
xt = F.leaky_relu(x, LRELU_SLOPE)
|
379 |
+
if x_mask is not None:
|
380 |
+
xt = xt * x_mask
|
381 |
+
xt = c(xt)
|
382 |
+
x = xt + x
|
383 |
+
if x_mask is not None:
|
384 |
+
x = x * x_mask
|
385 |
+
return x
|
386 |
+
|
387 |
+
def remove_weight_norm(self):
|
388 |
+
for l in self.convs:
|
389 |
+
remove_weight_norm(l)
|
390 |
+
|
391 |
+
|
392 |
+
class Generator_Harm(torch.nn.Module):
|
393 |
+
def __init__(self, hps):
|
394 |
+
super(Generator_Harm, self).__init__()
|
395 |
+
self.hps = hps
|
396 |
+
|
397 |
+
self.prenet = Conv1d(hps.model.hidden_channels, hps.model.hidden_channels, 3, padding=1)
|
398 |
+
|
399 |
+
self.net = ConvReluNorm(hps.model.hidden_channels,
|
400 |
+
hps.model.hidden_channels,
|
401 |
+
hps.model.hidden_channels,
|
402 |
+
hps.model.kernel_size,
|
403 |
+
8,
|
404 |
+
hps.model.p_dropout)
|
405 |
+
|
406 |
+
# self.rnn = nn.LSTM(input_size=hps.model.hidden_channels,
|
407 |
+
# hidden_size=hps.model.hidden_channels,
|
408 |
+
# num_layers=1,
|
409 |
+
# bias=True,
|
410 |
+
# batch_first=True,
|
411 |
+
# dropout=0.5,
|
412 |
+
# bidirectional=True)
|
413 |
+
self.postnet = Conv1d(hps.model.hidden_channels, hps.model.n_harmonic + 1, 3, padding=1)
|
414 |
+
|
415 |
+
def forward(self, f0, harm, mask):
|
416 |
+
pitch = f0.transpose(1, 2)
|
417 |
+
harm = self.prenet(harm)
|
418 |
+
|
419 |
+
harm = self.net(harm) * mask
|
420 |
+
# harm = harm.transpose(1, 2)
|
421 |
+
# harm, (hs, hc) = self.rnn(harm)
|
422 |
+
# harm = harm.transpose(1, 2)
|
423 |
+
|
424 |
+
harm = self.postnet(harm)
|
425 |
+
harm = harm.transpose(1, 2)
|
426 |
+
param = harm
|
427 |
+
|
428 |
+
param = scale_function(param)
|
429 |
+
total_amp = param[..., :1]
|
430 |
+
amplitudes = param[..., 1:]
|
431 |
+
amplitudes = remove_above_nyquist(
|
432 |
+
amplitudes,
|
433 |
+
pitch,
|
434 |
+
self.hps.data.sample_rate,
|
435 |
+
)
|
436 |
+
amplitudes /= amplitudes.sum(-1, keepdim=True)
|
437 |
+
amplitudes *= total_amp
|
438 |
+
|
439 |
+
amplitudes = upsample(amplitudes, self.hps.data.hop_size)
|
440 |
+
pitch = upsample(pitch, self.hps.data.hop_size)
|
441 |
+
|
442 |
+
n_harmonic = amplitudes.shape[-1]
|
443 |
+
omega = torch.cumsum(2 * math.pi * pitch / self.hps.data.sample_rate, 1)
|
444 |
+
omegas = omega * torch.arange(1, n_harmonic + 1).to(omega)
|
445 |
+
signal_harmonics = (torch.sin(omegas) * amplitudes)
|
446 |
+
signal_harmonics = signal_harmonics.transpose(1, 2)
|
447 |
+
return signal_harmonics
|
448 |
+
|
449 |
+
|
450 |
+
class Generator(torch.nn.Module):
|
451 |
+
def __init__(self, hps, initial_channel, resblock, resblock_kernel_sizes, resblock_dilation_sizes, upsample_rates,
|
452 |
+
upsample_initial_channel, upsample_kernel_sizes, n_speakers=0, spk_channels=0):
|
453 |
+
super(Generator, self).__init__()
|
454 |
+
self.num_kernels = len(resblock_kernel_sizes)
|
455 |
+
self.num_upsamples = len(upsample_rates)
|
456 |
+
self.conv_pre = Conv1d(initial_channel, upsample_initial_channel, 7, 1, padding=3)
|
457 |
+
self.upsample_rates = upsample_rates
|
458 |
+
self.n_speakers = n_speakers
|
459 |
+
|
460 |
+
resblock = modules.ResBlock1 if resblock == '1' else modules.R
|
461 |
+
|
462 |
+
self.downs = nn.ModuleList()
|
463 |
+
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
464 |
+
i = len(upsample_rates) - 1 - i
|
465 |
+
u = upsample_rates[i]
|
466 |
+
k = upsample_kernel_sizes[i]
|
467 |
+
# print("down: ",upsample_initial_channel//(2**(i+1))," -> ", upsample_initial_channel//(2**i))
|
468 |
+
self.downs.append(weight_norm(
|
469 |
+
Conv1d(hps.model.n_harmonic + 2, hps.model.n_harmonic + 2,
|
470 |
+
k, u, padding=k // 2)))
|
471 |
+
|
472 |
+
self.resblocks_downs = nn.ModuleList()
|
473 |
+
for i in range(len(self.downs)):
|
474 |
+
j = len(upsample_rates) - 1 - i
|
475 |
+
self.resblocks_downs.append(ResBlock3(hps.model.n_harmonic + 2, 3, (1, 3)))
|
476 |
+
|
477 |
+
self.concat_pre = Conv1d(upsample_initial_channel + hps.model.n_harmonic + 2, upsample_initial_channel, 3, 1,
|
478 |
+
padding=1)
|
479 |
+
self.concat_conv = nn.ModuleList()
|
480 |
+
for i in range(len(upsample_rates)):
|
481 |
+
ch = upsample_initial_channel // (2 ** (i + 1))
|
482 |
+
self.concat_conv.append(Conv1d(ch + hps.model.n_harmonic + 2, ch, 3, 1, padding=1, bias=False))
|
483 |
+
|
484 |
+
self.ups = nn.ModuleList()
|
485 |
+
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
486 |
+
self.ups.append(weight_norm(
|
487 |
+
ConvTranspose1d(upsample_initial_channel // (2 ** i), upsample_initial_channel // (2 ** (i + 1)),
|
488 |
+
k, u, padding=(k - u) // 2)))
|
489 |
+
|
490 |
+
self.resblocks = nn.ModuleList()
|
491 |
+
for i in range(len(self.ups)):
|
492 |
+
ch = upsample_initial_channel // (2 ** (i + 1))
|
493 |
+
for j, (k, d) in enumerate(zip(resblock_kernel_sizes, resblock_dilation_sizes)):
|
494 |
+
self.resblocks.append(resblock(ch, k, d))
|
495 |
+
|
496 |
+
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
497 |
+
self.ups.apply(init_weights)
|
498 |
+
|
499 |
+
if self.n_speakers != 0:
|
500 |
+
self.cond = nn.Conv1d(spk_channels, upsample_initial_channel, 1)
|
501 |
+
|
502 |
+
def forward(self, x, ddsp, g=None):
|
503 |
+
|
504 |
+
x = self.conv_pre(x)
|
505 |
+
|
506 |
+
if g is not None:
|
507 |
+
x = x + self.cond(g)
|
508 |
+
|
509 |
+
se = ddsp
|
510 |
+
res_features = [se]
|
511 |
+
for i in range(self.num_upsamples):
|
512 |
+
in_size = se.size(2)
|
513 |
+
se = self.downs[i](se)
|
514 |
+
se = self.resblocks_downs[i](se)
|
515 |
+
up_rate = self.upsample_rates[self.num_upsamples - 1 - i]
|
516 |
+
se = se[:, :, : in_size // up_rate]
|
517 |
+
res_features.append(se)
|
518 |
+
|
519 |
+
x = torch.cat([x, se], 1)
|
520 |
+
x = self.concat_pre(x)
|
521 |
+
|
522 |
+
for i in range(self.num_upsamples):
|
523 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
524 |
+
in_size = x.size(2)
|
525 |
+
x = self.ups[i](x)
|
526 |
+
# 保证维度正确,丢掉多余通道
|
527 |
+
x = x[:, :, : in_size * self.upsample_rates[i]]
|
528 |
+
|
529 |
+
x = torch.cat([x, res_features[self.num_upsamples - 1 - i]], 1)
|
530 |
+
x = self.concat_conv[i](x)
|
531 |
+
|
532 |
+
xs = None
|
533 |
+
for j in range(self.num_kernels):
|
534 |
+
if xs is None:
|
535 |
+
xs = self.resblocks[i * self.num_kernels + j](x)
|
536 |
+
else:
|
537 |
+
xs += self.resblocks[i * self.num_kernels + j](x)
|
538 |
+
x = xs / self.num_kernels
|
539 |
+
|
540 |
+
x = F.leaky_relu(x)
|
541 |
+
x = self.conv_post(x)
|
542 |
+
x = torch.tanh(x)
|
543 |
+
|
544 |
+
return x
|
545 |
+
|
546 |
+
def remove_weight_norm(self):
|
547 |
+
print('Removing weight norm...')
|
548 |
+
for l in self.ups:
|
549 |
+
remove_weight_norm(l)
|
550 |
+
for l in self.resblocks:
|
551 |
+
l.remove_weight_norm()
|
552 |
+
|
553 |
+
|
554 |
+
class Generator_Noise(torch.nn.Module):
|
555 |
+
def __init__(self, hps):
|
556 |
+
super(Generator_Noise, self).__init__()
|
557 |
+
self.hps = hps
|
558 |
+
self.win_size = hps.data.win_size
|
559 |
+
self.hop_size = hps.data.hop_size
|
560 |
+
self.fft_size = hps.data.n_fft
|
561 |
+
self.istft_pre = Conv1d(hps.model.hidden_channels, hps.model.hidden_channels, 3, padding=1)
|
562 |
+
|
563 |
+
self.net = ConvReluNorm(hps.model.hidden_channels,
|
564 |
+
hps.model.hidden_channels,
|
565 |
+
hps.model.hidden_channels,
|
566 |
+
hps.model.kernel_size,
|
567 |
+
8,
|
568 |
+
hps.model.p_dropout)
|
569 |
+
|
570 |
+
self.istft_amplitude = torch.nn.Conv1d(hps.model.hidden_channels, self.fft_size // 2 + 1, 1, 1)
|
571 |
+
self.window = torch.hann_window(self.win_size)
|
572 |
+
|
573 |
+
def forward(self, x, mask):
|
574 |
+
istft_x = x
|
575 |
+
istft_x = self.istft_pre(istft_x)
|
576 |
+
|
577 |
+
istft_x = self.net(istft_x) * mask
|
578 |
+
|
579 |
+
amp = self.istft_amplitude(istft_x).unsqueeze(-1)
|
580 |
+
phase = (torch.rand(amp.shape) * 2 * 3.14 - 3.14).to(amp)
|
581 |
+
|
582 |
+
real = amp * torch.cos(phase)
|
583 |
+
imag = amp * torch.sin(phase)
|
584 |
+
spec = torch.cat([real, imag], 3)
|
585 |
+
istft_x = torch.istft(spec, self.fft_size, self.hop_size, self.win_size, self.window.to(amp), True,
|
586 |
+
length=x.shape[2] * self.hop_size, return_complex=False)
|
587 |
+
|
588 |
+
return istft_x.unsqueeze(1)
|
589 |
+
|
590 |
+
|
591 |
+
class LayerNorm(nn.Module):
|
592 |
+
def __init__(self, channels, eps=1e-5):
|
593 |
+
super().__init__()
|
594 |
+
self.channels = channels
|
595 |
+
self.eps = eps
|
596 |
+
|
597 |
+
self.gamma = nn.Parameter(torch.ones(channels))
|
598 |
+
self.beta = nn.Parameter(torch.zeros(channels))
|
599 |
+
|
600 |
+
def forward(self, x):
|
601 |
+
x = x.transpose(1, -1)
|
602 |
+
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
603 |
+
return x.transpose(1, -1)
|
604 |
+
|
605 |
+
|
606 |
+
class DiscriminatorP(torch.nn.Module):
|
607 |
+
def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
|
608 |
+
super(DiscriminatorP, self).__init__()
|
609 |
+
self.period = period
|
610 |
+
self.use_spectral_norm = use_spectral_norm
|
611 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
612 |
+
self.convs = nn.ModuleList([
|
613 |
+
norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
614 |
+
norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
615 |
+
norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
616 |
+
norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(kernel_size, 1), 0))),
|
617 |
+
norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(get_padding(kernel_size, 1), 0))),
|
618 |
+
])
|
619 |
+
self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
|
620 |
+
|
621 |
+
def forward(self, x):
|
622 |
+
fmap = []
|
623 |
+
|
624 |
+
# 1d to 2d
|
625 |
+
b, c, t = x.shape
|
626 |
+
if t % self.period != 0: # pad first
|
627 |
+
n_pad = self.period - (t % self.period)
|
628 |
+
x = F.pad(x, (0, n_pad), "reflect")
|
629 |
+
t = t + n_pad
|
630 |
+
x = x.view(b, c, t // self.period, self.period)
|
631 |
+
|
632 |
+
for l in self.convs:
|
633 |
+
x = l(x)
|
634 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
635 |
+
fmap.append(x)
|
636 |
+
x = self.conv_post(x)
|
637 |
+
fmap.append(x)
|
638 |
+
x = torch.flatten(x, 1, -1)
|
639 |
+
|
640 |
+
return x, fmap
|
641 |
+
|
642 |
+
|
643 |
+
class DiscriminatorS(torch.nn.Module):
|
644 |
+
def __init__(self, use_spectral_norm=False):
|
645 |
+
super(DiscriminatorS, self).__init__()
|
646 |
+
norm_f = weight_norm if use_spectral_norm == False else spectral_norm
|
647 |
+
self.convs = nn.ModuleList([
|
648 |
+
norm_f(Conv1d(1, 16, 15, 1, padding=7)),
|
649 |
+
norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
|
650 |
+
norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
|
651 |
+
norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
|
652 |
+
norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
|
653 |
+
norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
|
654 |
+
])
|
655 |
+
self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
|
656 |
+
|
657 |
+
def forward(self, x):
|
658 |
+
fmap = []
|
659 |
+
|
660 |
+
for l in self.convs:
|
661 |
+
x = l(x)
|
662 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
663 |
+
fmap.append(x)
|
664 |
+
x = self.conv_post(x)
|
665 |
+
fmap.append(x)
|
666 |
+
x = torch.flatten(x, 1, -1)
|
667 |
+
|
668 |
+
return x, fmap
|
669 |
+
|
670 |
+
|
671 |
+
class MultiFrequencyDiscriminator(nn.Module):
|
672 |
+
def __init__(self,
|
673 |
+
hop_lengths=[128, 256, 512],
|
674 |
+
hidden_channels=[256, 512, 512],
|
675 |
+
domain='double', mel_scale=True):
|
676 |
+
super(MultiFrequencyDiscriminator, self).__init__()
|
677 |
+
|
678 |
+
self.stfts = nn.ModuleList([
|
679 |
+
TorchSTFT(fft_size=x * 4, hop_size=x, win_size=x * 4,
|
680 |
+
normalized=True, domain=domain, mel_scale=mel_scale)
|
681 |
+
for x in hop_lengths])
|
682 |
+
|
683 |
+
self.domain = domain
|
684 |
+
if domain == 'double':
|
685 |
+
self.discriminators = nn.ModuleList([
|
686 |
+
BaseFrequenceDiscriminator(2, c)
|
687 |
+
for x, c in zip(hop_lengths, hidden_channels)])
|
688 |
+
else:
|
689 |
+
self.discriminators = nn.ModuleList([
|
690 |
+
BaseFrequenceDiscriminator(1, c)
|
691 |
+
for x, c in zip(hop_lengths, hidden_channels)])
|
692 |
+
|
693 |
+
def forward(self, x):
|
694 |
+
scores, feats = list(), list()
|
695 |
+
for stft, layer in zip(self.stfts, self.discriminators):
|
696 |
+
# print(stft)
|
697 |
+
mag, phase = stft.transform(x.squeeze())
|
698 |
+
if self.domain == 'double':
|
699 |
+
mag = torch.stack(torch.chunk(mag, 2, dim=1), dim=1)
|
700 |
+
else:
|
701 |
+
mag = mag.unsqueeze(1)
|
702 |
+
|
703 |
+
score, feat = layer(mag)
|
704 |
+
scores.append(score)
|
705 |
+
feats.append(feat)
|
706 |
+
return scores, feats
|
707 |
+
|
708 |
+
|
709 |
+
class BaseFrequenceDiscriminator(nn.Module):
|
710 |
+
def __init__(self, in_channels, hidden_channels=512):
|
711 |
+
super(BaseFrequenceDiscriminator, self).__init__()
|
712 |
+
|
713 |
+
self.discriminator = nn.ModuleList()
|
714 |
+
self.discriminator += [
|
715 |
+
nn.Sequential(
|
716 |
+
nn.ReflectionPad2d((1, 1, 1, 1)),
|
717 |
+
nn.utils.weight_norm(nn.Conv2d(
|
718 |
+
in_channels, hidden_channels // 32,
|
719 |
+
kernel_size=(3, 3), stride=(1, 1)))
|
720 |
+
),
|
721 |
+
nn.Sequential(
|
722 |
+
nn.LeakyReLU(0.2, True),
|
723 |
+
nn.ReflectionPad2d((1, 1, 1, 1)),
|
724 |
+
nn.utils.weight_norm(nn.Conv2d(
|
725 |
+
hidden_channels // 32, hidden_channels // 16,
|
726 |
+
kernel_size=(3, 3), stride=(2, 2)))
|
727 |
+
),
|
728 |
+
nn.Sequential(
|
729 |
+
nn.LeakyReLU(0.2, True),
|
730 |
+
nn.ReflectionPad2d((1, 1, 1, 1)),
|
731 |
+
nn.utils.weight_norm(nn.Conv2d(
|
732 |
+
hidden_channels // 16, hidden_channels // 8,
|
733 |
+
kernel_size=(3, 3), stride=(1, 1)))
|
734 |
+
),
|
735 |
+
nn.Sequential(
|
736 |
+
nn.LeakyReLU(0.2, True),
|
737 |
+
nn.ReflectionPad2d((1, 1, 1, 1)),
|
738 |
+
nn.utils.weight_norm(nn.Conv2d(
|
739 |
+
hidden_channels // 8, hidden_channels // 4,
|
740 |
+
kernel_size=(3, 3), stride=(2, 2)))
|
741 |
+
),
|
742 |
+
nn.Sequential(
|
743 |
+
nn.LeakyReLU(0.2, True),
|
744 |
+
nn.ReflectionPad2d((1, 1, 1, 1)),
|
745 |
+
nn.utils.weight_norm(nn.Conv2d(
|
746 |
+
hidden_channels // 4, hidden_channels // 2,
|
747 |
+
kernel_size=(3, 3), stride=(1, 1)))
|
748 |
+
),
|
749 |
+
nn.Sequential(
|
750 |
+
nn.LeakyReLU(0.2, True),
|
751 |
+
nn.ReflectionPad2d((1, 1, 1, 1)),
|
752 |
+
nn.utils.weight_norm(nn.Conv2d(
|
753 |
+
hidden_channels // 2, hidden_channels,
|
754 |
+
kernel_size=(3, 3), stride=(2, 2)))
|
755 |
+
),
|
756 |
+
nn.Sequential(
|
757 |
+
nn.LeakyReLU(0.2, True),
|
758 |
+
nn.ReflectionPad2d((1, 1, 1, 1)),
|
759 |
+
nn.utils.weight_norm(nn.Conv2d(
|
760 |
+
hidden_channels, 1,
|
761 |
+
kernel_size=(3, 3), stride=(1, 1)))
|
762 |
+
)
|
763 |
+
]
|
764 |
+
|
765 |
+
def forward(self, x):
|
766 |
+
hiddens = []
|
767 |
+
for layer in self.discriminator:
|
768 |
+
x = layer(x)
|
769 |
+
hiddens.append(x)
|
770 |
+
return x, hiddens[-1]
|
771 |
+
|
772 |
+
|
773 |
+
class Discriminator(torch.nn.Module):
|
774 |
+
def __init__(self, hps, use_spectral_norm=False):
|
775 |
+
super(Discriminator, self).__init__()
|
776 |
+
periods = [2, 3, 5, 7, 11]
|
777 |
+
|
778 |
+
discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
|
779 |
+
discs = discs + [DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods]
|
780 |
+
self.discriminators = nn.ModuleList(discs)
|
781 |
+
self.disc_multfrequency = MultiFrequencyDiscriminator(hop_lengths=[int(hps.data.sample_rate * 2.5 / 1000),
|
782 |
+
int(hps.data.sample_rate * 5 / 1000),
|
783 |
+
int(hps.data.sample_rate * 7.5 / 1000),
|
784 |
+
int(hps.data.sample_rate * 10 / 1000),
|
785 |
+
int(hps.data.sample_rate * 12.5 / 1000),
|
786 |
+
int(hps.data.sample_rate * 15 / 1000)],
|
787 |
+
hidden_channels=[256, 256, 256, 256, 256])
|
788 |
+
|
789 |
+
def forward(self, y, y_hat):
|
790 |
+
y_d_rs = []
|
791 |
+
y_d_gs = []
|
792 |
+
fmap_rs = []
|
793 |
+
fmap_gs = []
|
794 |
+
for i, d in enumerate(self.discriminators):
|
795 |
+
y_d_r, fmap_r = d(y)
|
796 |
+
y_d_g, fmap_g = d(y_hat)
|
797 |
+
y_d_rs.append(y_d_r)
|
798 |
+
y_d_gs.append(y_d_g)
|
799 |
+
fmap_rs.append(fmap_r)
|
800 |
+
fmap_gs.append(fmap_g)
|
801 |
+
scores_r, fmaps_r = self.disc_multfrequency(y)
|
802 |
+
scores_g, fmaps_g = self.disc_multfrequency(y_hat)
|
803 |
+
for i in range(len(scores_r)):
|
804 |
+
y_d_rs.append(scores_r[i])
|
805 |
+
y_d_gs.append(scores_g[i])
|
806 |
+
fmap_rs.append(fmaps_r[i])
|
807 |
+
fmap_gs.append(fmaps_g[i])
|
808 |
+
return y_d_rs, y_d_gs, fmap_rs, fmap_gs
|
809 |
+
|
810 |
+
|
811 |
+
class SynthesizerTrn(nn.Module):
|
812 |
+
"""
|
813 |
+
Model
|
814 |
+
"""
|
815 |
+
|
816 |
+
def __init__(self, hps):
|
817 |
+
super().__init__()
|
818 |
+
self.hps = hps
|
819 |
+
|
820 |
+
self.text_encoder = TextEncoder(
|
821 |
+
len(ttsing_phone_set),
|
822 |
+
hps.model.prior_hidden_channels,
|
823 |
+
hps.model.prior_hidden_channels,
|
824 |
+
hps.model.prior_filter_channels,
|
825 |
+
hps.model.prior_n_heads,
|
826 |
+
hps.model.prior_n_layers,
|
827 |
+
hps.model.prior_kernel_size,
|
828 |
+
hps.model.prior_p_dropout)
|
829 |
+
|
830 |
+
self.decoder = PriorDecoder(
|
831 |
+
hps.model.hidden_channels * 2,
|
832 |
+
hps.model.prior_hidden_channels,
|
833 |
+
hps.model.prior_filter_channels,
|
834 |
+
hps.model.prior_n_heads,
|
835 |
+
hps.model.prior_n_layers,
|
836 |
+
hps.model.prior_kernel_size,
|
837 |
+
hps.model.prior_p_dropout,
|
838 |
+
n_speakers=hps.data.n_speakers,
|
839 |
+
spk_channels=hps.model.spk_channels
|
840 |
+
)
|
841 |
+
|
842 |
+
self.f0_decoder = Decoder(
|
843 |
+
1,
|
844 |
+
hps.model.prior_hidden_channels,
|
845 |
+
hps.model.prior_filter_channels,
|
846 |
+
hps.model.prior_n_heads,
|
847 |
+
hps.model.prior_n_layers,
|
848 |
+
hps.model.prior_kernel_size,
|
849 |
+
hps.model.prior_p_dropout,
|
850 |
+
n_speakers=hps.data.n_speakers,
|
851 |
+
spk_channels=hps.model.spk_channels
|
852 |
+
)
|
853 |
+
|
854 |
+
self.mel_decoder = Decoder(
|
855 |
+
hps.data.acoustic_dim,
|
856 |
+
hps.model.prior_hidden_channels,
|
857 |
+
hps.model.prior_filter_channels,
|
858 |
+
hps.model.prior_n_heads,
|
859 |
+
hps.model.prior_n_layers,
|
860 |
+
hps.model.prior_kernel_size,
|
861 |
+
hps.model.prior_p_dropout,
|
862 |
+
n_speakers=hps.data.n_speakers,
|
863 |
+
spk_channels=hps.model.spk_channels
|
864 |
+
)
|
865 |
+
|
866 |
+
self.posterior_encoder = PosteriorEncoder(
|
867 |
+
hps,
|
868 |
+
hps.data.acoustic_dim,
|
869 |
+
hps.model.hidden_channels,
|
870 |
+
hps.model.hidden_channels, 3, 1, 8)
|
871 |
+
|
872 |
+
self.dropout = nn.Dropout(0.2)
|
873 |
+
|
874 |
+
self.duration_predictor = DurationPredictor(
|
875 |
+
hps.model.prior_hidden_channels,
|
876 |
+
hps.model.prior_hidden_channels,
|
877 |
+
3,
|
878 |
+
0.5,
|
879 |
+
n_speakers=hps.data.n_speakers,
|
880 |
+
spk_channels=hps.model.spk_channels)
|
881 |
+
self.LR = LengthRegulator()
|
882 |
+
|
883 |
+
self.dec = Generator(hps,
|
884 |
+
hps.model.hidden_channels,
|
885 |
+
hps.model.resblock,
|
886 |
+
hps.model.resblock_kernel_sizes,
|
887 |
+
hps.model.resblock_dilation_sizes,
|
888 |
+
hps.model.upsample_rates,
|
889 |
+
hps.model.upsample_initial_channel,
|
890 |
+
hps.model.upsample_kernel_sizes,
|
891 |
+
n_speakers=hps.data.n_speakers,
|
892 |
+
spk_channels=hps.model.spk_channels)
|
893 |
+
|
894 |
+
self.dec_harm = Generator_Harm(hps)
|
895 |
+
|
896 |
+
self.dec_noise = Generator_Noise(hps)
|
897 |
+
|
898 |
+
self.f0_prenet = nn.Conv1d(1, hps.model.prior_hidden_channels + 2, 3, padding=1)
|
899 |
+
self.energy_prenet = nn.Conv1d(1, hps.model.prior_hidden_channels + 2, 3, padding=1)
|
900 |
+
self.mel_prenet = nn.Conv1d(hps.data.acoustic_dim, hps.model.prior_hidden_channels + 2, 3, padding=1)
|
901 |
+
|
902 |
+
if hps.data.n_speakers > 1:
|
903 |
+
self.emb_spk = nn.Embedding(hps.data.n_speakers, hps.model.spk_channels)
|
904 |
+
self.flow = modules.ResidualCouplingBlock(hps.model.prior_hidden_channels, hps.model.hidden_channels, 5, 1, 4,n_speakers=hps.data.n_speakers, gin_channels=hps.model.spk_channels)
|
905 |
+
|
906 |
+
def forward(self, phone, phone_lengths, pitchid, dur, slur, gtdur, F0, mel, bn_lengths, spk_id=None):
|
907 |
+
if self.hps.data.n_speakers > 0:
|
908 |
+
g = self.emb_spk(spk_id).unsqueeze(-1) # [b, h, 1]
|
909 |
+
else:
|
910 |
+
g = None
|
911 |
+
|
912 |
+
# Encoder
|
913 |
+
x, x_mask = self.text_encoder(phone, phone_lengths, pitchid, dur, slur)
|
914 |
+
|
915 |
+
# LR
|
916 |
+
decoder_input, mel_len = self.LR(x, gtdur, None)
|
917 |
+
|
918 |
+
LF0 = 2595. * torch.log10(1. + F0 / 700.)
|
919 |
+
LF0 = LF0 / 500
|
920 |
+
|
921 |
+
# aam
|
922 |
+
predict_mel, predict_bn_mask = self.mel_decoder(decoder_input + self.f0_prenet(LF0), bn_lengths, spk_emb=g)
|
923 |
+
|
924 |
+
predict_energy = predict_mel.detach().sum(1).unsqueeze(1) / self.hps.data.acoustic_dim
|
925 |
+
|
926 |
+
decoder_input = decoder_input + \
|
927 |
+
self.f0_prenet(LF0) + \
|
928 |
+
self.energy_prenet(predict_energy) + \
|
929 |
+
self.mel_prenet(predict_mel.detach())
|
930 |
+
decoder_output, predict_bn_mask = self.decoder(decoder_input, bn_lengths, spk_emb=g)
|
931 |
+
|
932 |
+
prior_info = decoder_output
|
933 |
+
m_p = prior_info[:, :self.hps.model.hidden_channels, :]
|
934 |
+
logs_p = prior_info[:, self.hps.model.hidden_channels:, :]
|
935 |
+
|
936 |
+
# posterior
|
937 |
+
posterior, y_mask = self.posterior_encoder(mel, bn_lengths,g=g)
|
938 |
+
m_q = posterior[:, :self.hps.model.hidden_channels, :]
|
939 |
+
logs_q = posterior[:, self.hps.model.hidden_channels:, :]
|
940 |
+
z = (m_q + torch.randn_like(m_q) * torch.exp(logs_q)) * y_mask
|
941 |
+
z_p = self.flow(z, y_mask, g=g)
|
942 |
+
|
943 |
+
# kl loss
|
944 |
+
loss_kl = kl_loss(z_p, logs_q, m_p, logs_p, y_mask)
|
945 |
+
|
946 |
+
p_z = z
|
947 |
+
p_z = self.dropout(p_z)
|
948 |
+
|
949 |
+
pitch = upsample(F0.transpose(1, 2), self.hps.data.hop_size)
|
950 |
+
omega = torch.cumsum(2 * math.pi * pitch / self.hps.data.sample_rate, 1)
|
951 |
+
sin = torch.sin(omega).transpose(1, 2)
|
952 |
+
|
953 |
+
# dsp synthesize
|
954 |
+
noise_x = self.dec_noise(p_z, y_mask)
|
955 |
+
harm_x = self.dec_harm(F0, p_z, y_mask)
|
956 |
+
|
957 |
+
# dsp waveform
|
958 |
+
dsp_o = torch.cat([harm_x, noise_x], axis=1)
|
959 |
+
|
960 |
+
decoder_condition = torch.cat([harm_x, noise_x, sin], axis=1)
|
961 |
+
|
962 |
+
# dsp based HiFiGAN vocoder
|
963 |
+
x_slice, ids_slice = commons.rand_slice_segments(p_z, bn_lengths,
|
964 |
+
self.hps.train.segment_size // self.hps.data.hop_size)
|
965 |
+
F0_slice = commons.slice_segments(F0, ids_slice, self.hps.train.segment_size // self.hps.data.hop_size)
|
966 |
+
dsp_slice = commons.slice_segments(dsp_o, ids_slice * self.hps.data.hop_size, self.hps.train.segment_size)
|
967 |
+
condition_slice = commons.slice_segments(decoder_condition, ids_slice * self.hps.data.hop_size,
|
968 |
+
self.hps.train.segment_size)
|
969 |
+
o = self.dec(x_slice, condition_slice.detach(), g=g)
|
970 |
+
|
971 |
+
return o, ids_slice, LF0 * predict_bn_mask, dsp_slice.sum(1), loss_kl, predict_mel, predict_bn_mask
|
972 |
+
|
973 |
+
def infer(self, phone, phone_lengths, pitchid, dur, slur, gtdur=None, spk_id=None, length_scale=1., F0=None, noise_scale=0.8):
|
974 |
+
|
975 |
+
if self.hps.data.n_speakers > 0:
|
976 |
+
g = self.emb_spk(spk_id).unsqueeze(-1) # [b, h, 1]
|
977 |
+
else:
|
978 |
+
g = None
|
979 |
+
|
980 |
+
# Encoder
|
981 |
+
x, x_mask = self.text_encoder(phone, phone_lengths, pitchid, dur, slur)
|
982 |
+
|
983 |
+
# dur
|
984 |
+
y_lengths = torch.clamp_min(torch.sum(gtdur.squeeze(1), [1]), 1).long()
|
985 |
+
LF0 = 2595. * torch.log10(1. + F0 / 700.)
|
986 |
+
LF0 = LF0 / 500
|
987 |
+
# LR
|
988 |
+
decoder_input, mel_len = self.LR(x, gtdur, None)
|
989 |
+
|
990 |
+
# aam
|
991 |
+
predict_mel, predict_bn_mask = self.mel_decoder(decoder_input + self.f0_prenet(LF0), y_lengths, spk_emb=g)
|
992 |
+
|
993 |
+
predict_energy = predict_mel.sum(1).unsqueeze(1) / self.hps.data.acoustic_dim
|
994 |
+
|
995 |
+
decoder_input = decoder_input + \
|
996 |
+
self.f0_prenet(LF0) + \
|
997 |
+
self.energy_prenet(predict_energy) + \
|
998 |
+
self.mel_prenet(predict_mel)
|
999 |
+
decoder_output, y_mask = self.decoder(decoder_input, y_lengths, spk_emb=g)
|
1000 |
+
|
1001 |
+
prior_info = decoder_output
|
1002 |
+
|
1003 |
+
m_p = prior_info[:, :self.hps.model.hidden_channels, :]
|
1004 |
+
logs_p = prior_info[:, self.hps.model.hidden_channels:, :]
|
1005 |
+
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
1006 |
+
z = self.flow(z_p, y_mask, g=g, reverse=True)
|
1007 |
+
|
1008 |
+
prior_z = z
|
1009 |
+
|
1010 |
+
noise_x = self.dec_noise(prior_z, y_mask)
|
1011 |
+
|
1012 |
+
harm_x = self.dec_harm(F0, prior_z, y_mask)
|
1013 |
+
|
1014 |
+
pitch = upsample(F0.transpose(1, 2), self.hps.data.hop_size)
|
1015 |
+
omega = torch.cumsum(2 * math.pi * pitch / self.hps.data.sample_rate, 1)
|
1016 |
+
sin = torch.sin(omega).transpose(1, 2)
|
1017 |
+
|
1018 |
+
decoder_condition = torch.cat([harm_x, noise_x, sin], axis=1)
|
1019 |
+
|
1020 |
+
# dsp based HiFiGAN vocoder
|
1021 |
+
o = self.dec(prior_z, decoder_condition, g=g)
|
1022 |
+
|
1023 |
+
return o, harm_x.sum(1).unsqueeze(1), noise_x
|
egs/visinger2/train.py
ADDED
@@ -0,0 +1,456 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import json
|
4 |
+
import argparse
|
5 |
+
import itertools
|
6 |
+
import math
|
7 |
+
import time
|
8 |
+
import logging
|
9 |
+
|
10 |
+
import torch
|
11 |
+
from torch import nn, optim
|
12 |
+
from torch.nn import functional as F
|
13 |
+
from torch.utils.data import DataLoader
|
14 |
+
from torch.utils.tensorboard import SummaryWriter
|
15 |
+
import torch.multiprocessing as mp
|
16 |
+
import torch.distributed as dist
|
17 |
+
from torch.nn.parallel import DistributedDataParallel as DDP
|
18 |
+
from torch.cuda.amp import autocast, GradScaler
|
19 |
+
|
20 |
+
sys.path.append('../..')
|
21 |
+
import modules.commons as commons
|
22 |
+
import utils.utils as utils
|
23 |
+
|
24 |
+
from dataset import DatasetConstructor
|
25 |
+
|
26 |
+
from models import (
|
27 |
+
SynthesizerTrn,
|
28 |
+
Discriminator
|
29 |
+
)
|
30 |
+
|
31 |
+
from modules.losses import (
|
32 |
+
generator_loss,
|
33 |
+
discriminator_loss,
|
34 |
+
feature_loss,
|
35 |
+
kl_loss,
|
36 |
+
)
|
37 |
+
from preprocess.mel_processing import mel_spectrogram_torch, spec_to_mel_torch, spectrogram_torch
|
38 |
+
|
39 |
+
torch.backends.cudnn.benchmark = True
|
40 |
+
global_step = 0
|
41 |
+
use_cuda = torch.cuda.is_available()
|
42 |
+
print("use_cuda, ", use_cuda)
|
43 |
+
|
44 |
+
numba_logger = logging.getLogger('numba')
|
45 |
+
numba_logger.setLevel(logging.WARNING)
|
46 |
+
|
47 |
+
|
48 |
+
def main():
|
49 |
+
"""Assume Single Node Multi GPUs Training Only"""
|
50 |
+
|
51 |
+
hps = utils.get_hparams()
|
52 |
+
os.environ['MASTER_ADDR'] = 'localhost'
|
53 |
+
os.environ['MASTER_PORT'] = str(hps.train.port)
|
54 |
+
|
55 |
+
if (torch.cuda.is_available()):
|
56 |
+
n_gpus = torch.cuda.device_count()
|
57 |
+
mp.spawn(run, nprocs=n_gpus, args=(n_gpus, hps,))
|
58 |
+
else:
|
59 |
+
cpurun(0, 1, hps)
|
60 |
+
|
61 |
+
|
62 |
+
def run(rank, n_gpus, hps):
|
63 |
+
global global_step
|
64 |
+
if rank == 0:
|
65 |
+
logger = utils.get_logger(hps.train.save_dir)
|
66 |
+
logger.info(hps.train)
|
67 |
+
logger.info(hps.data)
|
68 |
+
logger.info(hps.model)
|
69 |
+
utils.check_git_hash(hps.train.save_dir)
|
70 |
+
writer = SummaryWriter(log_dir=hps.train.save_dir)
|
71 |
+
writer_eval = SummaryWriter(log_dir=os.path.join(hps.train.save_dir, "eval"))
|
72 |
+
|
73 |
+
dist.init_process_group(backend='nccl', init_method='env://', world_size=n_gpus, rank=rank)
|
74 |
+
torch.manual_seed(hps.train.seed)
|
75 |
+
torch.cuda.set_device(rank)
|
76 |
+
dataset_constructor = DatasetConstructor(hps, num_replicas=n_gpus, rank=rank)
|
77 |
+
|
78 |
+
train_loader = dataset_constructor.get_train_loader()
|
79 |
+
if rank == 0:
|
80 |
+
valid_loader = dataset_constructor.get_valid_loader()
|
81 |
+
|
82 |
+
net_g = SynthesizerTrn(hps).cuda(rank)
|
83 |
+
net_d = Discriminator(hps, hps.model.use_spectral_norm).cuda(rank)
|
84 |
+
|
85 |
+
optim_g = torch.optim.AdamW(
|
86 |
+
net_g.parameters(),
|
87 |
+
hps.train.learning_rate,
|
88 |
+
betas=hps.train.betas,
|
89 |
+
eps=hps.train.eps)
|
90 |
+
optim_d = torch.optim.AdamW(
|
91 |
+
net_d.parameters(),
|
92 |
+
hps.train.learning_rate,
|
93 |
+
betas=hps.train.betas,
|
94 |
+
eps=hps.train.eps)
|
95 |
+
net_g = DDP(net_g, device_ids=[rank], find_unused_parameters=True)
|
96 |
+
net_d = DDP(net_d, device_ids=[rank], find_unused_parameters=True)
|
97 |
+
try:
|
98 |
+
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.train.save_dir, "G_*.pth"), net_g,
|
99 |
+
optim_g)
|
100 |
+
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.train.save_dir, "D_*.pth"), net_d,
|
101 |
+
optim_d)
|
102 |
+
global_step = (epoch_str - 1) * len(train_loader)
|
103 |
+
except:
|
104 |
+
epoch_str = 1
|
105 |
+
global_step = 0
|
106 |
+
|
107 |
+
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
|
108 |
+
scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
|
109 |
+
|
110 |
+
for epoch in range(epoch_str, hps.train.epochs + 1):
|
111 |
+
if rank == 0:
|
112 |
+
train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d],
|
113 |
+
[train_loader, valid_loader], logger, [writer, writer_eval])
|
114 |
+
else:
|
115 |
+
train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d],
|
116 |
+
[train_loader, None], None, None)
|
117 |
+
scheduler_g.step()
|
118 |
+
scheduler_d.step()
|
119 |
+
|
120 |
+
|
121 |
+
def cpurun(rank, n_gpus, hps):
|
122 |
+
global global_step
|
123 |
+
if rank == 0:
|
124 |
+
logger = utils.get_logger(hps.train.save_dir)
|
125 |
+
logger.info(hps.train)
|
126 |
+
logger.info(hps.data)
|
127 |
+
logger.info(hps.model)
|
128 |
+
utils.check_git_hash(hps.train.save_dir)
|
129 |
+
writer = SummaryWriter(log_dir=hps.train.save_dir)
|
130 |
+
writer_eval = SummaryWriter(log_dir=os.path.join(hps.train.save_dir, "eval"))
|
131 |
+
torch.manual_seed(hps.train.seed)
|
132 |
+
dataset_constructor = DatasetConstructor(hps, num_replicas=n_gpus, rank=rank)
|
133 |
+
|
134 |
+
train_loader = dataset_constructor.get_train_loader()
|
135 |
+
if rank == 0:
|
136 |
+
valid_loader = dataset_constructor.get_valid_loader()
|
137 |
+
|
138 |
+
net_g = SynthesizerTrn(hps)
|
139 |
+
net_d = Discriminator(hps, hps.model.use_spectral_norm)
|
140 |
+
|
141 |
+
optim_g = torch.optim.AdamW(
|
142 |
+
net_g.parameters(),
|
143 |
+
hps.train.learning_rate,
|
144 |
+
betas=hps.train.betas,
|
145 |
+
eps=hps.train.eps)
|
146 |
+
optim_d = torch.optim.AdamW(
|
147 |
+
net_d.parameters(),
|
148 |
+
hps.train.learning_rate,
|
149 |
+
betas=hps.train.betas,
|
150 |
+
eps=hps.train.eps)
|
151 |
+
try:
|
152 |
+
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.train.save_dir, "G_*.pth"), net_g,
|
153 |
+
optim_g)
|
154 |
+
_, _, _, epoch_str = utils.load_checkpoint(utils.latest_checkpoint_path(hps.train.save_dir, "D_*.pth"), net_g,
|
155 |
+
optim_g)
|
156 |
+
global_step = (epoch_str - 1) * len(train_loader)
|
157 |
+
except:
|
158 |
+
epoch_str = 1
|
159 |
+
global_step = 0
|
160 |
+
|
161 |
+
scheduler_g = torch.optim.lr_scheduler.ExponentialLR(optim_g, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
|
162 |
+
scheduler_d = torch.optim.lr_scheduler.ExponentialLR(optim_d, gamma=hps.train.lr_decay, last_epoch=epoch_str - 2)
|
163 |
+
|
164 |
+
for epoch in range(epoch_str, hps.train.epochs + 1):
|
165 |
+
train_and_evaluate(rank, epoch, hps, [net_g, net_d], [optim_g, optim_d], [scheduler_g, scheduler_d],
|
166 |
+
[train_loader, valid_loader], logger, [writer, writer_eval])
|
167 |
+
|
168 |
+
scheduler_g.step()
|
169 |
+
scheduler_d.step()
|
170 |
+
|
171 |
+
|
172 |
+
def train_and_evaluate(rank, epoch, hps, nets, optims, schedulers, loaders, logger, writers):
|
173 |
+
net_g, net_d = nets
|
174 |
+
optim_g, optim_d = optims
|
175 |
+
scheduler_g, scheduler_d = schedulers
|
176 |
+
train_loader, eval_loader = loaders
|
177 |
+
if writers is not None:
|
178 |
+
writer, writer_eval = writers
|
179 |
+
|
180 |
+
train_loader.sampler.set_epoch(epoch)
|
181 |
+
global global_step
|
182 |
+
|
183 |
+
net_g.train()
|
184 |
+
net_d.train()
|
185 |
+
for batch_idx, data_dict in enumerate(train_loader):
|
186 |
+
|
187 |
+
phone = data_dict["phone"]
|
188 |
+
pitchid = data_dict["pitchid"]
|
189 |
+
dur = data_dict["dur"]
|
190 |
+
slur = data_dict["slur"]
|
191 |
+
gtdur = data_dict["gtdur"]
|
192 |
+
mel = data_dict["mel"]
|
193 |
+
f0 = data_dict["f0"]
|
194 |
+
wav = data_dict["wav"]
|
195 |
+
spkid = data_dict["spkid"]
|
196 |
+
|
197 |
+
phone_lengths = data_dict["phone_lengths"]
|
198 |
+
mel_lengths = data_dict["mel_lengths"]
|
199 |
+
wav_lengths = data_dict["wav_lengths"]
|
200 |
+
f0_lengths = data_dict["f0_lengths"]
|
201 |
+
|
202 |
+
# data
|
203 |
+
if (use_cuda):
|
204 |
+
phone, phone_lengths = phone.cuda(rank, non_blocking=True), phone_lengths.cuda(rank, non_blocking=True)
|
205 |
+
pitchid = pitchid.cuda(rank, non_blocking=True)
|
206 |
+
dur = dur.cuda(rank, non_blocking=True)
|
207 |
+
slur = slur.cuda(rank, non_blocking=True)
|
208 |
+
gtdur = gtdur.cuda(rank, non_blocking=True)
|
209 |
+
mel, mel_lengths = mel.cuda(rank, non_blocking=True), mel_lengths.cuda(rank, non_blocking=True)
|
210 |
+
wav, wav_lengths = wav.cuda(rank, non_blocking=True), wav_lengths.cuda(rank, non_blocking=True)
|
211 |
+
f0, f0_lengths = f0.cuda(rank, non_blocking=True), f0_lengths.cuda(rank, non_blocking=True)
|
212 |
+
spkid = spkid.cuda(rank, non_blocking=True)
|
213 |
+
|
214 |
+
# forward
|
215 |
+
y_hat, ids_slice, LF0, y_ddsp, kl_div, predict_mel, mask = net_g(phone, phone_lengths, pitchid, dur, slur,
|
216 |
+
gtdur, f0, mel, mel_lengths, spk_id=spkid)
|
217 |
+
y_ddsp = y_ddsp.unsqueeze(1)
|
218 |
+
|
219 |
+
# Discriminator
|
220 |
+
y = commons.slice_segments(wav, ids_slice * hps.data.hop_size, hps.train.segment_size) # slice
|
221 |
+
y_ddsp_mel = mel_spectrogram_torch(
|
222 |
+
y_ddsp.squeeze(1),
|
223 |
+
hps.data.n_fft,
|
224 |
+
hps.data.acoustic_dim,
|
225 |
+
hps.data.sample_rate,
|
226 |
+
hps.data.hop_size,
|
227 |
+
hps.data.win_size,
|
228 |
+
hps.data.fmin,
|
229 |
+
hps.data.fmax
|
230 |
+
)
|
231 |
+
|
232 |
+
y_logspec = torch.log(spectrogram_torch(
|
233 |
+
y.squeeze(1),
|
234 |
+
hps.data.n_fft,
|
235 |
+
hps.data.sample_rate,
|
236 |
+
hps.data.hop_size,
|
237 |
+
hps.data.win_size
|
238 |
+
) + 1e-7)
|
239 |
+
|
240 |
+
y_ddsp_logspec = torch.log(spectrogram_torch(
|
241 |
+
y_ddsp.squeeze(1),
|
242 |
+
hps.data.n_fft,
|
243 |
+
hps.data.sample_rate,
|
244 |
+
hps.data.hop_size,
|
245 |
+
hps.data.win_size
|
246 |
+
) + 1e-7)
|
247 |
+
|
248 |
+
y_mel = mel_spectrogram_torch(
|
249 |
+
y.squeeze(1),
|
250 |
+
hps.data.n_fft,
|
251 |
+
hps.data.acoustic_dim,
|
252 |
+
hps.data.sample_rate,
|
253 |
+
hps.data.hop_size,
|
254 |
+
hps.data.win_size,
|
255 |
+
hps.data.fmin,
|
256 |
+
hps.data.fmax
|
257 |
+
)
|
258 |
+
y_hat_mel = mel_spectrogram_torch(
|
259 |
+
y_hat.squeeze(1),
|
260 |
+
hps.data.n_fft,
|
261 |
+
hps.data.acoustic_dim,
|
262 |
+
hps.data.sample_rate,
|
263 |
+
hps.data.hop_size,
|
264 |
+
hps.data.win_size,
|
265 |
+
hps.data.fmin,
|
266 |
+
hps.data.fmax
|
267 |
+
)
|
268 |
+
|
269 |
+
y_d_hat_r, y_d_hat_g, _, _ = net_d(y, y_hat.detach())
|
270 |
+
loss_disc, losses_disc_r, losses_disc_g = discriminator_loss(y_d_hat_r, y_d_hat_g)
|
271 |
+
loss_disc_all = loss_disc
|
272 |
+
|
273 |
+
optim_d.zero_grad()
|
274 |
+
loss_disc_all.backward()
|
275 |
+
grad_norm_d = commons.clip_grad_value_(net_d.parameters(), None)
|
276 |
+
optim_d.step()
|
277 |
+
|
278 |
+
# loss
|
279 |
+
y_d_hat_r, y_d_hat_g, fmap_r, fmap_g = net_d(y, y_hat)
|
280 |
+
|
281 |
+
loss_mel = F.l1_loss(y_mel, y_hat_mel) * 45
|
282 |
+
loss_mel_dsp = F.l1_loss(y_mel, y_ddsp_mel) * 45
|
283 |
+
loss_spec_dsp = F.l1_loss(y_logspec, y_ddsp_logspec) * 45
|
284 |
+
|
285 |
+
loss_mel_am = F.mse_loss(mel * mask, predict_mel * mask) # * 10
|
286 |
+
|
287 |
+
loss_fm = feature_loss(fmap_r, fmap_g)
|
288 |
+
loss_gen, losses_gen = generator_loss(y_d_hat_g)
|
289 |
+
|
290 |
+
loss_fm = loss_fm / 2
|
291 |
+
loss_gen = loss_gen / 2
|
292 |
+
loss_gen_all = loss_gen + loss_fm + loss_mel + loss_mel_dsp + kl_div + loss_mel_am + loss_spec_dsp
|
293 |
+
|
294 |
+
loss_gen_all = loss_gen_all / hps.train.accumulation_steps
|
295 |
+
|
296 |
+
loss_gen_all.backward()
|
297 |
+
if ((global_step + 1) % hps.train.accumulation_steps == 0):
|
298 |
+
grad_norm_g = commons.clip_grad_value_(net_g.parameters(), None)
|
299 |
+
optim_g.step()
|
300 |
+
optim_g.zero_grad()
|
301 |
+
|
302 |
+
if rank == 0:
|
303 |
+
if (global_step + 1) % (hps.train.accumulation_steps * 10) == 0:
|
304 |
+
logger.info(["step&time", global_step, time.asctime(time.localtime(time.time()))])
|
305 |
+
logger.info(["mel&mel_dsp&spec_dsp: ", loss_mel, loss_mel_dsp, loss_spec_dsp])
|
306 |
+
logger.info(["adv&fm: ", loss_gen, loss_fm])
|
307 |
+
logger.info(["kl: ", kl_div])
|
308 |
+
logger.info(["am&dur: ", loss_mel_am])
|
309 |
+
|
310 |
+
if global_step % hps.train.log_interval == 0:
|
311 |
+
lr = optim_g.param_groups[0]['lr']
|
312 |
+
losses = [loss_gen_all, loss_mel]
|
313 |
+
logger.info('Train Epoch: {} [{:.0f}%]'.format(
|
314 |
+
epoch,
|
315 |
+
100. * batch_idx / len(train_loader)))
|
316 |
+
logger.info([x.item() for x in losses] + [global_step, lr])
|
317 |
+
|
318 |
+
scalar_dict = {"loss/total": loss_gen_all,
|
319 |
+
"loss/mel": loss_mel,
|
320 |
+
"loss/adv": loss_gen,
|
321 |
+
"loss/fm": loss_fm,
|
322 |
+
"loss/mel_ddsp": loss_mel_dsp,
|
323 |
+
"loss/spec_ddsp": loss_spec_dsp,
|
324 |
+
"loss/mel_am": loss_mel_am,
|
325 |
+
"loss/kl_div": kl_div,
|
326 |
+
"learning_rate": lr}
|
327 |
+
|
328 |
+
utils.summarize(
|
329 |
+
writer=writer,
|
330 |
+
global_step=global_step,
|
331 |
+
scalars=scalar_dict)
|
332 |
+
|
333 |
+
if global_step % hps.train.eval_interval == 0:
|
334 |
+
logger.info(['All training params(G): ', utils.count_parameters(net_g), ' M'])
|
335 |
+
# print('Sub training params(G): ', \
|
336 |
+
# 'text_encoder: ', utils.count_parameters(net_g.module.text_encoder), ' M, ', \
|
337 |
+
# 'decoder: ', utils.count_parameters(net_g.module.decoder), ' M, ', \
|
338 |
+
# 'mel_decoder: ', utils.count_parameters(net_g.module.mel_decoder), ' M, ', \
|
339 |
+
# 'dec: ', utils.count_parameters(net_g.module.dec), ' M, ', \
|
340 |
+
# 'dec_harm: ', utils.count_parameters(net_g.module.dec_harm), ' M, ', \
|
341 |
+
# 'dec_noise: ', utils.count_parameters(net_g.module.dec_noise), ' M, ', \
|
342 |
+
# 'posterior: ', utils.count_parameters(net_g.module.posterior_encoder), ' M, ', \
|
343 |
+
# )
|
344 |
+
|
345 |
+
evaluate(hps, net_g, eval_loader, writer_eval)
|
346 |
+
utils.save_checkpoint(net_g, optim_g, hps.train.learning_rate, epoch,
|
347 |
+
os.path.join(hps.train.save_dir, "G_{}.pth".format(global_step)), hps.train.eval_interval)
|
348 |
+
utils.save_checkpoint(net_d, optim_d, hps.train.learning_rate, epoch,
|
349 |
+
os.path.join(hps.train.save_dir, "D_{}.pth".format(global_step)), hps.train.eval_interval)
|
350 |
+
net_g.train()
|
351 |
+
global_step += 1
|
352 |
+
|
353 |
+
if rank == 0:
|
354 |
+
logger.info('====> Epoch: {}'.format(epoch))
|
355 |
+
|
356 |
+
|
357 |
+
def evaluate(hps, generator, eval_loader, writer_eval):
|
358 |
+
generator.eval()
|
359 |
+
image_dict = {}
|
360 |
+
audio_dict = {}
|
361 |
+
with torch.no_grad():
|
362 |
+
for batch_idx, data_dict in enumerate(eval_loader):
|
363 |
+
if batch_idx == 4:
|
364 |
+
break
|
365 |
+
phone = data_dict["phone"]
|
366 |
+
pitchid = data_dict["pitchid"]
|
367 |
+
dur = data_dict["dur"]
|
368 |
+
slur = data_dict["slur"]
|
369 |
+
gtdur = data_dict["gtdur"]
|
370 |
+
mel = data_dict["mel"]
|
371 |
+
f0 = data_dict["f0"]
|
372 |
+
wav = data_dict["wav"]
|
373 |
+
spkid = data_dict["spkid"]
|
374 |
+
|
375 |
+
phone_lengths = data_dict["phone_lengths"]
|
376 |
+
mel_lengths = data_dict["mel_lengths"]
|
377 |
+
wav_lengths = data_dict["wav_lengths"]
|
378 |
+
f0_lengths = data_dict["f0_lengths"]
|
379 |
+
|
380 |
+
# data
|
381 |
+
if (use_cuda):
|
382 |
+
phone, phone_lengths = phone.cuda(0), phone_lengths.cuda(0)
|
383 |
+
pitchid = pitchid.cuda(0)
|
384 |
+
dur = dur.cuda(0)
|
385 |
+
slur = slur.cuda(0)
|
386 |
+
wav = wav.cuda(0)
|
387 |
+
mel = mel.cuda(0)
|
388 |
+
f0 = f0.cuda(0)
|
389 |
+
gtdur = gtdur.cuda(0)
|
390 |
+
spkid = spkid.cuda(0)
|
391 |
+
# remove else
|
392 |
+
phone = phone[:1]
|
393 |
+
phone_lengths = phone_lengths[:1]
|
394 |
+
pitchid = pitchid[:1]
|
395 |
+
dur = dur[:1]
|
396 |
+
slur = slur[:1]
|
397 |
+
wav = wav[:1]
|
398 |
+
mel = mel[:1]
|
399 |
+
f0 = f0[:1]
|
400 |
+
gtdur = gtdur[:1]
|
401 |
+
spkid = spkid[:1]
|
402 |
+
|
403 |
+
y_hat, y_harm, y_noise = generator.module.infer(phone, phone_lengths, pitchid, dur, slur, gtdur=gtdur, F0=f0,
|
404 |
+
spk_id=spkid)
|
405 |
+
spec = spectrogram_torch(
|
406 |
+
wav.squeeze(1),
|
407 |
+
hps.data.n_fft,
|
408 |
+
hps.data.sample_rate,
|
409 |
+
hps.data.hop_size,
|
410 |
+
hps.data.win_size
|
411 |
+
)
|
412 |
+
|
413 |
+
y_mel = mel_spectrogram_torch(
|
414 |
+
wav.squeeze(1),
|
415 |
+
hps.data.n_fft,
|
416 |
+
hps.data.acoustic_dim,
|
417 |
+
hps.data.sample_rate,
|
418 |
+
hps.data.hop_size,
|
419 |
+
hps.data.win_size,
|
420 |
+
hps.data.fmin,
|
421 |
+
hps.data.fmax
|
422 |
+
)
|
423 |
+
y_hat_mel = mel_spectrogram_torch(
|
424 |
+
y_hat.squeeze(1),
|
425 |
+
hps.data.n_fft,
|
426 |
+
hps.data.acoustic_dim,
|
427 |
+
hps.data.sample_rate,
|
428 |
+
hps.data.hop_size,
|
429 |
+
hps.data.win_size,
|
430 |
+
hps.data.fmin,
|
431 |
+
hps.data.fmax
|
432 |
+
)
|
433 |
+
image_dict.update({
|
434 |
+
f"gen/mel_{batch_idx}": utils.plot_spectrogram_to_numpy(y_hat_mel[0].cpu().numpy()),
|
435 |
+
})
|
436 |
+
audio_dict.update( {
|
437 |
+
f"gen/audio_{batch_idx}": y_hat[0, :, :],
|
438 |
+
f"gen/harm_{batch_idx}": y_harm[0, :, :],
|
439 |
+
"gen/noise": y_noise[0, :, :]
|
440 |
+
})
|
441 |
+
# if global_step == 0:
|
442 |
+
image_dict.update({f"gt/mel_{batch_idx}": utils.plot_spectrogram_to_numpy(mel[0].cpu().numpy())})
|
443 |
+
audio_dict.update({f"gt/audio_{batch_idx}": wav[0, :, :wav_lengths[0]]})
|
444 |
+
|
445 |
+
utils.summarize(
|
446 |
+
writer=writer_eval,
|
447 |
+
global_step=global_step,
|
448 |
+
images=image_dict,
|
449 |
+
audios=audio_dict,
|
450 |
+
audio_sampling_rate=hps.data.sample_rate
|
451 |
+
)
|
452 |
+
generator.train()
|
453 |
+
|
454 |
+
|
455 |
+
if __name__ == "__main__":
|
456 |
+
main()
|
infer/__init__.py
ADDED
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
import librosa
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
import tqdm
|
7 |
+
from text import npu
|
8 |
+
|
9 |
+
def resize2d_f0(x, target_len):
|
10 |
+
source = np.array(x)
|
11 |
+
source[source < 0.001] = np.nan
|
12 |
+
target = np.interp(np.arange(0, len(source) * target_len, len(source)) / target_len, np.arange(0, len(source)),
|
13 |
+
source)
|
14 |
+
res = np.nan_to_num(target)
|
15 |
+
return res
|
16 |
+
|
17 |
+
|
18 |
+
def preprocess(ds):
|
19 |
+
note_list = ds["note_seq"]
|
20 |
+
midis = [librosa.note_to_midi(x.split("/")[0]) if x != 'rest' else 0
|
21 |
+
for x in note_list.split(" ")]
|
22 |
+
f0_seq = None
|
23 |
+
if ds["f0_seq"] is not None:
|
24 |
+
f0_seq = [float(i.strip()) for i in ds["f0_seq"].split(" ")]
|
25 |
+
f0_seq = np.array(f0_seq)
|
26 |
+
phseq = ds["ph_seq"].split(" ")
|
27 |
+
newphseq = []
|
28 |
+
for ph in phseq:
|
29 |
+
newphseq.append(npu.ttsing_phone_to_int[ph])
|
30 |
+
phseq = newphseq
|
31 |
+
phseq = np.array(phseq)
|
32 |
+
pitch = 440 * (2 ** ((np.array(midis) - 69) / 12))
|
33 |
+
durations = [float(i) for i in ds["ph_dur"].split(" ")]
|
34 |
+
accu_dur = 0
|
35 |
+
accu_durs = []
|
36 |
+
for dur in durations:
|
37 |
+
accu_dur += dur
|
38 |
+
accu_durs.append(accu_dur)
|
39 |
+
accu_durs = np.array(accu_durs)
|
40 |
+
accu_durs = (accu_durs * 44100 // 512).astype(int)
|
41 |
+
sub_durs = np.zeros_like(accu_durs)
|
42 |
+
sub_durs[1:accu_durs.shape[0]] = accu_durs[:accu_durs.shape[0]-1]
|
43 |
+
durations = accu_durs-sub_durs
|
44 |
+
f0_seq = resize2d_f0(f0_seq, sum(durations))
|
45 |
+
pos = 0
|
46 |
+
for i, d in enumerate(durations):
|
47 |
+
if phseq[i] == 0:
|
48 |
+
f0_seq[pos:pos + d] = 0
|
49 |
+
pos += d
|
50 |
+
|
51 |
+
return f0_seq,pitch, phseq, durations
|
52 |
+
|
53 |
+
if __name__ == '__main__':
|
54 |
+
inp = {
|
55 |
+
"text": "SP 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 SP 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 SP 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 啊 SP",
|
56 |
+
"ph_seq": "SP x ing z ou z ai w ei x ian b ian y van s i0 y i d e g uai d ao SP z i0 y ou d e t iao zh e zh ir j ian sh ang d e w u d ao SP q ing y ing d e x iang an y ing zh ong c ang f u d e b o s i0 m ao d eng d ai x ia y i g e m u u b iao SP",
|
57 |
+
"note_seq": "rest D5 D5 B4 B4 D5 D5 G5 G5 D5 D5 C5 C5 B4 B4 A#4 A#4 A4 A4 G4 G4 D4 D4 G4 G4 rest D5 D5 B4 B4 D5 D5 G5 G5 D5 D5 C5 C5 B4 B4 C5 C5 C5 C5 G5 G5 C5 C5 rest D5 D5 B4 B4 D5 D5 G5 G5 D5 C5 C5 B4 B4 A#4 A#4 A#4 A#4 A#4 A#4 A#4 A#4 A#4 A#4 G4 G4 D4 D4 G4 G4 F4 F4 G4 G4 A#4 A#4 C5 C5 C#5 D5 D5 rest",
|
58 |
+
"note_dur_seq": "0.6 0.136 0.136 0.137 0.137 0.545 0.545 0.546 0.546 0.2720001 0.2720001 0.273 0.273 0.273 0.273 0.2719998 0.2719998 0.546 0.546 0.5450001 0.5450001 0.2730002 0.2730002 0.4089999 0.4089999 0.1370001 0.1359997 0.1359997 0.1360002 0.1360002 0.546 0.546 0.5450001 0.5450001 0.2729998 0.2729998 0.2730002 0.2730002 0.2719998 0.2719998 0.546 0.546 0.2730002 0.2730002 0.5449996 0.5449996 0.6820002 0.6820002 0.1359997 0.1370001 0.1370001 0.1360006 0.1360006 0.5450001 0.5450001 0.5459995 0.5459995 0.2729998 0.2720003 0.2720003 0.2729998 0.2729998 0.3640003 0.3640003 0.1809998 0.1809998 0.3640003 0.3640003 0.1820002 0.1820002 0.3639994 0.3639994 0.1810007 0.1810007 0.3639994 0.3639994 0.1820002 0.1820002 0.4090004 0.4090004 0.4089994 0.4089994 0.2729998 0.2729998 0.2720003 0.2720003 0.5460005 0.8179989 0.8179989 0.5",
|
59 |
+
"is_slur_seq": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0",
|
60 |
+
"ph_dur": "0.3875 0.2125 0.070091 0.065909 0.082455 0.054545 0.474545 0.070455 0.339182 0.206818 0.244727 0.027273 0.207091 0.065909 0.163909 0.109091 0.272 0 0.442591 0.103409 0.447273 0.097727 0.224137 0.048864 0.409 0.088136 0.048864 0.070091 0.065909 0.081455 0.054545 0.452818 0.093182 0.37 0.175 0.103682 0.169318 0.115046 0.157955 0.1845 0.0875 0.475545 0.070455 0.273 0 0.506363 0.038636 0.682 0.054182 0.081818 0.076773 0.060227 0.097364 0.038636 0.354091 0.190909 0.546 0.202545 0.070455 0.168591 0.103409 0.218454 0.054545 0.2765 0.0875 0.148045 0.032955 0.325364 0.038636 0.067227 0.114773 0.270818 0.093182 0.148046 0.032955 0.286727 0.077273 0.057 0.125 0.409 0 0.381727 0.027273 0.152545 0.120455 0.272 0.441653 0.104348 0.817999 0.5",
|
61 |
+
"f0_timestep": "0.005",
|
62 |
+
"f0_seq": "587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.2 587.0 586.9 586.7 586.1 585.4 584.8 584.1 583.4 582.9 582.5 582.3 582.5 582.9 583.4 584.1 584.9 585.5 586.1 586.7 587.0 587.3 587.6 587.9 588.0 588.1 588.4 588.7 588.7 588.7 588.0 586.4 584.1 580.8 575.8 568.7 560.8 552.0 540.9 531.0 522.2 513.8 506.6 501.7 497.9 495.0 493.8 493.0 492.6 492.6 492.7 492.7 492.7 492.7 492.7 492.5 492.6 493.2 494.1 495.6 498.7 502.5 507.6 515.5 523.9 532.9 543.2 553.7 562.4 570.3 577.2 581.7 584.6 586.9 588.2 588.7 588.7 588.6 588.3 588.1 588.0 587.8 587.5 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.2 586.9 586.7 587.0 587.0 587.0 587.0 587.5 588.7 590.8 594.1 599.0 607.7 617.7 630.6 647.9 667.1 686.3 706.4 727.1 743.0 755.2 765.1 773.3 778.6 781.6 783.4 784.4 784.4 784.4 784.4 784.7 784.7 784.3 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.1 784.5 784.9 784.4 784.4 784.4 784.4 783.8 782.3 779.9 775.1 768.7 759.5 747.9 731.5 712.9 694.2 674.0 652.5 636.1 622.4 610.1 601.9 596.0 591.8 589.1 587.8 587.0 587.0 587.0 587.0 586.8 586.8 587.1 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.6 587.9 588.0 588.1 588.5 589.1 589.4 589.4 589.1 588.4 586.8 584.5 581.2 575.9 570.6 564.1 556.0 548.8 542.3 536.2 531.1 527.3 524.8 522.6 521.9 521.5 521.4 521.6 521.9 522.4 522.6 522.6 522.9 523.2 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.6 523.9 524.1 524.4 524.8 525.4 525.8 526.0 526.2 525.7 524.9 523.3 521.1 518.6 515.3 511.3 507.6 504.0 499.9 497.3 495.0 493.1 492.0 491.4 491.1 491.4 491.6 492.1 492.6 492.9 493.2 493.4 493.7 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 494.1 494.3 494.5 494.8 495.1 495.6 496.1 496.4 496.6 496.5 495.8 494.7 493.2 491.0 487.9 484.7 481.2 477.3 473.8 470.9 468.4 466.2 464.8 464.1 463.6 463.7 463.9 464.2 464.7 465.1 465.4 465.6 465.8 466.1 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.4 466.7 466.9 467.2 467.5 468.0 468.4 468.6 468.9 468.3 467.6 466.4 464.4 462.0 459.3 456.0 452.2 449.0 446.0 443.1 441.0 439.5 438.5 437.9 437.5 437.7 437.9 438.4 438.8 439.1 439.3 439.6 439.8 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.0 440.3 440.5 440.5 440.7 441.0 441.4 441.5 441.5 441.3 440.6 439.1 437.0 434.2 430.6 426.3 420.5 415.3 410.1 404.6 400.5 397.2 394.5 392.6 391.4 390.9 390.6 390.6 390.8 391.1 391.4 391.5 391.6 391.8 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.2 392.4 392.3 392.2 392.2 392.2 392.1 391.5 390.6 388.6 385.6 381.6 375.9 368.3 360.1 351.0 339.3 329.8 321.3 313.1 306.8 302.4 298.9 296.3 294.9 294.1 293.7 293.5 293.5 293.5 293.5 293.4 293.5 293.6 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.6 293.5 293.4 293.5 293.5 293.5 293.5 293.7 294.3 295.4 297.0 299.5 303.8 308.9 315.3 323.9 333.6 343.2 353.2 363.5 371.5 377.6 382.5 386.6 389.3 390.8 391.7 392.2 392.2 392.2 392.2 392.4 392.3 392.1 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 391.8 391.8 391.6 391.3 390.9 390.5 390.1 389.5 389.1 388.9 389.1 389.6 390.3 391.2 392.8 394.5 397.0 400.6 405.3 411.1 419.5 431.0 443.7 458.9 479.8 497.9 515.2 532.6 546.7 557.1 565.4 571.7 575.6 577.8 579.1 580.0 580.4 580.8 581.5 582.7 582.9 583.5 584.4 585.1 585.6 586.2 586.8 587.0 587.3 587.7 588.0 588.0 588.2 588.5 588.7 588.7 588.5 587.7 586.3 583.3 579.0 573.7 567.1 558.7 548.3 538.6 529.1 519.2 511.5 505.6 500.7 496.9 494.8 493.6 492.7 492.5 492.6 492.7 492.7 492.7 492.7 492.7 492.5 492.7 493.3 494.5 496.5 499.4 503.7 510.1 517.2 525.5 536.3 546.3 555.5 564.6 572.6 578.1 582.6 585.6 587.3 588.3 588.7 588.7 588.6 588.3 588.0 588.0 587.7 587.4 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.1 586.8 586.8 587.0 587.0 587.0 587.0 587.8 589.1 591.4 595.5 601.9 609.7 619.7 636.1 652.5 670.9 692.6 712.9 730.2 745.9 759.5 768.7 775.1 779.9 782.3 783.8 784.4 784.4 784.4 784.4 784.8 784.5 784.1 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.2 784.6 784.7 784.4 784.4 784.4 784.4 783.5 781.7 778.6 773.8 766.5 755.2 743.0 727.1 706.4 686.3 667.1 649.2 632.8 617.7 607.7 600.4 594.3 590.8 588.9 587.6 587.0 587.0 587.0 587.0 586.7 586.9 587.2 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.7 588.0 588.0 588.2 588.7 589.1 589.4 589.3 589.0 588.2 586.1 583.4 579.6 574.8 569.0 561.3 554.4 547.5 540.1 534.7 530.2 526.6 524.1 522.5 521.7 521.4 521.4 521.6 522.1 522.5 522.6 522.7 523.0 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.4 523.7 523.9 524.2 524.5 525.0 525.6 525.9 526.1 526.1 525.5 524.5 522.9 520.7 517.6 514.2 510.6 506.6 502.6 499.4 496.7 494.2 492.7 491.9 491.3 491.2 491.4 491.7 492.3 492.7 493.0 493.2 493.5 493.7 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.6 493.4 493.1 492.9 492.6 492.1 491.6 491.3 491.1 491.5 492.2 493.3 495.2 497.8 500.6 504.0 508.4 512.0 515.6 518.9 521.6 523.6 524.9 525.8 526.3 526.0 525.8 525.3 524.8 524.4 524.1 523.8 523.6 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.5 523.8 524.0 524.4 525.0 525.5 526.0 526.9 527.4 527.7 527.8 527.5 527.0 526.4 525.5 524.5 523.5 522.4 521.3 520.4 519.7 519.2 518.7 518.7 519.0 519.5 520.2 520.8 521.4 521.9 522.4 522.6 522.9 523.2 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.2 522.9 522.6 523.3 523.3 523.5 523.9 524.8 526.3 529.0 533.6 539.5 548.4 560.5 577.8 598.5 620.8 646.5 675.9 700.1 720.9 741.4 755.4 765.4 773.0 778.1 781.1 782.6 783.5 783.9 784.0 784.5 784.7 784.3 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.1 784.5 784.9 784.1 784.0 783.7 783.1 782.0 780.0 775.7 770.5 762.0 748.5 731.9 712.5 688.4 660.8 635.5 611.2 586.0 569.0 555.3 543.8 535.9 531.0 527.6 525.2 524.2 523.7 523.3 523.3 522.9 522.7 523.0 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.2 522.9 522.9 522.7 522.2 521.7 521.1 520.5 519.8 519.3 519.0 519.0 519.2 519.7 520.5 521.5 522.7 524.5 526.0 528.0 530.9 534.3 538.4 543.6 549.7 555.5 561.5 568.0 572.4 575.9 578.8 580.8 581.9 582.6 582.6 582.6 582.3 582.0 581.9 582.3 582.7 583.1 583.8 584.6 585.2 585.8 586.4 586.8 587.1 587.4 587.7 588.0 588.0 588.3 588.6 588.7 588.7 588.3 587.3 585.1 582.6 578.1 572.6 564.6 555.5 546.3 536.3 525.5 517.2 510.1 503.7 499.4 496.5 494.5 493.3 492.7 492.5 492.7 492.7 492.7 492.7 492.7 492.6 492.5 492.9 493.6 494.8 497.3 501.0 505.6 511.5 519.2 529.1 538.6 548.3 558.7 567.1 573.7 579.4 583.6 586.0 587.7 588.7 588.7 588.7 588.5 588.2 588.0 587.9 587.6 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.0 586.7 586.9 587.0 587.0 587.0 587.2 588.0 589.4 592.4 597.0 603.3 612.5 625.1 639.3 655.8 678.6 698.1 716.6 735.3 750.3 761.4 770.3 777.1 780.8 782.7 784.0 784.4 784.4 784.4 784.5 784.8 784.4 784.1 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.0 784.4 784.7 784.7 784.4 784.4 784.4 784.4 783.1 781.2 777.8 771.8 763.2 752.8 739.1 720.2 702.1 682.4 663.3 643.9 627.9 615.7 605.9 598.0 593.1 590.3 588.3 587.3 587.0 587.0 587.0 586.9 586.7 587.0 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.4 587.7 588.0 588.0 588.3 588.8 589.2 589.4 589.3 588.8 587.6 585.6 582.8 578.7 573.1 566.7 559.9 552.7 544.8 538.6 533.7 528.8 525.8 523.7 522.3 521.6 521.4 521.4 521.7 522.2 522.5 522.6 522.8 523.0 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.5 523.7 524.0 524.3 524.6 525.2 525.7 525.9 526.2 525.9 525.2 524.2 522.4 519.7 516.9 513.5 509.2 505.4 501.9 498.6 495.9 493.9 492.5 491.6 491.1 491.2 491.5 491.9 492.4 492.8 493.1 493.3 493.6 493.8 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 493.9 494.2 494.5 494.7 494.9 495.4 495.9 496.3 496.5 496.6 496.2 495.5 493.9 491.9 489.5 486.4 482.6 479.1 475.7 471.9 469.4 467.2 465.5 464.4 463.8 463.5 463.8 464.0 464.5 465.0 465.3 465.5 465.7 465.9 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.3 466.5 466.8 467.1 467.5 468.0 468.5 469.1 469.7 470.0 470.2 470.1 469.7 469.2 468.5 467.6 466.7 465.7 464.7 463.9 463.2 462.7 462.2 462.1 462.3 462.7 463.2 463.9 464.4 464.8 465.3 465.5 465.8 466.0 466.2 466.2 466.2 466.4 466.7 466.9 467.3 467.8 468.3 468.9 469.5 469.9 470.2 470.2 469.9 469.4 468.7 468.0 467.0 466.0 465.1 464.2 463.4 462.9 462.4 462.1 462.2 462.5 462.9 463.6 464.2 464.6 465.1 465.5 465.7 465.9 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.3 466.5 466.7 467.0 467.4 467.9 468.4 469.0 469.6 470.0 470.2 470.2 469.7 469.2 468.6 467.7 466.7 465.8 464.9 463.9 463.2 462.8 462.3 462.1 462.3 462.6 463.1 463.8 464.3 464.7 465.2 465.5 465.8 466.0 466.2 466.2 466.2 466.4 466.7 466.9 467.3 467.8 468.2 468.8 469.5 469.9 470.2 470.2 469.9 469.4 468.9 468.1 467.1 466.2 465.3 464.2 463.5 462.9 462.5 462.1 462.2 462.4 462.9 463.6 464.1 464.6 465.1 465.4 465.7 465.9 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.3 466.5 466.7 466.7 466.9 467.2 467.2 467.2 467.0 466.1 464.4 462.4 458.9 454.5 448.1 440.9 433.6 425.7 417.1 410.5 404.8 399.8 396.4 394.0 392.4 391.5 391.1 391.1 391.1 391.3 391.5 391.5 391.7 391.8 392.0 392.0 392.0 392.2 392.4 392.3 392.2 392.2 392.2 392.1 391.5 390.6 388.6 385.6 381.6 375.9 368.3 360.1 351.0 339.3 329.8 321.3 313.1 306.8 302.4 298.9 296.3 294.9 294.1 293.7 293.5 293.5 293.5 293.5 293.4 293.5 293.6 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.6 293.5 293.3 293.5 293.5 293.5 293.5 293.7 294.2 295.1 296.6 299.0 303.0 307.8 313.9 322.0 331.7 341.2 351.0 362.0 369.9 376.4 382.2 386.3 388.9 390.6 391.7 392.2 392.2 392.2 392.2 392.3 392.4 392.2 392.2 392.0 392.2 392.2 392.4 392.4 392.5 392.8 393.2 393.4 393.4 393.2 392.7 391.5 389.7 387.4 384.2 380.0 375.6 370.9 366.3 361.5 357.5 354.4 351.9 350.0 348.8 348.3 348.1 348.0 348.1 348.4 348.7 348.8 348.8 349.0 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.2 349.0 348.9 348.8 348.7 348.4 348.2 348.0 348.1 348.2 348.7 350.0 351.6 353.9 356.8 360.5 365.4 370.0 374.7 379.8 383.6 386.8 389.5 391.4 392.6 393.1 393.4 393.4 393.2 392.9 392.6 392.4 392.4 392.2 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 392.0 391.7 391.5 391.5 391.4 391.2 391.1 391.1 391.2 392.0 393.1 394.8 398.1 402.1 407.1 413.7 421.4 429.0 436.7 445.1 451.4 456.3 460.7 463.6 465.4 466.6 467.2 467.2 467.2 467.0 466.8 466.7 466.6 466.4 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.2 466.0 465.8 465.6 465.6 465.3 464.8 464.6 464.6 464.7 465.2 466.4 468.4 470.9 474.6 479.8 485.4 491.4 498.4 504.8 510.2 514.9 519.2 521.7 523.5 524.6 525.0 525.1 525.0 524.6 524.1 523.9 523.9 523.6 523.4 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.3 523.2 522.9 522.7 522.4 522.1 521.7 521.2 520.7 520.5 520.2 520.8 521.6 522.9 525.2 528.0 531.1 534.9 539.4 543.3 546.9 550.5 553.1 555.0 556.3 557.1 557.5 557.3 557.0 556.4 555.9 555.5 555.2 554.9 554.6 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.4 554.0 553.8 553.5 553.2 552.8 552.2 551.7 551.4 551.2 551.7 552.6 554.0 556.1 558.8 562.0 566.5 570.6 574.7 579.3 582.7 585.5 587.8 589.4 590.2 590.7 590.4 590.1 589.6 589.0 588.5 588.2 587.9 587.6 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.3 587.1 587.0 586.9 586.4 585.9 585.3 584.6 583.8 583.2 582.7 582.4 582.4 582.7 583.0 583.6 584.5 585.1 585.7 586.3 586.8 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0 587.0",
|
63 |
+
"input_type": "phoneme",
|
64 |
+
"offset": 72.491
|
65 |
+
}
|
66 |
+
res = preprocess(inp)
|
67 |
+
print(res)
|
68 |
+
print([float(i) for i in res[0]])
|
69 |
+
|
70 |
+
def cross_fade(a: np.ndarray, b: np.ndarray, idx: int):
|
71 |
+
result = np.zeros(idx + b.shape[0])
|
72 |
+
fade_len = a.shape[0] - idx
|
73 |
+
np.copyto(dst=result[:idx], src=a[:idx])
|
74 |
+
k = np.linspace(0, 1.0, num=fade_len, endpoint=True)
|
75 |
+
result[idx: a.shape[0]] = (1 - k) * a[idx:] + k * b[: fade_len]
|
76 |
+
np.copyto(dst=result[a.shape[0]:], src=b[fade_len:])
|
77 |
+
return result
|
78 |
+
|
79 |
+
|
80 |
+
def infer_ds(model, hps, ds, speaker, trans):
|
81 |
+
|
82 |
+
sample_rate = 44100
|
83 |
+
|
84 |
+
result = np.zeros(0)
|
85 |
+
current_length = 0
|
86 |
+
for inp in tqdm.tqdm(ds):
|
87 |
+
spkid = hps.data.spk2id[speaker]
|
88 |
+
f0_seq, pitch, phseq, durations = preprocess(inp)
|
89 |
+
|
90 |
+
f0 = torch.FloatTensor(f0_seq).unsqueeze(0)
|
91 |
+
|
92 |
+
text_norm = torch.LongTensor(phseq)
|
93 |
+
x_tst = text_norm.unsqueeze(0)
|
94 |
+
x_tst_lengths = torch.LongTensor([text_norm.size(0)])
|
95 |
+
spk = torch.LongTensor([spkid])
|
96 |
+
manual_f0 = torch.FloatTensor(f0).unsqueeze(0)
|
97 |
+
manual_dur = torch.LongTensor(durations).unsqueeze(0)
|
98 |
+
t1 = time.time()
|
99 |
+
infer_res = model.infer(x_tst, x_tst_lengths, None, None,
|
100 |
+
None, gtdur=manual_dur, spk_id=spk,
|
101 |
+
F0=manual_f0 * 2 ** (trans / 12))
|
102 |
+
seg_audio = infer_res[0][0, 0].data.float().numpy()
|
103 |
+
try:
|
104 |
+
offset_ = inp['offset']
|
105 |
+
except:
|
106 |
+
offset_ = 0
|
107 |
+
silent_length = round(offset_ * sample_rate) - current_length
|
108 |
+
if silent_length >= 0:
|
109 |
+
result = np.append(result, np.zeros(silent_length))
|
110 |
+
result = np.append(result, seg_audio)
|
111 |
+
else:
|
112 |
+
result = cross_fade(result, seg_audio, current_length + silent_length)
|
113 |
+
current_length = current_length + silent_length + seg_audio.shape[0]
|
114 |
+
print("infer time:", time.time() - t1)
|
115 |
+
return result
|
116 |
+
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
#
|
121 |
+
# midis = [librosa.note_to_midi(x.split("/")[0]) if x != 'rest' else 0
|
122 |
+
# for x in note_lst]
|
infer/share.ds
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"text": "SP 清 晨 SP",
|
4 |
+
"ph_seq": "SP q ing ch en SP",
|
5 |
+
"note_seq": "rest D4 D4 G4 G4 rest",
|
6 |
+
"note_dur_seq": "0.6 0.273 0.273 0.4089999 0.4089999 0.4",
|
7 |
+
"is_slur_seq": "0 0 0 0 0 0",
|
8 |
+
"ph_dur": "0.469318 0.130682 0.120727 0.152273 0.409 0.4",
|
9 |
+
"f0_timestep": "0.005",
|
10 |
+
"f0_seq": "301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 301.9 302.0 302.4 301.9 301.4 300.5 299.4 299.0 298.3 297.9 297.6 297.2 297.2 297.0 296.8 296.9 296.7 296.6 296.8 296.9 296.9 297.4 297.6 297.7 298.2 298.5 298.3 298.6 298.7 298.5 298.6 298.3 297.8 296.4 293.9 291.5 286.7 283.2 279.6 278.5 283.4 288.4 293.5 298.6 303.9 309.3 314.7 320.3 325.9 331.7 337.5 343.5 349.5 355.7 362.0 368.3 374.8 381.5 387.1 388.7 391.3 393.6 396.1 397.7 398.7 399.3 399.6 399.8 399.4 399.0 398.6 397.9 397.7 397.1 396.7 396.1 396.0 395.4 395.6 395.7 395.9 395.9 396.1 396.4 396.8 397.0 397.3 397.5 397.5 397.5 397.7 397.7 397.7 397.7 397.9 397.7 397.7 397.7 397.7 397.7 397.7 397.5 397.5 397.2 397.0 397.0 396.7 396.6 396.6 396.5 396.3 396.3 396.1 396.1 396.3 396.3 396.1 396.3 396.3 396.4 396.6 396.7 396.6 396.9 397.2 396.8 397.4 397.9 398.0 398.5 399.1 399.1 399.1 399.0 398.7 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2 398.2",
|
11 |
+
"input_type": "phoneme",
|
12 |
+
"offset": 16.582
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"text": "SP 阳 光 攀 过 窗 沿 隙 缝 SP 温 暖 了 我 的 被 窝 搅 醒 没 做 完 的 好 梦 SP 起 身 SP 手 忙 脚 乱 匆 匆 出 门 SP 怕 赶 不 上 整 点 开 走 的 巴 士 一 路 狂 奔 SP 铃 声 SP 唤 不 动 我 疲 惫 眼 神 SP 怪 自 己 昨 夜 睡 的 太 沉 闹 钟 不 够 大 声 SP",
|
16 |
+
"ph_seq": "SP y ang g uang p an g uo ch uang y En x i f eng SP w en n uan l e w o d e b ei w o j iao x ing m ei z uo w an d e h ao m eng SP q i sh en SP sh ou m ang j iao l uan c ong c ong ch u m en SP p a g an b u sh ang zh eng d ian k ai z ou d e b a sh ir y i l u k uang b en SP l ing sh eng SP h uan b u d ong w o p i b ei y En sh en SP g uai z i0 j i z uo y E sh ui d e t ai ch en n ao zh ong b u g ou d a sh eng SP",
|
17 |
+
"note_seq": "rest G4 G4 G4 G4 G4 G4 G4 G4 F#4 F#4 G4 G4 A4 A4 G4 G4 rest F#4 F#4 F#4 F#4 F#4 F#4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 D4 D4 E4 E4 G4 G4 D4 D4 D4 D4 D4 D4 rest D4 D4 G4 G4 rest G4 G4 G4 G4 G4 G4 G4 G4 F#4 F#4 G4 G4 A4 A4 G4 G4 rest F#4 F#4 F#4 F#4 F#4 F#4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 D4 D4 E4 E4 G4 G4 A4 A4 G4 G4 F#4 F#4 rest D4 D4 G4 G4 rest G4 G4 G4 G4 G4 G4 G4 G4 F#4 F#4 G4 G4 A4 A4 G4 G4 rest F#4 F#4 F#4 F#4 F#4 F#4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 E4 D4 D4 E4 E4 G4 G4 D4 D4 D4 D4 D4 D4 rest",
|
18 |
+
"note_dur_seq": "0.327 0.545 0.545 0.273 0.273 0.273 0.273 0.2720001 0.2720001 0.273 0.273 0.273 0.273 0.273 0.273 0.4089999 0.4089999 0.1359999 0.273 0.273 0.2720001 0.2720001 0.273 0.273 0.273 0.273 0.2730002 0.2730002 0.2719998 0.2719998 0.2729998 0.2729998 0.2730002 0.2730002 0.2729998 0.2729998 0.2720003 0.2720003 0.2729998 0.2729998 0.4089999 0.4089999 0.4090004 0.4090004 0.2729998 0.2729998 0.5450001 0.5450001 0.2729998 0.2730002 0.2730002 0.5450001 0.5450001 0.2729998 0.5450001 0.5450001 0.2729998 0.2729998 0.2730007 0.2730007 0.2729998 0.2729998 0.2719994 0.2719994 0.2730007 0.2730007 0.2729998 0.2729998 0.4090004 0.4090004 0.1359997 0.2729998 0.2729998 0.2729998 0.2729998 0.2720003 0.2720003 0.2729998 0.2729998 0.2730007 0.2730007 0.2729998 0.2729998 0.2719994 0.2719994 0.2730007 0.2730007 0.2729998 0.2729998 0.2729998 0.2729998 0.2720003 0.2720003 0.4089994 0.4089994 0.4090004 0.4090004 0.2729998 0.2729998 0.5459995 0.5459995 0.2720013 0.2729988 0.2729988 0.5460014 0.5460014 0.2719994 0.5459995 0.5459995 0.2720013 0.2720013 0.2729988 0.2729988 0.2730007 0.2730007 0.2729988 0.2729988 0.2720013 0.2720013 0.2729988 0.2729988 0.2730007 0.2730007 0.2730007 0.2719994 0.2719994 0.2730007 0.2730007 0.2729988 0.2729988 0.2720013 0.2720013 0.2729988 0.2729988 0.2730007 0.2730007 0.2729988 0.2729988 0.2720013 0.2720013 0.2729988 0.2729988 0.2730007 0.2730007 0.2730007 0.2730007 0.4089985 0.4089985 0.4090004 0.4090004 0.2720013 0.2720013 0.4099998 0.4099998 0.081",
|
19 |
+
"is_slur_seq": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0",
|
20 |
+
"ph_dur": "0.245182 0.081818 0.490454 0.054546 0.202546 0.070454 0.218454 0.054546 0.151545 0.120455 0.202546 0.070454 0.136636 0.136364 0.103683 0.169317 0.409 0.087136 0.048864 0.1855 0.0875 0.178818 0.093182 0.1855 0.0875 0.234363 0.038637 0.247999 0.025002 0.228817 0.043182 0.148 0.125 0.082092 0.190908 0.218453 0.054546 0.151546 0.120455 0.229817 0.043182 0.348773 0.060226 0.245363 0.163637 0.202546 0.070454 0.545 0.175273 0.097727 0.070728 0.202272 0.545 0.109363 0.163637 0.496136 0.048864 0.1855 0.0875 0.207092 0.065908 0.169591 0.103409 0.146999 0.125 0.136637 0.136364 0.115045 0.157955 0.409 0.087135 0.048864 0.245728 0.027271 0.234365 0.038635 0.119729 0.152271 0.196862 0.076138 0.247999 0.025002 0.185498 0.087502 0.195862 0.076138 0.234365 0.038635 0.234365 0.038635 0.115045 0.157955 0.168591 0.103409 0.343089 0.06591 0.327183 0.081818 0.218453 0.054546 0.546 0.206091 0.06591 0.10368 0.169319 0.546001 0.190182 0.081818 0.497137 0.048862 0.233366 0.038635 0.202545 0.070454 0.224138 0.048862 0.218452 0.054546 0.217455 0.054546 0.120728 0.152271 0.273001 0.240046 0.032955 0.217453 0.054546 0.202547 0.070454 0.179817 0.093182 0.24473 0.027271 0.130955 0.142044 0.234365 0.038635 0.147999 0.125 0.114046 0.157955 0.196861 0.076138 0.218454 0.054546 0.202547 0.070454 0.278318 0.13068 0.376045 0.032955 0.086775 0.185226 0.41 0.081",
|
21 |
+
"f0_timestep": "0.005",
|
22 |
+
"f0_seq": "371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.9 371.5 372.2 373.2 374.9 376.0 378.0 378.4 379.3 378.6 378.6 378.0 377.3 374.9 374.5 373.1 373.2 373.6 374.3 373.6 374.5 374.3 375.2 376.7 377.6 378.1 380.4 380.8 382.8 385.5 387.2 389.1 391.3 392.7 394.3 395.4 396.8 397.2 397.9 398.4 398.6 398.6 399.2 399.3 399.3 399.5 399.3 399.1 399.3 399.3 398.9 399.1 399.1 398.8 399.1 399.1 398.6 398.8 399.1 398.6 398.6 398.6 398.2 397.9 397.7 397.5 397.5 397.7 397.2 397.2 397.0 397.0 397.0 397.2 397.2 397.2 397.2 397.2 397.0 397.2 397.2 396.8 397.0 397.2 396.6 396.7 397.0 396.2 395.6 395.0 393.6 390.4 388.0 383.9 380.2 377.8 378.3 379.5 380.7 381.9 383.1 384.3 385.5 386.7 388.0 389.2 390.4 391.7 392.9 394.2 395.4 396.6 397.9 398.2 397.9 398.3 398.2 398.0 397.9 397.7 397.7 397.2 397.4 397.5 396.8 397.0 397.0 396.8 397.0 397.2 397.2 397.5 397.7 397.7 398.2 398.2 398.6 398.6 398.6 398.2 398.0 396.8 395.6 395.0 393.4 392.0 390.4 388.7 385.7 383.0 380.8 377.6 375.6 375.3 375.6 375.8 376.0 376.3 376.5 376.7 376.9 377.2 377.4 377.6 377.9 378.1 378.3 378.6 378.8 379.0 379.3 381.5 384.3 387.3 390.9 393.3 395.2 397.3 398.6 398.6 399.0 399.1 398.5 398.2 398.2 397.8 397.5 397.3 397.2 397.0 397.0 396.6 396.4 395.9 395.4 395.0 395.0 395.0 395.2 395.9 396.2 396.6 397.3 397.7 397.2 397.2 396.6 395.2 392.7 390.0 386.8 382.2 379.2 376.0 374.3 376.8 379.3 381.8 384.3 386.9 389.4 392.0 394.6 397.2 398.2 398.0 398.4 398.4 398.2 398.4 398.6 398.4 398.4 398.6 398.4 398.4 398.6 398.4 398.4 398.4 398.4 398.2 398.2 398.2 397.9 397.9 398.2 398.0 397.7 397.5 397.3 395.2 393.7 389.3 383.9 378.4 371.9 367.7 363.4 362.4 362.7 362.9 363.2 363.5 363.8 364.0 364.3 364.6 364.9 365.1 365.4 365.7 366.0 366.3 366.5 366.8 367.1 367.4 367.7 368.1 370.7 372.6 374.4 375.4 376.0 376.2 376.0 376.2 376.2 375.6 375.8 375.8 375.6 375.8 375.8 375.6 375.8 375.8 375.6 376.0 376.2 376.2 376.4 376.7 376.7 376.9 376.5 376.6 376.7 376.2 376.4 376.2 375.7 375.8 375.6 374.9 374.5 375.4 375.6 375.8 376.4 376.5 376.1 376.7 377.1 377.1 378.4 379.5 381.5 383.9 386.5 390.0 392.2 394.3 395.4 396.2 397.2 397.6 397.9 397.9 398.3 398.4 398.2 398.2 398.2 398.0 398.2 398.0 397.9 398.2 398.0 397.7 397.9 397.9 397.5 397.5 397.5 397.1 397.0 396.6 396.0 394.3 391.8 387.9 384.2 382.1 379.5 382.2 384.8 387.5 390.3 393.0 395.7 398.5 401.3 404.1 406.9 409.8 412.7 415.5 418.4 421.4 424.3 427.3 430.3 433.3 436.3 439.4 441.0 441.0 441.7 442.8 443.2 443.3 443.6 444.0 444.1 443.9 444.1 444.1 444.1 444.3 444.3 444.6 444.6 444.6 444.9 444.9 444.6 445.1 444.9 444.9 445.3 444.6 444.3 443.5 441.0 439.7 435.7 431.4 427.7 422.3 418.7 415.3 414.6 413.8 413.1 412.3 411.6 410.9 410.1 409.4 408.7 407.9 407.2 406.5 405.7 405.0 404.3 403.6 402.9 402.1 401.4 400.7 400.2 399.7 398.6 398.2 398.2 397.9 397.9 397.7 397.9 397.9 397.5 397.7 397.5 397.3 397.5 397.5 397.1 397.2 397.2 397.0 397.2 397.2 397.0 397.0 397.2 397.0 397.4 397.7 397.5 397.7 397.9 397.7 397.9 398.1 397.9 397.9 397.9 397.9 397.9 397.9 397.9 397.7 397.7 397.9 397.9 397.9 398.1 398.4 398.4 398.8 399.5 399.5 400.5 401.2 401.4 402.3 402.9 403.0 402.8 402.8 402.1 401.2 400.3 399.4 398.5 397.6 396.7 395.8 394.9 394.1 393.2 392.3 391.4 390.5 389.7 388.8 387.9 387.1 386.2 385.3 384.5 383.6 382.7 381.9 381.0 380.2 379.3 378.5 377.6 376.8 375.9 375.1 374.3 373.4 372.6 371.7 370.9 370.1 369.3 368.4 367.6 366.8 366.0 365.8 366.6 367.4 368.5 369.1 370.2 371.1 371.7 372.2 372.8 373.3 373.4 373.4 373.6 373.2 373.4 373.4 373.2 373.0 373.2 372.9 373.0 373.2 373.2 373.0 373.2 373.0 372.8 372.8 372.1 372.0 372.1 371.3 371.3 371.9 371.6 371.7 372.1 371.6 371.7 372.1 371.3 371.3 371.5 370.6 370.5 370.9 370.2 370.0 370.6 370.3 370.0 370.9 370.9 370.9 371.4 372.1 373.0 373.7 374.9 375.1 376.0 376.0 376.0 376.2 376.4 376.0 376.0 375.8 375.6 375.4 375.2 374.9 374.9 374.9 374.9 374.9 374.9 374.9 374.9 374.8 374.7 374.5 374.5 374.3 374.1 374.1 373.9 374.0 374.3 374.1 374.5 374.5 374.3 374.5 374.5 374.3 374.5 374.3 373.6 373.6 373.2 372.6 372.7 372.6 372.4 372.8 373.0 373.2 373.9 374.6 374.9 375.4 367.2 368.1 369.3 370.9 371.5 371.5 371.7 371.9 371.7 371.9 371.9 371.9 372.3 372.4 372.1 372.5 372.6 372.6 372.8 373.0 373.0 373.0 373.2 373.2 373.2 373.2 373.4 373.4 373.2 372.6 372.0 371.5 370.2 368.1 366.6 364.5 360.5 358.4 354.7 351.3 349.1 346.2 342.2 337.3 334.0 332.4 330.8 330.3 329.8 329.8 329.3 329.8 329.7 330.0 329.6 331.5 333.5 334.3 335.0 335.2 335.4 335.2 335.4 335.4 335.2 335.0 334.8 334.6 334.2 334.2 334.1 333.8 333.8 334.0 333.8 334.0 334.2 334.2 334.4 334.4 334.6 334.6 334.6 334.6 334.6 334.5 334.4 333.9 333.3 332.7 329.8 326.2 321.4 314.7 308.1 304.4 300.0 298.1 300.0 302.8 305.6 308.5 311.3 314.3 317.2 320.1 323.1 326.2 329.2 332.3 335.4 338.5 338.7 338.1 337.1 336.8 336.0 335.6 335.2 334.8 334.5 334.2 334.1 333.8 333.8 333.8 334.0 334.0 334.0 334.2 334.2 334.2 334.2 334.2 334.0 334.0 334.0 333.9 333.3 333.3 332.9 330.4 329.0 324.9 320.2 315.8 310.9 308.7 306.0 307.9 309.8 311.8 313.7 315.7 317.7 319.7 321.7 323.8 325.8 327.8 329.9 332.0 334.1 336.2 338.3 337.8 337.5 336.9 335.8 335.6 335.3 334.6 334.6 334.6 334.0 334.2 334.0 333.7 333.7 333.7 333.2 333.1 333.1 332.7 332.7 332.8 332.7 332.8 333.1 333.1 333.1 333.7 333.5 333.5 333.7 333.3 332.9 332.9 332.7 332.3 332.5 332.3 332.0 331.9 331.9 331.6 331.2 331.0 330.6 330.2 330.0 329.8 329.7 329.8 330.0 330.3 331.0 331.6 332.9 333.8 334.6 335.4 335.7 336.0 336.0 336.0 336.2 336.0 335.6 335.8 335.5 334.8 335.0 335.0 334.6 334.8 334.8 334.6 334.6 334.6 334.3 334.0 334.2 333.7 333.5 333.3 331.0 329.7 325.1 319.7 314.2 307.4 303.6 299.1 297.9 299.9 301.9 304.0 306.0 308.1 310.1 312.2 314.3 316.4 318.5 320.7 322.8 325.0 327.2 329.4 331.6 333.8 336.1 338.3 339.1 338.8 338.7 338.7 338.3 338.1 337.8 336.9 336.6 336.0 336.0 335.5 335.4 335.4 335.2 335.2 335.4 335.0 335.2 335.4 335.4 335.4 335.6 335.6 335.8 335.8 335.0 334.0 332.9 329.8 327.2 323.7 320.1 317.8 318.1 319.0 319.9 320.9 321.9 322.8 323.8 324.7 325.7 326.7 327.6 328.6 329.6 330.6 331.6 332.5 333.5 334.5 335.5 336.5 337.5 337.1 337.1 336.7 336.4 336.9 336.4 335.9 335.6 335.1 334.4 334.0 333.7 333.5 333.3 332.9 332.7 332.4 331.9 331.6 331.5 331.7 331.7 332.1 332.3 332.9 332.9 333.0 333.3 333.1 332.9 332.7 332.5 332.3 332.0 331.7 331.5 331.1 330.6 330.4 329.6 326.6 324.6 321.2 318.7 315.7 312.6 310.6 307.9 305.7 303.8 301.9 300.3 299.5 298.8 298.5 297.8 299.1 299.7 299.8 299.5 299.3 299.2 299.3 299.3 299.1 299.0 299.1 298.8 298.7 298.8 298.6 298.5 298.5 298.5 298.1 298.3 298.1 297.9 298.1 298.1 297.9 297.6 295.9 294.3 290.1 285.6 281.5 276.9 273.9 271.6 270.9 274.0 277.2 280.5 283.7 287.0 290.4 293.8 297.2 300.7 304.2 307.8 311.4 315.0 318.7 322.4 326.2 330.0 333.8 337.7 338.9 338.4 338.3 338.0 337.3 337.3 337.0 336.4 336.4 336.0 335.6 335.6 335.0 334.8 335.0 334.8 334.5 334.6 334.3 334.0 334.2 334.1 333.8 333.8 333.7 333.7 333.7 333.7 333.5 333.5 333.3 333.3 333.1 333.1 332.7 332.1 332.0 331.0 329.7 328.7 326.8 328.2 330.8 332.4 333.7 336.0 337.9 340.1 342.4 345.8 348.8 352.0 357.8 362.0 363.2 364.5 365.4 365.5 366.4 367.2 367.0 366.8 367.9 367.5 368.5 369.8 370.1 370.6 372.4 373.0 374.9 376.9 378.9 379.6 382.8 383.5 385.1 388.2 390.0 391.8 394.0 394.8 395.9 396.4 397.0 397.2 397.2 396.6 396.4 395.9 395.2 394.4 394.3 393.3 392.4 392.0 391.1 389.5 388.8 387.3 385.3 383.8 381.9 379.9 376.7 374.3 371.8 367.2 365.1 362.5 359.9 357.2 354.6 351.9 349.3 346.8 344.2 341.7 339.1 336.6 334.2 331.7 329.2 326.8 324.4 322.0 319.6 317.3 314.9 312.6 310.3 308.0 305.7 303.5 301.2 302.1 301.9 301.2 300.5 299.7 299.1 298.7 298.3 298.0 297.8 297.6 297.6 297.4 297.6 297.4 297.6 297.6 297.6 297.6 297.6 297.6 297.6 297.6 297.6 297.8 297.8 297.8 297.8 297.8 297.8 297.8 297.8 297.8 297.8 297.6 297.8 297.8 297.8 297.8 297.9 298.1 298.1 298.5 298.6 298.6 299.0 299.0 298.3 298.4 297.4 295.5 294.1 291.6 289.4 287.0 285.6 286.0 286.4 286.8 287.2 287.6 288.0 288.4 288.8 289.2 289.6 290.0 290.4 290.8 291.2 291.6 292.0 292.4 292.9 293.3 293.7 294.1 294.5 294.9 295.3 295.7 296.6 297.9 298.4 299.1 299.7 300.0 300.0 300.0 300.0 299.5 299.1 298.8 298.4 298.1 298.1 297.8 297.4 297.3 297.1 297.1 297.1 296.9 297.0 296.9 297.1 297.2 297.6 297.4 297.4 297.4 297.3 297.4 297.2 297.1 297.2 296.8 296.4 296.0 295.2 294.3 293.8 292.8 292.4 292.1 291.3 291.2 291.0 290.7 290.3 290.6 290.8 291.0 291.0 291.8 292.3 293.4 295.5 296.9 298.1 299.1 299.1 298.8 298.5 298.5 298.3 298.1 298.1 298.0 297.8 297.8 297.8 297.6 297.6 297.4 297.4 297.3 297.2 297.2 297.1 296.9 297.0 296.9 296.7 296.9 296.7 296.7 296.9 296.9 297.1 297.2 297.2 297.2 297.2 297.2 297.4 297.6 297.6 297.6 297.4 297.4 297.6 297.6 297.6 297.6 297.6 297.4 297.6 297.4 297.2 297.6 297.4 297.6 297.6 297.6 297.4 297.6 297.6 297.6 297.6 297.6 297.6 297.8 297.8 297.8 297.8 297.6 297.4 297.4 297.2 297.1 297.1 297.1 296.9 296.9 296.9 296.8 296.9 296.9 296.9 297.1 297.2 297.2 297.4 297.8 297.8 298.1 298.5 298.6 298.8 299.1 299.4 299.7 299.8 300.0 300.0 299.7 299.3 298.8 298.3 298.1 298.1 298.1 298.1 298.1 298.1 298.1 298.1 298.1 298.1 298.1 298.0 298.0 298.0 298.0 298.0 298.0 298.0 298.0 298.0 298.0 298.0 298.0 298.0 298.0 298.0 298.0 298.0 297.9 297.9 297.9 297.9 297.9 297.9 297.9 297.9 297.9 297.9 297.9 297.9 297.9 297.9 297.9 297.9 297.9 297.9 297.8 297.8 297.8 297.8 297.8 297.8 297.8 297.8 297.8 297.8 297.8 297.8 297.8 297.8 297.8 297.6 297.8 297.8 297.4 298.1 297.8 297.4 297.6 297.6 297.4 297.6 297.4 297.2 297.4 297.4 297.4 297.6 297.4 297.8 297.8 297.8 298.1 298.1 298.1 298.3 298.0 297.2 296.0 293.8 290.8 288.6 285.1 284.0 286.7 290.4 294.1 297.9 301.7 305.6 309.5 313.4 317.4 321.5 325.6 329.8 334.0 338.3 342.6 347.0 351.4 355.9 360.5 365.1 369.8 368.3 367.9 368.5 369.9 370.6 371.9 371.9 373.0 373.2 374.1 374.7 375.4 375.6 375.8 375.6 376.5 376.6 377.8 378.1 379.1 379.7 381.0 382.4 384.2 385.7 387.5 388.6 390.4 391.2 392.7 393.4 393.7 394.3 395.4 395.6 396.3 396.7 396.8 396.8 396.8 396.8 396.8 396.8 396.8 396.8 397.0 397.0 396.8 397.0 397.0 397.0 397.0 397.0 396.8 396.8 396.6 396.6 396.6 396.3 396.3 396.6 396.4 396.3 396.5 396.6 396.3 396.7 396.6 396.6 396.8 396.6 396.6 396.8 396.6 396.3 396.3 396.3 396.1 396.1 396.1 396.1 396.1 396.3 396.3 396.6 396.8 396.8 397.0 397.2 397.5 397.5 397.9 398.2 398.0 398.6 398.6 398.6 399.5 399.5 399.5 400.7 400.7 400.5 401.0 400.7 400.3 400.0 399.6 399.2 398.9 398.5 398.2 397.8 397.4 397.1 396.7 396.4 396.0 395.6 395.3 394.9 394.6 394.2 393.8 393.5 393.1 392.8 392.4 392.1 391.7 391.4 391.0 390.6 390.3 389.9 389.6 389.2 388.9 388.5 388.2 387.8 387.5 387.1 386.8 386.4 386.1 385.7 385.4 385.0 384.7 384.3 384.0 383.6 383.3 382.9 382.6 382.2 381.9 381.5 381.2 380.8 380.5 380.2 379.8 380.8 380.6 380.4 379.7 380.3 379.7 380.0 379.0 378.9 377.8 378.0 377.8 378.1 377.6 377.6 377.1 377.3 377.7 378.6 379.1 380.6 381.5 383.0 385.3 387.5 389.7 391.8 392.9 394.0 394.6 395.6 395.9 396.2 396.3 396.5 396.6 396.8 396.8 396.8 396.8 396.8 396.8 396.8 396.6 396.8 396.8 396.6 396.7 396.8 396.6 396.8 396.8 396.6 396.8 396.6 396.6 396.8 396.6 396.6 396.7 396.6 396.8 396.8 396.6 396.7 397.0 396.8 397.0 397.2 397.2 397.5 397.7 397.7 397.7 397.7 397.7 397.5 397.7 397.5 397.5 397.7 397.7 397.5 397.5 397.5 397.0 396.3 396.1 394.3 393.1 391.3 389.1 387.6 385.7 384.4 383.9 383.5 383.3 384.4 384.6 384.6 386.4 386.9 387.5 388.8 390.6 392.4 394.2 396.3 396.9 397.5 397.7 397.7 397.7 397.7 397.5 397.5 397.5 397.5 397.5 397.5 397.5 397.7 397.5 397.5 397.2 397.0 396.8 396.3 395.9 395.0 394.7 393.8 392.2 391.3 389.3 386.8 385.0 381.7 379.0 375.8 373.4 372.5 373.4 374.8 376.1 377.5 378.8 380.2 381.6 383.0 384.3 385.7 387.1 388.5 389.9 391.3 392.7 394.2 395.6 397.0 397.2 397.4 397.7 397.7 397.7 397.7 397.7 397.5 397.2 397.0 396.6 396.6 396.1 396.1 395.9 395.6 395.6 395.9 395.7 395.6 395.9 395.9 395.6 395.8 396.1 396.1 396.3 396.6 396.4 396.6 396.8 396.8 396.8 397.2 397.2 397.2 397.4 397.5 397.5 397.2 397.2 397.0 396.3 395.9 395.0 394.3 393.3 392.4 392.1 391.5 390.9 391.2 391.5 391.9 393.1 394.3 395.2 396.1 397.5 397.7 397.9 397.9 397.9 397.7 397.5 397.5 397.1 397.0 396.8 396.8 396.8 397.0 397.0 397.0 397.2 397.0 396.8 396.6 396.4 395.4 394.0 393.8 392.7 391.3 391.3 390.0 388.7 387.5 385.8 383.0 380.6 379.0 376.7 375.4 374.1 372.9 371.6 370.3 369.1 367.8 366.6 365.3 364.1 362.9 361.6 360.4 359.2 358.0 356.7 355.5 356.6 359.7 363.4 366.9 370.2 372.4 373.8 374.7 375.4 375.8 375.8 375.8 375.8 376.0 376.2 376.5 376.6 376.9 377.1 377.3 377.3 377.3 377.3 376.9 376.2 375.8 375.6 374.3 373.4 373.0 370.4 369.0 367.0 364.4 360.3 357.0 354.9 352.9 353.0 354.9 357.2 359.5 361.8 364.1 366.4 368.8 371.1 373.5 375.9 378.3 380.7 383.2 385.6 388.1 389.5 392.7 393.8 396.6 397.9 398.8 399.8 400.2 400.7 400.5 400.5 400.2 400.0 399.8 399.8 399.8 399.8 399.6 399.1 398.4 397.7 397.2 396.9 396.1 395.5 395.0 394.3 393.1 391.1 389.8 387.0 383.7 381.5 378.4 376.4 377.8 380.9 384.1 387.3 390.6 393.9 397.2 400.5 403.8 407.2 410.6 414.1 417.5 421.0 424.6 428.1 431.7 435.3 439.0 439.6 442.5 443.1 444.3 445.6 445.6 445.4 445.6 445.4 444.9 444.6 444.3 443.8 444.0 444.1 444.3 444.6 444.9 445.1 445.6 445.8 446.1 446.4 446.4 446.7 446.7 446.4 446.1 445.9 444.9 444.4 442.3 440.5 438.4 434.7 430.7 425.0 421.1 418.5 415.3 412.2 410.3 407.7 404.8 402.3 399.6 395.6 395.0 393.1 390.6 391.0 391.3 392.0 392.8 395.6 396.4 397.2 397.5 397.9 397.7 397.7 397.5 397.2 397.0 396.8 396.6 396.3 396.1 395.9 395.6 395.6 395.6 395.9 395.9 395.9 395.9 396.1 395.9 396.1 396.3 396.3 396.6 396.8 396.8 397.0 397.2 397.2 397.2 397.4 397.5 397.5 397.7 397.7 397.7 397.7 397.5 397.5 397.5 397.5 397.2 397.4 397.5 397.2 397.4 397.5 397.5 397.7 398.2 398.2 398.6 399.4 399.8 400.2 400.8 400.7 399.8 400.2 399.3 399.1 398.2 397.2 396.2 395.1 394.1 393.1 392.1 391.0 390.0 389.0 388.0 387.0 386.0 385.0 384.0 383.0 382.0 381.0 380.0 379.0 378.0 377.0 376.0 375.0 374.1 373.1 372.1 371.1 370.2 369.2 368.2 367.3 366.3 365.4 364.4 363.5 362.5 361.6 360.6 359.7 358.8 357.8 356.9 356.0 355.0 354.1 355.7 359.0 362.0 365.1 369.1 370.7 373.9 375.2 375.2 375.8 376.2 375.4 375.6 375.4 374.5 373.8 373.2 372.1 372.0 372.1 372.3 372.8 373.9 374.0 375.2 375.9 376.0 376.0 376.0 374.7 374.1 373.2 370.0 367.7 363.2 359.7 354.5 350.8 348.8 349.7 351.3 353.0 354.6 356.3 357.9 359.6 361.3 363.0 364.7 366.4 368.1 369.8 371.5 371.7 371.9 371.9 371.5 371.3 371.1 370.9 370.9 371.3 371.7 372.2 372.8 373.0 373.2 373.6 373.6 373.9 374.0 374.1 374.3 374.5 374.3 373.8 373.0 372.6 371.9 371.7 371.9 371.7 371.7 372.2 372.1 372.0 372.1 371.9 370.9 369.1 367.4 364.3 361.8 359.2 356.2 353.9 352.1 350.8 353.5 356.2 358.8 361.5 364.3 367.0 369.8 372.6 375.4 378.2 378.8 379.1 378.4 377.3 377.3 377.2 376.7 376.7 376.9 376.5 376.1 376.2 375.6 375.6 375.8 375.4 375.4 375.6 375.4 375.4 375.6 375.4 375.8 375.8 375.4 375.4 374.7 374.0 371.3 367.7 363.4 357.2 353.4 349.4 348.4 348.1 347.8 347.4 347.1 346.8 346.4 346.1 345.8 345.4 345.1 344.8 344.5 344.1 343.8 343.5 343.1 342.8 342.5 342.2 341.8 342.0 341.3 340.9 339.9 339.1 338.3 337.1 336.4 335.8 335.4 335.1 335.2 335.0 335.0 335.0 335.0 334.8 334.6 334.8 334.6 334.3 334.2 333.7 333.1 332.7 332.2 331.3 331.5 331.5 331.0 331.0 330.4 329.8 328.5 325.3 322.6 317.3 313.4 309.2 306.8 309.0 311.1 313.3 315.5 317.7 319.9 322.2 324.4 326.7 329.0 331.2 333.6 335.9 337.3 337.5 337.2 336.9 336.6 336.6 336.2 335.8 335.4 334.9 334.6 334.4 334.1 334.2 334.2 334.0 334.2 334.4 334.2 334.2 334.4 334.0 333.9 333.8 333.4 333.5 333.3 332.8 332.5 332.3 331.7 331.7 331.6 331.0 331.1 330.8 330.4 330.4 328.9 327.4 324.7 321.2 317.9 314.0 311.2 308.3 306.7 309.2 311.8 314.4 317.0 319.7 322.4 325.1 327.8 330.5 333.3 333.7 334.2 334.7 335.0 335.0 335.5 335.6 335.4 335.6 335.6 335.4 335.4 335.4 335.4 335.2 335.0 334.8 334.5 334.2 333.8 333.7 333.1 332.3 332.3 331.3 331.8 331.9 332.1 332.3 331.7 331.2 330.2 327.4 324.7 320.4 316.9 312.4 310.1 310.1 310.6 311.1 311.6 312.1 312.5 313.0 313.5 314.0 314.5 315.0 315.5 316.0 316.5 317.0 317.5 318.0 320.5 323.8 327.0 330.0 332.3 334.4 335.6 335.9 336.6 336.6 336.2 336.4 336.0 335.8 335.6 335.3 334.8 334.8 334.8 334.4 334.6 334.6 334.5 335.0 334.6 334.0 332.7 329.2 324.5 318.0 312.9 306.7 303.6 301.2 303.1 304.9 306.8 308.7 310.6 312.5 314.5 316.4 318.3 320.3 322.3 324.3 326.3 328.3 330.3 332.3 334.4 336.4 338.5 339.7 339.1 338.6 337.7 337.1 336.4 336.2 335.8 335.2 335.4 335.0 334.8 334.8 334.6 334.4 334.6 334.4 334.6 334.8 334.6 334.8 335.0 334.8 334.8 334.8 334.8 334.8 334.6 334.4 334.1 333.7 333.3 332.0 330.6 328.4 324.0 319.7 315.4 310.1 305.4 302.8 300.7 299.8 302.6 306.4 310.2 314.1 318.0 321.9 325.9 330.0 334.1 338.3 340.1 339.6 338.5 336.9 336.3 335.2 334.9 334.4 334.0 333.9 333.8 333.7 333.7 333.7 333.8 333.8 334.0 334.2 334.2 334.2 334.4 334.2 334.2 334.2 333.7 333.5 333.0 330.4 328.3 325.6 319.9 315.6 309.2 304.4 300.4 296.6 296.9 297.3 297.6 298.0 298.3 298.7 299.0 299.4 299.7 300.1 300.5 300.8 301.2 301.5 301.9 302.3 302.6 303.0 301.5 301.6 301.0 300.4 300.0 299.4 298.6 298.8 298.3 297.9 298.1 297.8 297.6 297.6 297.4 297.4 297.6 297.4 297.4 297.6 297.4 297.4 297.4 297.2 297.2 297.4 297.6 298.0 298.6 299.0 299.0 299.1 298.3 296.4 294.2 291.2 286.8 284.3 281.7 284.1 286.6 289.1 291.6 294.1 296.6 299.2 301.8 304.4 307.0 309.7 312.4 315.1 317.8 320.5 323.3 326.1 328.9 331.8 334.6 337.5 337.9 338.9 339.1 338.5 338.3 338.1 337.9 337.9 337.1 336.6 335.9 335.6 335.3 335.0 334.6 334.6 334.2 334.2 334.2 334.2 334.2 334.0 333.8 333.1 331.9 331.1 328.5 326.8 324.9 322.3 320.4 319.0 320.7 322.5 324.4 326.2 328.0 329.9 331.7 333.6 335.5 337.3 339.2 341.1 343.1 345.0 346.9 348.9 350.8 357.2 363.1 371.9 374.3 376.9 380.2 382.2 385.3 387.7 390.0 390.6 391.8 392.7 393.2 394.0 394.7 394.9 395.0 395.1 395.4 395.9 396.2 396.8 397.0 397.5 397.7 397.9 397.9 398.1 398.2 398.2 398.0 398.2 397.9 397.9 397.9 397.9 397.9 397.9 397.9 397.9 397.7 397.9 397.9 397.5 397.2 397.2 397.0 396.6 396.8 396.8 396.4 396.1 396.1 395.4 395.4 395.4 395.0 394.3 394.3 393.6 393.6 393.8 394.2 393.8 395.9 396.4 396.9 397.4 397.9 398.4 397.4 396.8 395.6 394.7 395.4 395.6 394.7 395.5 395.6 396.6 398.3 402.1 405.8 407.9 407.9 409.1 410.3 411.3 412.2 413.9 413.9 414.3 414.5 414.8 414.6 415.0 414.1 414.3 414.1 414.8 415.6 417.5 418.3 420.6 422.6 425.0 428.2 431.4 433.4 435.7 437.0 439.2 440.3 442.0 442.8 443.0 443.1 443.5 443.6 443.8 444.5 444.9 444.9 445.4 445.4 445.2 445.4 444.5 443.3 440.8 438.4 435.2 431.4 428.7 425.7 424.6 423.5 422.4 421.2 420.1 419.0 417.9 416.8 415.6 414.5 413.4 412.3 411.2 410.1 409.0 408.0 406.9 405.8 404.7 403.6 402.5 401.5 400.4 399.3 398.3 397.2 396.1 395.1 394.0 394.7 396.2 397.0 398.2 398.8 399.1 399.1 398.8 398.8 398.4 398.2 398.0 397.9 397.7 397.7 397.7 397.5 397.7 397.7 397.2 397.0 396.6 395.2 394.8 393.1 392.0 391.1 389.3 387.6 384.8 381.9 379.2 375.4 372.1 369.6 370.1 370.6 371.1 371.6 372.1 372.6 373.2 373.7 374.2 374.7 375.2 375.7 376.3 376.8 377.3 377.8 378.3 378.9 377.6 377.1 377.3 376.6 375.8 375.6 375.2 374.7 374.9 374.7 374.7 374.7 374.7 374.4 374.5 374.5 374.5 374.5 374.5 374.3 374.3 374.3 374.3 374.1 373.9 373.9 373.6 373.6 373.6 373.6 373.9 374.3 374.5 374.7 374.9 375.1 375.4 375.6 375.6 375.6 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 375.8 376.0 375.8 376.0 376.0 375.8 376.0 375.8 375.8 375.6 375.6 375.2 375.2 375.2 374.9 374.7 374.7 374.5 374.5 374.7 374.7 374.9 375.2 375.2 375.2 375.6 375.6 375.6 376.0 376.0 376.0 376.7 376.7 377.0 377.8 377.8 378.2 378.6 378.5 377.8 378.0 376.9 376.0 374.0 371.9 369.9 367.9 365.9 363.9 361.9 360.0 358.0 356.0 354.1 352.2 350.3 348.3 346.5 344.6 342.7 340.8 339.0 337.1 335.3 333.5 331.6 329.8 328.0 326.3 324.5 322.7 320.9 319.2 317.5 315.7 314.0 312.3 310.6 308.9 307.2 305.6 303.9 302.2 300.6 299.0 297.3 295.7 294.1 292.5 290.9 289.3 287.7 286.2 284.6 283.1 281.5 283.2 285.6 287.6 290.6 292.5 295.0 296.2 296.9 296.9 297.0 297.4 297.2 297.5 297.8 297.6 297.8 298.1 297.8 297.8 298.1 297.6 297.8 297.6 297.4 297.1 297.2 296.9 297.2 297.9 297.9 298.8 299.1 298.9 298.1 297.0 295.0 291.6 289.5 286.6 285.0 287.6 291.3 295.0 298.8 302.6 306.5 310.5 314.4 318.5 322.6 326.7 330.9 335.2 339.5 343.8 348.3 352.7 357.3 361.9 366.5 366.4 366.2 366.3 368.5 368.9 370.2 371.3 372.2 372.4 374.1 374.5 374.7 374.7 374.7 373.8 373.9 373.9 374.2 374.9 376.0 376.7 378.2 379.5 381.9 383.4 385.7 387.3 388.6 390.0 391.5 392.4 393.4 393.9 395.0 395.3 396.6 396.8 397.2 397.2 397.2 397.2 397.2 397.2 397.2 397.2 397.2 397.2 397.4 397.2 397.4 397.5 397.5 397.5 397.7 397.5 397.7 397.9 397.7 397.7 397.9 397.9 397.9 398.1 398.2 398.0 397.9 397.9 397.7 397.7 397.7 397.5 397.5 397.3 397.2 397.0 397.0 397.0 396.8 396.8 396.8 396.3 396.3 396.3 395.9 396.1 396.3 396.3 396.5 397.2 397.4 397.7 398.6 398.6 398.8 399.5 399.3 399.8 400.5 401.4 402.2 403.7 404.6 404.6 405.6 404.8 404.1 403.4 402.7 401.9 401.2 400.5 399.7 399.0 398.3 397.6 396.8 396.1 395.4 394.7 394.0 393.3 392.5 391.8 391.1 390.4 389.7 389.0 388.3 387.6 386.9 386.2 385.5 384.8 384.1 383.4 382.7 382.0 381.3 380.6 379.9 379.2 378.6 377.9 377.2 376.5 375.8 375.1 374.5 373.8 373.1 372.4 371.7 371.1 370.4 369.7 369.1 368.4 367.7 367.1 366.4 365.7 365.1 364.4 364.3 365.5 367.4 368.5 368.7 369.8 369.6 368.9 368.5 368.3 367.2 366.9 366.2 365.3 364.1 364.1 363.4 363.8 364.9 365.6 366.0 367.9 368.5 370.4 372.6 374.9 376.4 379.5 380.8 382.1 384.8 386.4 387.5 389.7 390.5 392.2 393.2 395.2 395.9 396.6 396.8 397.0 396.8 396.8 397.0 396.8 396.8 397.0 396.8 396.8 397.0 396.8 397.0 397.2 397.2 397.5 397.7 397.7 397.9 398.1 398.2 398.2 398.2 398.2 398.0 397.7 397.7 397.0 396.3 395.9 395.2 394.6 394.3 393.8 393.2 393.6 393.0 392.7 393.1 393.3 392.7 393.4 393.4 392.7 393.1 392.9 391.8 390.4 389.3 387.0 384.4 382.3 379.5 377.6 379.0 380.5 381.9 383.4 384.9 386.4 387.9 389.4 390.9 392.4 393.9 395.4 396.9 398.5 400.0 400.2 400.5 399.9 399.1 399.5 399.7 399.3 399.3 399.5 398.8 398.7 398.8 398.5 398.4 398.6 398.4 398.2 398.3 398.2 398.2 398.3 398.2 398.2 398.2 397.9 397.7 397.7 397.5 397.0 396.8 396.6 395.9 394.5 392.7 389.3 385.4 379.1 374.9 368.5 364.5 367.2 370.0 372.8 375.6 378.4 381.2 384.1 387.0 389.9 392.8 395.7 398.7 401.7 402.3 401.6 401.4 400.0 399.2 398.8 398.4 398.0 397.9 398.1 397.9 397.9 398.1 398.4 398.4 398.6 398.6 398.4 398.6 398.6 398.4 398.6 398.6 398.4 398.4 398.4 398.2 398.3 398.2 398.0 398.2 397.9 397.9 398.2 398.0 397.9 397.7 397.3 397.2 397.0 396.3 396.6 396.2 395.4 395.9 396.1 395.9 397.0 397.7 397.9 398.8 399.3 399.5 399.5 399.7 399.5 399.5 399.5 399.3 399.3 399.1 398.8 398.7 398.4 398.2 397.7 397.5 397.5 397.2 397.0 397.2 397.0 397.2 397.2 397.2 397.2 397.5 397.3 397.2 397.2 396.6 395.9 395.5 393.6 390.8 387.3 382.6 378.6 373.4 371.1 370.8 371.0 371.2 371.5 371.7 371.9 372.1 372.3 372.5 372.7 373.0 373.2 373.4 373.6 373.8 374.0 374.3 374.5 374.7 374.9 375.1 375.3 375.5 375.6 375.8 375.8 375.4 375.9 375.4 375.2 375.6 375.2 375.2 375.4 374.8 374.7 374.7 374.5 374.5 374.7 374.5 374.7 374.7 374.7 374.9 374.9 374.7 374.0 372.4 370.7 366.6 363.0 358.7 353.9 350.2 349.0 351.4 353.9 356.3 358.8 361.3 363.8 366.3 368.8 371.4 373.9 376.5 379.1 381.7 384.4 387.0 389.7 392.4 395.1 397.8 399.1 399.3 399.1 398.8 398.6 398.8 398.6 398.6 399.1 398.9 398.6 398.8 398.7 398.2 398.3 398.2 397.9 397.9 397.7 397.5 397.5 397.5 397.3 397.2 397.2 396.8 397.0 397.0 396.8 397.0 397.0 397.0 397.2 397.2 397.0 397.2 397.0 396.6 396.3 395.4 394.0 392.2 393.2 394.7 395.1 395.6 396.8 397.7 398.8 401.0 404.4 407.9 411.7 417.7 424.0 426.2 428.7 431.4 434.7 436.2 438.7 440.3 440.9 441.5 442.2 442.8 443.3 443.5 444.1 444.3 444.3 444.9 445.1 444.9 445.1 445.1 444.3 444.6 444.1 443.3 442.0 440.8 438.6 434.7 432.4 429.0 426.0 425.0 423.9 422.8 421.7 420.7 419.6 418.5 417.5 416.4 415.4 414.3 413.2 412.2 411.1 410.1 409.1 408.0 407.0 405.9 404.9 403.9 402.9 401.8 400.7 400.0 399.6 399.5 399.7 399.3 399.5 399.3 398.4 398.4 397.9 397.2 397.4 397.0 396.8 396.8 396.8 396.4 396.8 396.6 396.6 396.8 396.6 396.6 396.7 396.8 396.6 396.7 396.8 396.6 396.8 396.8 396.6 396.8 396.6 396.3 396.6 396.6 396.3 396.5 397.0 397.0 397.0 397.2 396.8 396.3 396.3 395.6 395.2 393.8 393.4 393.1 392.9 392.7 392.4 392.2 391.9 391.7 391.5 391.2 391.0 390.7 390.5 390.2 390.0 389.8 389.5 389.3 389.0 388.8 388.6 388.3 388.1 387.8 387.6 387.4 387.1 386.9 386.6 386.4 386.2 385.9 385.7 385.5 385.2 385.0 384.7 384.5 384.3 384.0 383.8 383.6 383.3 383.1 382.8 382.6 382.4 382.1 381.9 381.7 381.4 381.2 381.0 380.7 380.5 380.3 380.0 379.8 379.3 378.8 378.0 378.2 378.2 378.0 377.8 377.8 377.8 377.2 376.9 376.7 376.0 376.0 375.8 375.6 375.6 375.4 375.4 375.4 375.4 375.4 375.4 375.2 375.2 375.2 375.0 374.3 373.0 371.6 367.2 362.8 355.5 350.4 346.4 343.2 344.7 346.1 347.6 349.1 350.5 352.0 353.5 355.0 356.5 358.0 359.5 361.0 362.6 364.1 365.6 367.2 368.7 370.3 371.9 373.4 374.7 375.2 375.4 374.8 375.2 375.8 375.4 375.2 375.5 374.9 374.8 375.2 374.5 374.5 374.7 374.5 374.3 374.5 374.3 374.3 374.3 373.9 373.6 372.7 371.3 369.1 364.7 360.3 355.1 347.4 341.1 336.4 331.7 329.2 330.4 332.6 334.8 337.1 339.3 341.6 343.9 346.2 348.5 350.8 353.1 355.5 357.8 360.2 362.6 365.0 367.5 369.9 372.4 374.9 375.4 375.2 375.2 374.7 375.1 375.6 375.6 376.0 376.2 376.1 376.2 376.4 376.0 376.2 376.2 376.0 376.2 376.2 376.0 376.2 376.0 375.8 376.0 375.4 374.5 373.0 368.7 364.1 357.2 351.6 343.8 339.9 334.2 332.1 332.3 332.5 332.7 333.0 333.2 333.4 333.6 333.8 334.0 334.2 334.4 334.7 334.9 335.1 335.3 335.5 335.7 335.9 336.1 336.4 336.8 337.1 337.0 336.6 336.6 336.6 336.6 336.4 336.6 336.2 336.0 336.0 335.6 335.6 335.6 335.6 335.2 335.4 335.4 335.2 335.2 335.4 335.0 335.0 334.8 334.8 334.6 334.5 334.4 334.4 334.3 334.4 334.6 334.8 335.0 335.5 335.4 335.4 335.6 334.6 333.8 333.1 331.8 329.8 329.4 328.8 328.1 328.7 329.4 329.8 330.8 331.9 332.7 333.8 334.6 335.1 335.6 335.7 336.2 336.4 336.2 336.4 336.2 336.0 335.8 335.5 335.2 335.2 335.0 335.0 334.8 334.8 334.6 334.8 334.8 334.8 335.0 335.0 334.8 334.6 334.1 333.7 331.9 328.7 324.3 320.8 317.1 315.7 316.4 317.5 318.6 319.8 320.9 322.0 323.2 324.3 325.5 326.7 327.8 329.0 330.2 331.3 332.5 333.7 334.9 336.1 337.3 338.5 339.7 340.1 339.7 340.1 339.6 339.5 339.3 338.8 337.9 337.3 336.4 336.0 335.5 335.6 335.8 335.8 335.8 336.1 335.8 335.8 336.0 336.0 335.8 336.0 335.8 336.2 336.2 336.2 336.2 336.2 336.0 335.8 335.6 335.0 334.8 332.7 331.0 326.8 320.8 314.7 310.4 306.5 305.0 306.8 309.5 312.2 314.9 317.6 320.3 323.1 325.9 328.7 331.6 334.5 337.3 340.3 339.3 338.6 337.9 337.1 336.4 335.8 335.6 335.2 335.2 335.0 335.0 335.0 334.8 334.6 335.0 334.6 334.8 334.8 334.5 334.4 334.8 334.5 334.4 334.6 334.0 333.8 333.8 332.3 331.3 327.5 322.8 318.4 312.2 308.6 309.1 309.6 310.1 310.5 311.0 311.5 312.0 312.4 312.9 313.4 313.9 314.4 314.9 315.3 315.8 316.3 316.8 317.3 317.8 318.3 318.8 320.8 323.3 326.2 329.4 331.7 333.1 334.8 335.8 336.1 336.6 336.4 336.2 336.2 336.0 335.8 335.4 335.2 335.0 334.8 334.8 334.8 334.8 334.6 334.5 333.8 333.3 330.9 327.5 322.1 317.1 310.9 307.2 302.6 304.0 305.3 306.7 308.1 309.4 310.8 312.2 313.6 315.0 316.4 317.8 319.2 320.7 322.1 323.5 325.0 326.4 327.9 329.4 330.8 332.3 333.7 335.4 336.5 338.3 339.0 339.5 339.5 339.0 338.5 337.9 337.1 336.6 336.2 336.0 335.8 335.6 335.4 335.4 335.0 335.0 334.8 334.4 334.1 333.7 332.9 332.1 331.9 331.1 331.0 331.0 330.8 330.4 330.7 330.6 330.4 330.4 330.0 329.7 329.4 329.0 328.7 325.5 322.8 319.5 317.5 314.4 312.0 310.5 307.9 305.6 304.0 302.1 300.6 298.3 299.3 299.7 299.8 299.5 299.0 298.5 298.8 298.5 298.5 298.5 298.3 297.9 298.1 298.0 297.6 297.7 297.6 297.6 297.7 297.6 298.1 298.1 298.3 298.4 298.5 298.6 298.5 298.5 298.2 295.9 294.7 291.0 286.8 282.4 277.5 274.2 271.5 270.1 273.9 277.8 281.8 285.8 289.8 294.0 298.2 302.4 306.7 311.1 315.5 320.0 324.5 329.2 333.8 336.4 337.7 338.0 337.5 337.1 337.0 336.7 336.4 336.0 336.0 335.5 335.0 335.0 335.0 334.8 335.1 335.4 335.4 335.4 335.2 335.2 335.0 334.7 334.8 334.8 334.6 334.8 334.8 334.6 334.8 334.6 334.2 334.4 334.4 333.8 334.0 333.3 332.3 329.8 327.4 324.4 321.2 319.5 318.2 321.7 325.2 328.8 332.3 336.0 339.6 343.4 347.1 350.9 354.7 358.6 362.5 366.5 370.5 372.1 371.9 372.8 373.9 374.9 374.9 376.5 376.1 375.8 375.6 375.6 374.9 375.7 375.4 375.6 376.5 377.8 378.3 381.1 381.6 383.0 384.8 386.5 388.2 390.2 392.0 393.1 393.7 395.0 395.0 395.6 395.9 395.7 395.4 395.6 395.4 395.4 395.4 395.2 395.2 395.2 395.0 395.0 394.7 394.2 394.0 393.1 391.2 389.5 386.5 381.1 376.9 372.6 366.4 362.7 359.0 356.5 353.9 351.3 348.8 346.3 343.8 341.3 338.9 336.4 334.0 331.6 329.2 326.8 324.5 322.1 319.8 317.5 315.2 313.0 310.7 308.5 306.2 304.0 301.8 299.7 299.3 299.2 299.1 298.5 298.1 298.3 298.0 297.6 297.6 297.1 296.9 296.6 296.4 296.1 296.2 296.0 295.9 296.0 296.0 295.9 296.2 296.1 296.0 296.2 296.2 296.0 296.3 296.4 296.4 296.6 296.6 296.6 296.7 296.7 296.7 296.7 296.9 296.9 297.0 297.1 297.1 297.2 297.4 297.3 297.4 297.2 297.2 297.2 297.2 297.2 297.1 296.9 296.9 296.7 296.2 295.9 294.3 292.3 289.2 284.5 279.8 276.8 273.4 271.8 272.8 274.5 276.2 277.9 279.6 281.3 283.0 284.8 286.5 288.3 290.0 291.8 293.6 295.4 297.2 299.0 300.9 301.0 300.9 300.6 300.5 300.2 300.0 299.7 299.5 299.0 299.0 298.7 298.5 298.3 298.1 297.9 297.9 298.1 298.0 298.1 298.1 298.1 298.1 298.5 297.9 297.9 297.9 297.1 296.4 293.8 290.8 287.7 284.0 281.9 281.1 282.2 283.3 284.4 285.6 286.7 287.8 289.0 290.1 291.3 292.5 293.6 294.8 296.0 297.2 298.3 299.5 300.7 301.9 303.1 304.3 305.6 306.8 306.5 306.6 305.8 305.4 304.4 303.1 302.3 301.2 300.2 299.3 298.8 298.2 297.6 297.6 297.2 296.9 296.9 296.6 296.6 296.6 296.4 296.5 296.6 296.6 296.7 296.7 296.6 296.7 296.9 296.8 296.7 296.9 296.9 296.9 297.3 297.2 297.4 297.6 297.6 297.6 297.9 297.8 297.8 297.6 297.6 297.6 297.6 297.4 297.4 297.3 297.1 296.9 296.6 296.4 296.2 295.9 295.7 295.9 295.5 295.4 295.7 295.6 295.7 296.2 296.4 296.6 297.2 297.4 297.8 298.5 299.0 299.1 299.7 299.3 298.5 297.2 296.2 294.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7 293.7",
|
23 |
+
"input_type": "phoneme",
|
24 |
+
"offset": 17.946
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"text": "SP 体 温 SP 伴 着 喘 息 有 点 上 升 SP 窗 外 吹 来 了 夏 天 的 风 SP",
|
28 |
+
"ph_seq": "SP t i w en SP b an zh e ch uan x i y ou d ian sh ang sh eng SP ch uang w ai ch ui l ai l e x ia t ian d e f eng SP",
|
29 |
+
"note_seq": "rest D4 D4 G4 G4 rest G4 G4 G4 G4 G4 G4 G4 G4 F#4 F#4 G4 G4 A4 A4 G4 G4 rest F#4 F#4 F#4 F#4 E4 E4 E4 E4 E4 E4 D#4 D#4 D#4 D#4 D#4 D#4 E4 E4 rest",
|
30 |
+
"note_dur_seq": "0.327 0.272 0.272 0.546 0.546 0.273 0.5450001 0.5450001 0.273 0.273 0.2719998 0.2719998 0.273 0.273 0.273 0.273 0.273 0.273 0.2720001 0.2720001 0.546 0.546 0.2730002 0.2719998 0.2719998 0.2729998 0.2729998 0.4090004 0.4090004 0.4089999 0.4089999 0.2729998 0.2729998 0.4089999 0.4089999 0.4090004 0.4090004 0.2729998 0.2729998 1.091 1.091 0.081",
|
31 |
+
"is_slur_seq": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0",
|
32 |
+
"ph_dur": "0.179272 0.147728 0.168591 0.103409 0.546 0.224138 0.048862 0.47909 0.06591 0.14232 0.13068 0.119729 0.152271 0.202546 0.070454 0.218454 0.054546 0.136636 0.136364 0.119729 0.152271 0.546 0.158227 0.114773 0.184498 0.087502 0.103681 0.169319 0.327183 0.081818 0.34309 0.06591 0.109365 0.163635 0.27832 0.13068 0.321499 0.087502 0.087773 0.185226 1.091 0.081",
|
33 |
+
"f0_timestep": "0.005",
|
34 |
+
"f0_seq": "294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.3 294.5 295.2 295.3 295.8 295.8 296.0 295.7 295.7 295.9 295.9 295.9 295.9 295.9 296.0 295.9 295.9 296.0 295.9 295.9 296.4 296.2 296.4 296.9 296.7 296.7 297.0 296.7 296.6 296.8 296.5 296.4 296.4 296.1 295.7 295.3 297.7 300.0 301.5 302.6 304.9 306.3 307.4 310.6 312.4 315.5 319.3 322.7 327.9 332.5 338.3 344.3 349.6 358.1 364.5 366.4 367.8 369.6 370.6 372.4 373.3 373.6 373.4 373.9 373.2 374.6 374.8 375.2 374.5 374.7 373.9 374.9 374.9 376.4 376.2 377.3 377.7 379.3 380.9 383.0 384.1 385.6 386.2 387.9 388.6 390.5 391.1 392.0 392.6 393.4 394.0 394.8 395.0 395.0 395.0 395.1 395.0 395.0 395.2 395.2 395.2 395.2 395.2 395.2 395.4 395.2 395.2 395.4 395.2 395.2 395.4 395.2 395.2 395.4 395.2 395.2 395.4 395.2 395.2 395.4 395.4 395.4 395.6 395.4 395.6 395.8 395.6 395.6 395.6 395.6 395.6 395.6 395.6 395.6 395.6 395.7 395.9 395.9 395.9 395.9 395.9 396.1 395.9 395.9 396.1 396.1 396.2 396.6 396.6 396.8 397.5 397.7 397.9 398.6 398.5 398.3 399.0 398.8 398.3 397.9 397.5 397.1 396.8 396.4 396.0 395.6 395.2 394.8 394.5 394.1 393.7 393.3 392.9 392.5 392.2 391.8 391.4 391.0 390.6 390.3 389.9 389.5 389.1 388.7 388.4 388.0 387.6 387.2 386.9 386.5 386.1 385.7 385.4 385.0 384.6 384.2 383.9 383.5 383.1 382.7 382.4 382.0 381.6 381.3 380.9 380.5 380.1 379.8 379.4 379.0 378.7 378.3 377.9 377.6 377.2 376.8 376.5 376.1 375.6 374.7 376.1 376.3 376.5 375.0 375.2 374.5 374.0 372.4 372.3 371.2 371.5 371.8 372.1 371.2 371.7 370.7 371.2 372.1 372.8 373.0 375.0 375.4 376.9 379.8 381.5 383.7 386.4 387.5 389.6 391.3 393.0 393.8 394.7 395.1 395.9 396.2 396.8 397.0 397.0 397.2 397.0 397.0 397.0 397.0 396.8 397.0 396.8 396.8 396.7 396.6 396.6 396.5 396.3 396.6 396.6 396.6 396.6 396.8 396.8 397.0 397.1 397.2 397.2 397.0 396.8 396.5 396.2 395.9 395.3 394.7 393.5 393.2 392.2 391.8 392.0 391.7 391.4 391.8 391.7 391.5 392.1 391.6 390.9 391.1 390.1 388.8 387.3 385.5 381.7 379.3 376.5 375.2 376.5 377.8 379.1 380.4 381.7 383.0 384.3 385.6 386.9 388.3 389.6 390.9 392.3 393.6 395.0 396.3 396.6 396.6 396.6 396.6 396.8 397.4 397.2 397.1 397.2 396.8 396.5 396.3 395.9 395.9 395.9 395.6 395.6 395.6 395.7 395.9 395.6 395.6 395.7 395.2 395.2 394.3 392.1 390.0 385.8 381.9 375.7 372.1 366.9 365.1 366.2 367.2 368.3 369.4 370.5 371.6 372.6 373.7 374.8 375.9 377.0 378.1 379.2 380.3 381.4 382.5 383.7 384.8 385.9 387.0 388.2 390.4 391.3 393.9 395.3 396.1 397.0 397.2 397.7 397.7 397.7 397.4 397.2 397.4 397.2 397.0 397.0 397.0 396.7 396.6 396.8 396.4 396.6 396.5 396.3 396.1 395.4 394.5 392.1 389.5 387.0 383.5 382.4 381.0 381.6 382.3 382.9 383.6 384.3 384.9 385.6 386.2 386.9 387.6 388.2 388.9 389.6 390.2 390.9 391.6 392.3 392.9 393.6 394.3 395.0 395.6 395.4 395.4 395.7 395.8 395.4 395.4 395.6 395.0 394.5 394.5 393.8 393.6 393.6 393.7 393.4 393.6 393.6 393.7 394.1 394.3 394.1 394.3 394.5 394.5 394.5 394.6 394.0 394.2 393.9 393.4 393.3 393.1 392.2 392.0 391.8 390.5 390.0 389.5 388.3 386.8 385.2 383.8 383.2 382.7 381.9 380.7 380.2 378.6 377.4 376.5 375.2 374.3 374.6 375.0 375.2 374.7 374.7 374.4 374.1 374.1 373.8 373.6 373.5 373.6 373.4 373.4 373.4 373.4 373.3 373.4 373.3 373.4 373.7 373.9 373.9 374.1 374.1 373.9 373.9 373.9 373.6 373.6 373.1 372.6 371.9 370.4 368.3 363.6 359.2 355.5 350.9 349.3 351.6 353.9 356.2 358.6 360.9 363.3 365.7 368.1 370.5 373.0 375.4 377.9 380.4 382.9 385.4 387.9 390.0 390.2 391.2 391.5 392.5 392.8 393.6 394.4 395.3 396.0 396.6 397.0 397.2 397.2 397.7 397.5 397.5 397.9 397.5 397.5 397.6 397.2 397.1 397.1 396.5 396.3 396.1 395.0 394.3 392.0 389.4 386.6 381.8 379.4 377.7 377.2 380.6 384.1 387.7 391.3 394.9 398.5 402.2 405.9 409.6 413.4 417.2 421.1 425.0 428.9 432.8 436.8 440.9 444.9 448.2 448.1 447.7 446.6 446.0 445.5 444.7 444.1 443.7 443.3 443.4 443.6 443.8 444.1 444.3 444.6 444.6 444.9 445.1 444.9 445.1 444.9 444.9 445.0 444.6 444.0 443.6 442.3 440.4 436.9 432.7 427.8 423.5 419.8 418.2 417.4 416.7 415.9 415.1 414.4 413.6 412.9 412.1 411.4 410.6 409.9 409.1 408.4 407.6 406.9 406.1 405.4 404.7 403.9 403.2 402.4 401.6 400.5 400.0 399.8 399.0 398.6 398.6 398.0 397.5 397.6 397.2 397.0 397.2 397.0 396.8 397.0 396.8 396.8 396.8 396.8 396.7 396.6 396.5 396.3 396.1 396.1 396.1 396.1 396.1 396.1 396.1 396.3 396.4 396.6 396.6 396.8 396.8 397.0 397.0 397.0 397.0 397.0 397.0 397.0 397.0 396.8 396.8 397.0 397.0 397.0 397.0 397.0 397.0 397.0 397.0 397.0 397.2 397.2 397.2 397.3 397.4 397.2 397.5 397.4 397.2 397.0 397.0 396.7 396.6 396.5 396.3 396.1 396.3 396.0 395.9 395.9 395.9 395.7 395.9 396.0 395.7 396.1 396.3 396.1 396.4 396.7 396.6 397.1 397.5 397.6 398.0 398.6 398.5 399.1 399.9 400.4 401.2 402.2 402.6 402.6 402.7 402.6 401.5 400.9 400.2 399.6 399.0 398.4 397.7 397.1 396.5 395.9 395.3 394.7 394.0 393.4 392.8 392.2 391.6 391.0 390.4 389.8 389.1 388.5 387.9 387.3 386.7 386.1 385.5 384.9 384.3 383.7 383.1 382.5 381.9 381.3 380.7 380.1 379.5 378.9 378.4 377.8 377.2 376.6 376.0 375.4 374.8 374.2 373.7 373.1 372.5 371.9 371.3 370.7 370.2 369.6 369.0 368.4 367.9 367.3 366.7 366.1 365.6 365.4 367.0 369.8 371.1 373.6 374.3 375.0 375.4 375.4 375.6 375.5 374.9 374.7 374.5 373.6 373.4 372.7 372.3 372.1 372.1 371.7 372.3 372.2 372.4 372.7 373.4 373.4 374.1 374.1 374.5 374.7 375.1 374.7 374.7 374.3 374.0 373.8 373.5 372.8 372.0 371.3 370.1 369.7 369.6 369.5 370.0 371.0 371.9 372.8 373.8 374.7 375.6 376.6 377.1 377.7 378.2 378.1 377.8 377.8 376.8 376.4 376.2 375.7 375.4 375.3 375.2 375.2 375.3 375.2 375.2 375.2 374.9 374.9 374.7 374.5 374.5 374.3 374.0 373.9 373.0 371.7 368.2 364.6 360.1 353.7 348.1 344.3 341.0 339.5 338.1 336.6 335.2 333.7 332.3 330.9 329.5 328.1 326.7 325.3 323.9 322.5 321.1 319.8 318.4 317.0 315.7 314.3 313.0 311.7 311.2 314.3 317.2 319.2 319.7 320.6 321.2 320.6 320.6 320.8 319.8 319.5 319.5 318.9 318.6 318.8 318.4 318.6 319.4 319.9 320.9 322.3 323.4 324.7 326.2 328.0 329.3 330.5 331.2 331.5 332.0 332.3 332.3 332.5 332.7 332.9 333.1 333.4 333.3 333.3 333.0 332.9 332.9 332.9 332.9 333.1 333.1 333.5 333.5 333.7 333.7 333.8 333.8 333.7 333.7 333.6 333.5 333.6 333.4 333.3 333.2 333.1 333.1 333.0 332.9 332.7 332.6 332.2 331.5 331.5 330.8 330.0 329.9 329.1 328.3 328.5 328.4 328.3 329.1 330.0 331.1 331.9 334.0 334.1 334.4 334.8 334.6 334.3 334.6 334.6 334.3 334.6 334.3 334.1 334.2 334.2 334.0 334.2 334.2 334.0 334.4 334.4 334.3 334.4 334.2 334.1 334.2 334.2 334.3 334.4 334.4 334.4 334.8 334.6 334.6 334.6 334.5 334.3 334.8 334.8 334.6 334.8 334.9 334.6 334.6 334.7 334.4 334.2 334.2 334.0 333.6 333.5 333.4 333.0 332.9 333.0 332.9 332.9 333.1 333.1 333.4 333.8 333.7 333.7 333.7 332.9 332.3 331.9 331.0 330.2 329.9 329.1 328.3 328.5 328.3 328.0 328.7 329.4 330.2 331.0 332.6 333.3 334.1 335.0 335.2 335.6 335.5 335.4 335.4 335.2 335.2 335.0 334.8 334.6 334.4 334.2 334.2 334.0 334.0 333.8 333.8 333.8 334.0 334.1 334.2 334.2 334.0 334.0 333.7 333.3 332.5 331.0 328.5 325.5 321.7 316.7 313.1 310.0 310.8 311.5 312.2 313.0 313.7 314.4 315.2 315.9 316.7 317.4 318.2 318.9 319.7 320.4 321.2 322.0 322.7 323.5 324.2 325.0 325.8 326.5 327.1 326.8 325.4 325.2 324.5 322.7 321.7 320.5 319.4 318.6 317.7 316.9 316.5 316.4 316.2 316.2 316.0 315.9 316.0 315.8 316.1 316.2 316.0 316.1 316.4 316.0 316.2 316.5 316.2 316.4 316.7 316.6 316.6 316.7 316.6 316.6 316.6 316.2 316.2 316.0 315.8 315.5 315.2 314.5 312.4 310.3 306.1 301.6 297.4 294.2 290.9 288.8 289.7 290.5 291.4 292.3 293.1 294.0 294.9 295.8 296.7 297.5 298.4 299.3 300.2 301.1 302.0 302.9 303.8 304.7 305.7 306.6 307.5 308.4 309.3 310.3 311.2 312.1 313.1 314.0 315.0 316.3 317.1 317.8 318.4 318.7 318.6 318.8 318.9 318.4 318.6 318.8 318.2 318.2 318.4 317.8 317.6 317.2 316.7 316.6 316.3 316.0 315.9 315.6 315.5 315.3 315.5 315.9 316.2 316.2 316.4 316.6 316.5 316.7 316.7 316.6 316.7 316.5 316.2 316.0 315.1 314.1 313.2 312.0 311.1 311.1 310.8 311.0 311.4 311.7 311.9 312.1 312.2 312.2 312.2 312.4 312.2 312.1 312.2 312.0 311.9 312.0 311.5 310.9 310.8 309.3 307.5 305.4 303.0 301.6 299.5 298.5 299.7 302.7 305.7 308.7 311.7 314.8 317.7 319.5 320.4 320.1 319.6 319.0 318.4 317.8 317.3 316.7 316.5 316.4 316.1 316.2 316.0 316.0 316.0 316.0 315.8 316.2 316.0 316.0 316.0 316.0 315.8 315.8 315.8 314.9 314.6 312.9 310.2 308.1 303.6 299.8 295.0 292.4 290.5 291.9 293.3 294.7 296.1 297.5 299.0 300.4 301.8 303.3 304.8 306.2 307.7 309.2 310.7 312.2 313.7 315.2 316.7 318.2 319.8 321.3 322.8 322.7 321.8 321.0 321.1 320.1 319.3 318.4 318.4 317.1 317.3 317.7 317.6 317.1 317.4 316.6 316.8 316.9 316.9 317.0 318.0 317.6 318.4 319.3 320.1 320.2 322.4 322.9 324.0 325.9 327.1 328.1 329.9 330.4 331.5 332.3 333.3 333.6 334.2 334.2 334.2 334.2 334.2 334.2 334.2 334.2 334.2 334.1 334.2 334.2 334.0 334.2 334.2 334.0 334.2 334.2 334.1 334.2 334.2 334.2 334.2 334.2 334.2 334.2 334.2 334.1 334.2 334.2 334.0 334.2 334.2 334.1 334.2 334.2 334.1 334.2 334.2 334.2 334.2 334.2 334.2 334.2 334.2 334.0 334.2 334.0 334.1 334.2 334.0 334.1 334.2 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.1 334.2 334.0 334.1 334.2 334.0 334.0 334.2 334.0 334.0 334.2 334.0 334.1 334.2 334.0 334.1 334.2 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 334.0 333.8 333.8 333.8 333.7 333.7 333.7 333.6 333.5 333.5 333.5 333.3 333.5 333.5 333.5 333.5 333.7 333.6 333.5 333.7 333.7 333.7 333.7 333.7 333.8 333.7 333.8 333.8 333.7 333.7 333.8 333.7 333.7 333.8 333.7 333.7 333.7 333.7 333.5 333.8 333.5 333.5 333.7 333.6 333.5 333.7 333.7 333.7 333.8 333.8 333.7 334.0 334.0 334.1 334.5 334.8 335.1 335.7 336.2 336.7 337.1 337.4 337.5 336.6 336.1 335.6 334.6 334.6 334.6 334.6 334.6 334.6 334.6 334.6 334.6 334.6 334.6 334.6 334.6 334.6 334.6 334.6 334.6 334.6 334.6 334.6 334.6",
|
35 |
+
"input_type": "phoneme",
|
36 |
+
"offset": 43.037
|
37 |
+
},
|
38 |
+
{
|
39 |
+
"text": "SP 夕 阳 晚 照 轻 眺 余 晖 风 在 背 后 温 柔 的 吹 SP 看 得 见 的 青 春 体 会 SP 看 不 清 我 究 竟 是 谁 SP 风 高 夜 黑 星 目 剑 眉 从 来 不 会 空 手 而 归 无 声 的 鬼 魅 SP",
|
40 |
+
"ph_seq": "SP x i y ang w an zh ao q ing t iao y v h ui f eng z ai b ei h ou w en r ou d e ch ui SP k an d e j ian d e q ing ch un t i h ui SP k an b u q ing w o j iu j ing sh ir sh ui SP f eng g ao y E h ei x ing m u j ian m ei c ong l ai b u h ui k ong sh ou er g ui w u sh eng d e g ui m ei SP",
|
41 |
+
"note_seq": "rest G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 rest F#4 F#4 F#4 F#4 F#4 F#4 F#4 F#4 F#4 F#4 F#4 F#4 F#4 F#4 F#4 F#4 rest G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 A4 A4 A4 A4 G4 G4 rest G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 G4 F#4 F#4 F#4 F#4 E4 E4 E4 E4 D#4 D#4 rest",
|
42 |
+
"note_dur_seq": "0.6 0.272 0.272 0.273 0.273 0.273 0.273 0.273 0.273 0.2720001 0.2720001 0.273 0.273 0.273 0.273 0.2719998 0.2719998 0.273 0.273 0.273 0.273 0.273 0.273 0.2720001 0.2720001 0.273 0.273 0.2730002 0.2730002 0.2729998 0.2729998 0.1360002 0.1360002 0.1359997 0.2730002 0.2730002 0.2729998 0.2729998 0.2720003 0.2720003 0.2729998 0.2729998 0.2730002 0.2730002 0.2729998 0.2729998 0.2720003 0.2720003 0.1369996 0.1369996 0.1360002 0.2730002 0.2730002 0.2729998 0.2729998 0.2719998 0.2719998 0.2730002 0.2730002 0.2729998 0.2729998 0.2720003 0.2720003 0.2729998 0.2729998 0.1370001 0.1370001 0.1359997 0.2730007 0.2730007 0.2719994 0.2719994 0.2730007 0.2730007 0.2729998 0.2729998 0.2729998 0.2729998 0.2720003 0.2720003 0.2729998 0.2729998 0.1359997 0.1359997 0.2720003 0.2720003 0.2729998 0.2729998 0.2729998 0.2729998 0.2730007 0.2730007 0.2719994 0.2719994 0.2730007 0.2730007 0.2729998 0.2729998 0.2729998 0.8179998 0.8179998 0.2720003 0.2720003 0.8190002 0.8190002 0.2719994 0.2719994 1.091 1.091 0.218",
|
43 |
+
"is_slur_seq": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0",
|
44 |
+
"ph_dur": "0.436365 0.163635 0.217454 0.054546 0.175271 0.097729 0.20709 0.06591 0.130956 0.142044 0.184499 0.087502 0.20709 0.06591 0.185498 0.087502 0.151547 0.120453 0.218454 0.054546 0.234365 0.038635 0.185498 0.087502 0.20609 0.06591 0.163907 0.109093 0.212774 0.060226 0.087773 0.185226 0.136 0.011 0.125 0.218454 0.054546 0.163907 0.109093 0.217454 0.054546 0.093453 0.179546 0.148 0.125 0.202546 0.070454 0.184499 0.087502 0.137 0.065547 0.070454 0.240045 0.032955 0.148 0.125 0.201546 0.070454 0.185499 0.087502 0.191182 0.081818 0.135636 0.136364 0.103681 0.169319 0.137 0.00532 0.13068 0.234365 0.038635 0.217453 0.054546 0.185499 0.087502 0.130956 0.142044 0.202542 0.070457 0.162908 0.109093 0.202542 0.070457 0.115048 0.157951 0.20609 0.06591 0.245725 0.027275 0.148 0.125 0.229818 0.043182 0.129955 0.142044 0.212774 0.235501 0.097725 0.185498 0.087502 0.594138 0.223862 0.178815 0.093185 0.709908 0.109093 0.233364 0.038635 1.091 0.218",
|
45 |
+
"f0_timestep": "0.005",
|
46 |
+
"f0_seq": "364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.7 364.5 364.3 365.2 366.3 367.1 368.3 369.0 369.8 370.8 371.3 372.2 373.5 374.5 376.1 377.4 378.9 381.2 383.4 385.1 386.9 389.0 390.6 392.3 393.8 394.6 395.3 395.8 396.0 396.5 396.7 396.4 396.6 396.6 396.2 396.3 396.6 396.3 396.2 396.3 396.1 396.0 396.1 395.7 395.5 395.6 395.5 395.6 396.0 396.4 396.8 397.2 397.6 397.7 397.7 397.7 397.6 397.4 397.0 396.2 395.7 395.5 395.0 394.5 394.4 394.2 394.0 394.0 393.9 393.9 394.0 394.2 394.3 394.3 394.4 394.5 394.6 394.9 395.1 395.2 395.4 395.6 395.6 395.5 395.3 394.8 393.9 392.7 391.3 389.6 388.2 386.9 385.7 384.6 383.9 383.7 383.9 384.7 386.3 388.6 390.5 392.6 394.8 396.3 397.3 398.2 398.7 398.8 399.0 398.9 398.5 398.2 397.9 397.4 396.7 396.7 396.6 396.3 396.5 396.6 396.7 396.9 397.0 397.1 397.1 396.9 396.6 395.9 395.4 394.6 393.7 393.0 392.4 391.9 391.9 392.0 391.8 391.7 391.5 390.4 389.2 387.7 385.1 382.5 380.2 378.2 377.6 378.9 380.2 381.5 382.8 384.2 385.5 386.8 388.1 389.5 390.8 392.1 393.5 394.8 396.2 397.0 397.0 397.1 397.1 397.0 396.8 396.3 395.8 395.5 395.3 395.1 394.8 394.7 394.6 394.5 394.6 394.9 395.1 395.5 395.9 396.1 396.4 396.9 397.0 397.2 397.5 397.4 397.1 396.9 396.4 395.2 393.4 390.9 387.0 382.7 377.8 373.0 369.0 368.3 369.8 371.2 372.7 374.2 375.7 377.2 378.7 380.2 381.7 383.2 384.7 386.2 387.8 389.3 390.9 392.4 394.0 395.5 397.1 398.7 399.1 399.1 399.3 399.3 399.6 399.5 399.1 398.6 398.1 397.5 397.0 396.6 396.3 396.2 396.4 396.6 396.6 396.6 396.6 396.6 396.3 396.1 396.1 395.7 394.9 394.5 393.8 392.4 391.3 389.9 388.1 385.6 382.5 379.2 375.8 373.3 371.3 371.1 372.4 373.6 374.8 376.0 377.2 378.4 379.7 380.9 382.1 383.4 384.6 385.9 387.1 388.4 389.7 391.4 393.8 395.1 396.1 396.9 396.9 396.8 396.7 396.3 396.0 395.6 395.1 394.7 394.4 394.3 394.3 394.3 394.5 394.8 395.0 395.2 395.5 395.7 396.1 396.4 396.6 396.7 396.7 396.4 396.3 396.2 395.7 395.4 395.3 395.1 395.0 395.0 394.6 393.9 393.4 392.6 391.5 390.8 389.9 389.4 389.4 389.2 389.6 390.6 391.3 392.1 393.1 393.6 394.2 395.0 395.8 396.5 397.1 397.6 397.9 398.2 398.3 398.4 398.4 398.4 398.3 398.0 398.0 397.9 397.6 397.5 397.3 397.1 397.1 397.1 397.0 397.0 396.7 396.6 396.5 396.0 395.9 395.1 393.0 390.5 387.5 383.7 380.8 378.1 375.8 375.4 377.0 378.5 380.2 381.8 383.4 385.0 386.6 388.3 389.9 391.6 393.2 394.9 396.6 398.3 399.9 401.6 402.3 402.4 402.6 402.3 401.9 401.5 400.9 400.3 399.8 399.3 398.8 398.6 398.5 398.4 398.4 398.4 398.4 398.4 398.4 398.4 398.5 398.6 398.8 398.8 398.6 398.4 398.0 397.4 396.4 394.6 392.3 389.2 385.3 381.2 377.5 375.3 374.8 376.1 377.5 378.8 380.2 381.5 382.9 384.2 385.6 386.9 388.3 389.7 391.1 392.4 393.8 395.2 396.6 398.0 399.4 395.7 391.9 392.2 392.2 391.8 391.5 391.9 392.5 392.9 393.8 394.6 395.1 395.6 396.3 396.7 397.0 397.4 397.5 397.6 397.7 397.7 397.6 397.2 397.0 396.7 396.3 395.7 395.4 394.6 392.9 391.9 390.3 387.7 385.5 382.9 380.3 377.4 375.4 374.2 374.7 376.5 378.3 380.1 381.9 383.7 385.5 387.3 389.2 391.0 392.9 394.7 396.6 398.5 400.4 400.7 400.6 400.7 400.4 400.4 400.3 400.3 400.3 400.2 399.9 399.5 399.4 399.0 398.6 398.5 398.0 397.7 397.6 397.2 397.0 396.9 396.7 396.7 396.8 396.9 397.3 397.6 397.8 398.2 398.4 398.2 398.2 397.8 396.9 395.7 393.4 390.2 385.6 379.9 375.0 371.3 371.1 373.1 375.0 376.9 378.9 380.8 382.8 384.8 386.8 388.8 390.8 392.8 394.8 396.9 398.9 399.8 399.8 399.8 399.7 399.4 399.2 399.1 399.1 398.9 399.0 399.0 398.7 398.6 398.5 398.1 397.9 397.9 397.8 397.7 397.7 397.6 397.7 397.9 397.9 398.2 398.4 398.4 398.4 398.2 397.7 396.9 395.4 393.5 391.0 387.7 384.7 381.8 379.5 378.8 379.8 380.9 381.9 382.9 383.9 385.0 386.0 387.0 388.1 389.1 390.2 391.2 392.3 393.3 394.4 395.4 396.3 397.6 399.2 399.9 400.6 400.9 400.9 400.9 400.8 400.4 400.2 400.0 399.4 399.1 398.8 398.5 398.4 398.2 398.2 398.3 398.3 398.3 398.4 398.3 398.3 398.5 398.5 398.5 398.6 398.5 398.5 398.6 398.4 398.2 398.2 397.9 397.7 397.6 397.0 396.1 395.4 394.3 393.2 392.4 391.7 391.9 392.7 393.9 395.3 396.4 396.9 397.4 397.8 398.2 398.7 399.0 399.2 399.3 399.2 399.1 399.0 398.5 398.0 397.7 397.2 397.0 396.9 396.5 396.5 396.4 396.2 396.2 396.2 396.2 396.3 396.3 396.3 396.7 397.1 397.4 397.8 398.2 398.0 398.2 398.4 398.3 398.3 398.4 398.2 397.9 397.9 397.7 397.5 397.5 397.2 396.9 396.8 396.4 395.7 395.4 395.1 394.4 394.2 394.4 394.6 395.0 395.5 395.9 396.1 396.5 396.7 396.9 397.0 396.9 396.9 397.0 396.9 396.8 396.8 396.6 396.6 396.7 396.8 396.8 396.9 396.9 396.8 396.9 397.0 397.1 397.2 397.2 397.4 397.5 397.5 397.6 397.7 397.6 397.5 397.5 397.2 396.7 396.0 394.8 392.3 389.6 386.0 381.3 376.2 371.3 367.5 364.1 364.9 368.9 372.9 376.9 380.9 385.1 389.2 393.4 397.7 401.2 402.2 402.0 401.3 400.5 399.3 398.6 398.1 397.7 397.5 397.4 397.2 397.2 397.1 396.9 396.9 396.9 396.8 396.9 397.0 397.0 397.1 397.1 397.0 397.2 397.4 397.1 397.1 396.8 396.3 395.5 393.2 390.2 386.1 381.6 376.2 371.4 367.4 364.9 365.0 365.8 366.6 367.4 368.2 369.0 369.9 370.7 371.5 372.3 373.2 374.0 374.8 375.7 376.5 377.3 378.2 379.0 379.9 380.7 381.7 384.4 387.8 390.9 393.5 395.1 396.2 396.7 397.1 397.2 397.1 397.1 397.2 397.2 397.2 397.2 397.2 397.2 397.2 397.5 397.7 397.7 397.8 397.8 397.6 397.4 396.9 396.0 394.2 390.6 386.9 382.7 378.5 376.8 376.2 375.6 375.0 374.4 373.8 373.2 372.6 372.1 371.5 370.9 370.3 369.7 369.2 368.6 368.0 367.4 366.9 366.3 365.7 365.1 366.2 368.4 370.6 373.3 375.7 377.0 378.1 379.2 379.5 379.7 379.5 379.0 378.4 378.0 377.7 377.4 377.4 377.4 377.3 377.2 376.7 376.0 375.3 374.6 373.9 373.2 372.9 372.1 371.2 371.3 370.7 370.0 369.9 369.6 368.7 367.8 366.8 364.7 361.8 358.9 355.4 352.6 350.0 347.1 346.4 347.3 349.2 353.6 359.7 365.9 372.3 376.9 379.8 380.3 379.5 378.6 377.7 376.8 376.1 375.7 375.3 375.1 374.9 374.8 374.6 374.5 374.4 374.4 374.4 374.4 374.5 374.5 374.5 374.6 374.6 374.5 374.6 374.5 374.2 373.7 371.9 369.5 365.4 359.7 353.8 348.7 345.1 342.0 341.8 343.5 345.1 346.7 348.4 350.1 351.7 353.4 355.1 356.8 358.5 360.2 361.9 363.6 365.4 367.1 368.8 370.6 372.4 373.8 374.3 374.9 375.4 375.5 375.7 376.2 376.5 376.6 376.8 376.8 376.5 376.3 376.1 376.0 376.0 376.0 376.0 375.9 375.8 375.8 375.7 375.5 375.1 374.8 374.3 373.5 373.2 372.6 371.9 371.5 370.7 370.1 370.4 370.2 369.8 369.9 369.6 368.7 367.5 365.7 362.8 359.3 355.7 352.9 350.9 350.9 351.8 355.2 359.8 365.4 371.7 376.7 379.6 380.3 379.8 379.0 378.2 377.2 376.6 376.1 375.6 375.4 375.2 375.1 374.8 374.7 374.6 374.5 374.5 374.5 374.6 374.6 374.5 374.6 374.7 374.6 374.5 374.3 373.9 373.2 371.3 368.5 364.1 359.3 353.3 347.4 343.2 342.6 344.2 345.7 347.3 348.9 350.5 352.1 353.7 355.4 357.0 358.6 360.3 361.9 363.6 365.3 366.9 368.6 370.3 372.0 373.7 375.4 377.2 378.0 378.0 378.1 377.9 377.9 378.0 378.0 377.9 377.5 377.0 376.5 375.9 375.5 375.1 374.7 374.4 374.3 374.0 374.1 374.5 374.8 375.3 375.9 376.2 376.5 376.8 376.8 376.8 376.9 376.5 376.0 375.7 374.6 373.1 371.5 369.0 365.1 361.5 358.7 356.8 356.4 357.4 358.4 359.4 360.4 361.5 362.5 363.5 364.6 365.6 366.7 367.7 368.8 369.8 370.9 371.9 372.7 374.1 375.8 376.8 377.3 377.7 377.7 377.4 377.3 377.1 376.6 376.5 376.2 376.1 376.2 376.1 375.8 375.5 374.9 374.4 374.2 373.6 373.0 372.7 371.9 371.5 371.9 371.6 371.3 371.3 370.6 370.0 369.1 368.1 366.1 363.3 361.0 358.7 358.4 359.5 360.5 361.6 362.6 363.7 364.8 365.8 366.9 368.0 369.1 370.1 371.2 372.3 373.4 374.5 375.6 376.2 376.6 376.9 376.8 376.9 376.9 376.7 376.7 376.5 376.1 376.0 375.7 375.4 375.4 375.3 375.3 375.2 375.0 375.4 375.8 375.6 375.8 376.1 375.7 375.4 375.1 374.3 373.2 371.7 369.4 365.7 362.4 359.0 356.3 354.3 354.6 356.4 358.3 360.1 362.0 363.9 365.7 367.6 369.5 371.5 373.4 375.3 377.2 379.2 381.2 383.1 383.8 383.8 383.5 382.7 381.7 380.7 379.7 379.0 378.2 377.4 377.0 376.6 376.4 376.2 376.0 375.8 375.8 375.8 375.8 375.8 375.8 375.9 376.0 376.1 376.2 376.2 376.1 375.9 375.5 374.6 373.0 369.7 366.1 361.6 357.4 356.1 357.4 358.7 360.0 361.4 362.7 364.0 365.4 366.7 368.1 369.5 370.8 372.2 373.6 375.0 376.3 377.7 379.1 380.5 381.9 383.4 385.1 387.4 389.7 392.3 395.3 397.2 398.6 399.9 400.1 400.0 399.9 399.5 399.1 398.7 398.4 398.0 397.9 397.8 397.4 397.2 397.0 396.6 396.1 395.5 394.9 394.3 394.3 394.2 393.7 394.0 394.4 394.2 394.0 393.9 392.9 391.4 390.0 387.5 384.9 382.5 379.6 377.7 376.1 376.9 379.1 381.4 383.6 385.9 388.2 390.5 392.8 395.1 397.4 399.7 401.2 401.3 401.0 400.3 400.0 400.1 399.8 399.4 399.5 399.4 398.8 398.9 398.8 398.5 398.7 398.6 398.4 398.5 398.5 398.4 398.5 398.6 398.5 398.4 398.3 397.9 397.6 397.2 396.7 395.1 393.0 389.5 385.1 379.3 373.4 368.3 364.0 363.8 365.5 367.2 369.0 370.7 372.4 374.2 376.0 377.7 379.5 381.3 383.1 384.9 386.7 388.5 390.3 392.1 394.0 395.8 397.7 399.6 400.5 400.3 400.5 400.5 400.6 400.7 400.6 400.3 400.1 399.8 399.2 398.7 398.3 397.9 397.6 397.2 396.7 396.2 396.1 396.0 395.9 396.1 396.6 397.1 397.5 397.9 398.0 398.2 398.7 398.7 398.7 399.0 398.9 399.0 399.1 398.8 398.6 398.5 398.0 397.7 397.6 397.1 396.8 396.9 396.8 396.9 397.5 397.7 398.0 398.4 398.6 398.7 399.1 399.3 399.4 399.7 399.8 399.6 399.7 399.7 399.5 399.4 399.3 399.0 398.7 398.5 398.3 398.2 398.2 398.0 397.9 397.9 397.8 397.6 397.6 397.6 397.2 397.0 396.7 395.7 394.3 391.8 387.9 383.2 378.6 373.5 368.5 365.9 366.4 368.1 369.9 371.7 373.5 375.3 377.1 378.9 380.8 382.6 384.4 386.3 388.2 390.0 391.9 393.8 395.7 397.6 398.9 399.2 399.2 399.2 399.0 399.0 399.1 398.8 398.6 398.5 398.2 397.8 397.7 397.6 397.5 397.5 397.5 397.5 397.6 397.7 397.8 397.9 397.9 397.9 397.8 397.7 397.6 396.9 396.3 394.8 391.4 387.1 381.9 377.0 372.0 368.6 368.8 372.3 375.8 379.3 382.9 386.5 390.1 393.8 397.5 401.3 405.1 408.9 412.7 416.6 420.5 424.5 428.5 432.5 436.6 438.7 439.9 441.3 442.3 443.4 444.2 444.7 445.0 445.0 445.0 444.4 444.1 443.9 443.4 442.9 442.9 442.9 442.7 442.9 443.2 442.9 443.3 443.9 444.1 444.5 445.2 444.9 444.4 443.7 442.5 441.3 439.0 436.3 434.4 432.8 432.5 433.0 433.4 433.9 434.4 434.9 435.3 435.8 436.3 436.8 437.2 437.7 438.2 438.7 439.2 439.7 440.1 440.6 441.1 441.6 441.9 442.3 443.3 444.5 445.1 445.1 445.4 445.6 445.6 445.6 445.8 445.6 445.5 445.8 445.8 445.8 446.0 446.1 446.3 446.4 446.4 446.2 446.0 445.8 445.5 444.9 443.7 441.4 438.6 435.3 431.3 428.2 426.8 425.7 424.5 423.3 422.1 420.9 419.6 418.4 417.2 416.0 414.8 413.6 412.4 411.2 410.1 408.9 407.7 406.5 405.4 404.2 403.0 402.4 402.0 401.1 400.6 400.3 400.3 400.3 400.1 399.9 399.5 399.0 398.6 398.3 398.3 398.4 398.5 398.6 398.8 398.7 398.6 398.6 398.6 398.5 398.5 398.6 398.6 398.5 398.4 398.1 397.7 397.1 396.2 395.0 393.5 390.5 387.5 384.5 381.3 380.9 382.1 383.4 384.6 385.8 387.1 388.3 389.6 390.8 392.1 393.3 394.6 395.8 397.1 398.4 399.7 401.0 402.2 402.7 402.1 401.5 400.6 400.1 399.8 399.5 399.4 399.2 399.1 399.1 398.9 399.0 399.1 398.8 398.8 399.1 399.1 399.2 399.4 399.4 399.2 399.1 399.1 399.0 398.7 398.6 398.6 398.3 398.3 398.4 398.2 398.4 398.6 398.3 397.9 397.7 396.6 394.4 391.9 388.5 384.4 380.7 377.9 376.1 374.7 373.3 371.9 370.5 369.1 367.8 366.4 365.0 365.4 366.8 368.4 369.4 370.2 370.9 371.2 371.1 371.1 371.0 371.0 371.1 371.0 370.4 370.4 370.6 371.3 373.4 376.0 378.2 381.6 384.5 387.6 391.1 394.0 396.0 396.8 397.4 397.3 397.1 396.9 396.7 396.3 396.0 395.5 394.9 394.5 393.9 393.4 393.0 392.6 392.3 392.1 391.9 391.8 391.9 391.9 392.0 392.6 393.1 393.8 394.5 395.4 396.3 397.1 397.7 398.0 398.2 398.3 398.4 398.2 398.3 398.3 398.0 398.0 397.9 397.7 397.8 397.7 397.5 397.5 397.2 397.0 397.1 397.1 397.1 397.4 397.4 397.2 397.4 397.1 396.7 395.5 392.7 389.6 385.7 381.7 377.8 374.8 372.7 372.4 373.6 374.7 375.9 377.0 378.2 379.4 380.5 381.7 382.9 384.1 385.3 386.5 387.7 388.9 390.1 391.3 393.0 395.1 397.0 398.4 399.3 400.0 400.3 400.3 400.1 399.8 399.4 399.3 399.2 398.8 398.6 398.6 398.5 398.2 398.2 398.0 397.9 397.9 397.8 397.7 397.6 397.4 396.3 394.3 391.9 388.7 384.8 381.2 377.9 377.0 378.0 379.0 379.9 380.9 381.9 382.9 383.9 384.8 385.8 386.8 387.8 388.8 389.8 390.8 391.8 392.8 393.8 394.8 395.8 396.9 397.9 398.9 398.8 398.4 398.0 397.7 397.9 398.2 398.0 397.9 397.9 397.8 397.6 397.3 397.1 396.8 396.3 396.0 395.5 395.0 395.0 394.7 394.6 394.9 395.2 395.5 396.0 396.6 396.9 397.3 397.6 397.7 397.8 397.9 397.8 397.6 397.6 397.4 396.8 396.7 396.4 395.7 395.4 395.1 394.4 394.0 393.7 392.8 392.6 392.4 391.8 391.9 392.7 393.0 393.8 394.5 395.2 396.1 397.0 398.0 398.8 399.4 399.8 400.1 400.2 400.0 399.9 399.9 399.5 399.4 399.4 399.2 399.2 399.3 399.2 399.2 399.3 399.2 399.1 399.1 398.8 398.5 398.2 397.4 395.4 393.2 389.3 383.1 376.5 370.8 366.3 361.9 358.8 359.1 361.0 362.8 364.7 366.6 368.5 370.4 372.3 374.2 376.1 378.1 380.0 382.0 383.9 385.9 387.9 389.9 391.9 393.6 393.9 394.5 395.2 395.5 396.2 397.1 397.2 397.7 397.9 397.7 397.5 397.3 397.1 397.1 397.0 396.8 396.8 396.6 396.4 396.4 396.4 396.3 396.3 396.3 396.2 396.1 396.0 395.7 395.5 395.3 394.9 394.4 394.1 393.7 392.8 392.2 391.4 390.2 390.0 390.2 390.0 390.1 390.7 390.3 390.1 390.4 389.7 389.6 390.1 390.0 390.4 391.5 392.9 394.7 397.0 398.3 398.5 398.6 398.6 398.5 398.4 398.4 398.4 398.3 398.0 398.0 398.0 397.9 397.9 397.9 397.9 397.9 397.9 397.8 397.8 397.8 397.7 397.6 397.4 396.8 395.2 393.3 389.9 385.6 380.4 374.8 369.8 366.1 365.7 367.0 368.4 369.7 371.1 372.4 373.8 375.1 376.5 377.9 379.3 380.6 382.0 383.4 384.8 386.2 387.6 389.0 390.5 391.9 393.0 394.6 395.9 396.9 397.7 398.2 398.7 399.1 399.2 399.4 399.2 398.7 398.5 398.2 397.9 398.0 397.9 397.8 398.1 398.0 398.0 398.2 398.2 398.3 398.5 398.7 398.8 398.8 399.0 398.9 398.8 398.7 398.5 398.2 397.7 397.5 397.2 396.8 396.6 396.3 395.9 395.6 395.5 395.4 395.7 396.1 396.4 397.0 397.5 398.1 398.5 399.1 399.8 400.0 400.0 399.9 399.4 398.8 398.4 397.9 397.7 397.6 397.4 397.2 397.2 397.1 397.0 397.0 396.9 396.8 396.8 396.8 396.9 397.0 397.1 397.1 397.2 397.6 397.7 397.8 397.8 397.6 397.2 396.6 395.5 393.4 389.4 384.5 379.1 373.7 370.3 369.4 371.1 372.8 374.6 376.3 378.0 379.8 381.5 383.3 385.1 386.9 388.7 390.5 392.3 394.1 395.9 397.7 399.1 399.6 399.4 399.0 399.0 399.3 399.1 399.0 399.2 398.8 398.4 398.5 398.3 398.0 398.3 398.2 398.3 398.5 398.4 398.3 398.7 398.7 398.7 399.0 399.0 398.8 399.0 399.0 399.0 399.1 398.8 398.6 397.6 395.9 393.3 389.3 384.5 380.2 375.4 371.3 369.1 369.6 371.8 374.0 376.2 378.5 380.7 383.0 385.3 387.6 389.9 392.2 394.5 396.9 399.2 401.6 402.8 403.0 403.2 403.2 402.9 402.4 401.7 400.9 400.0 399.4 398.8 398.6 398.6 398.5 398.4 398.4 398.4 398.3 398.2 398.3 398.5 398.6 398.7 398.8 398.6 398.2 397.9 396.4 393.8 390.1 385.2 380.5 375.8 371.8 370.9 371.7 372.6 373.4 374.2 375.1 375.9 376.7 377.6 378.4 379.3 380.1 381.0 381.8 382.7 383.5 384.4 385.2 386.1 387.0 387.8 389.2 391.4 393.6 395.5 396.9 397.7 398.2 398.5 399.0 399.3 399.2 399.2 399.3 399.2 399.1 399.2 399.2 399.1 399.2 399.3 399.4 399.4 399.4 399.4 399.2 398.9 398.4 397.5 395.8 393.7 391.3 388.1 384.4 381.0 378.7 378.7 379.8 380.9 382.0 383.1 384.3 385.4 386.6 387.7 388.8 390.0 391.1 392.3 393.5 394.6 395.8 397.0 398.1 399.3 399.6 399.5 399.2 398.6 398.2 398.0 397.7 397.2 397.0 396.9 396.6 396.3 396.3 396.1 395.9 395.9 395.9 396.0 396.3 396.7 396.8 396.9 397.1 397.4 397.6 397.8 397.9 397.8 397.6 396.9 396.1 394.6 392.7 390.3 387.9 385.3 382.6 381.3 379.8 378.9 379.1 379.4 379.5 380.0 380.7 381.4 382.0 382.7 383.4 384.1 384.5 384.6 385.1 385.9 387.2 388.5 389.4 390.2 391.0 391.5 392.1 392.4 392.7 393.0 393.2 393.6 394.1 394.4 394.7 395.1 395.5 395.8 395.9 396.0 396.0 395.9 396.1 396.2 396.1 396.2 396.0 395.5 394.5 392.2 388.6 384.4 379.4 375.4 373.6 374.6 375.5 376.5 377.5 378.5 379.4 380.4 381.4 382.4 383.4 384.4 385.4 386.3 387.3 388.3 389.3 390.4 391.4 392.4 393.2 394.3 395.7 396.6 397.1 397.8 398.2 398.3 398.7 398.7 398.6 398.5 398.4 398.4 398.4 398.5 398.6 398.6 398.8 399.0 399.0 399.0 399.1 398.9 398.8 399.0 398.8 398.5 398.1 397.7 397.1 396.7 396.3 395.4 394.6 393.7 392.4 390.9 389.0 387.2 384.9 382.4 380.4 378.8 377.0 375.3 374.2 373.7 373.8 373.8 373.6 373.5 373.0 372.4 372.2 372.0 371.7 372.1 373.3 373.9 374.2 374.6 374.7 374.7 374.8 374.9 374.8 374.9 374.9 374.9 374.9 374.9 374.8 375.0 375.2 375.2 375.3 375.5 375.3 375.4 375.6 375.5 375.6 375.9 375.8 375.7 375.9 375.9 375.9 376.0 375.9 375.9 376.0 375.9 375.9 376.0 375.9 375.9 376.0 375.9 375.9 376.0 375.9 375.9 376.0 375.9 375.8 375.9 375.9 375.8 375.9 375.9 375.8 375.9 375.8 375.7 375.9 375.8 375.7 375.9 375.8 375.7 375.9 375.9 375.8 375.9 375.9 375.9 376.0 375.9 375.9 376.1 376.1 376.0 376.1 376.0 375.9 376.0 375.8 375.7 375.8 375.5 375.5 375.6 375.5 375.5 375.6 375.5 375.5 375.6 375.5 375.5 375.5 375.4 375.4 375.4 375.3 375.2 375.2 375.1 375.1 375.2 374.8 374.6 374.8 374.8 375.0 375.4 375.6 375.7 375.9 375.8 375.7 375.9 375.7 375.3 374.9 374.4 373.3 371.9 370.0 367.2 364.3 361.5 359.9 358.3 357.8 358.5 359.1 359.8 360.5 361.2 361.8 362.5 363.2 363.9 364.6 365.3 365.9 366.6 367.3 368.0 368.7 369.4 370.1 370.8 371.5 372.2 372.9 373.6 374.3 375.0 375.7 376.4 377.1 377.8 378.5 379.2 379.0 378.2 377.7 377.2 376.8 376.7 376.8 376.6 376.1 376.0 375.9 375.7 375.7 375.8 375.7 375.6 375.5 375.3 374.9 374.5 373.6 373.0 372.4 371.7 371.1 370.3 369.2 367.2 364.2 360.9 356.4 352.2 348.0 343.0 337.9 333.5 329.9 328.1 328.7 329.2 329.8 330.3 330.9 331.4 331.9 332.5 333.0 333.6 334.1 334.7 335.3 335.8 336.4 336.9 335.9 335.1 335.4 335.4 335.0 334.8 334.8 334.6 334.5 334.6 334.6 334.7 334.7 334.6 334.7 334.7 334.6 334.7 334.6 334.4 334.5 334.5 334.3 334.3 334.3 334.1 334.2 334.1 334.1 334.3 334.3 334.3 334.5 334.6 334.7 334.9 334.9 335.0 335.3 335.2 335.1 335.3 335.2 335.1 335.3 335.2 335.1 335.3 335.2 335.1 335.3 335.2 335.1 335.3 335.2 335.1 335.3 335.2 335.1 335.3 335.2 335.1 335.3 335.2 335.1 335.3 335.2 335.1 335.3 335.2 335.1 335.3 335.2 335.1 335.3 335.2 335.1 335.3 335.2 335.1 335.3 335.3 335.2 335.3 335.2 335.1 335.3 335.2 335.1 335.3 335.2 335.1 335.2 335.1 335.0 335.1 335.0 334.8 334.8 334.6 334.4 334.5 334.4 334.3 334.4 334.3 334.3 334.4 334.4 334.5 334.7 334.7 334.7 334.9 334.9 335.0 335.3 335.2 335.0 335.0 334.8 334.5 334.4 334.3 334.2 334.2 334.1 334.0 333.9 333.7 333.5 333.0 332.8 332.5 332.2 331.9 331.5 331.2 330.1 328.6 326.5 323.9 321.5 320.5 321.1 321.7 322.3 322.9 323.5 324.1 324.7 325.3 326.0 326.6 327.2 327.8 328.4 329.0 329.7 330.3 330.9 331.5 332.1 332.8 333.4 334.0 334.3 334.7 335.4 335.4 335.1 335.4 335.3 335.0 335.1 335.0 334.7 334.6 334.6 334.6 334.7 334.8 335.0 335.2 335.3 335.6 335.7 335.7 335.9 335.9 335.7 335.5 335.1 334.7 334.4 334.0 333.6 333.2 332.6 331.6 330.7 329.7 328.6 327.5 326.7 325.8 325.0 324.3 323.2 321.8 320.9 319.8 318.9 318.2 317.6 317.0 317.0 316.8 316.8 316.9 317.3 317.8 317.8 317.6 317.4 317.1 317.2 317.3 317.1 317.0 317.1 317.0 316.9 317.1 317.0 316.9 317.1 316.9 316.7 316.6 316.2 316.0 316.0 315.8 315.7 315.7 315.3 315.2 315.4 315.4 315.5 315.9 316.1 316.4 316.7 316.8 317.2 317.4 317.5 317.6 317.7 317.6 317.5 317.6 317.4 317.4 317.5 317.4 317.4 317.5 317.4 317.4 317.5 317.4 317.4 317.6 317.4 317.4 317.5 317.4 317.4 317.5 317.4 317.4 317.5 317.4 317.4 317.5 317.4 317.4 317.5 317.4 317.4 317.5 317.4 317.4 317.5 317.4 317.4 317.5 317.4 317.4 317.5 317.4 317.3 317.3 317.2 317.2 317.3 317.2 317.2 317.3 317.2 317.2 317.3 317.2 317.2 317.3 317.2 317.2 317.3 317.2 317.2 317.3 317.2 317.2 317.3 317.2 317.2 317.3 317.2 317.2 317.3 317.2 317.2 317.3 317.2 317.2 317.3 317.2 317.2 317.3 317.2 317.2 317.3 317.2 317.2 317.3 317.2 317.2 317.3 317.2 317.2 317.4 317.4 317.4 317.5 317.4 317.4 317.5 317.4 317.4 317.5 317.4 317.4 317.5 317.4 317.3 317.4 317.3 317.2 317.3 317.2 317.1 317.0 316.7 316.6 316.7 316.7 316.7 316.9 316.8 316.7 316.7 316.7 316.6 316.7 316.7 316.6 316.5 316.4 316.3 316.3 316.4 316.5 316.6 316.5 316.4 316.4 316.4 316.3 316.2 316.2 316.0 315.7 315.7 315.7 315.7 315.7 315.7 315.7 315.9 316.0 316.2 316.7 317.0 317.3 317.7 317.9 318.3 318.6 318.9 319.0 319.0 319.1 319.1 319.2 319.2 319.0 319.0 318.9 318.1 317.7 317.7 317.7 317.7 317.7 317.7 317.7 317.7 317.7 317.7 317.7 317.7 317.7 317.7 317.7 317.7 317.7 317.7 317.7 317.7 317.7 317.7 317.7",
|
47 |
+
"input_type": "phoneme",
|
48 |
+
"offset": 51.764
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"text": "SP 行 走 在 危 险 边 缘 肆 意 的 怪 盗 SP 自 由 的 跳 着 指 尖 上 的 舞 蹈 SP 轻 盈 的 像 暗 影 中 藏 伏 的 波 斯 猫 等 待 下 一 个 目 标 SP",
|
52 |
+
"ph_seq": "SP x ing z ou z ai w ei x ian b ian y van s i0 y i d e g uai d ao SP z i0 y ou d e t iao zh e zh ir j ian sh ang d e w u d ao SP q ing y ing d e x iang an y ing zh ong c ang f u d e b o s i0 m ao d eng d ai x ia y i g e m u u b iao SP",
|
53 |
+
"note_seq": "rest D5 D5 B4 B4 D5 D5 G5 G5 D5 D5 C5 C5 B4 B4 A#4 A#4 A4 A4 G4 G4 D4 D4 G4 G4 rest D5 D5 B4 B4 D5 D5 G5 G5 D5 D5 C5 C5 B4 B4 C5 C5 C5 C5 G5 G5 C5 C5 rest D5 D5 B4 B4 D5 D5 G5 G5 D5 C5 C5 B4 B4 A#4 A#4 A#4 A#4 A#4 A#4 A#4 A#4 A#4 A#4 G4 G4 D4 D4 G4 G4 F4 F4 G4 G4 A#4 A#4 C5 C5 C#5 D5 D5 rest",
|
54 |
+
"note_dur_seq": "0.6 0.136 0.136 0.137 0.137 0.545 0.545 0.546 0.546 0.2720001 0.2720001 0.273 0.273 0.273 0.273 0.2719998 0.2719998 0.546 0.546 0.5450001 0.5450001 0.2730002 0.2730002 0.4089999 0.4089999 0.1370001 0.1359997 0.1359997 0.1360002 0.1360002 0.546 0.546 0.5450001 0.5450001 0.2729998 0.2729998 0.2730002 0.2730002 0.2719998 0.2719998 0.546 0.546 0.2730002 0.2730002 0.5449996 0.5449996 0.6820002 0.6820002 0.1359997 0.1370001 0.1370001 0.1360006 0.1360006 0.5450001 0.5450001 0.5459995 0.5459995 0.2729998 0.2720003 0.2720003 0.2729998 0.2729998 0.3640003 0.3640003 0.1809998 0.1809998 0.3640003 0.3640003 0.1820002 0.1820002 0.3639994 0.3639994 0.1810007 0.1810007 0.3639994 0.3639994 0.1820002 0.1820002 0.4090004 0.4090004 0.4089994 0.4089994 0.2729998 0.2729998 0.2720003 0.2720003 0.5460005 0.8179989 0.8179989 0.5",
|
55 |
+
"is_slur_seq": "0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0",
|
56 |
+
"ph_dur": "0.387498 0.212502 0.07009 0.06591 0.082457 0.054543 0.474542 0.070457 0.339182 0.206818 0.244725 0.027275 0.20709 0.06591 0.163907 0.109093 0.195866 0.076134 0.442591 0.103409 0.447275 0.097725 0.224134 0.048866 0.409 0.088134 0.048866 0.070089 0.06591 0.081458 0.054543 0.452815 0.093185 0.369997 0.175003 0.103681 0.169319 0.115049 0.157951 0.184498 0.087502 0.475543 0.070457 0.185499 0.087502 0.506364 0.038635 0.682 0.054182 0.081818 0.076774 0.060226 0.097365 0.038635 0.35409 0.19091 0.475542 0.273 0.070457 0.168591 0.103409 0.218457 0.054543 0.276499 0.087502 0.148048 0.032951 0.325365 0.038635 0.067231 0.114769 0.270814 0.093185 0.148049 0.032951 0.286729 0.077271 0.057 0.125 0.311275 0.097725 0.381724 0.027275 0.152547 0.120453 0.272 0.436908 0.109093 0.817999 0.5",
|
57 |
+
"f0_timestep": "0.005",
|
58 |
+
"f0_seq": "597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 597.9 596.0 595.2 593.7 593.4 592.8 591.4 591.1 589.4 588.3 588.3 586.1 585.0 583.9 580.6 577.6 573.9 570.6 565.6 556.0 549.7 543.5 537.4 531.3 525.3 519.4 513.6 507.8 501.0 501.9 501.6 501.2 500.5 500.2 500.1 499.6 498.9 498.5 497.1 494.8 492.5 487.8 483.2 476.1 469.3 464.6 459.2 456.3 463.8 471.3 479.0 486.9 494.8 502.9 511.1 519.5 527.9 536.6 545.3 552.2 550.9 550.5 551.8 554.3 555.6 557.3 557.6 559.2 559.6 560.4 561.5 562.0 561.5 562.6 562.2 562.4 563.5 563.7 563.1 564.7 564.4 566.4 568.3 570.1 571.5 574.2 575.3 577.2 580.0 581.6 581.6 584.3 584.3 586.3 588.1 590.0 591.4 592.8 592.9 593.2 593.5 593.5 593.5 593.5 593.5 593.5 593.4 593.1 593.1 593.1 593.1 593.1 592.8 592.4 592.5 592.8 592.4 592.4 592.4 592.3 591.8 591.9 591.1 590.5 590.8 591.1 590.9 591.8 591.7 591.5 592.1 591.7 591.4 591.6 591.0 590.4 590.6 590.0 589.4 589.6 589.0 588.7 589.0 588.8 588.0 587.7 587.2 586.3 586.0 586.1 585.3 597.7 606.6 619.7 629.9 639.4 651.2 660.0 673.2 683.2 692.8 706.5 716.4 733.0 746.8 759.5 776.4 779.0 782.6 784.9 787.2 789.2 790.4 791.0 791.7 791.7 790.4 790.4 789.7 789.0 789.0 788.9 788.5 789.0 789.1 789.4 789.5 789.9 789.1 789.5 789.9 789.9 789.9 790.7 790.4 790.4 791.7 791.7 792.0 793.0 792.6 792.4 793.6 793.4 792.9 794.0 793.9 793.1 794.1 794.2 793.1 794.0 794.0 793.3 794.0 794.0 792.8 793.6 792.9 792.3 792.6 792.6 792.2 793.0 792.6 792.6 793.1 793.1 792.8 793.5 793.1 793.2 793.6 793.6 793.6 794.0 793.6 793.6 793.5 792.9 791.7 790.0 788.1 785.0 783.0 780.4 776.1 768.4 760.7 753.1 745.5 738.1 730.7 723.4 716.2 709.0 701.9 694.9 688.0 681.1 674.3 667.5 660.9 654.3 647.7 641.2 634.8 628.5 622.2 616.0 609.8 603.7 597.7 591.7 587.0 587.3 589.1 589.4 590.6 591.5 592.1 592.5 592.8 593.1 593.0 592.4 593.0 592.4 592.9 593.5 593.8 594.3 594.8 594.8 594.8 594.5 593.8 593.5 592.6 591.6 590.7 590.0 588.3 587.9 588.6 588.3 589.1 589.7 589.8 589.0 588.7 587.0 585.3 583.5 580.9 577.2 574.5 572.6 566.3 560.1 553.9 547.9 541.9 535.9 530.0 524.2 518.5 512.8 507.4 508.5 510.7 513.6 514.9 518.0 520.4 522.6 525.3 526.6 527.9 528.2 528.4 528.1 528.1 528.0 527.5 527.5 528.0 527.5 527.9 528.4 528.1 528.2 528.7 528.8 529.1 529.3 529.4 529.6 529.9 529.9 529.9 529.8 529.3 528.8 528.9 528.1 527.2 527.2 525.7 524.7 524.5 523.4 522.3 519.5 517.3 514.9 512.6 511.6 508.5 506.4 505.1 501.9 500.5 498.8 497.4 496.2 497.7 497.9 498.5 498.5 498.5 498.8 498.8 498.9 499.6 499.3 499.7 500.2 500.0 500.5 500.8 500.5 500.8 500.7 500.2 500.2 499.8 499.3 498.7 498.3 497.6 496.3 495.3 493.0 490.4 488.5 484.2 481.2 478.9 477.2 476.7 476.2 475.7 475.2 474.6 474.1 473.6 473.1 472.6 472.1 471.5 471.0 470.5 470.0 469.5 469.0 468.5 468.0 467.5 466.9 466.4 468.1 468.3 469.7 469.9 469.9 469.4 469.4 469.4 469.1 468.9 468.5 468.1 467.8 467.8 467.8 468.1 468.1 468.4 468.6 468.6 468.6 468.9 468.9 468.9 469.1 469.1 469.1 469.4 469.1 468.9 468.9 468.9 468.3 467.9 467.2 466.5 465.4 464.2 462.5 461.9 460.4 459.7 458.7 456.1 453.9 452.0 450.3 448.5 446.4 444.9 442.8 441.8 440.3 441.4 441.8 441.8 442.0 442.5 442.6 443.1 443.3 443.1 443.4 443.8 443.6 443.9 444.1 443.8 443.9 444.3 444.1 444.3 444.8 444.6 444.6 445.1 444.9 445.0 445.3 445.1 445.2 445.4 445.3 445.2 445.4 445.3 445.1 445.3 445.1 445.1 445.3 445.1 445.2 445.3 445.1 445.2 445.4 445.1 445.2 445.4 445.3 445.1 445.6 445.3 445.1 445.3 445.1 444.9 444.8 444.3 443.9 444.1 443.6 443.6 443.6 442.8 442.8 443.0 442.6 443.1 442.7 439.5 437.3 433.1 430.0 425.0 422.6 421.7 420.9 420.1 419.3 418.5 417.7 416.8 416.0 415.2 414.4 413.6 412.8 412.0 411.2 410.4 409.6 408.8 408.0 407.2 406.4 405.6 404.8 404.0 403.3 402.5 401.7 400.9 400.1 399.4 398.6 397.8 397.0 396.3 395.6 395.6 395.8 395.4 395.2 395.2 395.2 395.0 395.4 395.4 395.6 395.6 395.6 395.4 395.6 395.4 395.6 395.6 395.4 395.4 395.4 395.2 395.5 395.6 395.4 395.7 395.9 395.7 395.9 396.1 395.9 396.1 396.3 396.1 396.3 396.5 396.3 396.6 396.6 396.6 396.6 396.6 396.3 396.3 396.3 396.3 396.3 396.3 396.1 396.3 396.3 396.3 396.3 396.3 396.3 396.3 396.1 396.0 395.8 395.4 395.1 394.5 393.5 391.3 390.0 385.9 383.7 380.8 379.4 378.0 377.8 377.1 375.0 372.9 370.9 368.8 366.8 364.8 362.7 360.7 358.7 356.7 354.8 352.8 350.9 348.9 347.0 345.1 343.1 341.2 339.4 337.5 335.6 333.7 331.9 330.1 328.2 326.4 324.6 322.8 321.0 319.2 317.5 315.7 314.0 312.2 310.5 308.8 307.1 305.4 303.4 300.5 301.1 301.5 301.7 300.9 300.4 300.0 299.9 299.5 299.7 300.0 299.6 299.9 300.2 299.9 300.0 299.9 299.7 299.7 299.7 299.1 299.3 299.1 299.1 298.9 299.5 299.0 298.1 297.4 293.6 291.0 285.9 282.5 280.6 285.1 289.6 294.3 299.0 303.8 308.7 313.6 318.7 323.8 329.0 334.2 339.6 345.0 350.6 356.2 361.9 367.7 373.6 379.6 385.7 390.1 389.5 387.9 385.9 384.6 384.1 383.7 383.9 384.1 384.8 385.1 385.7 386.4 386.4 385.7 386.3 385.9 387.0 387.8 388.4 388.4 389.5 389.5 390.4 391.5 392.8 393.1 394.5 394.6 395.0 395.3 395.6 395.3 395.6 395.2 395.2 395.4 395.4 395.5 395.9 395.9 395.9 396.1 396.3 396.3 396.6 396.6 396.6 396.6 396.6 396.6 396.8 396.6 396.9 397.2 397.8 398.2 398.9 399.0 398.8 399.2 398.4 398.2 399.3 402.8 406.3 409.8 413.3 416.9 420.5 424.2 427.8 431.5 435.3 439.0 442.9 446.7 450.6 454.5 458.4 462.4 466.4 470.4 474.5 478.6 482.7 486.9 491.1 495.4 499.7 504.0 508.4 512.8 517.2 521.7 526.2 530.8 535.4 540.0 544.7 549.4 554.2 559.0 563.8 568.7 573.6 578.6 583.6 588.0 589.4 590.4 590.4 591.5 591.8 592.1 591.4 591.4 592.0 591.7 591.4 591.6 590.7 590.5 590.7 590.4 589.1 581.3 569.7 560.6 553.1 542.3 534.6 523.8 514.8 507.2 497.4 491.9 493.7 494.0 494.5 495.0 494.7 494.1 493.5 491.6 487.9 485.1 478.8 473.6 468.9 463.3 459.7 468.3 477.0 485.8 494.9 504.0 513.4 522.9 532.6 542.5 552.6 563.2 573.4 575.9 577.2 578.9 580.2 581.8 582.6 584.2 583.6 584.1 583.6 583.6 583.9 583.9 583.0 583.2 582.6 583.7 584.3 585.2 585.1 586.0 585.7 586.3 587.3 588.4 588.7 589.5 590.0 590.5 590.8 591.4 591.5 591.8 591.8 591.8 591.8 592.1 592.1 592.4 592.4 592.4 592.4 592.4 592.4 592.5 592.8 592.4 592.4 592.8 592.8 592.8 592.7 592.4 592.5 592.7 592.4 592.4 592.4 592.1 592.1 592.1 592.1 592.1 592.1 592.1 592.1 592.1 592.1 592.4 592.4 592.4 592.4 592.1 591.8 590.6 589.7 585.3 581.9 575.8 570.7 568.0 573.4 578.8 584.4 589.9 595.5 601.2 606.9 612.7 618.5 624.4 630.4 636.4 642.4 648.5 654.7 660.9 667.2 673.6 680.0 686.5 693.0 699.6 706.2 713.0 719.8 726.6 733.5 740.5 747.6 754.7 759.5 761.7 766.0 767.9 770.1 772.2 773.6 772.6 773.5 772.8 772.5 773.2 773.3 772.2 773.6 773.8 774.5 777.0 778.1 778.1 781.3 781.3 783.2 785.7 787.2 787.5 789.0 789.4 789.6 790.4 791.2 790.8 792.1 791.7 791.7 792.3 792.6 792.8 793.6 793.6 793.5 793.1 793.0 792.7 793.1 792.6 792.2 792.2 792.1 791.7 792.1 791.7 792.2 792.2 792.2 792.3 792.6 792.3 792.6 792.6 792.2 792.2 792.1 791.7 791.3 791.3 791.3 791.3 791.3 791.3 790.9 791.3 790.7 790.3 789.9 789.4 789.4 789.4 789.0 789.0 786.0 784.4 777.8 771.6 765.6 757.1 751.6 742.7 733.9 725.2 716.6 708.2 699.8 691.5 683.3 675.2 667.2 659.3 651.5 643.8 636.2 628.6 621.2 613.8 606.6 599.4 592.3 586.7 586.8 587.3 588.9 589.7 591.2 591.9 592.4 593.2 593.5 593.2 593.5 593.5 593.2 593.5 593.3 592.8 592.8 592.7 592.4 592.8 592.8 592.8 593.1 592.8 592.1 591.9 590.7 588.4 585.3 579.1 573.4 567.7 561.2 556.6 554.7 552.9 551.0 549.1 547.3 545.4 543.6 541.8 539.9 538.1 536.3 534.5 532.7 530.9 529.1 527.3 525.5 523.8 522.0 520.2 520.4 521.2 521.4 523.4 524.2 524.6 525.5 526.0 526.3 526.6 526.8 526.6 526.6 526.8 526.6 527.0 527.2 527.2 527.2 527.2 527.2 527.2 527.5 527.5 527.5 527.8 527.7 527.2 526.0 523.3 519.8 514.6 510.1 502.5 497.9 493.3 493.2 493.0 492.9 492.7 492.6 492.5 492.3 492.2 492.1 491.9 491.8 491.7 491.5 491.4 491.3 491.1 491.0 490.8 490.7 490.6 490.7 491.6 493.5 494.2 495.8 496.8 497.3 498.6 499.0 499.3 499.3 499.3 499.2 498.8 499.0 499.0 499.0 498.8 498.8 498.4 498.2 498.2 497.3 497.3 496.9 496.4 496.2 495.7 495.0 492.6 490.8 489.6 486.1 485.4 487.4 489.4 491.5 493.5 495.5 497.6 499.7 501.7 503.8 505.9 508.0 510.1 512.2 514.3 516.5 518.6 520.8 522.9 525.1 527.3 529.2 530.0 530.6 530.7 531.2 530.2 529.7 528.4 527.8 527.5 526.8 526.3 526.6 526.1 525.4 525.9 525.6 525.4 526.3 526.3 525.9 526.8 526.6 527.3 528.1 529.0 529.2 530.2 530.2 530.2 530.7 531.1 530.9 531.5 531.5 531.2 531.5 531.8 531.8 532.1 532.1 532.1 532.1 532.1 532.1 532.1 532.0 531.8 531.8 531.8 531.5 531.5 531.5 531.5 531.5 531.5 531.5 531.5 531.4 531.2 531.4 531.2 531.2 530.9 530.9 530.9 530.8 530.6 530.5 530.2 529.9 529.5 529.0 528.3 527.8 527.3 526.5 526.3 526.5 526.3 525.7 526.0 526.0 526.4 526.9 528.1 528.0 527.9 527.8 527.7 527.6 527.5 527.4 527.3 527.3 527.2 527.1 527.0 526.9 526.8 526.7 526.6 526.5 526.4 526.3 526.2 526.1 526.1 526.0 526.3 526.9 526.8 526.6 526.6 526.9 526.6 526.9 527.2 526.9 526.9 526.8 526.6 526.6 526.6 526.6 526.6 526.8 526.6 526.6 526.9 526.6 526.6 526.6 526.5 526.3 526.2 525.9 525.7 525.6 525.4 524.9 524.4 523.9 521.8 520.8 519.2 517.4 516.6 516.5 516.0 535.7 553.0 567.0 587.5 603.8 625.6 643.9 661.5 683.9 700.9 728.8 750.6 769.6 772.1 771.0 771.1 771.9 771.9 771.3 772.8 770.3 769.7 770.1 769.5 768.7 770.0 767.9 767.9 768.5 769.6 771.3 774.5 774.5 777.2 779.0 780.8 783.2 785.8 786.4 787.2 788.4 788.1 788.1 788.9 788.5 788.6 788.9 788.5 788.2 788.5 788.1 788.2 788.5 788.1 788.1 788.4 788.0 787.6 788.4 787.6 787.7 788.0 787.6 787.3 788.1 787.1 786.8 787.2 786.7 786.7 787.5 787.2 787.2 787.6 787.6 787.8 788.5 788.5 788.6 789.0 788.5 788.6 789.0 789.0 789.0 789.0 788.9 788.5 788.6 789.0 788.4 787.6 787.6 783.6 781.7 774.6 768.4 762.1 754.4 750.8 739.7 728.9 718.2 707.6 697.2 687.0 676.9 667.0 657.2 647.5 638.0 628.6 619.4 610.3 601.4 592.5 583.8 575.3 566.8 558.5 550.3 542.2 534.2 524.3 525.1 525.9 525.7 526.0 526.0 526.3 526.9 526.9 527.2 527.5 527.5 527.8 527.8 527.5 527.8 527.8 527.9 528.2 528.4 528.5 529.0 529.0 529.0 529.0 529.0 529.0 529.3 529.0 529.0 529.3 529.3 529.1 529.3 529.3 529.1 529.3 529.0 529.0 529.0 529.0 528.7 529.0 528.7 528.7 528.7 528.7 528.5 528.7 528.4 528.5 528.7 528.4 528.5 528.7 528.4 528.4 528.7 528.4 528.4 528.7 528.4 528.5 528.7 528.4 528.5 528.7 528.7 528.8 529.0 528.7 528.7 529.0 529.0 528.7 529.0 529.0 528.8 529.0 528.7 528.8 529.0 528.7 528.7 528.7 528.5 528.7 529.0 528.8 529.0 529.3 529.0 529.3 529.3 529.0 529.0 529.0 528.8 529.1 529.3 528.8 529.3 529.6 529.7 529.9 530.5 530.2 530.7 531.2 531.2 531.6 532.4 532.7 531.6 532.4 532.3 533.5 534.7 535.9 537.1 538.3 539.5 540.7 541.9 543.2 544.4 545.6 546.8 548.0 549.3 550.5 551.7 553.0 554.2 555.4 556.7 557.9 559.2 560.4 561.7 562.9 564.2 565.5 566.7 568.0 569.3 570.5 571.8 573.1 574.4 575.7 577.0 578.3 579.6 580.9 582.2 583.5 584.8 586.1 587.4 588.7 590.0 591.2 591.9 592.4 593.3 594.2 593.9 594.2 594.2 594.1 593.8 593.7 593.0 592.4 592.0 591.4 591.0 590.6 590.0 578.6 570.6 559.6 551.0 543.9 533.3 525.7 514.8 506.0 498.5 498.0 497.3 496.9 496.0 495.3 494.3 493.6 490.6 488.3 486.0 481.8 479.3 475.7 473.2 471.6 479.4 487.4 495.6 503.8 512.2 520.7 529.4 538.2 547.2 556.3 565.6 574.9 578.1 579.1 580.2 581.5 582.9 583.9 585.1 584.3 585.2 584.6 585.0 585.4 585.6 585.1 585.6 585.1 585.7 586.3 587.3 587.3 588.4 588.7 589.0 589.9 590.7 591.2 591.8 591.8 592.2 592.4 592.8 592.8 592.8 593.1 592.8 593.1 593.1 593.1 593.2 593.5 593.1 593.2 593.5 593.2 593.5 593.4 593.2 593.5 593.5 593.5 593.5 593.4 593.1 593.2 593.5 593.5 593.5 593.5 593.8 593.8 593.9 594.2 594.2 593.9 594.2 594.0 593.6 594.2 593.6 592.4 591.6 590.4 588.7 585.6 583.9 580.4 579.6 584.7 589.8 595.0 600.3 605.6 610.9 616.3 621.7 627.2 632.7 638.3 643.9 649.6 655.3 661.0 666.9 672.7 678.7 684.6 690.7 696.7 702.9 709.1 715.3 721.6 728.0 734.4 740.8 747.4 753.9 760.6 767.3 774.0 780.0 782.6 784.4 785.7 787.2 788.4 789.7 791.3 791.5 792.6 793.0 792.6 792.6 792.5 791.7 792.1 791.7 791.3 792.2 792.2 792.0 793.0 792.6 792.8 793.6 793.8 793.3 794.5 793.9 793.1 793.6 793.4 792.6 793.4 792.6 792.7 793.3 794.0 793.8 794.9 794.8 794.6 794.9 794.5 794.5 794.4 794.0 794.0 794.5 794.5 794.6 794.9 794.9 795.1 795.9 795.5 795.9 796.3 796.4 796.8 797.6 797.3 797.7 797.7 797.7 797.6 797.2 796.8 796.6 795.9 794.4 793.9 793.6 792.6 792.2 791.6 791.0 789.9 787.3 785.8 781.2 776.6 773.6 765.0 756.5 748.1 739.8 731.6 723.5 715.4 707.5 699.6 691.9 684.2 676.6 669.1 661.6 654.3 647.0 639.8 632.7 625.7 618.7 611.9 605.1 597.7 588.3 581.3 573.7 576.2 578.6 582.0 583.9 585.7 587.7 589.7 589.9 591.1 591.8 591.8 592.5 592.8 592.8 592.8 592.8 592.8 592.8 593.1 593.1 593.1 593.2 593.5 593.5 593.6 594.2 594.1 593.9 594.2 594.1 593.8 593.8 593.7 593.1 592.4 592.4 591.5 590.4 590.4 589.2 588.3 581.9 577.0 572.6 567.6 564.1 558.6 554.3 550.9 545.0 541.1 535.5 531.0 527.2 527.6 527.8 527.9 528.2 528.4 528.2 528.7 528.7 528.4 528.4 528.1 527.8 527.7 527.5 527.5 527.5 527.5 526.9 526.9 526.9 526.6 526.9 526.6 526.6 526.9 526.9 526.9 527.5 527.6 527.8 527.8 527.5 527.1 526.2 524.5 522.6 521.1 516.6 513.9 511.9 508.2 507.4 506.7 505.9 505.1 504.4 503.6 502.9 502.1 501.3 500.6 499.8 499.1 498.5 498.5 499.0 499.0 499.4 499.9 499.9 500.2 499.9 499.8 499.3 499.0 499.0 498.8 498.8 498.8 498.8 498.5 498.8 498.5 498.5 498.5 498.4 498.2 497.5 497.2 496.5 496.4 495.9 495.0 494.9 494.2 492.6 491.9 489.3 486.7 484.8 482.5 480.7 477.6 473.2 469.0 464.7 460.5 456.3 452.2 448.1 444.0 440.0 436.0 432.1 428.2 424.3 420.5 416.7 414.1 416.3 418.6 421.6 425.1 428.1 430.9 433.1 433.7 435.3 434.7 434.7 435.4 434.9 435.6 437.3 437.5 439.7 442.0 443.9 446.0 449.2 451.2 453.4 457.8 460.1 461.9 465.6 467.0 469.1 470.4 471.3 471.9 472.4 472.7 472.7 472.7 472.9 472.9 472.7 472.7 472.7 471.5 471.0 468.9 466.6 464.8 460.6 458.4 454.7 452.5 453.3 454.1 454.9 455.7 456.5 457.3 458.0 458.8 459.6 460.4 461.2 462.0 462.8 463.6 464.4 465.2 466.1 466.9 467.7 468.5 469.3 470.1 470.8 470.1 469.7 469.9 470.5 470.5 470.4 469.9 469.7 469.3 468.6 468.3 468.2 467.8 467.8 468.1 467.5 465.9 464.8 460.4 456.8 450.1 445.3 441.0 437.1 440.2 443.3 446.4 449.6 452.8 456.0 459.2 462.5 465.8 469.1 472.4 473.8 473.7 473.2 472.4 472.4 472.0 471.6 471.6 471.5 471.0 470.7 470.4 469.9 469.9 469.9 469.7 469.7 469.9 470.2 470.2 470.2 470.2 470.2 470.5 470.5 470.5 470.7 470.5 470.5 470.8 470.8 470.8 471.0 470.8 470.8 471.0 471.0 470.8 470.8 470.8 470.5 470.5 470.5 470.4 470.2 469.9 469.4 467.8 465.8 462.9 458.9 454.9 451.6 448.1 446.1 447.6 449.1 450.5 452.0 453.5 455.0 456.5 458.0 459.5 461.0 462.5 464.0 465.5 467.0 468.5 470.1 471.4 471.9 472.0 471.3 471.6 471.9 472.1 471.9 472.1 472.3 471.8 471.6 470.7 470.2 468.1 465.4 463.2 457.1 452.1 443.6 437.8 432.9 429.2 430.3 431.4 432.6 433.7 434.9 436.0 437.2 438.3 439.5 440.6 441.8 443.0 444.1 445.3 446.5 447.6 448.8 449.8 449.8 449.9 450.8 451.6 451.8 452.9 454.0 454.5 455.6 456.1 456.6 457.1 457.4 458.3 459.0 459.2 460.1 460.8 461.2 462.1 462.9 463.6 464.0 465.4 465.7 465.9 466.9 466.7 467.1 467.5 467.5 467.6 468.1 468.1 468.4 468.6 468.3 468.3 468.2 467.5 467.2 466.5 465.6 464.7 464.0 462.7 460.9 460.0 457.2 455.1 453.2 449.3 444.3 437.3 432.9 430.0 425.5 423.1 420.7 417.5 415.1 412.5 409.4 405.3 403.7 401.2 398.8 398.4 397.0 396.0 395.4 396.9 397.7 398.2 398.2 398.4 398.4 398.4 398.0 397.1 396.3 393.6 391.5 388.6 386.0 381.9 377.4 372.4 364.3 357.8 353.9 348.6 346.4 342.8 339.3 335.8 332.3 328.9 325.5 322.1 318.8 315.5 312.3 309.0 305.8 302.7 300.4 301.2 302.0 301.6 301.1 300.3 300.0 299.0 298.4 298.1 298.1 297.8 297.9 297.9 297.8 297.8 297.9 297.8 297.8 297.9 297.8 297.8 297.6 297.6 297.6 297.6 297.8 297.9 297.9 297.9 298.3 298.1 298.1 297.9 297.8 297.4 297.2 296.9 296.8 296.9 296.8 296.9 297.2 297.0 297.2 297.4 297.2 297.9 297.9 297.9 298.1 297.4 296.7 295.7 294.9 293.4 291.8 291.0 291.6 298.1 304.8 311.6 318.5 325.6 332.9 340.4 348.0 355.7 363.7 371.8 380.1 388.6 396.9 397.5 397.7 397.5 397.5 397.5 397.7 397.7 397.7 397.7 397.3 396.7 396.6 395.7 394.7 393.1 391.7 389.1 384.3 380.4 374.6 371.4 370.9 370.5 370.0 369.6 369.1 368.7 368.2 367.8 367.3 366.9 366.4 366.0 365.5 365.1 364.7 364.2 363.8 363.4 364.2 363.7 362.8 362.3 361.8 361.1 360.8 360.1 359.2 359.0 358.4 358.2 358.0 357.4 357.2 356.9 356.8 356.8 356.9 356.8 356.7 356.6 356.6 356.5 356.4 356.1 355.9 355.9 355.8 355.9 355.8 355.9 355.9 355.8 355.9 356.1 356.0 356.2 356.1 355.9 355.9 355.9 355.9 355.9 355.9 356.0 356.1 355.9 355.9 355.5 355.3 355.1 354.7 354.9 354.7 354.7 354.7 354.7 354.4 353.7 354.3 354.9 355.5 356.0 356.6 357.2 357.8 358.4 359.0 359.6 360.2 360.8 361.4 362.0 362.6 366.5 370.9 374.3 376.4 378.0 380.7 382.6 385.0 387.4 389.3 390.6 391.7 392.4 393.4 393.6 394.5 394.5 394.5 394.8 395.2 395.5 395.9 396.1 396.4 396.6 397.1 397.3 397.7 397.7 397.9 398.2 398.4 398.4 398.6 398.6 398.6 398.6 398.6 398.6 398.6 398.4 398.4 398.6 398.4 398.4 398.3 398.1 397.9 397.9 397.7 397.5 397.4 397.0 397.0 396.8 396.5 396.3 396.1 395.4 394.0 392.4 389.9 387.3 383.9 381.3 377.9 377.5 381.6 385.7 389.9 394.1 398.4 402.7 407.1 411.5 415.9 420.4 425.0 429.6 434.2 438.9 443.7 448.5 453.4 458.3 463.2 467.3 467.5 468.1 468.1 469.0 469.5 469.7 470.0 470.5 470.5 470.5 470.5 470.7 470.5 470.8 470.8 471.0 470.8 471.0 471.3 471.4 471.9 472.1 472.1 472.4 472.4 472.4 472.4 472.4 472.0 471.2 470.5 469.0 468.1 466.5 464.8 465.9 466.8 468.6 471.9 474.9 477.6 482.3 485.1 488.4 492.1 494.5 497.4 500.5 503.8 506.0 508.9 513.4 516.3 521.6 523.4 523.9 524.2 524.2 524.4 522.9 522.9 521.0 520.2 519.6 519.6 519.3 520.2 520.2 521.0 521.9 522.6 523.8 525.1 525.7 526.1 526.9 527.5 527.5 528.2 528.6 528.1 528.5 529.0 528.5 528.8 529.0 528.7 528.7 529.0 528.7 528.7 529.0 528.7 528.8 529.0 529.1 529.4 529.8 530.6 531.5 532.4 533.7 534.9 536.7 538.2 539.6 541.4 543.0 545.0 546.6 548.1 549.9 551.5 553.1 554.5 555.6 556.9 557.8 558.8 559.3 559.6 559.9 560.2 560.2 560.4 560.2 560.2 560.5 560.2 560.2 560.5 560.2 560.2 560.4 560.2 560.2 560.4 560.2 560.2 560.4 560.2 560.2 560.5 560.2 560.2 560.5 560.2 560.5 560.7 560.2 560.5 560.7 560.2 560.5 560.7 560.2 560.5 560.8 560.2 560.2 560.5 559.9 560.2 560.1 559.8 559.8 560.1 559.8 559.9 560.1 559.8 560.2 560.2 559.9 560.2 560.2 559.8 559.8 560.3 559.6 559.8 559.6 558.2 556.4 552.1 548.9 542.1 537.7 534.2 535.4 536.7 537.9 539.1 540.4 541.6 542.9 544.1 545.4 546.7 547.9 549.2 550.5 551.7 553.0 554.3 555.6 556.9 558.2 559.4 560.7 562.0 563.3 564.6 565.9 567.4 569.6 570.4 572.9 576.5 579.5 582.6 586.2 589.0 591.0 592.3 593.5 593.5 593.5 593.4 593.2 593.8 593.8 593.8 594.6 594.8 594.8 595.5 595.5 595.5 595.5 595.2 595.2 595.2 594.5 594.4 594.2 593.8 593.8 593.8 593.9 594.2 594.1 593.8 594.2 594.1 593.8 594.2 594.2 593.9 594.2 594.2 593.9 594.2 594.4 593.9 594.2 594.4 593.8 594.2 594.4 593.8 594.2 594.5 593.9 594.2 594.5 593.9 594.2 594.4 593.9 594.2 594.1 593.8 594.2 594.1 593.8 594.2 594.2 593.8 593.9 594.2 593.8 593.8 594.1 593.8 593.8 594.1 593.8 593.9 594.1 593.8 593.9 594.2 593.8 593.9 594.2 593.8 593.8 594.1 593.8 593.8 594.1 593.8 593.9 594.1 593.8 593.9 594.2 593.5 593.9 594.2 593.5 593.5 593.7 593.2 593.5 593.4 592.8 593.1 593.0 592.4 592.8 593.1 592.5 592.8 593.1 592.6 593.1 593.1 592.8 593.1 593.4 592.8 593.2 593.4 592.8 593.2 593.5 592.9 593.2 593.5 592.8 593.1 593.3 592.9 593.5 593.8 593.5 594.2 594.5 594.5 596.3 596.6 597.9 599.5 600.4 601.3 602.4 603.6 604.0 604.5 604.9 604.9 604.3 605.0 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6 605.6",
|
59 |
+
"input_type": "phoneme",
|
60 |
+
"offset": 72.491
|
61 |
+
}
|
62 |
+
]
|
modules/attentions.py
ADDED
@@ -0,0 +1,397 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import math
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
from torch import nn
|
6 |
+
from torch.nn import functional as F
|
7 |
+
|
8 |
+
import modules.commons as commons
|
9 |
+
|
10 |
+
|
11 |
+
class LayerNorm(nn.Module):
|
12 |
+
def __init__(self, channels, eps=1e-5):
|
13 |
+
super().__init__()
|
14 |
+
self.channels = channels
|
15 |
+
self.eps = eps
|
16 |
+
|
17 |
+
self.gamma = nn.Parameter(torch.ones(channels))
|
18 |
+
self.beta = nn.Parameter(torch.zeros(channels))
|
19 |
+
|
20 |
+
def forward(self, x):
|
21 |
+
x = x.transpose(1, -1)
|
22 |
+
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
23 |
+
return x.transpose(1, -1)
|
24 |
+
|
25 |
+
|
26 |
+
class Encoder(nn.Module):
|
27 |
+
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
|
28 |
+
super().__init__()
|
29 |
+
self.hidden_channels = hidden_channels
|
30 |
+
self.filter_channels = filter_channels
|
31 |
+
self.n_heads = n_heads
|
32 |
+
self.n_layers = n_layers
|
33 |
+
self.kernel_size = kernel_size
|
34 |
+
self.p_dropout = p_dropout
|
35 |
+
self.window_size = window_size
|
36 |
+
|
37 |
+
self.drop = nn.Dropout(p_dropout)
|
38 |
+
self.attn_layers = nn.ModuleList()
|
39 |
+
self.norm_layers_1 = nn.ModuleList()
|
40 |
+
self.ffn_layers = nn.ModuleList()
|
41 |
+
self.norm_layers_2 = nn.ModuleList()
|
42 |
+
for i in range(self.n_layers):
|
43 |
+
self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
|
44 |
+
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
45 |
+
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
|
46 |
+
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
47 |
+
|
48 |
+
def forward(self, x, x_mask):
|
49 |
+
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
50 |
+
x = x * x_mask
|
51 |
+
for i in range(self.n_layers):
|
52 |
+
y = self.attn_layers[i](x, x, attn_mask)
|
53 |
+
y = self.drop(y)
|
54 |
+
x = self.norm_layers_1[i](x + y)
|
55 |
+
|
56 |
+
y = self.ffn_layers[i](x, x_mask)
|
57 |
+
y = self.drop(y)
|
58 |
+
x = self.norm_layers_2[i](x + y)
|
59 |
+
x = x * x_mask
|
60 |
+
return x
|
61 |
+
|
62 |
+
class Decoder(nn.Module):
|
63 |
+
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
|
64 |
+
super().__init__()
|
65 |
+
self.hidden_channels = hidden_channels
|
66 |
+
self.filter_channels = filter_channels
|
67 |
+
self.n_heads = n_heads
|
68 |
+
self.n_layers = n_layers
|
69 |
+
self.kernel_size = kernel_size
|
70 |
+
self.p_dropout = p_dropout
|
71 |
+
self.proximal_bias = proximal_bias
|
72 |
+
self.proximal_init = proximal_init
|
73 |
+
|
74 |
+
self.drop = nn.Dropout(p_dropout)
|
75 |
+
self.self_attn_layers = nn.ModuleList()
|
76 |
+
self.norm_layers_0 = nn.ModuleList()
|
77 |
+
self.encdec_attn_layers = nn.ModuleList()
|
78 |
+
self.norm_layers_1 = nn.ModuleList()
|
79 |
+
self.ffn_layers = nn.ModuleList()
|
80 |
+
self.norm_layers_2 = nn.ModuleList()
|
81 |
+
for i in range(self.n_layers):
|
82 |
+
self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
|
83 |
+
self.norm_layers_0.append(LayerNorm(hidden_channels))
|
84 |
+
self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
|
85 |
+
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
86 |
+
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
|
87 |
+
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
88 |
+
|
89 |
+
def forward(self, x, x_mask, h, h_mask):
|
90 |
+
"""
|
91 |
+
x: decoder input
|
92 |
+
h: encoder output
|
93 |
+
"""
|
94 |
+
self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
|
95 |
+
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
96 |
+
x = x * x_mask
|
97 |
+
for i in range(self.n_layers):
|
98 |
+
y = self.self_attn_layers[i](x, x, self_attn_mask)
|
99 |
+
y = self.drop(y)
|
100 |
+
x = self.norm_layers_0[i](x + y)
|
101 |
+
|
102 |
+
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
|
103 |
+
y = self.drop(y)
|
104 |
+
x = self.norm_layers_1[i](x + y)
|
105 |
+
|
106 |
+
y = self.ffn_layers[i](x, x_mask)
|
107 |
+
y = self.drop(y)
|
108 |
+
x = self.norm_layers_2[i](x + y)
|
109 |
+
x = x * x_mask
|
110 |
+
return x
|
111 |
+
|
112 |
+
class FFT(nn.Module):
|
113 |
+
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers=1, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
|
114 |
+
super().__init__()
|
115 |
+
self.hidden_channels = hidden_channels
|
116 |
+
self.filter_channels = filter_channels
|
117 |
+
self.n_heads = n_heads
|
118 |
+
self.n_layers = n_layers
|
119 |
+
self.kernel_size = kernel_size
|
120 |
+
self.p_dropout = p_dropout
|
121 |
+
self.proximal_bias = proximal_bias
|
122 |
+
self.proximal_init = proximal_init
|
123 |
+
|
124 |
+
self.drop = nn.Dropout(p_dropout)
|
125 |
+
self.self_attn_layers = nn.ModuleList()
|
126 |
+
self.norm_layers_0 = nn.ModuleList()
|
127 |
+
self.ffn_layers = nn.ModuleList()
|
128 |
+
self.norm_layers_1 = nn.ModuleList()
|
129 |
+
for i in range(self.n_layers):
|
130 |
+
self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
|
131 |
+
self.norm_layers_0.append(LayerNorm(hidden_channels))
|
132 |
+
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
|
133 |
+
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
134 |
+
|
135 |
+
def forward(self, x, x_mask):
|
136 |
+
"""
|
137 |
+
x: decoder input
|
138 |
+
h: encoder output
|
139 |
+
"""
|
140 |
+
self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
|
141 |
+
x = x * x_mask
|
142 |
+
for i in range(self.n_layers):
|
143 |
+
y = self.self_attn_layers[i](x, x, self_attn_mask)
|
144 |
+
y = self.drop(y)
|
145 |
+
x = self.norm_layers_0[i](x + y)
|
146 |
+
|
147 |
+
y = self.ffn_layers[i](x, x_mask)
|
148 |
+
y = self.drop(y)
|
149 |
+
x = self.norm_layers_1[i](x + y)
|
150 |
+
x = x * x_mask
|
151 |
+
return x
|
152 |
+
|
153 |
+
|
154 |
+
class FFNs(nn.Module):
|
155 |
+
def __init__(self, hidden_channels, filter_channels, n_heads, n_layers=1, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
|
156 |
+
super().__init__()
|
157 |
+
self.hidden_channels = hidden_channels
|
158 |
+
self.filter_channels = filter_channels
|
159 |
+
self.n_heads = n_heads
|
160 |
+
self.n_layers = n_layers
|
161 |
+
self.kernel_size = kernel_size
|
162 |
+
self.p_dropout = p_dropout
|
163 |
+
self.proximal_bias = proximal_bias
|
164 |
+
self.proximal_init = proximal_init
|
165 |
+
|
166 |
+
self.drop = nn.Dropout(p_dropout)
|
167 |
+
#self.self_attn_layers = nn.ModuleList()
|
168 |
+
#self.norm_layers_0 = nn.ModuleList()
|
169 |
+
self.ffn_layers = nn.ModuleList()
|
170 |
+
self.norm_layers_1 = nn.ModuleList()
|
171 |
+
for i in range(self.n_layers):
|
172 |
+
#self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
|
173 |
+
#self.norm_layers_0.append(LayerNorm(hidden_channels))
|
174 |
+
self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
|
175 |
+
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
176 |
+
|
177 |
+
def forward(self, x, x_mask):
|
178 |
+
"""
|
179 |
+
x: decoder input
|
180 |
+
h: encoder output
|
181 |
+
"""
|
182 |
+
self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
|
183 |
+
x = x * x_mask
|
184 |
+
for i in range(self.n_layers):
|
185 |
+
#y = self.self_attn_layers[i](x, x, self_attn_mask)
|
186 |
+
#y = self.drop(y)
|
187 |
+
#x = self.norm_layers_0[i](x + y)
|
188 |
+
|
189 |
+
y = self.ffn_layers[i](x, x_mask)
|
190 |
+
y = self.drop(y)
|
191 |
+
x = self.norm_layers_1[i](x + y)
|
192 |
+
x = x * x_mask
|
193 |
+
return x
|
194 |
+
|
195 |
+
class MultiHeadAttention(nn.Module):
|
196 |
+
def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
|
197 |
+
super().__init__()
|
198 |
+
assert channels % n_heads == 0
|
199 |
+
|
200 |
+
self.channels = channels
|
201 |
+
self.out_channels = out_channels
|
202 |
+
self.n_heads = n_heads
|
203 |
+
self.p_dropout = p_dropout
|
204 |
+
self.window_size = window_size
|
205 |
+
self.heads_share = heads_share
|
206 |
+
self.block_length = block_length
|
207 |
+
self.proximal_bias = proximal_bias
|
208 |
+
self.proximal_init = proximal_init
|
209 |
+
self.attn = None
|
210 |
+
|
211 |
+
self.k_channels = channels // n_heads
|
212 |
+
self.conv_q = nn.Conv1d(channels, channels, 1)
|
213 |
+
self.conv_k = nn.Conv1d(channels, channels, 1)
|
214 |
+
self.conv_v = nn.Conv1d(channels, channels, 1)
|
215 |
+
self.conv_o = nn.Conv1d(channels, out_channels, 1)
|
216 |
+
self.drop = nn.Dropout(p_dropout)
|
217 |
+
|
218 |
+
if window_size is not None:
|
219 |
+
n_heads_rel = 1 if heads_share else n_heads
|
220 |
+
rel_stddev = self.k_channels**-0.5
|
221 |
+
self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
|
222 |
+
self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
|
223 |
+
|
224 |
+
nn.init.xavier_uniform_(self.conv_q.weight)
|
225 |
+
nn.init.xavier_uniform_(self.conv_k.weight)
|
226 |
+
nn.init.xavier_uniform_(self.conv_v.weight)
|
227 |
+
if proximal_init:
|
228 |
+
with torch.no_grad():
|
229 |
+
self.conv_k.weight.copy_(self.conv_q.weight)
|
230 |
+
self.conv_k.bias.copy_(self.conv_q.bias)
|
231 |
+
|
232 |
+
def forward(self, x, c, attn_mask=None):
|
233 |
+
q = self.conv_q(x)
|
234 |
+
k = self.conv_k(c)
|
235 |
+
v = self.conv_v(c)
|
236 |
+
|
237 |
+
x, self.attn = self.attention(q, k, v, mask=attn_mask)
|
238 |
+
|
239 |
+
x = self.conv_o(x)
|
240 |
+
return x
|
241 |
+
|
242 |
+
def attention(self, query, key, value, mask=None):
|
243 |
+
# reshape [b, d, t] -> [b, n_h, t, d_k]
|
244 |
+
b, d, t_s, t_t = (*key.size(), query.size(2))
|
245 |
+
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
|
246 |
+
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
247 |
+
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
248 |
+
|
249 |
+
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
|
250 |
+
if self.window_size is not None:
|
251 |
+
assert t_s == t_t, "Relative attention is only available for self-attention."
|
252 |
+
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
|
253 |
+
rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
|
254 |
+
scores_local = self._relative_position_to_absolute_position(rel_logits)
|
255 |
+
scores = scores + scores_local
|
256 |
+
if self.proximal_bias:
|
257 |
+
assert t_s == t_t, "Proximal bias is only available for self-attention."
|
258 |
+
scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
|
259 |
+
if mask is not None:
|
260 |
+
scores = scores.masked_fill(mask == 0, -1e4)
|
261 |
+
if self.block_length is not None:
|
262 |
+
assert t_s == t_t, "Local attention is only available for self-attention."
|
263 |
+
block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
|
264 |
+
scores = scores.masked_fill(block_mask == 0, -1e4)
|
265 |
+
p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
|
266 |
+
p_attn = self.drop(p_attn)
|
267 |
+
output = torch.matmul(p_attn, value)
|
268 |
+
if self.window_size is not None:
|
269 |
+
relative_weights = self._absolute_position_to_relative_position(p_attn)
|
270 |
+
value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
|
271 |
+
output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
|
272 |
+
output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
|
273 |
+
return output, p_attn
|
274 |
+
|
275 |
+
def _matmul_with_relative_values(self, x, y):
|
276 |
+
"""
|
277 |
+
x: [b, h, l, m]
|
278 |
+
y: [h or 1, m, d]
|
279 |
+
ret: [b, h, l, d]
|
280 |
+
"""
|
281 |
+
ret = torch.matmul(x, y.unsqueeze(0))
|
282 |
+
return ret
|
283 |
+
|
284 |
+
def _matmul_with_relative_keys(self, x, y):
|
285 |
+
"""
|
286 |
+
x: [b, h, l, d]
|
287 |
+
y: [h or 1, m, d]
|
288 |
+
ret: [b, h, l, m]
|
289 |
+
"""
|
290 |
+
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
|
291 |
+
return ret
|
292 |
+
|
293 |
+
def _get_relative_embeddings(self, relative_embeddings, length):
|
294 |
+
max_relative_position = 2 * self.window_size + 1
|
295 |
+
# Pad first before slice to avoid using cond ops.
|
296 |
+
pad_length = max(length - (self.window_size + 1), 0)
|
297 |
+
slice_start_position = max((self.window_size + 1) - length, 0)
|
298 |
+
slice_end_position = slice_start_position + 2 * length - 1
|
299 |
+
if pad_length > 0:
|
300 |
+
padded_relative_embeddings = F.pad(
|
301 |
+
relative_embeddings,
|
302 |
+
commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
|
303 |
+
else:
|
304 |
+
padded_relative_embeddings = relative_embeddings
|
305 |
+
used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
|
306 |
+
return used_relative_embeddings
|
307 |
+
|
308 |
+
def _relative_position_to_absolute_position(self, x):
|
309 |
+
"""
|
310 |
+
x: [b, h, l, 2*l-1]
|
311 |
+
ret: [b, h, l, l]
|
312 |
+
"""
|
313 |
+
batch, heads, length, _ = x.size()
|
314 |
+
# Concat columns of pad to shift from relative to absolute indexing.
|
315 |
+
x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
|
316 |
+
|
317 |
+
# Concat extra elements so to add up to shape (len+1, 2*len-1).
|
318 |
+
x_flat = x.view([batch, heads, length * 2 * length])
|
319 |
+
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
|
320 |
+
|
321 |
+
# Reshape and slice out the padded elements.
|
322 |
+
x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
|
323 |
+
return x_final
|
324 |
+
|
325 |
+
def _absolute_position_to_relative_position(self, x):
|
326 |
+
"""
|
327 |
+
x: [b, h, l, l]
|
328 |
+
ret: [b, h, l, 2*l-1]
|
329 |
+
"""
|
330 |
+
batch, heads, length, _ = x.size()
|
331 |
+
# padd along column
|
332 |
+
x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
|
333 |
+
x_flat = x.view([batch, heads, length**2 + length*(length -1)])
|
334 |
+
# add 0's in the beginning that will skew the elements after reshape
|
335 |
+
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
|
336 |
+
x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
|
337 |
+
return x_final
|
338 |
+
|
339 |
+
def _attention_bias_proximal(self, length):
|
340 |
+
"""Bias for self-attention to encourage attention to close positions.
|
341 |
+
Args:
|
342 |
+
length: an integer scalar.
|
343 |
+
Returns:
|
344 |
+
a Tensor with shape [1, 1, length, length]
|
345 |
+
"""
|
346 |
+
r = torch.arange(length, dtype=torch.float32)
|
347 |
+
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
|
348 |
+
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
|
349 |
+
|
350 |
+
|
351 |
+
class FFN(nn.Module):
|
352 |
+
def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
|
353 |
+
super().__init__()
|
354 |
+
self.in_channels = in_channels
|
355 |
+
self.out_channels = out_channels
|
356 |
+
self.filter_channels = filter_channels
|
357 |
+
self.kernel_size = kernel_size
|
358 |
+
self.p_dropout = p_dropout
|
359 |
+
self.activation = activation
|
360 |
+
self.causal = causal
|
361 |
+
|
362 |
+
if causal:
|
363 |
+
self.padding = self._causal_padding
|
364 |
+
else:
|
365 |
+
self.padding = self._same_padding
|
366 |
+
|
367 |
+
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
|
368 |
+
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
|
369 |
+
self.drop = nn.Dropout(p_dropout)
|
370 |
+
|
371 |
+
def forward(self, x, x_mask):
|
372 |
+
x = self.conv_1(self.padding(x * x_mask))
|
373 |
+
if self.activation == "gelu":
|
374 |
+
x = x * torch.sigmoid(1.702 * x)
|
375 |
+
else:
|
376 |
+
x = torch.relu(x)
|
377 |
+
x = self.drop(x)
|
378 |
+
x = self.conv_2(self.padding(x * x_mask))
|
379 |
+
return x * x_mask
|
380 |
+
|
381 |
+
def _causal_padding(self, x):
|
382 |
+
if self.kernel_size == 1:
|
383 |
+
return x
|
384 |
+
pad_l = self.kernel_size - 1
|
385 |
+
pad_r = 0
|
386 |
+
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
387 |
+
x = F.pad(x, commons.convert_pad_shape(padding))
|
388 |
+
return x
|
389 |
+
|
390 |
+
def _same_padding(self, x):
|
391 |
+
if self.kernel_size == 1:
|
392 |
+
return x
|
393 |
+
pad_l = (self.kernel_size - 1) // 2
|
394 |
+
pad_r = self.kernel_size // 2
|
395 |
+
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
396 |
+
x = F.pad(x, commons.convert_pad_shape(padding))
|
397 |
+
return x
|
modules/commons.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
from torch import nn
|
5 |
+
from torch.nn import functional as F
|
6 |
+
|
7 |
+
|
8 |
+
def init_weights(m, mean=0.0, std=0.01):
|
9 |
+
classname = m.__class__.__name__
|
10 |
+
if classname.find("Conv") != -1:
|
11 |
+
m.weight.data.normal_(mean, std)
|
12 |
+
|
13 |
+
|
14 |
+
def get_padding(kernel_size, dilation=1):
|
15 |
+
return int((kernel_size*dilation - dilation)/2)
|
16 |
+
|
17 |
+
|
18 |
+
def convert_pad_shape(pad_shape):
|
19 |
+
l = pad_shape[::-1]
|
20 |
+
pad_shape = [item for sublist in l for item in sublist]
|
21 |
+
return pad_shape
|
22 |
+
|
23 |
+
|
24 |
+
def intersperse(lst, item):
|
25 |
+
result = [item] * (len(lst) * 2 + 1)
|
26 |
+
result[1::2] = lst
|
27 |
+
return result
|
28 |
+
|
29 |
+
|
30 |
+
def kl_divergence(m_p, logs_p, m_q, logs_q):
|
31 |
+
"""KL(P||Q)"""
|
32 |
+
kl = (logs_q - logs_p) - 0.5
|
33 |
+
kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
|
34 |
+
return kl
|
35 |
+
|
36 |
+
|
37 |
+
def rand_gumbel(shape):
|
38 |
+
"""Sample from the Gumbel distribution, protect from overflows."""
|
39 |
+
uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
|
40 |
+
return -torch.log(-torch.log(uniform_samples))
|
41 |
+
|
42 |
+
|
43 |
+
def rand_gumbel_like(x):
|
44 |
+
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
|
45 |
+
return g
|
46 |
+
|
47 |
+
|
48 |
+
def slice_segments(x, ids_str, segment_size=4):
|
49 |
+
ret = torch.zeros_like(x[:, :, :segment_size])
|
50 |
+
# print("ret shape: ",ret.shape, ids_str)
|
51 |
+
for i in range(x.size(0)):
|
52 |
+
idx_str = ids_str[i]
|
53 |
+
idx_end = idx_str + segment_size
|
54 |
+
ret[i] = x[i, :, idx_str:idx_end]
|
55 |
+
return ret
|
56 |
+
|
57 |
+
|
58 |
+
def rand_slice_segments(x, x_lengths=None, segment_size=4):
|
59 |
+
b, d, t = x.size()
|
60 |
+
if x_lengths is None:
|
61 |
+
x_lengths = t
|
62 |
+
ids_str_max = x_lengths - segment_size - 1
|
63 |
+
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
|
64 |
+
ret = slice_segments(x, ids_str, segment_size)
|
65 |
+
return ret, ids_str
|
66 |
+
|
67 |
+
|
68 |
+
def get_timing_signal_1d(
|
69 |
+
length, channels, min_timescale=1.0, max_timescale=1.0e4):
|
70 |
+
position = torch.arange(length, dtype=torch.float)
|
71 |
+
num_timescales = channels // 2
|
72 |
+
log_timescale_increment = (
|
73 |
+
math.log(float(max_timescale) / float(min_timescale)) /
|
74 |
+
(num_timescales - 1))
|
75 |
+
inv_timescales = min_timescale * torch.exp(
|
76 |
+
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
|
77 |
+
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
|
78 |
+
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
|
79 |
+
signal = F.pad(signal, [0, 0, 0, channels % 2])
|
80 |
+
signal = signal.view(1, channels, length)
|
81 |
+
return signal
|
82 |
+
|
83 |
+
|
84 |
+
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
|
85 |
+
b, channels, length = x.size()
|
86 |
+
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
87 |
+
return x + signal.to(dtype=x.dtype, device=x.device)
|
88 |
+
|
89 |
+
|
90 |
+
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
|
91 |
+
b, channels, length = x.size()
|
92 |
+
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
93 |
+
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
|
94 |
+
|
95 |
+
|
96 |
+
def subsequent_mask(length):
|
97 |
+
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
|
98 |
+
return mask
|
99 |
+
|
100 |
+
|
101 |
+
@torch.jit.script
|
102 |
+
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
103 |
+
n_channels_int = n_channels[0]
|
104 |
+
in_act = input_a + input_b
|
105 |
+
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
106 |
+
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
107 |
+
acts = t_act * s_act
|
108 |
+
return acts
|
109 |
+
|
110 |
+
|
111 |
+
def convert_pad_shape(pad_shape):
|
112 |
+
l = pad_shape[::-1]
|
113 |
+
pad_shape = [item for sublist in l for item in sublist]
|
114 |
+
return pad_shape
|
115 |
+
|
116 |
+
|
117 |
+
def shift_1d(x):
|
118 |
+
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
|
119 |
+
return x
|
120 |
+
|
121 |
+
|
122 |
+
def sequence_mask(length, max_length=None):
|
123 |
+
if max_length is None:
|
124 |
+
max_length = length.max()
|
125 |
+
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
|
126 |
+
return x.unsqueeze(0) < length.unsqueeze(1)
|
127 |
+
|
128 |
+
|
129 |
+
def generate_path(duration, mask):
|
130 |
+
"""
|
131 |
+
duration: [b, 1, t_x]
|
132 |
+
mask: [b, 1, t_y, t_x]
|
133 |
+
"""
|
134 |
+
device = duration.device
|
135 |
+
|
136 |
+
b, _, t_y, t_x = mask.shape
|
137 |
+
cum_duration = torch.cumsum(duration, -1)
|
138 |
+
|
139 |
+
cum_duration_flat = cum_duration.view(b * t_x)
|
140 |
+
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
|
141 |
+
path = path.view(b, t_x, t_y)
|
142 |
+
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
|
143 |
+
path = path.unsqueeze(1).transpose(2,3) * mask
|
144 |
+
return path
|
145 |
+
|
146 |
+
|
147 |
+
def clip_grad_value_(parameters, clip_value, norm_type=2):
|
148 |
+
if isinstance(parameters, torch.Tensor):
|
149 |
+
parameters = [parameters]
|
150 |
+
parameters = list(filter(lambda p: p.grad is not None, parameters))
|
151 |
+
norm_type = float(norm_type)
|
152 |
+
if clip_value is not None:
|
153 |
+
clip_value = float(clip_value)
|
154 |
+
|
155 |
+
total_norm = 0
|
156 |
+
for p in parameters:
|
157 |
+
param_norm = p.grad.data.norm(norm_type)
|
158 |
+
total_norm += param_norm.item() ** norm_type
|
159 |
+
if clip_value is not None:
|
160 |
+
p.grad.data.clamp_(min=-clip_value, max=clip_value)
|
161 |
+
total_norm = total_norm ** (1. / norm_type)
|
162 |
+
return total_norm
|
modules/ddsp.py
ADDED
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from torch.nn import functional as F
|
4 |
+
import torch.fft as fft
|
5 |
+
import numpy as np
|
6 |
+
import librosa as li
|
7 |
+
import math
|
8 |
+
from scipy.signal import get_window
|
9 |
+
|
10 |
+
def safe_log(x):
|
11 |
+
return torch.log(x + 1e-7)
|
12 |
+
|
13 |
+
|
14 |
+
@torch.no_grad()
|
15 |
+
def mean_std_loudness(dataset):
|
16 |
+
mean = 0
|
17 |
+
std = 0
|
18 |
+
n = 0
|
19 |
+
for _, _, l in dataset:
|
20 |
+
n += 1
|
21 |
+
mean += (l.mean().item() - mean) / n
|
22 |
+
std += (l.std().item() - std) / n
|
23 |
+
return mean, std
|
24 |
+
|
25 |
+
|
26 |
+
def multiscale_fft(signal, scales, overlap):
|
27 |
+
stfts = []
|
28 |
+
for s in scales:
|
29 |
+
S = torch.stft(
|
30 |
+
signal,
|
31 |
+
s,
|
32 |
+
int(s * (1 - overlap)),
|
33 |
+
s,
|
34 |
+
torch.hann_window(s).to(signal),
|
35 |
+
True,
|
36 |
+
normalized=True,
|
37 |
+
return_complex=True,
|
38 |
+
).abs()
|
39 |
+
stfts.append(S)
|
40 |
+
return stfts
|
41 |
+
|
42 |
+
|
43 |
+
def resample(x, factor: int):
|
44 |
+
batch, frame, channel = x.shape
|
45 |
+
x = x.permute(0, 2, 1).reshape(batch * channel, 1, frame)
|
46 |
+
|
47 |
+
window = torch.hann_window(
|
48 |
+
factor * 2,
|
49 |
+
dtype=x.dtype,
|
50 |
+
device=x.device,
|
51 |
+
).reshape(1, 1, -1)
|
52 |
+
y = torch.zeros(x.shape[0], x.shape[1], factor * x.shape[2]).to(x)
|
53 |
+
y[..., ::factor] = x
|
54 |
+
y[..., -1:] = x[..., -1:]
|
55 |
+
y = torch.nn.functional.pad(y, [factor, factor])
|
56 |
+
y = torch.nn.functional.conv1d(y, window)[..., :-1]
|
57 |
+
|
58 |
+
y = y.reshape(batch, channel, factor * frame).permute(0, 2, 1)
|
59 |
+
|
60 |
+
return y
|
61 |
+
|
62 |
+
|
63 |
+
def upsample(signal, factor):
|
64 |
+
signal = signal.permute(0, 2, 1)
|
65 |
+
signal = nn.functional.interpolate(signal, size=signal.shape[-1] * factor)
|
66 |
+
return signal.permute(0, 2, 1)
|
67 |
+
|
68 |
+
|
69 |
+
def remove_above_nyquist(amplitudes, pitch, sampling_rate):
|
70 |
+
n_harm = amplitudes.shape[-1]
|
71 |
+
pitches = pitch * torch.arange(1, n_harm + 1).to(pitch)
|
72 |
+
aa = (pitches < sampling_rate / 2).float() + 1e-4
|
73 |
+
return amplitudes * aa
|
74 |
+
|
75 |
+
|
76 |
+
def scale_function(x):
|
77 |
+
return 2 * torch.sigmoid(x)**(math.log(10)) + 1e-7
|
78 |
+
|
79 |
+
|
80 |
+
def extract_loudness(signal, sampling_rate, block_size, n_fft=2048):
|
81 |
+
S = li.stft(
|
82 |
+
signal,
|
83 |
+
n_fft=n_fft,
|
84 |
+
hop_length=block_size,
|
85 |
+
win_length=n_fft,
|
86 |
+
center=True,
|
87 |
+
)
|
88 |
+
S = np.log(abs(S) + 1e-7)
|
89 |
+
f = li.fft_frequencies(sampling_rate, n_fft)
|
90 |
+
a_weight = li.A_weighting(f)
|
91 |
+
|
92 |
+
S = S + a_weight.reshape(-1, 1)
|
93 |
+
|
94 |
+
S = np.mean(S, 0)[..., :-1]
|
95 |
+
|
96 |
+
return S
|
97 |
+
|
98 |
+
|
99 |
+
def extract_pitch(signal, sampling_rate, block_size):
|
100 |
+
length = signal.shape[-1] // block_size
|
101 |
+
f0 = crepe.predict(
|
102 |
+
signal,
|
103 |
+
sampling_rate,
|
104 |
+
step_size=int(1000 * block_size / sampling_rate),
|
105 |
+
verbose=1,
|
106 |
+
center=True,
|
107 |
+
viterbi=True,
|
108 |
+
)
|
109 |
+
f0 = f0[1].reshape(-1)[:-1]
|
110 |
+
|
111 |
+
if f0.shape[-1] != length:
|
112 |
+
f0 = np.interp(
|
113 |
+
np.linspace(0, 1, length, endpoint=False),
|
114 |
+
np.linspace(0, 1, f0.shape[-1], endpoint=False),
|
115 |
+
f0,
|
116 |
+
)
|
117 |
+
|
118 |
+
return f0
|
119 |
+
|
120 |
+
|
121 |
+
def mlp(in_size, hidden_size, n_layers):
|
122 |
+
channels = [in_size] + (n_layers) * [hidden_size]
|
123 |
+
net = []
|
124 |
+
for i in range(n_layers):
|
125 |
+
net.append(nn.Linear(channels[i], channels[i + 1]))
|
126 |
+
net.append(nn.LayerNorm(channels[i + 1]))
|
127 |
+
net.append(nn.LeakyReLU())
|
128 |
+
return nn.Sequential(*net)
|
129 |
+
|
130 |
+
|
131 |
+
def gru(n_input, hidden_size):
|
132 |
+
return nn.GRU(n_input * hidden_size, hidden_size, batch_first=True)
|
133 |
+
|
134 |
+
|
135 |
+
def harmonic_synth(pitch, amplitudes, sampling_rate):
|
136 |
+
n_harmonic = amplitudes.shape[-1]
|
137 |
+
omega = torch.cumsum(2 * math.pi * pitch / sampling_rate, 1)
|
138 |
+
omegas = omega * torch.arange(1, n_harmonic + 1).to(omega)
|
139 |
+
signal = (torch.sin(omegas) * amplitudes).sum(-1, keepdim=True)
|
140 |
+
return signal
|
141 |
+
|
142 |
+
|
143 |
+
def amp_to_impulse_response(amp, target_size):
|
144 |
+
amp = torch.stack([amp, torch.zeros_like(amp)], -1)
|
145 |
+
amp = torch.view_as_complex(amp)
|
146 |
+
amp = fft.irfft(amp)
|
147 |
+
|
148 |
+
filter_size = amp.shape[-1]
|
149 |
+
|
150 |
+
amp = torch.roll(amp, filter_size // 2, -1)
|
151 |
+
win = torch.hann_window(filter_size, dtype=amp.dtype, device=amp.device)
|
152 |
+
|
153 |
+
amp = amp * win
|
154 |
+
|
155 |
+
amp = nn.functional.pad(amp, (0, int(target_size) - int(filter_size)))
|
156 |
+
amp = torch.roll(amp, -filter_size // 2, -1)
|
157 |
+
|
158 |
+
return amp
|
159 |
+
|
160 |
+
|
161 |
+
def fft_convolve(signal, kernel):
|
162 |
+
signal = nn.functional.pad(signal, (0, signal.shape[-1]))
|
163 |
+
kernel = nn.functional.pad(kernel, (kernel.shape[-1], 0))
|
164 |
+
|
165 |
+
output = fft.irfft(fft.rfft(signal) * fft.rfft(kernel))
|
166 |
+
output = output[..., output.shape[-1] // 2:]
|
167 |
+
|
168 |
+
return output
|
169 |
+
|
170 |
+
|
171 |
+
def init_kernels(win_len, win_inc, fft_len, win_type=None, invers=False):
|
172 |
+
if win_type == 'None' or win_type is None:
|
173 |
+
window = np.ones(win_len)
|
174 |
+
else:
|
175 |
+
window = get_window(win_type, win_len, fftbins=True)#**0.5
|
176 |
+
|
177 |
+
N = fft_len
|
178 |
+
fourier_basis = np.fft.rfft(np.eye(N))[:win_len]
|
179 |
+
real_kernel = np.real(fourier_basis)
|
180 |
+
imag_kernel = np.imag(fourier_basis)
|
181 |
+
kernel = np.concatenate([real_kernel, imag_kernel], 1).T
|
182 |
+
|
183 |
+
if invers :
|
184 |
+
kernel = np.linalg.pinv(kernel).T
|
185 |
+
|
186 |
+
kernel = kernel*window
|
187 |
+
kernel = kernel[:, None, :]
|
188 |
+
return torch.from_numpy(kernel.astype(np.float32)), torch.from_numpy(window[None,:,None].astype(np.float32))
|
189 |
+
|
modules/losses.py
ADDED
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.nn import functional as F
|
3 |
+
|
4 |
+
import modules.commons
|
5 |
+
import math
|
6 |
+
|
7 |
+
def feature_loss(fmap_r, fmap_g):
|
8 |
+
loss = 0
|
9 |
+
for dr, dg in zip(fmap_r, fmap_g):
|
10 |
+
for rl, gl in zip(dr, dg):
|
11 |
+
rl = rl.float().detach()
|
12 |
+
gl = gl.float()
|
13 |
+
loss += torch.mean(torch.abs(rl - gl))
|
14 |
+
|
15 |
+
return loss * 2
|
16 |
+
|
17 |
+
|
18 |
+
def discriminator_loss(disc_real_outputs, disc_generated_outputs):
|
19 |
+
loss = 0
|
20 |
+
r_losses = []
|
21 |
+
g_losses = []
|
22 |
+
for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
|
23 |
+
dr = dr.float()
|
24 |
+
dg = dg.float()
|
25 |
+
r_loss = torch.mean((1-dr)**2)
|
26 |
+
g_loss = torch.mean(dg**2)
|
27 |
+
loss += (r_loss + g_loss)
|
28 |
+
r_losses.append(r_loss.item())
|
29 |
+
g_losses.append(g_loss.item())
|
30 |
+
|
31 |
+
return loss, r_losses, g_losses
|
32 |
+
|
33 |
+
|
34 |
+
def generator_loss(disc_outputs):
|
35 |
+
loss = 0
|
36 |
+
gen_losses = []
|
37 |
+
for dg in disc_outputs:
|
38 |
+
dg = dg.float()
|
39 |
+
l = torch.mean((1-dg)**2)
|
40 |
+
gen_losses.append(l)
|
41 |
+
loss += l
|
42 |
+
|
43 |
+
return loss, gen_losses
|
44 |
+
|
45 |
+
|
46 |
+
def kl_loss(z_p, logs_q, m_p, logs_p, z_mask):
|
47 |
+
"""
|
48 |
+
z_p, logs_q: [b, h, t_t]
|
49 |
+
m_p, logs_p: [b, h, t_t]
|
50 |
+
"""
|
51 |
+
z_p = z_p.float()
|
52 |
+
logs_q = logs_q.float()
|
53 |
+
m_p = m_p.float()
|
54 |
+
logs_p = logs_p.float()
|
55 |
+
z_mask = z_mask.float()
|
56 |
+
|
57 |
+
kl = logs_p - logs_q - 0.5
|
58 |
+
kl += 0.5 * ((z_p - m_p)**2) * torch.exp(-2. * logs_p)
|
59 |
+
kl = torch.sum(kl * z_mask)
|
60 |
+
l = kl / torch.sum(z_mask)
|
61 |
+
return l
|
62 |
+
|
modules/modules.py
ADDED
@@ -0,0 +1,450 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
import math
|
3 |
+
import numpy as np
|
4 |
+
import scipy
|
5 |
+
import torch
|
6 |
+
from torch import nn
|
7 |
+
from torch.nn import functional as F
|
8 |
+
from torch.autograd import Function
|
9 |
+
from typing import Any, Optional, Tuple
|
10 |
+
|
11 |
+
from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
|
12 |
+
from torch.nn.utils import weight_norm, remove_weight_norm
|
13 |
+
|
14 |
+
import modules.commons as commons
|
15 |
+
import modules.attentions as attentions
|
16 |
+
from modules.commons import init_weights, get_padding
|
17 |
+
from modules.transforms import piecewise_rational_quadratic_transform
|
18 |
+
|
19 |
+
|
20 |
+
LRELU_SLOPE = 0.1
|
21 |
+
|
22 |
+
|
23 |
+
class LayerNorm(nn.Module):
|
24 |
+
def __init__(self, channels, eps=1e-5):
|
25 |
+
super().__init__()
|
26 |
+
self.channels = channels
|
27 |
+
self.eps = eps
|
28 |
+
|
29 |
+
self.gamma = nn.Parameter(torch.ones(channels))
|
30 |
+
self.beta = nn.Parameter(torch.zeros(channels))
|
31 |
+
|
32 |
+
def forward(self, x):
|
33 |
+
x = x.transpose(1, -1)
|
34 |
+
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
35 |
+
return x.transpose(1, -1)
|
36 |
+
|
37 |
+
|
38 |
+
class ConvReluNorm(nn.Module):
|
39 |
+
def __init__(self, in_channels, hidden_channels, out_channels, kernel_size, n_layers, p_dropout):
|
40 |
+
super().__init__()
|
41 |
+
self.in_channels = in_channels
|
42 |
+
self.hidden_channels = hidden_channels
|
43 |
+
self.out_channels = out_channels
|
44 |
+
self.kernel_size = kernel_size
|
45 |
+
self.n_layers = n_layers
|
46 |
+
self.p_dropout = p_dropout
|
47 |
+
assert n_layers > 1, "Number of layers should be larger than 0."
|
48 |
+
|
49 |
+
self.conv_layers = nn.ModuleList()
|
50 |
+
self.norm_layers = nn.ModuleList()
|
51 |
+
self.conv_layers.append(nn.Conv1d(in_channels, hidden_channels, kernel_size, padding=kernel_size//2))
|
52 |
+
self.norm_layers.append(LayerNorm(hidden_channels))
|
53 |
+
self.relu_drop = nn.Sequential(
|
54 |
+
nn.ReLU(),
|
55 |
+
nn.Dropout(p_dropout))
|
56 |
+
for _ in range(n_layers-1):
|
57 |
+
self.conv_layers.append(nn.Conv1d(hidden_channels, hidden_channels, kernel_size, padding=kernel_size//2))
|
58 |
+
self.norm_layers.append(LayerNorm(hidden_channels))
|
59 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
60 |
+
self.proj.weight.data.zero_()
|
61 |
+
self.proj.bias.data.zero_()
|
62 |
+
|
63 |
+
def forward(self, x, x_mask):
|
64 |
+
x_org = x
|
65 |
+
for i in range(self.n_layers):
|
66 |
+
x = self.conv_layers[i](x * x_mask)
|
67 |
+
x = self.norm_layers[i](x)
|
68 |
+
x = self.relu_drop(x)
|
69 |
+
x = x_org + self.proj(x)
|
70 |
+
return x * x_mask
|
71 |
+
|
72 |
+
|
73 |
+
class DDSConv(nn.Module):
|
74 |
+
"""
|
75 |
+
Dialted and Depth-Separable Convolution
|
76 |
+
"""
|
77 |
+
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.):
|
78 |
+
super().__init__()
|
79 |
+
self.channels = channels
|
80 |
+
self.kernel_size = kernel_size
|
81 |
+
self.n_layers = n_layers
|
82 |
+
self.p_dropout = p_dropout
|
83 |
+
|
84 |
+
self.drop = nn.Dropout(p_dropout)
|
85 |
+
self.convs_sep = nn.ModuleList()
|
86 |
+
self.convs_1x1 = nn.ModuleList()
|
87 |
+
self.norms_1 = nn.ModuleList()
|
88 |
+
self.norms_2 = nn.ModuleList()
|
89 |
+
for i in range(n_layers):
|
90 |
+
dilation = kernel_size ** i
|
91 |
+
padding = (kernel_size * dilation - dilation) // 2
|
92 |
+
self.convs_sep.append(nn.Conv1d(channels, channels, kernel_size,
|
93 |
+
groups=channels, dilation=dilation, padding=padding
|
94 |
+
))
|
95 |
+
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
|
96 |
+
self.norms_1.append(LayerNorm(channels))
|
97 |
+
self.norms_2.append(LayerNorm(channels))
|
98 |
+
|
99 |
+
def forward(self, x, x_mask, g=None):
|
100 |
+
if g is not None:
|
101 |
+
x = x + g
|
102 |
+
for i in range(self.n_layers):
|
103 |
+
y = self.convs_sep[i](x * x_mask)
|
104 |
+
y = self.norms_1[i](y)
|
105 |
+
y = F.gelu(y)
|
106 |
+
y = self.convs_1x1[i](y)
|
107 |
+
y = self.norms_2[i](y)
|
108 |
+
y = F.gelu(y)
|
109 |
+
y = self.drop(y)
|
110 |
+
x = x + y
|
111 |
+
return x * x_mask
|
112 |
+
|
113 |
+
|
114 |
+
class WN(torch.nn.Module):
|
115 |
+
def __init__(self, hidden_channels, kernel_size, dilation_rate, n_layers, n_speakers=0, spk_channels=0, p_dropout=0):
|
116 |
+
super(WN, self).__init__()
|
117 |
+
assert(kernel_size % 2 == 1)
|
118 |
+
self.hidden_channels =hidden_channels
|
119 |
+
self.kernel_size = kernel_size,
|
120 |
+
self.dilation_rate = dilation_rate
|
121 |
+
self.n_layers = n_layers
|
122 |
+
self.n_speakers = n_speakers
|
123 |
+
self.spk_channels = spk_channels
|
124 |
+
self.p_dropout = p_dropout
|
125 |
+
|
126 |
+
self.in_layers = torch.nn.ModuleList()
|
127 |
+
self.res_skip_layers = torch.nn.ModuleList()
|
128 |
+
self.drop = nn.Dropout(p_dropout)
|
129 |
+
|
130 |
+
if n_speakers > 0:
|
131 |
+
cond_layer = torch.nn.Conv1d(spk_channels, 2*hidden_channels*n_layers, 1)
|
132 |
+
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name='weight')
|
133 |
+
|
134 |
+
for i in range(n_layers):
|
135 |
+
dilation = dilation_rate ** i
|
136 |
+
padding = int((kernel_size * dilation - dilation) / 2)
|
137 |
+
in_layer = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, kernel_size,
|
138 |
+
dilation=dilation, padding=padding)
|
139 |
+
in_layer = torch.nn.utils.weight_norm(in_layer, name='weight')
|
140 |
+
self.in_layers.append(in_layer)
|
141 |
+
|
142 |
+
# last one is not necessary
|
143 |
+
if i < n_layers - 1:
|
144 |
+
res_skip_channels = 2 * hidden_channels
|
145 |
+
else:
|
146 |
+
res_skip_channels = hidden_channels
|
147 |
+
|
148 |
+
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
|
149 |
+
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name='weight')
|
150 |
+
self.res_skip_layers.append(res_skip_layer)
|
151 |
+
|
152 |
+
def forward(self, x, x_mask, g=None, **kwargs):
|
153 |
+
output = torch.zeros_like(x)
|
154 |
+
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
155 |
+
|
156 |
+
if g is not None:
|
157 |
+
g = self.cond_layer(g)
|
158 |
+
|
159 |
+
for i in range(self.n_layers):
|
160 |
+
x_in = self.in_layers[i](x)
|
161 |
+
if g is not None:
|
162 |
+
cond_offset = i * 2 * self.hidden_channels
|
163 |
+
g_l = g[:,cond_offset:cond_offset+2*self.hidden_channels,:]
|
164 |
+
else:
|
165 |
+
g_l = torch.zeros_like(x_in)
|
166 |
+
|
167 |
+
acts = commons.fused_add_tanh_sigmoid_multiply(
|
168 |
+
x_in,
|
169 |
+
g_l,
|
170 |
+
n_channels_tensor)
|
171 |
+
acts = self.drop(acts)
|
172 |
+
|
173 |
+
res_skip_acts = self.res_skip_layers[i](acts)
|
174 |
+
if i < self.n_layers - 1:
|
175 |
+
res_acts = res_skip_acts[:,:self.hidden_channels,:]
|
176 |
+
x = (x + res_acts) * x_mask
|
177 |
+
output = output + res_skip_acts[:,self.hidden_channels:,:]
|
178 |
+
else:
|
179 |
+
output = output + res_skip_acts
|
180 |
+
return output * x_mask
|
181 |
+
|
182 |
+
def remove_weight_norm(self):
|
183 |
+
if self.n_speakers > 0:
|
184 |
+
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
185 |
+
for l in self.in_layers:
|
186 |
+
torch.nn.utils.remove_weight_norm(l)
|
187 |
+
for l in self.res_skip_layers:
|
188 |
+
torch.nn.utils.remove_weight_norm(l)
|
189 |
+
|
190 |
+
|
191 |
+
class ResBlock1(torch.nn.Module):
|
192 |
+
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
|
193 |
+
super(ResBlock1, self).__init__()
|
194 |
+
self.convs1 = nn.ModuleList([
|
195 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
196 |
+
padding=get_padding(kernel_size, dilation[0]))),
|
197 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
198 |
+
padding=get_padding(kernel_size, dilation[1]))),
|
199 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
|
200 |
+
padding=get_padding(kernel_size, dilation[2])))
|
201 |
+
])
|
202 |
+
self.convs1.apply(init_weights)
|
203 |
+
|
204 |
+
self.convs2 = nn.ModuleList([
|
205 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
206 |
+
padding=get_padding(kernel_size, 1))),
|
207 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
208 |
+
padding=get_padding(kernel_size, 1))),
|
209 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
|
210 |
+
padding=get_padding(kernel_size, 1)))
|
211 |
+
])
|
212 |
+
self.convs2.apply(init_weights)
|
213 |
+
|
214 |
+
def forward(self, x, x_mask=None):
|
215 |
+
for c1, c2 in zip(self.convs1, self.convs2):
|
216 |
+
xt = F.leaky_relu(x, LRELU_SLOPE)
|
217 |
+
if x_mask is not None:
|
218 |
+
xt = xt * x_mask
|
219 |
+
xt = c1(xt)
|
220 |
+
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
221 |
+
if x_mask is not None:
|
222 |
+
xt = xt * x_mask
|
223 |
+
xt = c2(xt)
|
224 |
+
x = xt + x
|
225 |
+
if x_mask is not None:
|
226 |
+
x = x * x_mask
|
227 |
+
return x
|
228 |
+
|
229 |
+
def remove_weight_norm(self):
|
230 |
+
for l in self.convs1:
|
231 |
+
remove_weight_norm(l)
|
232 |
+
for l in self.convs2:
|
233 |
+
remove_weight_norm(l)
|
234 |
+
|
235 |
+
|
236 |
+
class ResBlock2(torch.nn.Module):
|
237 |
+
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
|
238 |
+
super(ResBlock2, self).__init__()
|
239 |
+
self.convs = nn.ModuleList([
|
240 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
|
241 |
+
padding=get_padding(kernel_size, dilation[0]))),
|
242 |
+
weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
|
243 |
+
padding=get_padding(kernel_size, dilation[1])))
|
244 |
+
])
|
245 |
+
self.convs.apply(init_weights)
|
246 |
+
|
247 |
+
def forward(self, x, x_mask=None):
|
248 |
+
for c in self.convs:
|
249 |
+
xt = F.leaky_relu(x, LRELU_SLOPE)
|
250 |
+
if x_mask is not None:
|
251 |
+
xt = xt * x_mask
|
252 |
+
xt = c(xt)
|
253 |
+
x = xt + x
|
254 |
+
if x_mask is not None:
|
255 |
+
x = x * x_mask
|
256 |
+
return x
|
257 |
+
|
258 |
+
def remove_weight_norm(self):
|
259 |
+
for l in self.convs:
|
260 |
+
remove_weight_norm(l)
|
261 |
+
|
262 |
+
|
263 |
+
class Log(nn.Module):
|
264 |
+
def forward(self, x, x_mask, reverse=False, **kwargs):
|
265 |
+
if not reverse:
|
266 |
+
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
|
267 |
+
logdet = torch.sum(-y, [1, 2])
|
268 |
+
return y, logdet
|
269 |
+
else:
|
270 |
+
x = torch.exp(x) * x_mask
|
271 |
+
return x
|
272 |
+
|
273 |
+
|
274 |
+
class Flip(nn.Module):
|
275 |
+
def forward(self, x, *args, reverse=False, **kwargs):
|
276 |
+
x = torch.flip(x, [1])
|
277 |
+
if not reverse:
|
278 |
+
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
|
279 |
+
return x, logdet
|
280 |
+
else:
|
281 |
+
return x
|
282 |
+
|
283 |
+
|
284 |
+
class ElementwiseAffine(nn.Module):
|
285 |
+
def __init__(self, channels):
|
286 |
+
super().__init__()
|
287 |
+
self.channels = channels
|
288 |
+
self.m = nn.Parameter(torch.zeros(channels,1))
|
289 |
+
self.logs = nn.Parameter(torch.zeros(channels,1))
|
290 |
+
|
291 |
+
def forward(self, x, x_mask, reverse=False, **kwargs):
|
292 |
+
if not reverse:
|
293 |
+
y = self.m + torch.exp(self.logs) * x
|
294 |
+
y = y * x_mask
|
295 |
+
logdet = torch.sum(self.logs * x_mask, [1,2])
|
296 |
+
return y, logdet
|
297 |
+
else:
|
298 |
+
x = (x - self.m) * torch.exp(-self.logs) * x_mask
|
299 |
+
return x
|
300 |
+
|
301 |
+
|
302 |
+
class ResidualCouplingLayer(nn.Module):
|
303 |
+
def __init__(self,
|
304 |
+
channels,
|
305 |
+
hidden_channels,
|
306 |
+
kernel_size,
|
307 |
+
dilation_rate,
|
308 |
+
n_layers,
|
309 |
+
p_dropout=0,
|
310 |
+
n_speakers=0,
|
311 |
+
spk_channels=0,
|
312 |
+
mean_only=False):
|
313 |
+
assert channels % 2 == 0, "channels should be divisible by 2"
|
314 |
+
super().__init__()
|
315 |
+
self.channels = channels
|
316 |
+
self.hidden_channels = hidden_channels
|
317 |
+
self.kernel_size = kernel_size
|
318 |
+
self.dilation_rate = dilation_rate
|
319 |
+
self.n_layers = n_layers
|
320 |
+
self.half_channels = channels // 2
|
321 |
+
self.mean_only = mean_only
|
322 |
+
|
323 |
+
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
|
324 |
+
self.enc = WN(hidden_channels, kernel_size, dilation_rate, n_layers, p_dropout=p_dropout, n_speakers=n_speakers, spk_channels=spk_channels)
|
325 |
+
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
|
326 |
+
self.post.weight.data.zero_()
|
327 |
+
self.post.bias.data.zero_()
|
328 |
+
|
329 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
330 |
+
x0, x1 = torch.split(x, [self.half_channels]*2, 1)
|
331 |
+
h = self.pre(x0) * x_mask
|
332 |
+
h = self.enc(h, x_mask, g=g)
|
333 |
+
stats = self.post(h) * x_mask
|
334 |
+
if not self.mean_only:
|
335 |
+
m, logs = torch.split(stats, [self.half_channels]*2, 1)
|
336 |
+
else:
|
337 |
+
m = stats
|
338 |
+
logs = torch.zeros_like(m)
|
339 |
+
|
340 |
+
if not reverse:
|
341 |
+
x1 = m + x1 * torch.exp(logs) * x_mask
|
342 |
+
x = torch.cat([x0, x1], 1)
|
343 |
+
logdet = torch.sum(logs, [1,2])
|
344 |
+
return x, logdet
|
345 |
+
else:
|
346 |
+
x1 = (x1 - m) * torch.exp(-logs) * x_mask
|
347 |
+
x = torch.cat([x0, x1], 1)
|
348 |
+
return x
|
349 |
+
|
350 |
+
class ResidualCouplingBlock(nn.Module):
|
351 |
+
def __init__(self,
|
352 |
+
channels,
|
353 |
+
hidden_channels,
|
354 |
+
kernel_size,
|
355 |
+
dilation_rate,
|
356 |
+
n_layers,
|
357 |
+
n_flows=4,
|
358 |
+
n_speakers=0,
|
359 |
+
gin_channels=0):
|
360 |
+
super().__init__()
|
361 |
+
self.channels = channels
|
362 |
+
self.hidden_channels = hidden_channels
|
363 |
+
self.kernel_size = kernel_size
|
364 |
+
self.dilation_rate = dilation_rate
|
365 |
+
self.n_layers = n_layers
|
366 |
+
self.n_flows = n_flows
|
367 |
+
self.gin_channels = gin_channels
|
368 |
+
|
369 |
+
self.flows = nn.ModuleList()
|
370 |
+
for i in range(n_flows):
|
371 |
+
self.flows.append(ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, n_speakers=n_speakers, spk_channels=gin_channels, mean_only=True))
|
372 |
+
self.flows.append(Flip())
|
373 |
+
|
374 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
375 |
+
if not reverse:
|
376 |
+
for flow in self.flows:
|
377 |
+
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
378 |
+
else:
|
379 |
+
for flow in reversed(self.flows):
|
380 |
+
x = flow(x, x_mask, g=g, reverse=reverse)
|
381 |
+
return x
|
382 |
+
|
383 |
+
|
384 |
+
class ConvFlow(nn.Module):
|
385 |
+
def __init__(self, in_channels, filter_channels, kernel_size, n_layers, num_bins=10, tail_bound=5.0):
|
386 |
+
super().__init__()
|
387 |
+
self.in_channels = in_channels
|
388 |
+
self.filter_channels = filter_channels
|
389 |
+
self.kernel_size = kernel_size
|
390 |
+
self.n_layers = n_layers
|
391 |
+
self.num_bins = num_bins
|
392 |
+
self.tail_bound = tail_bound
|
393 |
+
self.half_channels = in_channels // 2
|
394 |
+
|
395 |
+
self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
|
396 |
+
self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.)
|
397 |
+
self.proj = nn.Conv1d(filter_channels, self.half_channels * (num_bins * 3 - 1), 1)
|
398 |
+
self.proj.weight.data.zero_()
|
399 |
+
self.proj.bias.data.zero_()
|
400 |
+
|
401 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
402 |
+
x0, x1 = torch.split(x, [self.half_channels]*2, 1)
|
403 |
+
h = self.pre(x0)
|
404 |
+
h = self.convs(h, x_mask, g=g)
|
405 |
+
h = self.proj(h) * x_mask
|
406 |
+
|
407 |
+
b, c, t = x0.shape
|
408 |
+
h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
|
409 |
+
|
410 |
+
unnormalized_widths = h[..., :self.num_bins] / math.sqrt(self.filter_channels)
|
411 |
+
unnormalized_heights = h[..., self.num_bins:2*self.num_bins] / math.sqrt(self.filter_channels)
|
412 |
+
unnormalized_derivatives = h[..., 2 * self.num_bins:]
|
413 |
+
|
414 |
+
x1, logabsdet = piecewise_rational_quadratic_transform(x1,
|
415 |
+
unnormalized_widths,
|
416 |
+
unnormalized_heights,
|
417 |
+
unnormalized_derivatives,
|
418 |
+
inverse=reverse,
|
419 |
+
tails='linear',
|
420 |
+
tail_bound=self.tail_bound
|
421 |
+
)
|
422 |
+
|
423 |
+
x = torch.cat([x0, x1], 1) * x_mask
|
424 |
+
logdet = torch.sum(logabsdet * x_mask, [1,2])
|
425 |
+
if not reverse:
|
426 |
+
return x, logdet
|
427 |
+
else:
|
428 |
+
return x
|
429 |
+
|
430 |
+
|
431 |
+
class ResStack(nn.Module):
|
432 |
+
def __init__(self, channel, kernel_size=3, base=3, nums=4):
|
433 |
+
super(ResStack, self).__init__()
|
434 |
+
|
435 |
+
self.layers = nn.ModuleList([
|
436 |
+
nn.Sequential(
|
437 |
+
nn.LeakyReLU(),
|
438 |
+
nn.utils.weight_norm(nn.Conv1d(channel, channel,
|
439 |
+
kernel_size=kernel_size, dilation=base**i, padding=base**i)),
|
440 |
+
nn.LeakyReLU(),
|
441 |
+
nn.utils.weight_norm(nn.Conv1d(channel, channel,
|
442 |
+
kernel_size=kernel_size, dilation=1, padding=1)),
|
443 |
+
)
|
444 |
+
for i in range(nums)
|
445 |
+
])
|
446 |
+
|
447 |
+
def forward(self, x):
|
448 |
+
for layer in self.layers:
|
449 |
+
x = x + layer(x)
|
450 |
+
return x
|
modules/stft.py
ADDED
@@ -0,0 +1,512 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from librosa.util import pad_center, tiny
|
2 |
+
from scipy.signal import get_window
|
3 |
+
from torch import Tensor
|
4 |
+
from torch.autograd import Variable
|
5 |
+
from typing import Optional, Tuple
|
6 |
+
|
7 |
+
import librosa
|
8 |
+
import librosa.util as librosa_util
|
9 |
+
import math
|
10 |
+
import numpy as np
|
11 |
+
import scipy
|
12 |
+
import torch
|
13 |
+
import torch.nn.functional as F
|
14 |
+
import warnings
|
15 |
+
|
16 |
+
|
17 |
+
def create_fb_matrix(
|
18 |
+
n_freqs: int,
|
19 |
+
f_min: float,
|
20 |
+
f_max: float,
|
21 |
+
n_mels: int,
|
22 |
+
sample_rate: int,
|
23 |
+
norm: Optional[str] = None
|
24 |
+
) -> Tensor:
|
25 |
+
r"""Create a frequency bin conversion matrix.
|
26 |
+
|
27 |
+
Args:
|
28 |
+
n_freqs (int): Number of frequencies to highlight/apply
|
29 |
+
f_min (float): Minimum frequency (Hz)
|
30 |
+
f_max (float): Maximum frequency (Hz)
|
31 |
+
n_mels (int): Number of mel filterbanks
|
32 |
+
sample_rate (int): Sample rate of the audio waveform
|
33 |
+
norm (Optional[str]): If 'slaney', divide the triangular mel weights by the width of the mel band
|
34 |
+
(area normalization). (Default: ``None``)
|
35 |
+
|
36 |
+
Returns:
|
37 |
+
Tensor: Triangular filter banks (fb matrix) of size (``n_freqs``, ``n_mels``)
|
38 |
+
meaning number of frequencies to highlight/apply to x the number of filterbanks.
|
39 |
+
Each column is a filterbank so that assuming there is a matrix A of
|
40 |
+
size (..., ``n_freqs``), the applied result would be
|
41 |
+
``A * create_fb_matrix(A.size(-1), ...)``.
|
42 |
+
"""
|
43 |
+
|
44 |
+
if norm is not None and norm != "slaney":
|
45 |
+
raise ValueError("norm must be one of None or 'slaney'")
|
46 |
+
|
47 |
+
# freq bins
|
48 |
+
# Equivalent filterbank construction by Librosa
|
49 |
+
all_freqs = torch.linspace(0, sample_rate // 2, n_freqs)
|
50 |
+
|
51 |
+
# calculate mel freq bins
|
52 |
+
# hertz to mel(f) is 2595. * math.log10(1. + (f / 700.))
|
53 |
+
m_min = 2595.0 * math.log10(1.0 + (f_min / 700.0))
|
54 |
+
m_max = 2595.0 * math.log10(1.0 + (f_max / 700.0))
|
55 |
+
m_pts = torch.linspace(m_min, m_max, n_mels + 2)
|
56 |
+
# mel to hertz(mel) is 700. * (10**(mel / 2595.) - 1.)
|
57 |
+
f_pts = 700.0 * (10 ** (m_pts / 2595.0) - 1.0)
|
58 |
+
# calculate the difference between each mel point and each stft freq point in hertz
|
59 |
+
f_diff = f_pts[1:] - f_pts[:-1] # (n_mels + 1)
|
60 |
+
slopes = f_pts.unsqueeze(0) - all_freqs.unsqueeze(1) # (n_freqs, n_mels + 2)
|
61 |
+
# create overlapping triangles
|
62 |
+
down_slopes = (-1.0 * slopes[:, :-2]) / f_diff[:-1] # (n_freqs, n_mels)
|
63 |
+
up_slopes = slopes[:, 2:] / f_diff[1:] # (n_freqs, n_mels)
|
64 |
+
fb = torch.min(down_slopes, up_slopes)
|
65 |
+
fb = torch.clamp(fb, 1e-6, 1)
|
66 |
+
|
67 |
+
if norm is not None and norm == "slaney":
|
68 |
+
# Slaney-style mel is scaled to be approx constant energy per channel
|
69 |
+
enorm = 2.0 / (f_pts[2:n_mels + 2] - f_pts[:n_mels])
|
70 |
+
fb *= enorm.unsqueeze(0)
|
71 |
+
return fb
|
72 |
+
|
73 |
+
|
74 |
+
def lfilter(
|
75 |
+
waveform: Tensor,
|
76 |
+
a_coeffs: Tensor,
|
77 |
+
b_coeffs: Tensor,
|
78 |
+
clamp: bool = True,
|
79 |
+
) -> Tensor:
|
80 |
+
r"""Perform an IIR filter by evaluating difference equation.
|
81 |
+
|
82 |
+
Args:
|
83 |
+
waveform (Tensor): audio waveform of dimension of ``(..., time)``. Must be normalized to -1 to 1.
|
84 |
+
a_coeffs (Tensor): denominator coefficients of difference equation of dimension of ``(n_order + 1)``.
|
85 |
+
Lower delays coefficients are first, e.g. ``[a0, a1, a2, ...]``.
|
86 |
+
Must be same size as b_coeffs (pad with 0's as necessary).
|
87 |
+
b_coeffs (Tensor): numerator coefficients of difference equation of dimension of ``(n_order + 1)``.
|
88 |
+
Lower delays coefficients are first, e.g. ``[b0, b1, b2, ...]``.
|
89 |
+
Must be same size as a_coeffs (pad with 0's as necessary).
|
90 |
+
clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)
|
91 |
+
|
92 |
+
Returns:
|
93 |
+
Tensor: Waveform with dimension of ``(..., time)``.
|
94 |
+
"""
|
95 |
+
# pack batch
|
96 |
+
shape = waveform.size()
|
97 |
+
waveform = waveform.reshape(-1, shape[-1])
|
98 |
+
|
99 |
+
assert (a_coeffs.size(0) == b_coeffs.size(0))
|
100 |
+
assert (len(waveform.size()) == 2)
|
101 |
+
assert (waveform.device == a_coeffs.device)
|
102 |
+
assert (b_coeffs.device == a_coeffs.device)
|
103 |
+
|
104 |
+
device = waveform.device
|
105 |
+
dtype = waveform.dtype
|
106 |
+
n_channel, n_sample = waveform.size()
|
107 |
+
n_order = a_coeffs.size(0)
|
108 |
+
n_sample_padded = n_sample + n_order - 1
|
109 |
+
assert (n_order > 0)
|
110 |
+
|
111 |
+
# Pad the input and create output
|
112 |
+
padded_waveform = torch.zeros(n_channel, n_sample_padded, dtype=dtype, device=device)
|
113 |
+
padded_waveform[:, (n_order - 1):] = waveform
|
114 |
+
padded_output_waveform = torch.zeros(n_channel, n_sample_padded, dtype=dtype, device=device)
|
115 |
+
|
116 |
+
# Set up the coefficients matrix
|
117 |
+
# Flip coefficients' order
|
118 |
+
a_coeffs_flipped = a_coeffs.flip(0)
|
119 |
+
b_coeffs_flipped = b_coeffs.flip(0)
|
120 |
+
|
121 |
+
# calculate windowed_input_signal in parallel
|
122 |
+
# create indices of original with shape (n_channel, n_order, n_sample)
|
123 |
+
window_idxs = torch.arange(n_sample, device=device).unsqueeze(0) + torch.arange(n_order, device=device).unsqueeze(1)
|
124 |
+
window_idxs = window_idxs.repeat(n_channel, 1, 1)
|
125 |
+
window_idxs += (torch.arange(n_channel, device=device).unsqueeze(-1).unsqueeze(-1) * n_sample_padded)
|
126 |
+
window_idxs = window_idxs.long()
|
127 |
+
# (n_order, ) matmul (n_channel, n_order, n_sample) -> (n_channel, n_sample)
|
128 |
+
input_signal_windows = torch.matmul(b_coeffs_flipped, torch.take(padded_waveform, window_idxs))
|
129 |
+
|
130 |
+
input_signal_windows.div_(a_coeffs[0])
|
131 |
+
a_coeffs_flipped.div_(a_coeffs[0])
|
132 |
+
for i_sample, o0 in enumerate(input_signal_windows.t()):
|
133 |
+
windowed_output_signal = padded_output_waveform[:, i_sample:(i_sample + n_order)]
|
134 |
+
o0.addmv_(windowed_output_signal, a_coeffs_flipped, alpha=-1)
|
135 |
+
padded_output_waveform[:, i_sample + n_order - 1] = o0
|
136 |
+
|
137 |
+
output = padded_output_waveform[:, (n_order - 1):]
|
138 |
+
|
139 |
+
if clamp:
|
140 |
+
output = torch.clamp(output, min=-1., max=1.)
|
141 |
+
|
142 |
+
# unpack batch
|
143 |
+
output = output.reshape(shape[:-1] + output.shape[-1:])
|
144 |
+
|
145 |
+
return output
|
146 |
+
|
147 |
+
|
148 |
+
|
149 |
+
def biquad(
|
150 |
+
waveform: Tensor,
|
151 |
+
b0: float,
|
152 |
+
b1: float,
|
153 |
+
b2: float,
|
154 |
+
a0: float,
|
155 |
+
a1: float,
|
156 |
+
a2: float
|
157 |
+
) -> Tensor:
|
158 |
+
r"""Perform a biquad filter of input tensor. Initial conditions set to 0.
|
159 |
+
https://en.wikipedia.org/wiki/Digital_biquad_filter
|
160 |
+
|
161 |
+
Args:
|
162 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`
|
163 |
+
b0 (float): numerator coefficient of current input, x[n]
|
164 |
+
b1 (float): numerator coefficient of input one time step ago x[n-1]
|
165 |
+
b2 (float): numerator coefficient of input two time steps ago x[n-2]
|
166 |
+
a0 (float): denominator coefficient of current output y[n], typically 1
|
167 |
+
a1 (float): denominator coefficient of current output y[n-1]
|
168 |
+
a2 (float): denominator coefficient of current output y[n-2]
|
169 |
+
|
170 |
+
Returns:
|
171 |
+
Tensor: Waveform with dimension of `(..., time)`
|
172 |
+
"""
|
173 |
+
|
174 |
+
device = waveform.device
|
175 |
+
dtype = waveform.dtype
|
176 |
+
|
177 |
+
output_waveform = lfilter(
|
178 |
+
waveform,
|
179 |
+
torch.tensor([a0, a1, a2], dtype=dtype, device=device),
|
180 |
+
torch.tensor([b0, b1, b2], dtype=dtype, device=device)
|
181 |
+
)
|
182 |
+
return output_waveform
|
183 |
+
|
184 |
+
|
185 |
+
|
186 |
+
def _dB2Linear(x: float) -> float:
|
187 |
+
return math.exp(x * math.log(10) / 20.0)
|
188 |
+
|
189 |
+
|
190 |
+
def highpass_biquad(
|
191 |
+
waveform: Tensor,
|
192 |
+
sample_rate: int,
|
193 |
+
cutoff_freq: float,
|
194 |
+
Q: float = 0.707
|
195 |
+
) -> Tensor:
|
196 |
+
r"""Design biquad highpass filter and perform filtering. Similar to SoX implementation.
|
197 |
+
|
198 |
+
Args:
|
199 |
+
waveform (Tensor): audio waveform of dimension of `(..., time)`
|
200 |
+
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
|
201 |
+
cutoff_freq (float): filter cutoff frequency
|
202 |
+
Q (float, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
|
203 |
+
|
204 |
+
Returns:
|
205 |
+
Tensor: Waveform dimension of `(..., time)`
|
206 |
+
"""
|
207 |
+
w0 = 2 * math.pi * cutoff_freq / sample_rate
|
208 |
+
alpha = math.sin(w0) / 2. / Q
|
209 |
+
|
210 |
+
b0 = (1 + math.cos(w0)) / 2
|
211 |
+
b1 = -1 - math.cos(w0)
|
212 |
+
b2 = b0
|
213 |
+
a0 = 1 + alpha
|
214 |
+
a1 = -2 * math.cos(w0)
|
215 |
+
a2 = 1 - alpha
|
216 |
+
return biquad(waveform, b0, b1, b2, a0, a1, a2)
|
217 |
+
|
218 |
+
|
219 |
+
|
220 |
+
def lowpass_biquad(
|
221 |
+
waveform: Tensor,
|
222 |
+
sample_rate: int,
|
223 |
+
cutoff_freq: float,
|
224 |
+
Q: float = 0.707
|
225 |
+
) -> Tensor:
|
226 |
+
r"""Design biquad lowpass filter and perform filtering. Similar to SoX implementation.
|
227 |
+
|
228 |
+
Args:
|
229 |
+
waveform (torch.Tensor): audio waveform of dimension of `(..., time)`
|
230 |
+
sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
|
231 |
+
cutoff_freq (float): filter cutoff frequency
|
232 |
+
Q (float, optional): https://en.wikipedia.org/wiki/Q_factor (Default: ``0.707``)
|
233 |
+
|
234 |
+
Returns:
|
235 |
+
Tensor: Waveform of dimension of `(..., time)`
|
236 |
+
"""
|
237 |
+
w0 = 2 * math.pi * cutoff_freq / sample_rate
|
238 |
+
alpha = math.sin(w0) / 2 / Q
|
239 |
+
|
240 |
+
b0 = (1 - math.cos(w0)) / 2
|
241 |
+
b1 = 1 - math.cos(w0)
|
242 |
+
b2 = b0
|
243 |
+
a0 = 1 + alpha
|
244 |
+
a1 = -2 * math.cos(w0)
|
245 |
+
a2 = 1 - alpha
|
246 |
+
return biquad(waveform, b0, b1, b2, a0, a1, a2)
|
247 |
+
|
248 |
+
|
249 |
+
def window_sumsquare(window, n_frames, hop_length=200, win_length=800,
|
250 |
+
n_fft=800, dtype=np.float32, norm=None):
|
251 |
+
"""
|
252 |
+
# from librosa 0.6
|
253 |
+
Compute the sum-square envelope of a window function at a given hop length.
|
254 |
+
|
255 |
+
This is used to estimate modulation effects induced by windowing
|
256 |
+
observations in short-time fourier transforms.
|
257 |
+
|
258 |
+
Parameters
|
259 |
+
----------
|
260 |
+
window : string, tuple, number, callable, or list-like
|
261 |
+
Window specification, as in `get_window`
|
262 |
+
|
263 |
+
n_frames : int > 0
|
264 |
+
The number of analysis frames
|
265 |
+
|
266 |
+
hop_length : int > 0
|
267 |
+
The number of samples to advance between frames
|
268 |
+
|
269 |
+
win_length : [optional]
|
270 |
+
The length of the window function. By default, this matches `n_fft`.
|
271 |
+
|
272 |
+
n_fft : int > 0
|
273 |
+
The length of each analysis frame.
|
274 |
+
|
275 |
+
dtype : np.dtype
|
276 |
+
The data type of the output
|
277 |
+
|
278 |
+
Returns
|
279 |
+
-------
|
280 |
+
wss : np.ndarray, shape=`(n_fft + hop_length * (n_frames - 1))`
|
281 |
+
The sum-squared envelope of the window function
|
282 |
+
"""
|
283 |
+
if win_length is None:
|
284 |
+
win_length = n_fft
|
285 |
+
|
286 |
+
n = n_fft + hop_length * (n_frames - 1)
|
287 |
+
x = np.zeros(n, dtype=dtype)
|
288 |
+
|
289 |
+
# Compute the squared window at the desired length
|
290 |
+
win_sq = get_window(window, win_length, fftbins=True)
|
291 |
+
win_sq = librosa_util.normalize(win_sq, norm=norm)**2
|
292 |
+
win_sq = librosa_util.pad_center(win_sq, n_fft)
|
293 |
+
|
294 |
+
# Fill the envelope
|
295 |
+
for i in range(n_frames):
|
296 |
+
sample = i * hop_length
|
297 |
+
x[sample:min(n, sample + n_fft)] += win_sq[:max(0, min(n_fft, n - sample))]
|
298 |
+
return x
|
299 |
+
|
300 |
+
|
301 |
+
class MelScale(torch.nn.Module):
|
302 |
+
r"""Turn a normal STFT into a mel frequency STFT, using a conversion
|
303 |
+
matrix. This uses triangular filter banks.
|
304 |
+
|
305 |
+
User can control which device the filter bank (`fb`) is (e.g. fb.to(spec_f.device)).
|
306 |
+
|
307 |
+
Args:
|
308 |
+
n_mels (int, optional): Number of mel filterbanks. (Default: ``128``)
|
309 |
+
sample_rate (int, optional): Sample rate of audio signal. (Default: ``16000``)
|
310 |
+
f_min (float, optional): Minimum frequency. (Default: ``0.``)
|
311 |
+
f_max (float or None, optional): Maximum frequency. (Default: ``sample_rate // 2``)
|
312 |
+
n_stft (int, optional): Number of bins in STFT. Calculated from first input
|
313 |
+
if None is given. See ``n_fft`` in :class:`Spectrogram`. (Default: ``None``)
|
314 |
+
"""
|
315 |
+
__constants__ = ['n_mels', 'sample_rate', 'f_min', 'f_max']
|
316 |
+
|
317 |
+
def __init__(self,
|
318 |
+
n_mels: int = 128,
|
319 |
+
sample_rate: int = 24000,
|
320 |
+
f_min: float = 0.,
|
321 |
+
f_max: Optional[float] = None,
|
322 |
+
n_stft: Optional[int] = None) -> None:
|
323 |
+
super(MelScale, self).__init__()
|
324 |
+
self.n_mels = n_mels
|
325 |
+
self.sample_rate = sample_rate
|
326 |
+
self.f_max = f_max if f_max is not None else float(sample_rate // 2)
|
327 |
+
self.f_min = f_min
|
328 |
+
|
329 |
+
assert f_min <= self.f_max, 'Require f_min: %f < f_max: %f' % (f_min, self.f_max)
|
330 |
+
|
331 |
+
fb = torch.empty(0) if n_stft is None else create_fb_matrix(
|
332 |
+
n_stft, self.f_min, self.f_max, self.n_mels, self.sample_rate)
|
333 |
+
self.register_buffer('fb', fb)
|
334 |
+
|
335 |
+
def forward(self, specgram: Tensor) -> Tensor:
|
336 |
+
r"""
|
337 |
+
Args:
|
338 |
+
specgram (Tensor): A spectrogram STFT of dimension (..., freq, time).
|
339 |
+
|
340 |
+
Returns:
|
341 |
+
Tensor: Mel frequency spectrogram of size (..., ``n_mels``, time).
|
342 |
+
"""
|
343 |
+
|
344 |
+
# pack batch
|
345 |
+
shape = specgram.size()
|
346 |
+
specgram = specgram.reshape(-1, shape[-2], shape[-1])
|
347 |
+
|
348 |
+
if self.fb.numel() == 0:
|
349 |
+
tmp_fb = create_fb_matrix(specgram.size(1), self.f_min, self.f_max, self.n_mels, self.sample_rate)
|
350 |
+
# Attributes cannot be reassigned outside __init__ so workaround
|
351 |
+
self.fb.resize_(tmp_fb.size())
|
352 |
+
self.fb.copy_(tmp_fb)
|
353 |
+
|
354 |
+
# (channel, frequency, time).transpose(...) dot (frequency, n_mels)
|
355 |
+
# -> (channel, time, n_mels).transpose(...)
|
356 |
+
mel_specgram = torch.matmul(specgram.transpose(1, 2), self.fb).transpose(1, 2)
|
357 |
+
|
358 |
+
# unpack batch
|
359 |
+
mel_specgram = mel_specgram.reshape(shape[:-2] + mel_specgram.shape[-2:])
|
360 |
+
|
361 |
+
return mel_specgram
|
362 |
+
|
363 |
+
|
364 |
+
class TorchSTFT(torch.nn.Module):
|
365 |
+
def __init__(self, fft_size, hop_size, win_size,
|
366 |
+
normalized=False, domain='linear',
|
367 |
+
mel_scale=False, ref_level_db=20, min_level_db=-100):
|
368 |
+
super().__init__()
|
369 |
+
self.fft_size = fft_size
|
370 |
+
self.hop_size = hop_size
|
371 |
+
self.win_size = win_size
|
372 |
+
self.ref_level_db = ref_level_db
|
373 |
+
self.min_level_db = min_level_db
|
374 |
+
self.window = torch.hann_window(win_size)
|
375 |
+
self.normalized = normalized
|
376 |
+
self.domain = domain
|
377 |
+
self.mel_scale = MelScale(n_mels=(fft_size // 2 + 1),
|
378 |
+
n_stft=(fft_size // 2 + 1)) if mel_scale else None
|
379 |
+
|
380 |
+
def transform(self, x):
|
381 |
+
x_stft = torch.stft(x, self.fft_size, self.hop_size, self.win_size,
|
382 |
+
self.window.type_as(x), normalized=self.normalized)
|
383 |
+
real = x_stft[..., 0]
|
384 |
+
imag = x_stft[..., 1]
|
385 |
+
mag = torch.clamp(real ** 2 + imag ** 2, min=1e-7)
|
386 |
+
mag = torch.sqrt(mag)
|
387 |
+
phase = torch.atan2(imag, real)
|
388 |
+
|
389 |
+
if self.mel_scale is not None:
|
390 |
+
mag = self.mel_scale(mag)
|
391 |
+
|
392 |
+
if self.domain == 'log':
|
393 |
+
mag = 20 * torch.log10(mag) - self.ref_level_db
|
394 |
+
mag = torch.clamp((mag - self.min_level_db) / -self.min_level_db, 0, 1)
|
395 |
+
return mag, phase
|
396 |
+
elif self.domain == 'linear':
|
397 |
+
return mag, phase
|
398 |
+
elif self.domain == 'double':
|
399 |
+
log_mag = 20 * torch.log10(mag) - self.ref_level_db
|
400 |
+
log_mag = torch.clamp((log_mag - self.min_level_db) / -self.min_level_db, 0, 1)
|
401 |
+
return torch.cat((mag, log_mag), dim=1), phase
|
402 |
+
|
403 |
+
def complex(self, x):
|
404 |
+
x_stft = torch.stft(x, self.fft_size, self.hop_size, self.win_size,
|
405 |
+
self.window.type_as(x), normalized=self.normalized)
|
406 |
+
real = x_stft[..., 0]
|
407 |
+
imag = x_stft[..., 1]
|
408 |
+
return real, imag
|
409 |
+
|
410 |
+
|
411 |
+
|
412 |
+
class STFT(torch.nn.Module):
|
413 |
+
"""adapted from Prem Seetharaman's https://github.com/pseeth/pytorch-stft"""
|
414 |
+
def __init__(self, filter_length=800, hop_length=200, win_length=800,
|
415 |
+
window='hann'):
|
416 |
+
super(STFT, self).__init__()
|
417 |
+
self.filter_length = filter_length
|
418 |
+
self.hop_length = hop_length
|
419 |
+
self.win_length = win_length
|
420 |
+
self.window = window
|
421 |
+
self.forward_transform = None
|
422 |
+
scale = self.filter_length / self.hop_length
|
423 |
+
fourier_basis = np.fft.fft(np.eye(self.filter_length))
|
424 |
+
|
425 |
+
cutoff = int((self.filter_length / 2 + 1))
|
426 |
+
fourier_basis = np.vstack([np.real(fourier_basis[:cutoff, :]),
|
427 |
+
np.imag(fourier_basis[:cutoff, :])])
|
428 |
+
|
429 |
+
forward_basis = torch.FloatTensor(fourier_basis[:, None, :])
|
430 |
+
inverse_basis = torch.FloatTensor(
|
431 |
+
np.linalg.pinv(scale * fourier_basis).T[:, None, :])
|
432 |
+
|
433 |
+
if window is not None:
|
434 |
+
assert(filter_length >= win_length)
|
435 |
+
# get window and zero center pad it to filter_length
|
436 |
+
fft_window = get_window(window, win_length, fftbins=True)
|
437 |
+
fft_window = pad_center(fft_window, filter_length)
|
438 |
+
fft_window = torch.from_numpy(fft_window).float()
|
439 |
+
|
440 |
+
# window the bases
|
441 |
+
forward_basis *= fft_window
|
442 |
+
inverse_basis *= fft_window
|
443 |
+
|
444 |
+
self.register_buffer('forward_basis', forward_basis.float())
|
445 |
+
self.register_buffer('inverse_basis', inverse_basis.float())
|
446 |
+
|
447 |
+
def transform(self, input_data):
|
448 |
+
num_batches = input_data.size(0)
|
449 |
+
num_samples = input_data.size(1)
|
450 |
+
|
451 |
+
self.num_samples = num_samples
|
452 |
+
|
453 |
+
# similar to librosa, reflect-pad the input
|
454 |
+
input_data = input_data.view(num_batches, 1, num_samples)
|
455 |
+
input_data = F.pad(
|
456 |
+
input_data.unsqueeze(1),
|
457 |
+
(int(self.filter_length / 2), int(self.filter_length / 2), 0, 0),
|
458 |
+
mode='reflect')
|
459 |
+
input_data = input_data.squeeze(1)
|
460 |
+
|
461 |
+
forward_transform = F.conv1d(
|
462 |
+
input_data,
|
463 |
+
Variable(self.forward_basis, requires_grad=False),
|
464 |
+
stride=self.hop_length,
|
465 |
+
padding=0)
|
466 |
+
|
467 |
+
cutoff = int((self.filter_length / 2) + 1)
|
468 |
+
real_part = forward_transform[:, :cutoff, :]
|
469 |
+
imag_part = forward_transform[:, cutoff:, :]
|
470 |
+
|
471 |
+
magnitude = torch.sqrt(real_part**2 + imag_part**2)
|
472 |
+
phase = torch.autograd.Variable(
|
473 |
+
torch.atan2(imag_part.data, real_part.data))
|
474 |
+
|
475 |
+
return magnitude, phase
|
476 |
+
|
477 |
+
def inverse(self, magnitude, phase):
|
478 |
+
recombine_magnitude_phase = torch.cat(
|
479 |
+
[magnitude*torch.cos(phase), magnitude*torch.sin(phase)], dim=1)
|
480 |
+
|
481 |
+
inverse_transform = F.conv_transpose1d(
|
482 |
+
recombine_magnitude_phase,
|
483 |
+
Variable(self.inverse_basis, requires_grad=False),
|
484 |
+
stride=self.hop_length,
|
485 |
+
padding=0)
|
486 |
+
|
487 |
+
if self.window is not None:
|
488 |
+
window_sum = window_sumsquare(
|
489 |
+
self.window, magnitude.size(-1), hop_length=self.hop_length,
|
490 |
+
win_length=self.win_length, n_fft=self.filter_length,
|
491 |
+
dtype=np.float32)
|
492 |
+
# remove modulation effects
|
493 |
+
approx_nonzero_indices = torch.from_numpy(
|
494 |
+
np.where(window_sum > tiny(window_sum))[0])
|
495 |
+
window_sum = torch.autograd.Variable(
|
496 |
+
torch.from_numpy(window_sum), requires_grad=False)
|
497 |
+
window_sum = window_sum.cuda() if magnitude.is_cuda else window_sum
|
498 |
+
inverse_transform[:, :, approx_nonzero_indices] /= window_sum[approx_nonzero_indices]
|
499 |
+
|
500 |
+
# scale by hop ratio
|
501 |
+
inverse_transform *= float(self.filter_length) / self.hop_length
|
502 |
+
|
503 |
+
inverse_transform = inverse_transform[:, :, int(self.filter_length/2):]
|
504 |
+
inverse_transform = inverse_transform[:, :, :-int(self.filter_length/2):]
|
505 |
+
|
506 |
+
return inverse_transform
|
507 |
+
|
508 |
+
def forward(self, input_data):
|
509 |
+
self.magnitude, self.phase = self.transform(input_data)
|
510 |
+
reconstruction = self.inverse(self.magnitude, self.phase)
|
511 |
+
return reconstruction
|
512 |
+
|
modules/transforms.py
ADDED
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.nn import functional as F
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
|
7 |
+
DEFAULT_MIN_BIN_WIDTH = 1e-3
|
8 |
+
DEFAULT_MIN_BIN_HEIGHT = 1e-3
|
9 |
+
DEFAULT_MIN_DERIVATIVE = 1e-3
|
10 |
+
|
11 |
+
|
12 |
+
def piecewise_rational_quadratic_transform(inputs,
|
13 |
+
unnormalized_widths,
|
14 |
+
unnormalized_heights,
|
15 |
+
unnormalized_derivatives,
|
16 |
+
inverse=False,
|
17 |
+
tails=None,
|
18 |
+
tail_bound=1.,
|
19 |
+
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
20 |
+
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
21 |
+
min_derivative=DEFAULT_MIN_DERIVATIVE):
|
22 |
+
|
23 |
+
if tails is None:
|
24 |
+
spline_fn = rational_quadratic_spline
|
25 |
+
spline_kwargs = {}
|
26 |
+
else:
|
27 |
+
spline_fn = unconstrained_rational_quadratic_spline
|
28 |
+
spline_kwargs = {
|
29 |
+
'tails': tails,
|
30 |
+
'tail_bound': tail_bound
|
31 |
+
}
|
32 |
+
|
33 |
+
outputs, logabsdet = spline_fn(
|
34 |
+
inputs=inputs,
|
35 |
+
unnormalized_widths=unnormalized_widths,
|
36 |
+
unnormalized_heights=unnormalized_heights,
|
37 |
+
unnormalized_derivatives=unnormalized_derivatives,
|
38 |
+
inverse=inverse,
|
39 |
+
min_bin_width=min_bin_width,
|
40 |
+
min_bin_height=min_bin_height,
|
41 |
+
min_derivative=min_derivative,
|
42 |
+
**spline_kwargs
|
43 |
+
)
|
44 |
+
return outputs, logabsdet
|
45 |
+
|
46 |
+
|
47 |
+
def searchsorted(bin_locations, inputs, eps=1e-6):
|
48 |
+
bin_locations[..., -1] += eps
|
49 |
+
return torch.sum(
|
50 |
+
inputs[..., None] >= bin_locations,
|
51 |
+
dim=-1
|
52 |
+
) - 1
|
53 |
+
|
54 |
+
|
55 |
+
def unconstrained_rational_quadratic_spline(inputs,
|
56 |
+
unnormalized_widths,
|
57 |
+
unnormalized_heights,
|
58 |
+
unnormalized_derivatives,
|
59 |
+
inverse=False,
|
60 |
+
tails='linear',
|
61 |
+
tail_bound=1.,
|
62 |
+
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
63 |
+
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
64 |
+
min_derivative=DEFAULT_MIN_DERIVATIVE):
|
65 |
+
inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
|
66 |
+
outside_interval_mask = ~inside_interval_mask
|
67 |
+
|
68 |
+
outputs = torch.zeros_like(inputs)
|
69 |
+
logabsdet = torch.zeros_like(inputs)
|
70 |
+
|
71 |
+
if tails == 'linear':
|
72 |
+
unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
|
73 |
+
constant = np.log(np.exp(1 - min_derivative) - 1)
|
74 |
+
unnormalized_derivatives[..., 0] = constant
|
75 |
+
unnormalized_derivatives[..., -1] = constant
|
76 |
+
|
77 |
+
outputs[outside_interval_mask] = inputs[outside_interval_mask]
|
78 |
+
logabsdet[outside_interval_mask] = 0
|
79 |
+
else:
|
80 |
+
raise RuntimeError('{} tails are not implemented.'.format(tails))
|
81 |
+
|
82 |
+
outputs[inside_interval_mask], logabsdet[inside_interval_mask] = rational_quadratic_spline(
|
83 |
+
inputs=inputs[inside_interval_mask],
|
84 |
+
unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
|
85 |
+
unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
|
86 |
+
unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
|
87 |
+
inverse=inverse,
|
88 |
+
left=-tail_bound, right=tail_bound, bottom=-tail_bound, top=tail_bound,
|
89 |
+
min_bin_width=min_bin_width,
|
90 |
+
min_bin_height=min_bin_height,
|
91 |
+
min_derivative=min_derivative
|
92 |
+
)
|
93 |
+
|
94 |
+
return outputs, logabsdet
|
95 |
+
|
96 |
+
def rational_quadratic_spline(inputs,
|
97 |
+
unnormalized_widths,
|
98 |
+
unnormalized_heights,
|
99 |
+
unnormalized_derivatives,
|
100 |
+
inverse=False,
|
101 |
+
left=0., right=1., bottom=0., top=1.,
|
102 |
+
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
103 |
+
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
104 |
+
min_derivative=DEFAULT_MIN_DERIVATIVE):
|
105 |
+
if torch.min(inputs) < left or torch.max(inputs) > right:
|
106 |
+
raise ValueError('Input to a transform is not within its domain')
|
107 |
+
|
108 |
+
num_bins = unnormalized_widths.shape[-1]
|
109 |
+
|
110 |
+
if min_bin_width * num_bins > 1.0:
|
111 |
+
raise ValueError('Minimal bin width too large for the number of bins')
|
112 |
+
if min_bin_height * num_bins > 1.0:
|
113 |
+
raise ValueError('Minimal bin height too large for the number of bins')
|
114 |
+
|
115 |
+
widths = F.softmax(unnormalized_widths, dim=-1)
|
116 |
+
widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
|
117 |
+
cumwidths = torch.cumsum(widths, dim=-1)
|
118 |
+
cumwidths = F.pad(cumwidths, pad=(1, 0), mode='constant', value=0.0)
|
119 |
+
cumwidths = (right - left) * cumwidths + left
|
120 |
+
cumwidths[..., 0] = left
|
121 |
+
cumwidths[..., -1] = right
|
122 |
+
widths = cumwidths[..., 1:] - cumwidths[..., :-1]
|
123 |
+
|
124 |
+
derivatives = min_derivative + F.softplus(unnormalized_derivatives)
|
125 |
+
|
126 |
+
heights = F.softmax(unnormalized_heights, dim=-1)
|
127 |
+
heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
|
128 |
+
cumheights = torch.cumsum(heights, dim=-1)
|
129 |
+
cumheights = F.pad(cumheights, pad=(1, 0), mode='constant', value=0.0)
|
130 |
+
cumheights = (top - bottom) * cumheights + bottom
|
131 |
+
cumheights[..., 0] = bottom
|
132 |
+
cumheights[..., -1] = top
|
133 |
+
heights = cumheights[..., 1:] - cumheights[..., :-1]
|
134 |
+
|
135 |
+
if inverse:
|
136 |
+
bin_idx = searchsorted(cumheights, inputs)[..., None]
|
137 |
+
else:
|
138 |
+
bin_idx = searchsorted(cumwidths, inputs)[..., None]
|
139 |
+
|
140 |
+
input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
|
141 |
+
input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
|
142 |
+
|
143 |
+
input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
|
144 |
+
delta = heights / widths
|
145 |
+
input_delta = delta.gather(-1, bin_idx)[..., 0]
|
146 |
+
|
147 |
+
input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
|
148 |
+
input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
|
149 |
+
|
150 |
+
input_heights = heights.gather(-1, bin_idx)[..., 0]
|
151 |
+
|
152 |
+
if inverse:
|
153 |
+
a = (((inputs - input_cumheights) * (input_derivatives
|
154 |
+
+ input_derivatives_plus_one
|
155 |
+
- 2 * input_delta)
|
156 |
+
+ input_heights * (input_delta - input_derivatives)))
|
157 |
+
b = (input_heights * input_derivatives
|
158 |
+
- (inputs - input_cumheights) * (input_derivatives
|
159 |
+
+ input_derivatives_plus_one
|
160 |
+
- 2 * input_delta))
|
161 |
+
c = - input_delta * (inputs - input_cumheights)
|
162 |
+
|
163 |
+
discriminant = b.pow(2) - 4 * a * c
|
164 |
+
assert (discriminant >= 0).all()
|
165 |
+
|
166 |
+
root = (2 * c) / (-b - torch.sqrt(discriminant))
|
167 |
+
outputs = root * input_bin_widths + input_cumwidths
|
168 |
+
|
169 |
+
theta_one_minus_theta = root * (1 - root)
|
170 |
+
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
|
171 |
+
* theta_one_minus_theta)
|
172 |
+
derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * root.pow(2)
|
173 |
+
+ 2 * input_delta * theta_one_minus_theta
|
174 |
+
+ input_derivatives * (1 - root).pow(2))
|
175 |
+
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
|
176 |
+
|
177 |
+
return outputs, -logabsdet
|
178 |
+
else:
|
179 |
+
theta = (inputs - input_cumwidths) / input_bin_widths
|
180 |
+
theta_one_minus_theta = theta * (1 - theta)
|
181 |
+
|
182 |
+
numerator = input_heights * (input_delta * theta.pow(2)
|
183 |
+
+ input_derivatives * theta_one_minus_theta)
|
184 |
+
denominator = input_delta + ((input_derivatives + input_derivatives_plus_one - 2 * input_delta)
|
185 |
+
* theta_one_minus_theta)
|
186 |
+
outputs = input_cumheights + numerator / denominator
|
187 |
+
|
188 |
+
derivative_numerator = input_delta.pow(2) * (input_derivatives_plus_one * theta.pow(2)
|
189 |
+
+ 2 * input_delta * theta_one_minus_theta
|
190 |
+
+ input_derivatives * (1 - theta).pow(2))
|
191 |
+
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
|
192 |
+
|
193 |
+
return outputs, logabsdet
|
preprocess/mel_processing.py
ADDED
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
import torch
|
5 |
+
from torch import nn
|
6 |
+
import torch.nn.functional as F
|
7 |
+
import torch.utils.data
|
8 |
+
import numpy as np
|
9 |
+
import librosa
|
10 |
+
import librosa.util as librosa_util
|
11 |
+
from librosa.util import normalize, pad_center, tiny
|
12 |
+
from scipy.signal import get_window
|
13 |
+
from scipy.io.wavfile import read
|
14 |
+
from librosa.filters import mel as librosa_mel_fn
|
15 |
+
|
16 |
+
MAX_WAV_VALUE = 32768.0
|
17 |
+
|
18 |
+
|
19 |
+
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
20 |
+
"""
|
21 |
+
PARAMS
|
22 |
+
------
|
23 |
+
C: compression factor
|
24 |
+
"""
|
25 |
+
return torch.log(torch.clamp(x, min=clip_val) * C)
|
26 |
+
|
27 |
+
|
28 |
+
def dynamic_range_decompression_torch(x, C=1):
|
29 |
+
"""
|
30 |
+
PARAMS
|
31 |
+
------
|
32 |
+
C: compression factor used to compress
|
33 |
+
"""
|
34 |
+
return torch.exp(x) / C
|
35 |
+
|
36 |
+
|
37 |
+
def spectral_normalize_torch(magnitudes):
|
38 |
+
output = dynamic_range_compression_torch(magnitudes)
|
39 |
+
return output
|
40 |
+
|
41 |
+
|
42 |
+
def spectral_de_normalize_torch(magnitudes):
|
43 |
+
output = dynamic_range_decompression_torch(magnitudes)
|
44 |
+
return output
|
45 |
+
|
46 |
+
|
47 |
+
mel_basis = {}
|
48 |
+
hann_window = {}
|
49 |
+
|
50 |
+
|
51 |
+
def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
|
52 |
+
|
53 |
+
global hann_window
|
54 |
+
dtype_device = str(y.dtype) + '_' + str(y.device)
|
55 |
+
wnsize_dtype_device = str(win_size) + '_' + dtype_device
|
56 |
+
if wnsize_dtype_device not in hann_window:
|
57 |
+
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
|
58 |
+
|
59 |
+
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
60 |
+
y = y.squeeze(1)
|
61 |
+
|
62 |
+
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
|
63 |
+
center=center, pad_mode='reflect', normalized=False, onesided=True)
|
64 |
+
|
65 |
+
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
66 |
+
return spec
|
67 |
+
|
68 |
+
|
69 |
+
def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
|
70 |
+
global mel_basis
|
71 |
+
dtype_device = str(spec.dtype) + '_' + str(spec.device)
|
72 |
+
fmax_dtype_device = str(fmax) + '_' + dtype_device
|
73 |
+
if fmax_dtype_device not in mel_basis:
|
74 |
+
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
75 |
+
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=spec.dtype, device=spec.device)
|
76 |
+
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
|
77 |
+
spec = spectral_normalize_torch(spec)
|
78 |
+
return spec
|
79 |
+
|
80 |
+
|
81 |
+
def mel_spectrogram_torch(y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False):
|
82 |
+
|
83 |
+
global mel_basis, hann_window
|
84 |
+
dtype_device = str(y.dtype) + '_' + str(y.device)
|
85 |
+
fmax_dtype_device = str(fmax) + '_' + dtype_device
|
86 |
+
wnsize_dtype_device = str(win_size) + '_' + dtype_device
|
87 |
+
if fmax_dtype_device not in mel_basis:
|
88 |
+
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
89 |
+
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(dtype=y.dtype, device=y.device)
|
90 |
+
if wnsize_dtype_device not in hann_window:
|
91 |
+
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
|
92 |
+
|
93 |
+
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
94 |
+
y = y.squeeze(1)
|
95 |
+
|
96 |
+
spec = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
|
97 |
+
center=center, pad_mode='reflect', normalized=False, onesided=True)
|
98 |
+
|
99 |
+
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
100 |
+
|
101 |
+
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
|
102 |
+
spec = spectral_normalize_torch(spec)
|
103 |
+
|
104 |
+
return spec
|
preprocess/prepare_multispeaker.py
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
|
4 |
+
for spk in os.listdir("data"):
|
5 |
+
if os.path.isdir(f"data/{spk}"):
|
6 |
+
if os.path.exists(f"data/{spk}/raw/wavs"):
|
7 |
+
shutil.move(f"data/{spk}/raw/wavs", f"data/{spk}")
|
8 |
+
shutil.move(f"data/{spk}/raw/transcriptions.txt", f"data/{spk}")
|
9 |
+
shutil.rmtree(f"data/{spk}/raw")
|
10 |
+
|
preprocess/preprocess.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
import argparse
|
5 |
+
import numpy as np
|
6 |
+
from multiprocessing import cpu_count
|
7 |
+
from concurrent.futures import ProcessPoolExecutor
|
8 |
+
from functools import partial
|
9 |
+
from utils import audio
|
10 |
+
import utils.utils as utils
|
11 |
+
from tqdm import tqdm
|
12 |
+
import pyworld as pw
|
13 |
+
from random import shuffle
|
14 |
+
|
15 |
+
import warnings
|
16 |
+
warnings.filterwarnings("ignore")
|
17 |
+
|
18 |
+
def extract_mel(wav, hparams):
|
19 |
+
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
|
20 |
+
return mel_spectrogram.T, wav
|
21 |
+
|
22 |
+
def extract_pitch(wav, hps):
|
23 |
+
# rapt may be better
|
24 |
+
f0, _ = pw.harvest(wav.astype(np.float64),
|
25 |
+
hps.sample_rate,
|
26 |
+
frame_period=hps.hop_size / hps.sample_rate * 1000)
|
27 |
+
return f0
|
28 |
+
|
29 |
+
def process_utterance(hps, data_root, item):
|
30 |
+
out_dir = data_root
|
31 |
+
|
32 |
+
wav_path = os.path.join(data_root, "wavs",
|
33 |
+
"{}.wav".format(item))
|
34 |
+
wav = audio.load_wav(wav_path,
|
35 |
+
raw_sr=hps.data.sample_rate,
|
36 |
+
target_sr=hps.data.sample_rate,
|
37 |
+
win_size=hps.data.win_size,
|
38 |
+
hop_size=hps.data.hop_size)
|
39 |
+
|
40 |
+
mel, _ = extract_mel(wav, hps.data)
|
41 |
+
out_mel_dir = os.path.join(out_dir, "mels")
|
42 |
+
os.makedirs(out_mel_dir, exist_ok=True)
|
43 |
+
mel_path = os.path.join(out_mel_dir, item)
|
44 |
+
np.save(mel_path, mel)
|
45 |
+
|
46 |
+
pitch = extract_pitch(wav, hps.data)
|
47 |
+
out_pitch_dir = os.path.join(out_dir, "pitch")
|
48 |
+
os.makedirs(out_pitch_dir, exist_ok=True)
|
49 |
+
pitch_path = os.path.join(out_pitch_dir, item)
|
50 |
+
np.save(pitch_path, pitch)
|
51 |
+
|
52 |
+
|
53 |
+
def process(args, hps, data_dir):
|
54 |
+
print(os.path.join(data_dir, "wavs"))
|
55 |
+
if(not os.path.exists(os.path.join(data_dir, "file.list"))):
|
56 |
+
with open(os.path.join(data_dir, "file.list") , "w") as out_file:
|
57 |
+
files = os.listdir(os.path.join(data_dir, "wavs"))
|
58 |
+
files = [i for i in files if i.endswith(".wav")]
|
59 |
+
for f in files:
|
60 |
+
out_file.write(f.strip().split(".")[0] + '\n')
|
61 |
+
metadata = [
|
62 |
+
item.strip() for item in open(
|
63 |
+
os.path.join(data_dir, "file.list")).readlines()
|
64 |
+
]
|
65 |
+
executor = ProcessPoolExecutor(max_workers=args.num_workers)
|
66 |
+
results = []
|
67 |
+
for item in metadata:
|
68 |
+
results.append(executor.submit(partial(process_utterance, hps, data_dir, item)))
|
69 |
+
return [result.result() for result in tqdm(results)]
|
70 |
+
|
71 |
+
def split_dataset(data_dir):
|
72 |
+
metadata = [
|
73 |
+
item.strip() for item in open(
|
74 |
+
os.path.join(data_dir, "file.list")).readlines()
|
75 |
+
]
|
76 |
+
shuffle(metadata)
|
77 |
+
train_set = metadata[:-2]
|
78 |
+
test_set = metadata[-2:]
|
79 |
+
with open(os.path.join(data_dir, "train.list"), "w") as ts:
|
80 |
+
for item in train_set:
|
81 |
+
ts.write(item+"\n")
|
82 |
+
with open(os.path.join(data_dir, "test.list"), "w") as ts:
|
83 |
+
for item in test_set:
|
84 |
+
ts.write(item+"\n")
|
85 |
+
|
86 |
+
def main():
|
87 |
+
parser = argparse.ArgumentParser()
|
88 |
+
parser.add_argument('--config',
|
89 |
+
default='config.json',
|
90 |
+
help='json files for configurations.')
|
91 |
+
parser.add_argument('--num_workers', type=int, default=int(cpu_count()) // 2)
|
92 |
+
|
93 |
+
args = parser.parse_args()
|
94 |
+
hps = utils.get_hparams_from_file(args.config)
|
95 |
+
spklist = [spk for spk in os.listdir("data") if os.path.isdir(f"data/{spk}") and not os.path.exists(f"data/{spk}/test.list")]
|
96 |
+
for spk in tqdm(spklist):
|
97 |
+
print(f"preprocessing {spk}")
|
98 |
+
data_dir = f"data/{spk}"
|
99 |
+
process(args, hps, data_dir)
|
100 |
+
split_dataset(data_dir)
|
101 |
+
|
102 |
+
if __name__ == "__main__":
|
103 |
+
main()
|
preprocess/preprocess_multispeaker.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import glob
|
2 |
+
import json
|
3 |
+
|
4 |
+
data_root = "data"
|
5 |
+
|
6 |
+
|
7 |
+
transcriptions = glob.glob(f"{data_root}/*/transcriptions.txt")
|
8 |
+
spk2id = {}
|
9 |
+
spk_id = 0
|
10 |
+
ms_transcriptions = open(f'{data_root}/transcriptions.txt', "w")
|
11 |
+
ms_train_set = open(f'{data_root}/train.list', "w")
|
12 |
+
ms_test_set = open(f'{data_root}/test.list', "w")
|
13 |
+
for transcription in transcriptions:
|
14 |
+
spk = transcription.split("/")[-2]
|
15 |
+
spk2id[spk] = spk_id
|
16 |
+
spk_id += 1
|
17 |
+
for line in open(transcription).readlines():
|
18 |
+
ms_transcriptions.write(f"{spk}/{line}")
|
19 |
+
for line in open(transcription.replace("transcriptions.txt", "train.list")):
|
20 |
+
ms_train_set.write(f"{spk}/{line}")
|
21 |
+
for line in open(transcription.replace("transcriptions.txt", "test.list")):
|
22 |
+
ms_test_set.write(f"{spk}/{line}")
|
23 |
+
|
24 |
+
ms_transcriptions.close()
|
25 |
+
ms_train_set.close()
|
26 |
+
ms_test_set.close()
|
27 |
+
print("请手动将说话人与id的映射粘贴至config文件中")
|
28 |
+
print(json.dumps(spk2id))
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ipython==8.8.0
|
2 |
+
librosa==0.8.1
|
3 |
+
matplotlib==3.3.2
|
4 |
+
numpy==1.19.2
|
5 |
+
pyworld==0.3.0
|
6 |
+
scipy==1.5.2
|
7 |
+
soundfile==0.11.0
|
8 |
+
torch==1.8.1
|
9 |
+
tqdm==4.50.2
|
text/npu/__init__.py
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
from text.npu import symbols
|
2 |
+
from text.npu.symbol_converter import *
|
text/npu/symbol_converter.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import numpy as np
|
3 |
+
from text.npu.symbols import *
|
4 |
+
import os
|
5 |
+
|
6 |
+
# Mappings from symbol to numeric ID and vice versa:
|
7 |
+
_ttsing_phone_to_id = {p: i for i, p in enumerate(ttsing_phone_set)}
|
8 |
+
_ttsing_pitch_to_id = {p: i for i, p in enumerate(ttsing_pitch_set)}
|
9 |
+
_ttsing_slur_to_id = {s: i for i, s in enumerate(ttsing_slur_set)}
|
10 |
+
|
11 |
+
ttsing_phone_to_int = {}
|
12 |
+
int_to_ttsing_phone = {}
|
13 |
+
for idx, item in enumerate(ttsing_phone_set):
|
14 |
+
ttsing_phone_to_int[item] = idx
|
15 |
+
int_to_ttsing_phone[idx] = item
|
16 |
+
|
17 |
+
ttsing_pitch_to_int = {}
|
18 |
+
int_to_ttsing_pitch = {}
|
19 |
+
for idx, item in enumerate(ttsing_pitch_set):
|
20 |
+
ttsing_pitch_to_int[item] = idx
|
21 |
+
int_to_ttsing_pitch[idx] = item
|
22 |
+
|
23 |
+
# opencpop
|
24 |
+
ttsing_opencpop_pitch_to_int = {}
|
25 |
+
for idx, item in enumerate(ttsing_opencpop_pitch_set):
|
26 |
+
ttsing_opencpop_pitch_to_int[item] = idx
|
27 |
+
|
28 |
+
ttsing_slur_to_int = {}
|
29 |
+
int_to_ttsing_slur = {}
|
30 |
+
for idx, item in enumerate(ttsing_slur_set):
|
31 |
+
ttsing_slur_to_int[item] = idx
|
32 |
+
int_to_ttsing_slur[idx] = item
|
33 |
+
|
34 |
+
|
text/npu/symbols.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
ttsing_phone_set = ['_'] + [
|
3 |
+
"b", "c", "ch", "d", "f", "g", "h", "j", "k", "l", "m", "n", "p", "q", "r",
|
4 |
+
"s", "sh", "t", "x", "z", "zh", "a", "ai", "an", "ang", "ao", "e", "ei",
|
5 |
+
"en", "eng", "er", "iii", "ii", "i", "ia", "ian", "iang", "iao", "ie", "in",
|
6 |
+
"ing", "iong", "iou", "o", "ong", "ou", "u", "ua", "uai", "uan", "uang",
|
7 |
+
"uei", "uen", "ueng", "uo", "v", "van", "ve", "vn", "AH", "AA", "AO", "ER",
|
8 |
+
"IH", "IY", "UH", "UW", "EH", "AE", "AY", "EY", "OY", "AW", "OW", "P", "B",
|
9 |
+
"T", "D", "K", "G", "M", "N", "NG", "L", "S", "Z", "Y", "TH", "DH", "SH",
|
10 |
+
"ZH", "CH", "JH", "V", "W", "F", "R", "HH", "AH0", "AA0", "AO0", "ER0",
|
11 |
+
"IH0", "IY0", "UH0", "UW0", "EH0", "AE0", "AY0", "EY0", "OY0", "AW0", "OW0",
|
12 |
+
"AH1", "AA1", "AO1", "ER1", "IH1", "IY1", "UH1", "UW1", "EH1", "AE1", "AY1",
|
13 |
+
"EY1", "OY1", "AW1", "OW1", "AH2", "AA2", "AO2", "ER2", "IH2", "IY2", "UH2",
|
14 |
+
"UW2", "EH2", "AE2", "AY2", "EY2", "OY2", "AW2", "OW2", "AH3", "AA3", "AO3",
|
15 |
+
"ER3", "IH3", "IY3", "UH3", "UW3", "EH3", "AE3", "AY3", "EY3", "OY3", "AW3",
|
16 |
+
"OW3", "D-1", "T-1", "P*", "B*", "T*", "D*", "K*", "G*", "M*", "N*", "NG*",
|
17 |
+
"L*", "S*", "Z*", "Y*", "TH*", "DH*", "SH*", "ZH*", "CH*", "JH*", "V*",
|
18 |
+
"W*", "F*", "R*", "HH*", "sp", "sil", "or", "ar", "aor", "our", "angr",
|
19 |
+
"eir", "engr", "air", "ianr", "iaor", "ir", "ingr", "ur", "iiir", "uar",
|
20 |
+
"uangr", "uenr", "iir", "ongr", "uor", "ueir", "iar", "iangr", "inr",
|
21 |
+
"iour", "vr", "uanr", "ruai", "TR", "rest",
|
22 |
+
# opencpop
|
23 |
+
'w', 'SP', 'AP', 'un', 'y', 'ui', 'iu',
|
24 |
+
"iour", "vr", "uanr", "ruai", "TR", "rest",
|
25 |
+
# opencpop
|
26 |
+
'w', 'SP', 'AP', 'un', 'y', 'ui', 'iu',
|
27 |
+
# opencpop-strict
|
28 |
+
'i0', 'E', 'En'
|
29 |
+
]
|
30 |
+
|
31 |
+
ttsing_pitch_set = ['_'] + [
|
32 |
+
"C0", "C1", "C2", "C3", "C4", "C5", "C6", "C#/Db0", "C#/Db1", "C#/Db2",
|
33 |
+
"C#/Db3", "C#/Db4", "C#/Db5", "C#/Db6", "D0", "D1", "D2", "D3", "D4", "D5",
|
34 |
+
"D6", "D#/Eb0", "D#/Eb1", "D#/Eb2", "D#/Eb3", "D#/Eb4", "D#/Eb5", "D#/Eb6",
|
35 |
+
"E0", "E1", "E2", "E3", "E4", "E5", "E6", "F0", "F1", "F2", "F3", "F4",
|
36 |
+
"F5", "F6", "F#/Gb0", "F#/Gb1", "F#/Gb2", "F#/Gb3", "F#/Gb4", "F#/Gb5",
|
37 |
+
"F#/Gb6", "G0", "G1", "G2", "G3", "G4", "G5", "G6", "G#/Ab0", "G#/Ab1",
|
38 |
+
"G#/Ab2", "G#/Ab3", "G#/Ab4", "G#/Ab5", "G#/Ab6", "A0", "A1", "A2", "A3",
|
39 |
+
"A4", "A5", "A6", "A#/Bb0", "A#/Bb1", "A#/Bb2", "A#/Bb3", "A#/Bb4",
|
40 |
+
"A#/Bb5", "A#/Bb6", "B0", "B1", "B2", "B3", "B4", "B5", "B6", "RestRest"
|
41 |
+
]
|
42 |
+
|
43 |
+
ttsing_opencpop_pitch_set = ['_'] + [
|
44 |
+
"C0", "C1", "C2", "C3", "C4", "C5", "C6",
|
45 |
+
"C#0/Db0", "C#1/Db1", "C#2/Db2", "C#3/Db3", "C#4/Db4", "C#5/Db5", "C#6/Db6",
|
46 |
+
"D0", "D1", "D2", "D3", "D4", "D5", "D6",
|
47 |
+
"D#0/Eb0", "D#1/Eb1", "D#2/Eb2", "D#3/Eb3", "D#4/Eb4", "D#5/Eb5", "D#6/Eb6",
|
48 |
+
"E0", "E1", "E2", "E3", "E4", "E5", "E6",
|
49 |
+
"F0", "F1", "F2", "F3", "F4", "F5", "F6",
|
50 |
+
"F#0/Gb0", "F#1/Gb1", "F#2/Gb2", "F#3/Gb3", "F#4/Gb4", "F#5/Gb5", "F#6/Gb6",
|
51 |
+
"G0", "G1", "G2", "G3", "G4", "G5", "G6",
|
52 |
+
"G#0/Ab0", "G#1/Ab1", "G#2/Ab2", "G#3/Ab3", "G#4/Ab4", "G#5/Ab5", "G#6/Ab6",
|
53 |
+
"A0", "A1", "A2", "A3", "A4", "A5", "A6",
|
54 |
+
"A#0/Bb0", "A#1/Bb1", "A#2/Bb2", "A#3/Bb3", "A#4/Bb4", "A#5/Bb5", "A#6/Bb6",
|
55 |
+
"B0", "B1", "B2", "B3", "B4", "B5", "B6",
|
56 |
+
"RestRest", "rest"
|
57 |
+
]
|
58 |
+
|
59 |
+
ttsing_slur_set = ['_'] + ['0', '1']
|
60 |
+
|
61 |
+
|
utils/__init__.py
ADDED
File without changes
|
utils/audio.py
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
from numpy import linalg as LA
|
3 |
+
import librosa
|
4 |
+
from scipy.io import wavfile
|
5 |
+
import soundfile as sf
|
6 |
+
import librosa.filters
|
7 |
+
|
8 |
+
|
9 |
+
def load_wav(wav_path, raw_sr, target_sr=16000, win_size=800, hop_size=200):
|
10 |
+
audio = librosa.core.load(wav_path, sr=raw_sr)[0]
|
11 |
+
if raw_sr != target_sr:
|
12 |
+
audio = librosa.core.resample(audio,
|
13 |
+
raw_sr,
|
14 |
+
target_sr,
|
15 |
+
res_type='kaiser_best')
|
16 |
+
target_length = (audio.size // hop_size +
|
17 |
+
win_size // hop_size) * hop_size
|
18 |
+
pad_len = (target_length - audio.size) // 2
|
19 |
+
if audio.size % 2 == 0:
|
20 |
+
audio = np.pad(audio, (pad_len, pad_len), mode='reflect')
|
21 |
+
else:
|
22 |
+
audio = np.pad(audio, (pad_len, pad_len + 1), mode='reflect')
|
23 |
+
return audio
|
24 |
+
|
25 |
+
|
26 |
+
def save_wav(wav, path, sample_rate, norm=False):
|
27 |
+
if norm:
|
28 |
+
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
|
29 |
+
wavfile.write(path, sample_rate, wav.astype(np.int16))
|
30 |
+
else:
|
31 |
+
sf.write(path, wav, sample_rate)
|
32 |
+
|
33 |
+
|
34 |
+
_mel_basis = None
|
35 |
+
_inv_mel_basis = None
|
36 |
+
|
37 |
+
|
38 |
+
def _build_mel_basis(hparams):
|
39 |
+
assert hparams.fmax <= hparams.sample_rate // 2
|
40 |
+
return librosa.filters.mel(hparams.sample_rate,
|
41 |
+
hparams.n_fft,
|
42 |
+
n_mels=hparams.acoustic_dim,
|
43 |
+
fmin=hparams.fmin,
|
44 |
+
fmax=hparams.fmax)
|
45 |
+
|
46 |
+
|
47 |
+
def _linear_to_mel(spectogram, hparams):
|
48 |
+
global _mel_basis
|
49 |
+
if _mel_basis is None:
|
50 |
+
_mel_basis = _build_mel_basis(hparams)
|
51 |
+
return np.dot(_mel_basis, spectogram)
|
52 |
+
|
53 |
+
|
54 |
+
def _mel_to_linear(mel_spectrogram, hparams):
|
55 |
+
global _inv_mel_basis
|
56 |
+
if _inv_mel_basis is None:
|
57 |
+
_inv_mel_basis = np.linalg.pinv(_build_mel_basis(hparams))
|
58 |
+
return np.maximum(1e-10, np.dot(_inv_mel_basis, mel_spectrogram))
|
59 |
+
|
60 |
+
|
61 |
+
def _stft(y, hparams):
|
62 |
+
return librosa.stft(y=y,
|
63 |
+
n_fft=hparams.n_fft,
|
64 |
+
hop_length=hparams.hop_size,
|
65 |
+
win_length=hparams.win_size)
|
66 |
+
|
67 |
+
|
68 |
+
def _amp_to_db(x, hparams):
|
69 |
+
min_level = np.exp(hparams.min_level_db / 20 * np.log(10))
|
70 |
+
return 20 * np.log10(np.maximum(min_level, x))
|
71 |
+
|
72 |
+
def _normalize(S, hparams):
|
73 |
+
return hparams.max_abs_value * np.clip(((S - hparams.min_db) /
|
74 |
+
(-hparams.min_db)), 0, 1)
|
75 |
+
|
76 |
+
def _db_to_amp(x):
|
77 |
+
return np.power(10.0, (x) * 0.05)
|
78 |
+
|
79 |
+
|
80 |
+
def _stft(y, hparams):
|
81 |
+
return librosa.stft(y=y,
|
82 |
+
n_fft=hparams.n_fft,
|
83 |
+
hop_length=hparams.hop_size,
|
84 |
+
win_length=hparams.win_size)
|
85 |
+
|
86 |
+
|
87 |
+
def _istft(y, hparams):
|
88 |
+
return librosa.istft(y,
|
89 |
+
hop_length=hparams.hop_size,
|
90 |
+
win_length=hparams.win_size)
|
91 |
+
|
92 |
+
|
93 |
+
def melspectrogram(wav, hparams):
|
94 |
+
D = _stft(wav, hparams)
|
95 |
+
S = _amp_to_db(_linear_to_mel(np.abs(D), hparams),
|
96 |
+
hparams) - hparams.ref_level_db
|
97 |
+
return _normalize(S, hparams)
|
98 |
+
|
99 |
+
|
utils/utils.py
ADDED
@@ -0,0 +1,268 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import glob
|
3 |
+
import sys
|
4 |
+
import argparse
|
5 |
+
import logging
|
6 |
+
import json
|
7 |
+
import subprocess
|
8 |
+
import numpy as np
|
9 |
+
from scipy.io.wavfile import read
|
10 |
+
import torch
|
11 |
+
|
12 |
+
MATPLOTLIB_FLAG = False
|
13 |
+
|
14 |
+
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
|
15 |
+
logger = logging
|
16 |
+
|
17 |
+
|
18 |
+
def load_checkpoint(checkpoint_path, model, optimizer=None):
|
19 |
+
assert os.path.isfile(checkpoint_path)
|
20 |
+
checkpoint_dict = torch.load(checkpoint_path, map_location='cpu')
|
21 |
+
iteration = checkpoint_dict['iteration']
|
22 |
+
learning_rate = checkpoint_dict['learning_rate']
|
23 |
+
if optimizer is not None:
|
24 |
+
optimizer.load_state_dict(checkpoint_dict['optimizer'])
|
25 |
+
saved_state_dict = checkpoint_dict['model']
|
26 |
+
if hasattr(model, 'module'):
|
27 |
+
state_dict = model.module.state_dict()
|
28 |
+
else:
|
29 |
+
state_dict = model.state_dict()
|
30 |
+
new_state_dict = {}
|
31 |
+
for k, v in state_dict.items():
|
32 |
+
try:
|
33 |
+
new_state_dict[k] = saved_state_dict[k]
|
34 |
+
assert saved_state_dict[k].shape == v.shape, (saved_state_dict[k].shape, v.shape)
|
35 |
+
except:
|
36 |
+
print("error, %s is not in the checkpoint" % k)
|
37 |
+
logger.info("%s is not in the checkpoint" % k)
|
38 |
+
new_state_dict[k] = v
|
39 |
+
if hasattr(model, 'module'):
|
40 |
+
model.module.load_state_dict(new_state_dict)
|
41 |
+
else:
|
42 |
+
model.load_state_dict(new_state_dict)
|
43 |
+
print("load ")
|
44 |
+
logger.info("Loaded checkpoint '{}' (iteration {})".format(
|
45 |
+
checkpoint_path, iteration))
|
46 |
+
return model, optimizer, learning_rate, iteration
|
47 |
+
|
48 |
+
|
49 |
+
def save_checkpoint(model, optimizer, learning_rate, iteration, checkpoint_path, val_steps):
|
50 |
+
ckptname = checkpoint_path.split(os.sep)[-1]
|
51 |
+
newest_step = int(ckptname.split(".")[0].split("_")[1])
|
52 |
+
last_ckptname = checkpoint_path.replace(str(newest_step), str(newest_step - val_steps * 2))
|
53 |
+
if newest_step >= val_steps * 2:
|
54 |
+
os.system(f"rm {last_ckptname}")
|
55 |
+
|
56 |
+
logger.info("Saving model and optimizer state at iteration {} to {}".format(
|
57 |
+
iteration, checkpoint_path))
|
58 |
+
if hasattr(model, 'module'):
|
59 |
+
state_dict = model.module.state_dict()
|
60 |
+
else:
|
61 |
+
state_dict = model.state_dict()
|
62 |
+
torch.save({'model': state_dict,
|
63 |
+
'iteration': iteration,
|
64 |
+
'optimizer': optimizer.state_dict(),
|
65 |
+
'learning_rate': learning_rate}, checkpoint_path)
|
66 |
+
|
67 |
+
|
68 |
+
def summarize(writer, global_step, scalars={}, histograms={}, images={}, audios={}, audio_sampling_rate=22050):
|
69 |
+
for k, v in scalars.items():
|
70 |
+
writer.add_scalar(k, v, global_step)
|
71 |
+
for k, v in histograms.items():
|
72 |
+
writer.add_histogram(k, v, global_step)
|
73 |
+
for k, v in images.items():
|
74 |
+
writer.add_image(k, v, global_step, dataformats='HWC')
|
75 |
+
for k, v in audios.items():
|
76 |
+
writer.add_audio(k, v, global_step, audio_sampling_rate)
|
77 |
+
|
78 |
+
|
79 |
+
def latest_checkpoint_path(dir_path, regex="G_*.pth"):
|
80 |
+
f_list = glob.glob(os.path.join(dir_path, regex))
|
81 |
+
f_list.sort(key=lambda f: int("".join(filter(str.isdigit, f))))
|
82 |
+
x = f_list[-1]
|
83 |
+
print(x)
|
84 |
+
return x
|
85 |
+
|
86 |
+
|
87 |
+
def plot_spectrogram_to_numpy(spectrogram):
|
88 |
+
global MATPLOTLIB_FLAG
|
89 |
+
if not MATPLOTLIB_FLAG:
|
90 |
+
import matplotlib
|
91 |
+
matplotlib.use("Agg")
|
92 |
+
MATPLOTLIB_FLAG = True
|
93 |
+
mpl_logger = logging.getLogger('matplotlib')
|
94 |
+
mpl_logger.setLevel(logging.WARNING)
|
95 |
+
import matplotlib.pylab as plt
|
96 |
+
import numpy as np
|
97 |
+
|
98 |
+
fig, ax = plt.subplots(figsize=(10, 2))
|
99 |
+
im = ax.imshow(spectrogram, aspect="auto", origin="lower",
|
100 |
+
interpolation='none')
|
101 |
+
plt.colorbar(im, ax=ax)
|
102 |
+
plt.xlabel("Frames")
|
103 |
+
plt.ylabel("Channels")
|
104 |
+
plt.tight_layout()
|
105 |
+
|
106 |
+
fig.canvas.draw()
|
107 |
+
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
|
108 |
+
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
|
109 |
+
plt.close()
|
110 |
+
return data
|
111 |
+
|
112 |
+
|
113 |
+
def plot_alignment_to_numpy(alignment, info=None):
|
114 |
+
global MATPLOTLIB_FLAG
|
115 |
+
if not MATPLOTLIB_FLAG:
|
116 |
+
import matplotlib
|
117 |
+
matplotlib.use("Agg")
|
118 |
+
MATPLOTLIB_FLAG = True
|
119 |
+
mpl_logger = logging.getLogger('matplotlib')
|
120 |
+
mpl_logger.setLevel(logging.WARNING)
|
121 |
+
import matplotlib.pylab as plt
|
122 |
+
import numpy as np
|
123 |
+
|
124 |
+
fig, ax = plt.subplots(figsize=(6, 4))
|
125 |
+
im = ax.imshow(alignment.transpose(), aspect='auto', origin='lower',
|
126 |
+
interpolation='none')
|
127 |
+
fig.colorbar(im, ax=ax)
|
128 |
+
xlabel = 'Decoder timestep'
|
129 |
+
if info is not None:
|
130 |
+
xlabel += '\n\n' + info
|
131 |
+
plt.xlabel(xlabel)
|
132 |
+
plt.ylabel('Encoder timestep')
|
133 |
+
plt.tight_layout()
|
134 |
+
|
135 |
+
fig.canvas.draw()
|
136 |
+
data = np.fromstring(fig.canvas.tostring_rgb(), dtype=np.uint8, sep='')
|
137 |
+
data = data.reshape(fig.canvas.get_width_height()[::-1] + (3,))
|
138 |
+
plt.close()
|
139 |
+
return data
|
140 |
+
|
141 |
+
|
142 |
+
def load_wav_to_torch(full_path):
|
143 |
+
sampling_rate, data = read(full_path)
|
144 |
+
return torch.FloatTensor(data.astype(np.float32)), sampling_rate
|
145 |
+
|
146 |
+
|
147 |
+
def load_filepaths_and_text(filename, split="|"):
|
148 |
+
with open(filename, encoding='utf-8') as f:
|
149 |
+
filepaths_and_text = [line.strip().split(split) for line in f]
|
150 |
+
return filepaths_and_text
|
151 |
+
|
152 |
+
|
153 |
+
def get_hparams(init=True):
|
154 |
+
parser = argparse.ArgumentParser()
|
155 |
+
parser.add_argument('-c', '--config', type=str, default="./configs/base.json",
|
156 |
+
help='JSON file for configuration')
|
157 |
+
# parser.add_argument('-m', '--model', type=str, required=True,
|
158 |
+
# help='Model name')
|
159 |
+
|
160 |
+
args = parser.parse_args()
|
161 |
+
|
162 |
+
config_path = args.config
|
163 |
+
with open(config_path, "r") as f:
|
164 |
+
data = f.read()
|
165 |
+
config = json.loads(data)
|
166 |
+
|
167 |
+
hparams = HParams(**config)
|
168 |
+
# hparams.model_dir = model_dir
|
169 |
+
model_dir = hparams.train.save_dir
|
170 |
+
config_save_path = os.path.join(model_dir, "config.json")
|
171 |
+
|
172 |
+
if not os.path.exists(model_dir):
|
173 |
+
os.makedirs(model_dir)
|
174 |
+
|
175 |
+
with open(config_save_path, "w") as f:
|
176 |
+
f.write(data)
|
177 |
+
return hparams
|
178 |
+
|
179 |
+
|
180 |
+
def get_hparams_from_dir(model_dir):
|
181 |
+
config_save_path = os.path.join(model_dir, "config.json")
|
182 |
+
with open(config_save_path, "r") as f:
|
183 |
+
data = f.read()
|
184 |
+
config = json.loads(data)
|
185 |
+
|
186 |
+
hparams = HParams(**config)
|
187 |
+
hparams.model_dir = model_dir
|
188 |
+
return hparams
|
189 |
+
|
190 |
+
|
191 |
+
def get_hparams_from_file(config_path):
|
192 |
+
with open(config_path, "r") as f:
|
193 |
+
data = f.read()
|
194 |
+
config = json.loads(data)
|
195 |
+
|
196 |
+
hparams = HParams(**config)
|
197 |
+
return hparams
|
198 |
+
|
199 |
+
|
200 |
+
def check_git_hash(model_dir):
|
201 |
+
source_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
|
202 |
+
if not os.path.exists(os.path.join(source_dir, ".git")):
|
203 |
+
logger.warn("{} is not a git repository, therefore hash value comparison will be ignored.".format(
|
204 |
+
source_dir
|
205 |
+
))
|
206 |
+
return
|
207 |
+
|
208 |
+
cur_hash = subprocess.getoutput("git rev-parse HEAD")
|
209 |
+
|
210 |
+
path = os.path.join(model_dir, "githash")
|
211 |
+
if os.path.exists(path):
|
212 |
+
saved_hash = open(path).read()
|
213 |
+
if saved_hash != cur_hash:
|
214 |
+
logger.warn("git hash values are different. {}(saved) != {}(current)".format(
|
215 |
+
saved_hash[:8], cur_hash[:8]))
|
216 |
+
else:
|
217 |
+
open(path, "w").write(cur_hash)
|
218 |
+
|
219 |
+
|
220 |
+
def get_logger(model_dir, filename="train.log"):
|
221 |
+
global logger
|
222 |
+
logger = logging.getLogger(os.path.basename(model_dir))
|
223 |
+
logger.setLevel(logging.DEBUG)
|
224 |
+
|
225 |
+
formatter = logging.Formatter("%(asctime)s\t%(name)s\t%(levelname)s\t%(message)s")
|
226 |
+
if not os.path.exists(model_dir):
|
227 |
+
os.makedirs(model_dir)
|
228 |
+
h = logging.FileHandler(os.path.join(model_dir, filename))
|
229 |
+
h.setLevel(logging.DEBUG)
|
230 |
+
h.setFormatter(formatter)
|
231 |
+
logger.addHandler(h)
|
232 |
+
return logger
|
233 |
+
|
234 |
+
|
235 |
+
def count_parameters(model):
|
236 |
+
return sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6
|
237 |
+
|
238 |
+
|
239 |
+
class HParams():
|
240 |
+
def __init__(self, **kwargs):
|
241 |
+
for k, v in kwargs.items():
|
242 |
+
if type(v) == dict:
|
243 |
+
v = HParams(**v)
|
244 |
+
self[k] = v
|
245 |
+
|
246 |
+
def keys(self):
|
247 |
+
return self.__dict__.keys()
|
248 |
+
|
249 |
+
def items(self):
|
250 |
+
return self.__dict__.items()
|
251 |
+
|
252 |
+
def values(self):
|
253 |
+
return self.__dict__.values()
|
254 |
+
|
255 |
+
def __len__(self):
|
256 |
+
return len(self.__dict__)
|
257 |
+
|
258 |
+
def __getitem__(self, key):
|
259 |
+
return getattr(self, key)
|
260 |
+
|
261 |
+
def __setitem__(self, key, value):
|
262 |
+
return setattr(self, key, value)
|
263 |
+
|
264 |
+
def __contains__(self, key):
|
265 |
+
return key in self.__dict__
|
266 |
+
|
267 |
+
def __repr__(self):
|
268 |
+
return self.__dict__.__repr__()
|