ftshijt commited on
Commit
cee6458
1 Parent(s): 53eec2e

Update model

Browse files
Files changed (23) hide show
  1. README.md +334 -1
  2. exp/svs_stats_raw_phn_pyopenjtalk_jp/train/feats_stats.npz +3 -0
  3. exp/svs_stats_raw_phn_pyopenjtalk_jp/train/pitch_stats.npz +3 -0
  4. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/315epoch.pth +3 -0
  5. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/config.yaml +253 -0
  6. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/backward_time.png +0 -0
  7. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/clip.png +0 -0
  8. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/decoder_alpha.png +0 -0
  9. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/duration_loss.png +0 -0
  10. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/encoder_alpha.png +0 -0
  11. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/forward_time.png +0 -0
  12. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/gpu_max_cached_mem_GB.png +0 -0
  13. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/grad_norm.png +0 -0
  14. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/iter_time.png +0 -0
  15. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/loss.png +0 -0
  16. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/loss_scale.png +0 -0
  17. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/mel_loss.png +0 -0
  18. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/optim0_lr0.png +0 -0
  19. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/optim_step_time.png +0 -0
  20. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/pitch_loss.png +0 -0
  21. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/train_time.png +0 -0
  22. exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/vuv_loss.png +0 -0
  23. meta.yaml +8 -0
README.md CHANGED
@@ -1,3 +1,336 @@
1
  ---
2
- license: apache-2.0
 
 
 
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - singing-voice-synthesis
6
+ language: jp
7
+ datasets:
8
+ - oniku_kurumi_utagoe_db
9
+ license: cc-by-4.0
10
  ---
11
+
12
+ ## ESPnet2 SVS model
13
+
14
+ ### `espnet/oniku_kurumi_utagoe_db_xiaoice`
15
+
16
+ This model was trained by ftshijt using oniku_kurumi_utagoe_db recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 5c4d7cf7feba8461de2e1080bf82182f0efaef38
26
+ pip install -e .
27
+ cd egs2/oniku_kurumi_utagoe_db/svs1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/oniku_kurumi_utagoe_db_xiaoice
29
+ ```
30
+
31
+
32
+
33
+ ## SVS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/tuning/train_xiaoice.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ drop_last_iter: false
42
+ dry_run: false
43
+ iterator_type: sequence
44
+ valid_iterator_type: null
45
+ output_dir: exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp
46
+ ngpu: 1
47
+ seed: 0
48
+ num_workers: 10
49
+ num_att_plot: 3
50
+ dist_backend: nccl
51
+ dist_init_method: env://
52
+ dist_world_size: null
53
+ dist_rank: null
54
+ local_rank: 0
55
+ dist_master_addr: null
56
+ dist_master_port: null
57
+ dist_launcher: null
58
+ multiprocessing_distributed: false
59
+ unused_parameters: false
60
+ sharded_ddp: false
61
+ cudnn_enabled: true
62
+ cudnn_benchmark: false
63
+ cudnn_deterministic: true
64
+ collect_stats: false
65
+ write_collected_feats: false
66
+ max_epoch: 500
67
+ patience: null
68
+ val_scheduler_criterion:
69
+ - valid
70
+ - loss
71
+ early_stopping_criterion:
72
+ - valid
73
+ - loss
74
+ - min
75
+ best_model_criterion:
76
+ - - valid
77
+ - loss
78
+ - min
79
+ - - train
80
+ - loss
81
+ - min
82
+ keep_nbest_models: 5
83
+ nbest_averaging_interval: 0
84
+ grad_clip: 1.0
85
+ grad_clip_type: 2.0
86
+ grad_noise: false
87
+ accum_grad: 1
88
+ no_forward_run: false
89
+ resume: true
90
+ train_dtype: float32
91
+ use_amp: false
92
+ log_interval: null
93
+ use_matplotlib: true
94
+ use_tensorboard: true
95
+ create_graph_in_tensorboard: false
96
+ use_wandb: false
97
+ wandb_project: null
98
+ wandb_id: null
99
+ wandb_entity: null
100
+ wandb_name: null
101
+ wandb_model_log_interval: -1
102
+ detect_anomaly: false
103
+ use_lora: false
104
+ save_lora_only: true
105
+ lora_conf: {}
106
+ pretrain_path: null
107
+ init_param: []
108
+ ignore_init_mismatch: false
109
+ freeze_param: []
110
+ num_iters_per_epoch: 500
111
+ batch_size: 16
112
+ valid_batch_size: null
113
+ batch_bins: 1000000
114
+ valid_batch_bins: null
115
+ train_shape_file:
116
+ - exp/svs_stats_raw_phn_pyopenjtalk_jp/train/text_shape.phn
117
+ - exp/svs_stats_raw_phn_pyopenjtalk_jp/train/singing_shape
118
+ valid_shape_file:
119
+ - exp/svs_stats_raw_phn_pyopenjtalk_jp/valid/text_shape.phn
120
+ - exp/svs_stats_raw_phn_pyopenjtalk_jp/valid/singing_shape
121
+ batch_type: sorted
122
+ valid_batch_type: null
123
+ fold_length:
124
+ - 150
125
+ - 240000
126
+ sort_in_batch: descending
127
+ shuffle_within_batch: false
128
+ sort_batch: descending
129
+ multiple_iterator: false
130
+ chunk_length: 500
131
+ chunk_shift_ratio: 0.5
132
+ num_cache_chunks: 1024
133
+ chunk_excluded_key_prefixes: []
134
+ chunk_default_fs: null
135
+ train_data_path_and_name_and_type:
136
+ - - dump/raw/tr_no_dev/text
137
+ - text
138
+ - text
139
+ - - dump/raw/tr_no_dev/wav.scp
140
+ - singing
141
+ - sound
142
+ - - dump/raw/tr_no_dev/label
143
+ - label
144
+ - duration
145
+ - - dump/raw/tr_no_dev/score.scp
146
+ - score
147
+ - score
148
+ valid_data_path_and_name_and_type:
149
+ - - dump/raw/dev/text
150
+ - text
151
+ - text
152
+ - - dump/raw/dev/wav.scp
153
+ - singing
154
+ - sound
155
+ - - dump/raw/dev/label
156
+ - label
157
+ - duration
158
+ - - dump/raw/dev/score.scp
159
+ - score
160
+ - score
161
+ allow_variable_data_keys: false
162
+ max_cache_size: 0.0
163
+ max_cache_fd: 32
164
+ allow_multi_rates: false
165
+ valid_max_cache_size: null
166
+ exclude_weight_decay: false
167
+ exclude_weight_decay_conf: {}
168
+ optim: adam
169
+ optim_conf:
170
+ lr: 0.001
171
+ eps: 1.0e-06
172
+ weight_decay: 0.0
173
+ scheduler: null
174
+ scheduler_conf: {}
175
+ token_list:
176
+ - <blank>
177
+ - <unk>
178
+ - pau
179
+ - a
180
+ - o
181
+ - i
182
+ - u
183
+ - e
184
+ - k
185
+ - n
186
+ - r
187
+ - m
188
+ - t
189
+ - N
190
+ - s
191
+ - w
192
+ - y
193
+ - sh
194
+ - g
195
+ - d
196
+ - ch
197
+ - b
198
+ - ts
199
+ - p
200
+ - z
201
+ - h
202
+ - f
203
+ - j
204
+ - cl
205
+ - ry
206
+ - ky
207
+ - gy
208
+ - ny
209
+ - hy
210
+ - my
211
+ - v
212
+ - by
213
+ - py
214
+ - ty
215
+ - dy
216
+ - <sos/eos>
217
+ odim: null
218
+ model_conf: {}
219
+ use_preprocessor: true
220
+ token_type: phn
221
+ bpemodel: null
222
+ non_linguistic_symbols: null
223
+ cleaner: null
224
+ g2p: pyopenjtalk
225
+ fs: 24000
226
+ score_feats_extract: syllable_score_feats
227
+ score_feats_extract_conf:
228
+ fs: 24000
229
+ n_fft: 2048
230
+ win_length: 1200
231
+ hop_length: 300
232
+ feats_extract: fbank
233
+ feats_extract_conf:
234
+ n_fft: 2048
235
+ hop_length: 300
236
+ win_length: 1200
237
+ fs: 24000
238
+ fmin: 80
239
+ fmax: 7600
240
+ n_mels: 80
241
+ normalize: global_mvn
242
+ normalize_conf:
243
+ stats_file: exp/svs_stats_raw_phn_pyopenjtalk_jp/train/feats_stats.npz
244
+ svs: xiaoice
245
+ svs_conf:
246
+ midi_dim: 129
247
+ duration_dim: 500
248
+ adim: 384
249
+ aheads: 4
250
+ elayers: 6
251
+ eunits: 1536
252
+ dlayers: 6
253
+ dunits: 1536
254
+ postnet_layers: 5
255
+ postnet_chans: 512
256
+ postnet_filts: 5
257
+ postnet_dropout_rate: 0.5
258
+ use_batch_norm: true
259
+ reduction_factor: 1
260
+ init_type: pytorch
261
+ use_masking: true
262
+ loss_function: XiaoiceSing2
263
+ loss_type: L1
264
+ lambda_mel: 1
265
+ lambda_dur: 0.1
266
+ lambda_pitch: 0.01
267
+ lambda_vuv: 0.01
268
+ pitch_extract: dio
269
+ pitch_extract_conf:
270
+ use_token_averaged_f0: false
271
+ fs: 24000
272
+ n_fft: 2048
273
+ hop_length: 300
274
+ f0max: 800
275
+ f0min: 80
276
+ reduction_factor: 1
277
+ pitch_normalize: global_mvn
278
+ pitch_normalize_conf:
279
+ stats_file: exp/svs_stats_raw_phn_pyopenjtalk_jp/train/pitch_stats.npz
280
+ ying_extract: null
281
+ ying_extract_conf: {}
282
+ energy_extract: null
283
+ energy_extract_conf: {}
284
+ energy_normalize: null
285
+ energy_normalize_conf: {}
286
+ required:
287
+ - output_dir
288
+ - token_list
289
+ version: '202310'
290
+ distributed: false
291
+ ```
292
+
293
+ </details>
294
+
295
+
296
+
297
+ ### Citing ESPnet
298
+
299
+ ```BibTex
300
+ @inproceedings{watanabe2018espnet,
301
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
302
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
303
+ year={2018},
304
+ booktitle={Proceedings of Interspeech},
305
+ pages={2207--2211},
306
+ doi={10.21437/Interspeech.2018-1456},
307
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
308
+ }
309
+
310
+
311
+
312
+
313
+
314
+
315
+ @inproceedings{shi22d_interspeech,
316
+ author={Jiatong Shi and Shuai Guo and Tao Qian and Tomoki Hayashi and Yuning Wu and Fangzheng Xu and Xuankai Chang and Huazhe Li and Peter Wu and Shinji Watanabe and Qin Jin},
317
+ title={{Muskits: an End-to-end Music Processing Toolkit for Singing Voice Synthesis}},
318
+ year=2022,
319
+ booktitle={Proc. Interspeech 2022},
320
+ pages={4277--4281},
321
+ doi={10.21437/Interspeech.2022-10039}
322
+ }
323
+ ```
324
+
325
+ or arXiv:
326
+
327
+ ```bibtex
328
+ @misc{watanabe2018espnet,
329
+ title={ESPnet: End-to-End Speech Processing Toolkit},
330
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
331
+ year={2018},
332
+ eprint={1804.00015},
333
+ archivePrefix={arXiv},
334
+ primaryClass={cs.CL}
335
+ }
336
+ ```
exp/svs_stats_raw_phn_pyopenjtalk_jp/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15794e62a390e2aaeaaef623409dd499cc50256935080a1fbaa51d8831a0a1eb
3
+ size 1402
exp/svs_stats_raw_phn_pyopenjtalk_jp/train/pitch_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ea4d5243bf5e3904456ebf3fb51a31c677754f10376725b551bd96b3db8e00b
3
+ size 770
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/315epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acd93a0e3a39cc0f04008ffeeefb9e338598ce621675eabdc84aeb0f8e96e152
3
+ size 107710731
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/config.yaml ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_xiaoice.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ drop_last_iter: false
5
+ dry_run: false
6
+ iterator_type: sequence
7
+ valid_iterator_type: null
8
+ output_dir: exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp
9
+ ngpu: 1
10
+ seed: 0
11
+ num_workers: 10
12
+ num_att_plot: 3
13
+ dist_backend: nccl
14
+ dist_init_method: env://
15
+ dist_world_size: null
16
+ dist_rank: null
17
+ local_rank: 0
18
+ dist_master_addr: null
19
+ dist_master_port: null
20
+ dist_launcher: null
21
+ multiprocessing_distributed: false
22
+ unused_parameters: false
23
+ sharded_ddp: false
24
+ cudnn_enabled: true
25
+ cudnn_benchmark: false
26
+ cudnn_deterministic: true
27
+ collect_stats: false
28
+ write_collected_feats: false
29
+ max_epoch: 500
30
+ patience: null
31
+ val_scheduler_criterion:
32
+ - valid
33
+ - loss
34
+ early_stopping_criterion:
35
+ - valid
36
+ - loss
37
+ - min
38
+ best_model_criterion:
39
+ - - valid
40
+ - loss
41
+ - min
42
+ - - train
43
+ - loss
44
+ - min
45
+ keep_nbest_models: 5
46
+ nbest_averaging_interval: 0
47
+ grad_clip: 1.0
48
+ grad_clip_type: 2.0
49
+ grad_noise: false
50
+ accum_grad: 1
51
+ no_forward_run: false
52
+ resume: true
53
+ train_dtype: float32
54
+ use_amp: false
55
+ log_interval: null
56
+ use_matplotlib: true
57
+ use_tensorboard: true
58
+ create_graph_in_tensorboard: false
59
+ use_wandb: false
60
+ wandb_project: null
61
+ wandb_id: null
62
+ wandb_entity: null
63
+ wandb_name: null
64
+ wandb_model_log_interval: -1
65
+ detect_anomaly: false
66
+ use_lora: false
67
+ save_lora_only: true
68
+ lora_conf: {}
69
+ pretrain_path: null
70
+ init_param: []
71
+ ignore_init_mismatch: false
72
+ freeze_param: []
73
+ num_iters_per_epoch: 500
74
+ batch_size: 16
75
+ valid_batch_size: null
76
+ batch_bins: 1000000
77
+ valid_batch_bins: null
78
+ train_shape_file:
79
+ - exp/svs_stats_raw_phn_pyopenjtalk_jp/train/text_shape.phn
80
+ - exp/svs_stats_raw_phn_pyopenjtalk_jp/train/singing_shape
81
+ valid_shape_file:
82
+ - exp/svs_stats_raw_phn_pyopenjtalk_jp/valid/text_shape.phn
83
+ - exp/svs_stats_raw_phn_pyopenjtalk_jp/valid/singing_shape
84
+ batch_type: sorted
85
+ valid_batch_type: null
86
+ fold_length:
87
+ - 150
88
+ - 240000
89
+ sort_in_batch: descending
90
+ shuffle_within_batch: false
91
+ sort_batch: descending
92
+ multiple_iterator: false
93
+ chunk_length: 500
94
+ chunk_shift_ratio: 0.5
95
+ num_cache_chunks: 1024
96
+ chunk_excluded_key_prefixes: []
97
+ chunk_default_fs: null
98
+ train_data_path_and_name_and_type:
99
+ - - dump/raw/tr_no_dev/text
100
+ - text
101
+ - text
102
+ - - dump/raw/tr_no_dev/wav.scp
103
+ - singing
104
+ - sound
105
+ - - dump/raw/tr_no_dev/label
106
+ - label
107
+ - duration
108
+ - - dump/raw/tr_no_dev/score.scp
109
+ - score
110
+ - score
111
+ valid_data_path_and_name_and_type:
112
+ - - dump/raw/dev/text
113
+ - text
114
+ - text
115
+ - - dump/raw/dev/wav.scp
116
+ - singing
117
+ - sound
118
+ - - dump/raw/dev/label
119
+ - label
120
+ - duration
121
+ - - dump/raw/dev/score.scp
122
+ - score
123
+ - score
124
+ allow_variable_data_keys: false
125
+ max_cache_size: 0.0
126
+ max_cache_fd: 32
127
+ allow_multi_rates: false
128
+ valid_max_cache_size: null
129
+ exclude_weight_decay: false
130
+ exclude_weight_decay_conf: {}
131
+ optim: adam
132
+ optim_conf:
133
+ lr: 0.001
134
+ eps: 1.0e-06
135
+ weight_decay: 0.0
136
+ scheduler: null
137
+ scheduler_conf: {}
138
+ token_list:
139
+ - <blank>
140
+ - <unk>
141
+ - pau
142
+ - a
143
+ - o
144
+ - i
145
+ - u
146
+ - e
147
+ - k
148
+ - n
149
+ - r
150
+ - m
151
+ - t
152
+ - N
153
+ - s
154
+ - w
155
+ - y
156
+ - sh
157
+ - g
158
+ - d
159
+ - ch
160
+ - b
161
+ - ts
162
+ - p
163
+ - z
164
+ - h
165
+ - f
166
+ - j
167
+ - cl
168
+ - ry
169
+ - ky
170
+ - gy
171
+ - ny
172
+ - hy
173
+ - my
174
+ - v
175
+ - by
176
+ - py
177
+ - ty
178
+ - dy
179
+ - <sos/eos>
180
+ odim: null
181
+ model_conf: {}
182
+ use_preprocessor: true
183
+ token_type: phn
184
+ bpemodel: null
185
+ non_linguistic_symbols: null
186
+ cleaner: null
187
+ g2p: pyopenjtalk
188
+ fs: 24000
189
+ score_feats_extract: syllable_score_feats
190
+ score_feats_extract_conf:
191
+ fs: 24000
192
+ n_fft: 2048
193
+ win_length: 1200
194
+ hop_length: 300
195
+ feats_extract: fbank
196
+ feats_extract_conf:
197
+ n_fft: 2048
198
+ hop_length: 300
199
+ win_length: 1200
200
+ fs: 24000
201
+ fmin: 80
202
+ fmax: 7600
203
+ n_mels: 80
204
+ normalize: global_mvn
205
+ normalize_conf:
206
+ stats_file: exp/svs_stats_raw_phn_pyopenjtalk_jp/train/feats_stats.npz
207
+ svs: xiaoice
208
+ svs_conf:
209
+ midi_dim: 129
210
+ duration_dim: 500
211
+ adim: 384
212
+ aheads: 4
213
+ elayers: 6
214
+ eunits: 1536
215
+ dlayers: 6
216
+ dunits: 1536
217
+ postnet_layers: 5
218
+ postnet_chans: 512
219
+ postnet_filts: 5
220
+ postnet_dropout_rate: 0.5
221
+ use_batch_norm: true
222
+ reduction_factor: 1
223
+ init_type: pytorch
224
+ use_masking: true
225
+ loss_function: XiaoiceSing2
226
+ loss_type: L1
227
+ lambda_mel: 1
228
+ lambda_dur: 0.1
229
+ lambda_pitch: 0.01
230
+ lambda_vuv: 0.01
231
+ pitch_extract: dio
232
+ pitch_extract_conf:
233
+ use_token_averaged_f0: false
234
+ fs: 24000
235
+ n_fft: 2048
236
+ hop_length: 300
237
+ f0max: 800
238
+ f0min: 80
239
+ reduction_factor: 1
240
+ pitch_normalize: global_mvn
241
+ pitch_normalize_conf:
242
+ stats_file: exp/svs_stats_raw_phn_pyopenjtalk_jp/train/pitch_stats.npz
243
+ ying_extract: null
244
+ ying_extract_conf: {}
245
+ energy_extract: null
246
+ energy_extract_conf: {}
247
+ energy_normalize: null
248
+ energy_normalize_conf: {}
249
+ required:
250
+ - output_dir
251
+ - token_list
252
+ version: '202310'
253
+ distributed: false
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/backward_time.png ADDED
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/clip.png ADDED
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/decoder_alpha.png ADDED
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/duration_loss.png ADDED
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/encoder_alpha.png ADDED
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/forward_time.png ADDED
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/gpu_max_cached_mem_GB.png ADDED
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/grad_norm.png ADDED
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/iter_time.png ADDED
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/loss.png ADDED
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/loss_scale.png ADDED
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/mel_loss.png ADDED
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/optim0_lr0.png ADDED
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/optim_step_time.png ADDED
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/pitch_loss.png ADDED
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/train_time.png ADDED
exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/images/vuv_loss.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202310'
2
+ files:
3
+ model_file: exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/315epoch.pth
4
+ python: "3.9.16 (main, Mar 8 2023, 14:00:05) \n[GCC 11.2.0]"
5
+ timestamp: 1702893716.97948
6
+ torch: 1.13.1+cu117
7
+ yaml_files:
8
+ train_config: exp/svs_train_xiaoice_raw_phn_pyopenjtalk_jp/config.yaml