Spaces:

Mahiruoshi
/

BangDream-Bert-VITS2

Running

App Files Files Community

Mahiruoshi commited on Dec 16, 2023

Commit

0372395

•

1 Parent(s): 6510319

Upload 158 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.pre-commit-config.yaml +2 -2
Data/BangDreamV22/configs/config.json +197 -0
Data/BangDreamV22/models/G_51000.pth +3 -0
app.py +175 -272
bert/bert_models.json +2 -2
bert/deberta-v2-large-japanese-char-wwm/.gitattributes +34 -0
bert/deberta-v2-large-japanese-char-wwm/README.md +89 -0
bert/deberta-v2-large-japanese-char-wwm/config.json +37 -0
bert/deberta-v2-large-japanese-char-wwm/pytorch_model.bin +3 -0
bert/deberta-v2-large-japanese-char-wwm/special_tokens_map.json +7 -0
bert/deberta-v2-large-japanese-char-wwm/tokenizer_config.json +19 -0
bert/deberta-v2-large-japanese-char-wwm/vocab.txt +0 -0
bert_gen.py +8 -7
clap_gen.py +64 -0
clap_wrapper.py +49 -0
commons.py +6 -14
compress_model.py +89 -0
config.py +13 -2
config.yml +35 -18
configs/config.json +865 -99
css/custom.css +18 -0
data_utils.py +29 -9
default_config.yml +35 -18
emotional/clap-htsat-fused/.gitattributes +34 -0
emotional/clap-htsat-fused/README.md +107 -0
emotional/clap-htsat-fused/config.json +207 -0
emotional/clap-htsat-fused/merges.txt +0 -0
emotional/clap-htsat-fused/preprocessor_config.json +22 -0
emotional/clap-htsat-fused/pytorch_model.bin +3 -0
emotional/clap-htsat-fused/special_tokens_map.json +15 -0
emotional/clap-htsat-fused/tokenizer.json +0 -0
emotional/clap-htsat-fused/tokenizer_config.json +16 -0
emotional/clap-htsat-fused/vocab.json +0 -0
empty_emo.npy +3 -0
export_onnx.py +4 -48
filelists/sample.list +3 -0
img/yuyu.png +0 -0
img//345/217/202/346/225/260/350/257/264/346/230/216.png +0 -0
img//345/256/265/345/256/253.png +0 -0
img//345/276/256/344/277/241/345/233/276/347/211/207_20231010105112.png +0 -0
img//347/245/236/351/207/214/347/273/253/345/215/216.png +0 -0
img//347/272/263/350/245/277/345/246/262.png +0 -0
infer.py +186 -12
models.py +77 -6
monotonic_align/__pycache__/__init__.cpython-311.pyc +0 -0
monotonic_align/__pycache__/core.cpython-311.pyc +0 -0
onnx_modules/V200/__init__.py +0 -0
onnx_modules/V200/attentions_onnx.py +378 -0
onnx_modules/V200/models_onnx.py +990 -0
onnx_modules/V200/text/__init__.py +1 -0

.pre-commit-config.yaml CHANGED Viewed

@@ -7,13 +7,13 @@ repos:
       - id: trailing-whitespace
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.4
     hooks:
       - id: ruff
         args: [ --fix ]
   - repo: https://github.com/psf/black
-    rev: 23.10.1
     hooks:
       - id: black

       - id: trailing-whitespace
   - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.1.7
     hooks:
       - id: ruff
         args: [ --fix ]
   - repo: https://github.com/psf/black
+    rev: 23.11.0
     hooks:
       - id: black

Data/BangDreamV22/configs/config.json ADDED Viewed

	@@ -0,0 +1,197 @@

+{
+  "train": {
+    "log_interval": 200,
+    "eval_interval": 3000,
+    "seed": 42,
+    "epochs": 1000,
+    "learning_rate": 0.0002,
+    "betas": [
+      0.8,
+      0.99
+    ],
+    "eps": 1e-09,
+    "batch_size": 10,
+    "fp16_run": false,
+    "lr_decay": 0.99995,
+    "segment_size": 16384,
+    "init_lr_ratio": 1,
+    "warmup_epochs": 0,
+    "c_mel": 45,
+    "c_kl": 1.0,
+    "skip_optimizer": true,
+    "freeze_ZH_bert": false,
+    "freeze_JP_bert": false,
+    "freeze_EN_bert": false
+  },
+  "data": {
+    "training_files": "Data/BangDream/filelists/train.list",
+    "validation_files": "Data/BangDream/filelists/val.list",
+    "max_wav_value": 32768.0,
+    "sampling_rate": 44100,
+    "filter_length": 2048,
+    "hop_length": 512,
+    "win_length": 2048,
+    "n_mel_channels": 128,
+    "mel_fmin": 0.0,
+    "mel_fmax": null,
+    "add_blank": true,
+    "n_speakers": 99,
+    "cleaned_text": true,
+    "spk2id": {
+      "香澄": 0,
+      "有咲": 1,
+      "沙綾": 2,
+      "りみ": 3,
+      "たえ": 4,
+      "沙綾、りみ、たえ": 5,
+      "三月七1": 6,
+      "紗夜": 7,
+      "ロック": 8,
+      "パレオ": 9,
+      "レイヤ": 10,
+      "チュチュ": 11,
+      "彩": 12,
+      "千聖": 13,
+      "イヴ": 14,
+      "日菜": 15,
+      "麻弥": 16,
+      "蘭": 17,
+      "モカ": 18,
+      "巴": 19,
+      "ひまり": 20,
+      "つぐみ": 21,
+      "はぐみ": 22,
+      "花音": 23,
+      "美咲": 24,
+      "薫": 25,
+      "こころ": 26,
+      "つくし": 27,
+      "七深": 28,
+      "透子": 29,
+      "ましろ": 30,
+      "瑠唯": 31,
+      "友希那": 32,
+      "あこ": 33,
+      "リサ": 34,
+      "燐子": 35,
+      "燈": 36,
+      "愛音": 37,
+      "楽奈": 38,
+      "そよ": 39,
+      "立希": 40,
+      "ますき": 41,
+      "祥子": 42,
+      "睦": 43,
+      "海鈴": 44,
+      "にゃむ": 45,
+      "初華": 46,
+      "華戀": 47,
+      "晶": 48,
+      "光": 49,
+      "未知留": 50,
+      "香子": 51,
+      "雙葉": 52,
+      "真晝": 53,
+      "艾露": 54,
+      "珠緒": 55,
+      "艾露露": 56,
+      "純那": 57,
+      "克洛迪娜": 58,
+      "真矢": 59,
+      "奈奈": 60,
+      "壘": 61,
+      "文": 62,
+      "一愛": 63,
+      "菈樂菲": 64,
+      "司": 65,
+      "美空": 66,
+      "靜羽": 67,
+      "悠悠子": 68,
+      "八千代": 69,
+      "栞": 70,
+      "美帆": 71,
+      "芙蘿菈": 72,
+      "克蕾兒": 73,
+      "安德露": 74,
+      "瑪莉亞貝菈": 75,
+      "克拉迪亞": 76,
+      "桃樂西": 77,
+      "瑪麗安": 78,
+      "八重神子1": 79,
+      "娜塔莎": 80,
+      "宵宫": 81,
+      "派蒙11": 82,
+      "派蒙13": 83,
+      "派蒙3": 84,
+      "派蒙7": 85,
+      "派蒙8": 86,
+      "派蒙9": 87,
+      "派蒙10": 88,
+      "派蒙6": 89,
+      "派蒙4": 90,
+      "派蒙1": 91,
+      "派蒙2": 92,
+      "派蒙15": 93,
+      "派蒙16": 94,
+      "派蒙14": 95,
+      "派蒙12": 96,
+      "派蒙5": 97,
+      "纳西妲1": 98
+    }
+  },
+  "model": {
+    "use_spk_conditioned_encoder": true,
+    "use_noise_scaled_mas": true,
+    "use_mel_posterior_encoder": false,
+    "use_duration_discriminator": true,
+    "inter_channels": 192,
+    "hidden_channels": 192,
+    "filter_channels": 768,
+    "n_heads": 2,
+    "n_layers": 6,
+    "kernel_size": 3,
+    "p_dropout": 0.1,
+    "resblock": "1",
+    "resblock_kernel_sizes": [
+      3,
+      7,
+      11
+    ],
+    "resblock_dilation_sizes": [
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ],
+      [
+        1,
+        3,
+        5
+      ]
+    ],
+    "upsample_rates": [
+      8,
+      8,
+      2,
+      2,
+      2
+    ],
+    "upsample_initial_channel": 512,
+    "upsample_kernel_sizes": [
+      16,
+      16,
+      8,
+      2,
+      2
+    ],
+    "n_layers_q": 3,
+    "use_spectral_norm": false,
+    "gin_channels": 256
+  },
+  "version": "2.2"
+}

Data/BangDreamV22/models/G_51000.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:521be4508c8b8b81e81201372cce0ac09cef35ca0f66b3d981f1689a601db3c5
+size 750066550

app.py CHANGED Viewed

@@ -1,7 +1,8 @@
-# flake8: noqa: E402
 import os
-import logging
 import re_matching
 logging.getLogger("numba").setLevel(logging.WARNING)
@@ -15,34 +16,45 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)
-import warnings
-warnings.filterwarnings("ignore", category=UserWarning, module="gradio.blocks")
-import shutil
-from datetime import datetime
-import re
-import torch
-import utils
-from infer import infer, latest_version, get_net_g
 import gradio as gr
-import numpy as np
-from tools.sentence import extrac, is_japanese, is_chinese, seconds_to_ass_time, extract_text_from_file, remove_annotations
-import sys
-import math
-from scipy.io.wavfile import write
-from tools.translate import translate
-import random
 net_g = None
-cara_list = ["ひまり","たえ","彩","日菜","美咲","ましろ","燐子","香子","珠緒","たえ"]
 BandList = {
         "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
         "Afterglow":["蘭","モカ","ひまり","巴","つぐみ"],
         "HelloHappyWorld":["こころ","美咲","薫","花音","はぐみ"],
@@ -50,172 +62,123 @@ BandList = {
         "Roselia":["友希那","紗夜","リサ","燐子","あこ"],
         "RaiseASuilen":["レイヤ","ロック","ますき","チュチュ","パレオ"],
         "Morfonica":["ましろ","瑠唯","つくし","七深","透子"],
-        "MyGo&AveMujica(Part)":["燈","愛音","そよ","立希","楽奈","祥子","睦","海鈴"],
         "圣翔音乐学园":["華戀","光","香子","雙葉","真晝","純那","克洛迪娜","真矢","奈奈"],
         "凛明馆女子学校":["珠緒","壘","文","悠悠子","一愛"],
         "弗隆提亚艺术学校":["艾露","艾露露","菈樂菲","司","靜羽"],
         "西克菲尔特音乐学院":["晶","未知留","八千代","栞","美帆"]
 }
-device = (
-        "cuda:0"
-        if torch.cuda.is_available()
-        else (
-            "mps"
-            if sys.platform == "darwin" and torch.backends.mps.is_available()
-            else "cpu"
-        )
-    )
-def generate_audio(
     text,
     sdp_ratio,
     noise_scale,
     noise_scale_w,
     length_scale,
-    speaker,
-    language,
 ):
-    if len(text)>100:
-        return
-    with torch.no_grad():
-        if language == 'Auto':
-            language = "EN"
-            if is_japanese(text):
-                language = "JP"
-            elif is_chinese(text):
-                language = "ZH"
-        current_time = datetime.now()
-        print(str(current_time)+':'+str(speaker)+":"+language)
-        audio = infer(
-            text,
-            sdp_ratio=sdp_ratio,
-            noise_scale=noise_scale,
-            noise_scale_w=noise_scale_w,
-            length_scale=length_scale,
-            sid=speaker,
-            language=language,
-            hps=hps,
-            net_g=net_g,
-            device=device,
-        )
-    return gr.processing_utils.convert_to_16_bit_wav(audio)
-def tts_fn(
-    text: str,
-    speaker,
-    sdp_ratio,
-    noise_scale,
-    noise_scale_w,
-    length_scale,
-    language,
-    LongSentence,
-):
-    if not LongSentence:
-        with torch.no_grad():
-            audio = generate_audio(
-                text,
                 sdp_ratio=sdp_ratio,
                 noise_scale=noise_scale,
                 noise_scale_w=noise_scale_w,
                 length_scale=length_scale,
-                speaker=speaker,
-                language= language,
-            )
             torch.cuda.empty_cache()
-        return (hps.data.sampling_rate, audio)
-    else:
-        final_list = extrac(text)
-        audio_fin = []
-        for sentence in final_list:
-            if len(sentence) > 1:
-                with torch.no_grad():
-                    audio = generate_audio(
-                        sentence,
-                        sdp_ratio=sdp_ratio,
-                        noise_scale=noise_scale,
-                        noise_scale_w=noise_scale_w,
-                        length_scale=length_scale,
-                        speaker=speaker,
-                        language= language,
-                    )
-                silence_frames = int(math.log(len(sentence)+1, 1000) * 44010) if is_chinese(sentence) else int(math.log(len(sentence)+1, 3000) * 44010)
-                silence_data = np.zeros((silence_frames,), dtype=audio.dtype)
-                audio_fin.append(audio)
-                audio_fin.append(silence_data)
-        return (hps.data.sampling_rate, np.concatenate(audio_fin))
-def generate_audio_and_srt_for_group(group, outputPath, group_index, sampling_rate, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime):
-    audio_fin = []
-    ass_entries = []
-    start_time = 0
-    speaker = random.choice(cara_list)
-    ass_header = """[Script Info]
-; 我没意见
-Title: Audiobook
-ScriptType: v4.00+
-WrapStyle: 0
-PlayResX: 640
-PlayResY: 360
-ScaledBorderAndShadow: yes
-[V4+ Styles]
-Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
-Style: Default,Arial,20,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,0,0,0,0,100,100,0,0,1,1,1,2,10,10,10,1
-[Events]
-Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
-"""
-    for sentence in group:
-        try:
-            FakeSpeaker = sentence.split("|")[0]
-            print(FakeSpeaker)
-            SpeakersList = re.split('\n', spealerList)
-            if FakeSpeaker in list(hps.data.spk2id.keys()):
-                speaker = FakeSpeaker
-            for i in SpeakersList:
-                if FakeSpeaker == i.split("|")[1]:
-                    speaker = i.split("|")[0]
-            if sentence != '\n':
-                audio = generate_audio(remove_annotations(sentence.split("|")[-1]).replace(" ",""), speaker=speaker, sdp_ratio=sdp_ratio, noise_scale=noise_scale, noise_scale_w=noise_scale_w, length_scale=length_scale, language='Auto')
-                silence_frames = int(silenceTime * 44010)
-                silence_data = np.zeros((silence_frames,), dtype=audio.dtype)
-                audio_fin.append(audio)
-                audio_fin.append(silence_data)
-                duration = len(audio) / sampling_rate
-                end_time = start_time + duration + silenceTime
-                ass_entries.append("Dialogue: 0,{},{},".format(seconds_to_ass_time(start_time), seconds_to_ass_time(end_time)) + "Default,,0,0,0,,{}".format(sentence.replace("|","：")))
-                start_time = end_time
-        except:
-            pass
-    wav_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.wav')
-    ass_filename = os.path.join(outputPath, f'audiobook_part_{group_index}.ass')
-    write(wav_filename, sampling_rate, np.concatenate(audio_fin))
-    with open(ass_filename, 'w', encoding='utf-8') as f:
-        f.write(ass_header + '\n'.join(ass_entries))
-    return (hps.data.sampling_rate, np.concatenate(audio_fin))
-def audiobook(inputFile, groupsize, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime,filepath):
-    directory_path = filepath if torch.cuda.is_available() else "books"
-    if os.path.exists(directory_path):
-        shutil.rmtree(directory_path)
-    os.makedirs(directory_path)
-    text = extract_text_from_file(inputFile.name)
-    sentences = extrac(text)
-    GROUP_SIZE = groupsize
-    for i in range(0, len(sentences), GROUP_SIZE):
-        group = sentences[i:i+GROUP_SIZE]
-        if spealerList == "":
-            spealerList = "无"
-        result = generate_audio_and_srt_for_group(group,directory_path, i//GROUP_SIZE + 1, 44100, speaker, sdp_ratio, noise_scale, noise_scale_w, length_scale,spealerList,silenceTime)
-        if not torch.cuda.is_available():
-            return result
-    return result
 def loadmodel(model):
     _ = net_g.eval()
@@ -223,50 +186,56 @@ def loadmodel(model):
     return "success"
 if __name__ == "__main__":
-    hps = utils.get_hparams_from_file('Data/BangDream/config.json')
-    version = hps.version if hasattr(hps, "version") else latest_version
-    net_g = get_net_g(
-        model_path='Data/BangDream/models/G_33000.pth', version=version, device=device, hps=hps
-    )
-    speaker_ids = hps.data.spk2id
-    speakers = list(speaker_ids.keys())
     languages = [ "Auto", "ZH", "JP"]
     modelPaths = []
-    for dirpath, dirnames, filenames in os.walk("Data/BangDream/models/"):
         for filename in filenames:
             modelPaths.append(os.path.join(dirpath, filename))
     with gr.Blocks() as app:
-        gr.Markdown(value="""
-            少歌邦邦全员在线语音合成（Bert-Vits2）\n
-            新版本[Mygo&AveMujica](https://huggingface.co/spaces/Mahiruoshi/MyGO_VIts-bert)情感嵌入模型已更新，全员模型将于下个Bert-vits2版本更新
-            二创请标注作者：B站@Mahiroshi: https://space.bilibili.com/19874615 ,如果有问题需要反馈可私信联系\n
-            声音归属：BangDream及少歌手游\n
-            ！！！注意：huggingface容器仅用作展示，建议克隆本项目后本地运行app.py,环境参考requirements.txt\n
-            Bert-vits2[项目](https://github.com/Stardust-minus/Bert-VITS2)本身仍然处于开发过程中，因此稳定性存在一定问题。
-            关于此模型的使用参考: https://nijigaku.top/2023/10/03/BangDreamTTS\n
-            [数据集制作](https://huggingface.co/spaces/Mahiruoshi/BangDream-Bert-VITS2/tree/main/%E7%88%AC%E8%99%AB),
-            [服务器启动示例](https://huggingface.co/spaces/Mahiruoshi/BangDream-Bert-VITS2/blob/main/server.py)\n
-            使用本模型请严格遵守法律法规!禁止生成任何有损声优或者企划的内容!!!!!\n
-            このモデルを使用する際は法律法規を厳守してください！声優や企画に損害を与える内容の生成は禁止です!!!!!\n
-            Please strictly follow the laws in your country and regulations when using this model! It is prohibited to generate any content that is harmful to others!!!!!\n
-            发布二创作品请标注本项目作者及链接、作品使用Bert-VITS2 AI生成！\n
-            """)
         for band in BandList:
             with gr.TabItem(band):
                 for name in BandList[band]:
                     with gr.TabItem(name):
                         with gr.Row():
                             with gr.Column():
                                 with gr.Row():
                                     gr.Markdown(
                                         '<div align="center">'
-                                        f'<img style="width:auto;height:400px;" src="file/image/{name}.png">'
                                         '</div>'
                                     )
                                 length_scale = gr.Slider(
                                         minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
                                     )
-                                LongSentence = gr.Checkbox(value=False, label="自动拆分句子")
                                 with gr.Accordion(label="切换模型", open=False):
                                     modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value")
                                     btnMod = gr.Button("载入模型")
@@ -276,99 +245,33 @@ if __name__ == "__main__":
                                 text = gr.TextArea(
                                     label="输入纯日语或者中文",
                                     placeholder="输入纯日语或者中文",
-                                    value="有个人躺在地上，哀嚎......\n有个人睡着了，睡在盒子里。\n我要把它打开，看看他的梦是什么。",
-                                )
                                 btn = gr.Button("点击生成", variant="primary")
                                 audio_output = gr.Audio(label="Output Audio")
                                 btntran = gr.Button("快速中翻日")
                                 translateResult = gr.TextArea("从这复制翻译后的文本")
                                 btntran.click(translate, inputs=[text], outputs = [translateResult])
-                                with gr.Accordion(label="其它参数设定", open=False):
-                                    sdp_ratio = gr.Slider(
-                                    minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
-                                    )
-                                    noise_scale = gr.Slider(
-                                        minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
-                                    )
-                                    noise_scale_w = gr.Slider(
-                                        minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度"
-                                    )
-                                    language = gr.Dropdown(
-                                    choices=languages, value=languages[0], label="选择语言(默认自动)"
-                                    )
-                                    speaker = gr.Dropdown(
-                                        choices=speakers, value=name, label="说话人"
-                                    )
                     btn.click(
-                        tts_fn,
                         inputs=[
                             text,
-                            speaker,
                             sdp_ratio,
                             noise_scale,
                             noise_scale_w,
                             length_scale,
-                            language,
-                            LongSentence,
                         ],
                         outputs=[audio_output],
                     )
-        with gr.Tab('拓展功能'):
-            with gr.Row():
-                with gr.Column():
-                    gr.Markdown(
-                                    f"从 <a href='https://nijigaku.top/2023/10/03/BangDreamTTS/'>我的博客站点</a> 查看自制galgame使用说明\n</a>"
-                                )
-                    inputFile = gr.UploadButton(label="上传txt(可设置角色对应表)、epub或mobi文件")
-                    groupSize = gr.Slider(
-                    minimum=10, maximum=1000 if  torch.cuda.is_available() else 50,value = 50, step=1, label="单个音频文件包含的最大字数"
-                    )
-                    silenceTime = gr.Slider(
-                    minimum=0, maximum=1, value=0.5, step=0.1, label="句子的间隔"
-                    )
-                    filepath = gr.TextArea(
-                                        label="本地合成时的音频存储文件夹(会清空文件夹警告)",
-                                        value = "D:/audiobook/book1",
-                    )
-                    spealerList = gr.TextArea(
-                                        label="角色对应表(example)",
-                                        placeholder="左边是你想要在每一句话合成中用到的speaker(见角色清单)右边是你上传文本时分隔符左边设置的说话人:{ChoseSpeakerFromConfigList1}|{SeakerInUploadText1}\n{ChoseSpeakerFromConfigList2}|{SeakerInUploadText2}\n{ChoseSpeakerFromConfigList3}|{SeakerInUploadText3}\n",
-                                        value = "ましろ|真白\n七深|七深\n透子|透子\nつくし|筑紫\n瑠唯|瑠唯\nそよ|素世\n祥子|祥子",
-                    )
-                    speaker = gr.Dropdown(
-                        choices=speakers, value = "ましろ", label="选择默认说话人"
-                    )
-                with gr.Column():
-                    sdp_ratio = gr.Slider(
-                    minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
-                    )
-                    noise_scale = gr.Slider(
-                        minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
-                    )
-                    noise_scale_w = gr.Slider(
-                        minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度"
-                    )
-                    length_scale = gr.Slider(
-                        minimum=0.1, maximum=2, value=1, step=0.01, label="生成长度"
-                    )
-                    LastAudioOutput = gr.Audio(label="当使用cuda时才能在本地文件夹浏览全部文件")
-                    btn2 = gr.Button("点击生成", variant="primary")
-                btn2.click(
-                    audiobook,
-                    inputs=[
-                        inputFile,
-                        groupSize,
-                        speaker,
-                        sdp_ratio,
-                        noise_scale,
-                        noise_scale_w,
-                        length_scale,
-                        spealerList,
-                        silenceTime,
-                        filepath
-                    ],
-                    outputs=[LastAudioOutput],
-                )
-print("推理页面已开启!")
-app.launch()

+import argparse
 import os
+from pathlib import Path
+import logging
 import re_matching
 logging.getLogger("numba").setLevel(logging.WARNING)
 logger = logging.getLogger(__name__)
+import librosa
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader, Dataset
+from tqdm import tqdm
+from clap_wrapper import get_clap_audio_feature, get_clap_text_feature
 import gradio as gr
+import utils
+from config import config
+import torch
+import commons
+from text import cleaned_text_to_sequence, get_bert
+from text.cleaner import clean_text
+import utils
+from models import SynthesizerTrn
+from text.symbols import symbols
+import sys
 net_g = None
+'''
+device = (
+        "cuda:0"
+        if torch.cuda.is_available()
+        else (
+            "mps"
+            if sys.platform == "darwin" and torch.backends.mps.is_available()
+            else "cpu"
+        )
+    )
+'''
+device = "cpu"
 BandList = {
         "PoppinParty":["香澄","有咲","たえ","りみ","沙綾"],
         "Afterglow":["蘭","モカ","ひまり","巴","つぐみ"],
         "HelloHappyWorld":["こころ","美咲","薫","花音","はぐみ"],
         "Roselia":["友希那","紗夜","リサ","燐子","あこ"],
         "RaiseASuilen":["レイヤ","ロック","ますき","チュチュ","パレオ"],
         "Morfonica":["ましろ","瑠唯","つくし","七深","透子"],
+        "MyGo":["燈","愛音","そよ","立希","楽奈"],
+        "AveMujica":["祥子","睦","海鈴","にゃむ","初華"],
         "圣翔音乐学园":["華戀","光","香子","雙葉","真晝","純那","克洛迪娜","真矢","奈奈"],
         "凛明馆女子学校":["珠緒","壘","文","悠悠子","一愛"],
         "弗隆提亚艺术学校":["艾露","艾露露","菈樂菲","司","靜羽"],
         "西克菲尔特音乐学院":["晶","未知留","八千代","栞","美帆"]
 }
+def get_net_g(model_path: str,  device: str, hps):
+    net_g = SynthesizerTrn(
+        len(symbols),
+        hps.data.filter_length // 2 + 1,
+        hps.train.segment_size // hps.data.hop_length,
+        n_speakers=hps.data.n_speakers,
+        **hps.model,
+    ).to(device)
+    _ = net_g.eval()
+    _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True)
+    return net_g
+def get_text(text, language_str, hps, device):
+    # 在此处实现当前版本的get_text
+    norm_text, phone, tone, word2ph = clean_text(text, language_str)
+    phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str)
+    if hps.data.add_blank:
+        phone = commons.intersperse(phone, 0)
+        tone = commons.intersperse(tone, 0)
+        language = commons.intersperse(language, 0)
+        for i in range(len(word2ph)):
+            word2ph[i] = word2ph[i] * 2
+        word2ph[0] += 1
+    bert_ori = get_bert(norm_text, word2ph, language_str, device)
+    del word2ph
+    assert bert_ori.shape[-1] == len(phone), phone
+    if language_str == "ZH":
+        bert = bert_ori
+        ja_bert = torch.zeros(1024, len(phone))
+        en_bert = torch.zeros(1024, len(phone))
+    elif language_str == "JP":
+        bert = torch.zeros(1024, len(phone))
+        ja_bert = bert_ori
+        en_bert = torch.zeros(1024, len(phone))
+    else:
+        raise ValueError("language_str should be ZH, JP or EN")
+    assert bert.shape[-1] == len(
+        phone
+    ), f"Bert seq len {bert.shape[-1]} != {len(phone)}"
+    phone = torch.LongTensor(phone)
+    tone = torch.LongTensor(tone)
+    language = torch.LongTensor(language)
+    return bert, ja_bert, en_bert, phone, tone, language
+def infer(
     text,
     sdp_ratio,
     noise_scale,
     noise_scale_w,
     length_scale,
+    sid,
+    reference_audio=None,
+    emotion='Happy',
 ):
+    language= 'JP' if is_japanese(text) else 'ZH'
+    if isinstance(reference_audio, np.ndarray):
+        emo = get_clap_audio_feature(reference_audio, device)
+    else:
+        emo = get_clap_text_feature(emotion, device)
+    emo = torch.squeeze(emo, dim=1)
+    bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
+        text, language, hps, device
+    )
+    with torch.no_grad():
+        x_tst = phones.to(device).unsqueeze(0)
+        tones = tones.to(device).unsqueeze(0)
+        lang_ids = lang_ids.to(device).unsqueeze(0)
+        bert = bert.to(device).unsqueeze(0)
+        ja_bert = ja_bert.to(device).unsqueeze(0)
+        en_bert = en_bert.to(device).unsqueeze(0)
+        x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
+        emo = emo.to(device).unsqueeze(0)
+        del phones
+        speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
+        audio = (
+            net_g.infer(
+                x_tst,
+                x_tst_lengths,
+                speakers,
+                tones,
+                lang_ids,
+                bert,
+                ja_bert,
+                en_bert,
+                emo,
                 sdp_ratio=sdp_ratio,
                 noise_scale=noise_scale,
                 noise_scale_w=noise_scale_w,
                 length_scale=length_scale,
+            )[0][0, 0]
+            .data.cpu()
+            .float()
+            .numpy()
+        )
+        del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo
+        if torch.cuda.is_available():
             torch.cuda.empty_cache()
+        return (hps.data.sampling_rate,gr.processing_utils.convert_to_16_bit_wav(audio))
+def is_japanese(string):
+        for ch in string:
+            if ord(ch) > 0x3040 and ord(ch) < 0x30FF:
+                return True
+        return False
 def loadmodel(model):
     _ = net_g.eval()
     return "success"
 if __name__ == "__main__":
     languages = [ "Auto", "ZH", "JP"]
     modelPaths = []
+    for dirpath, dirnames, filenames in os.walk('Data/BangDreamV22/models/'):
         for filename in filenames:
             modelPaths.append(os.path.join(dirpath, filename))
+    hps = utils.get_hparams_from_file('Data/BangDreamV22/configs/config.json')
+    net_g = get_net_g(
+        model_path=modelPaths[-1], device=device, hps=hps
+    )
+    speaker_ids = hps.data.spk2id
+    speakers = list(speaker_ids.keys())
     with gr.Blocks() as app:
         for band in BandList:
             with gr.TabItem(band):
                 for name in BandList[band]:
                     with gr.TabItem(name):
+                        classifiedPaths = []
+                        for dirpath, dirnames, filenames in os.walk("Data/Bushiroad/classifedSample/"+name):
+                            for filename in filenames:
+                                classifiedPaths.append(os.path.join(dirpath, filename))
                         with gr.Row():
                             with gr.Column():
                                 with gr.Row():
                                     gr.Markdown(
                                         '<div align="center">'
+                                        f'<img style="width:auto;height:400px;" src="https://mahiruoshi-bangdream-bert-vits2.hf.space/file/image/{name}.png">'
                                         '</div>'
                                     )
                                 length_scale = gr.Slider(
                                         minimum=0.1, maximum=2, value=1, step=0.01, label="语速调节"
                                     )
+                                emotion = gr.Textbox(
+                                        label="Text prompt",
+                                        placeholder="用文字描述生成风格。如：Happy",
+                                        value="Happy",
+                                        visible=True,
+                                    )
+                                with gr.Accordion(label="参数设定", open=False):
+                                    sdp_ratio = gr.Slider(
+                                    minimum=0, maximum=1, value=0.2, step=0.01, label="SDP/DP混合比"
+                                    )
+                                    noise_scale = gr.Slider(
+                                        minimum=0.1, maximum=2, value=0.6, step=0.01, label="感情调节"
+                                    )
+                                    noise_scale_w = gr.Slider(
+                                        minimum=0.1, maximum=2, value=0.8, step=0.01, label="音素长度"
+                                    )
+                                    speaker = gr.Dropdown(
+                                        choices=speakers, value=name, label="说话人"
+                                    )
                                 with gr.Accordion(label="切换模型", open=False):
                                     modelstrs = gr.Dropdown(label = "模型", choices = modelPaths, value = modelPaths[0], type = "value")
                                     btnMod = gr.Button("载入模型")
                                 text = gr.TextArea(
                                     label="输入纯日语或者中文",
                                     placeholder="输入纯日语或者中文",
+                                    value="为什么要演奏春日影!",
+                                )
+                                try:
+                                    reference_audio = gr.Dropdown(label = "情感参考", choices = classifiedPaths, value = classifiedPaths[0], type = "value")
+                                except:
+                                    reference_audio = gr.Audio(label="情感参考音频）", type="filepath")
                                 btn = gr.Button("点击生成", variant="primary")
                                 audio_output = gr.Audio(label="Output Audio")
+                                '''
                                 btntran = gr.Button("快速中翻日")
                                 translateResult = gr.TextArea("从这复制翻译后的文本")
                                 btntran.click(translate, inputs=[text], outputs = [translateResult])
+                                '''
                     btn.click(
+                        infer,
                         inputs=[
                             text,
                             sdp_ratio,
                             noise_scale,
                             noise_scale_w,
                             length_scale,
+                            speaker,
+                            reference_audio,
+                            emotion,
                         ],
                         outputs=[audio_output],
                     )
+    print("推理页面已开启!")
+    app.launch(share=True)

bert/bert_models.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-    "deberta-v2-large-japanese": {
-        "repo_id": "ku-nlp/deberta-v2-large-japanese",
         "files": ["pytorch_model.bin"]
     },
     "chinese-roberta-wwm-ext-large": {

 {
+    "deberta-v2-large-japanese-char-wwm": {
+        "repo_id": "ku-nlp/deberta-v2-large-japanese-char-wwm",
         "files": ["pytorch_model.bin"]
     },
     "chinese-roberta-wwm-ext-large": {

bert/deberta-v2-large-japanese-char-wwm/.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

bert/deberta-v2-large-japanese-char-wwm/README.md ADDED Viewed

	@@ -0,0 +1,89 @@

+---
+language: ja
+license: cc-by-sa-4.0
+library_name: transformers
+tags:
+  - deberta
+  - deberta-v2
+  - fill-mask
+  - character
+  - wwm
+datasets:
+  - wikipedia
+  - cc100
+  - oscar
+metrics:
+  - accuracy
+mask_token: "[MASK]"
+widget:
+    - text: "京都大学で自然言語処理を[MASK][MASK]する。"
+---
+# Model Card for Japanese character-level DeBERTa V2 large
+## Model description
+This is a Japanese DeBERTa V2 large model pre-trained on Japanese Wikipedia, the Japanese portion of CC-100, and the Japanese portion of OSCAR.
+This model is trained with character-level tokenization and whole word masking.
+## How to use
+You can use this model for masked language modeling as follows:
+```python
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+tokenizer = AutoTokenizer.from_pretrained('ku-nlp/deberta-v2-large-japanese-char-wwm')
+model = AutoModelForMaskedLM.from_pretrained('ku-nlp/deberta-v2-large-japanese-char-wwm')
+sentence = '京都大学で自然言語処理を[MASK][MASK]する。'
+encoding = tokenizer(sentence, return_tensors='pt')
+...
+```
+You can also fine-tune this model on downstream tasks.
+## Tokenization
+There is no need to tokenize texts in advance, and you can give raw texts to the tokenizer.
+The texts are tokenized into character-level tokens by [sentencepiece](https://github.com/google/sentencepiece).
+## Training data
+We used the following corpora for pre-training:
+- Japanese Wikipedia (as of 20221020, 3.2GB, 27M sentences, 1.3M documents)
+- Japanese portion of CC-100 (85GB, 619M sentences, 66M documents)
+- Japanese portion of OSCAR (54GB, 326M sentences, 25M documents)
+Note that we filtered out documents annotated with "header", "footer", or "noisy" tags in OSCAR.
+Also note that Japanese Wikipedia was duplicated 10 times to make the total size of the corpus comparable to that of CC-100 and OSCAR. As a result, the total size of the training data is 171GB.
+## Training procedure
+We first segmented texts in the corpora into words using [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) for whole word masking.
+Then, we built a sentencepiece model with 22,012 tokens including all characters that appear in the training corpus.
+We tokenized raw corpora into character-level subwords using the sentencepiece model and trained the Japanese DeBERTa model using [transformers](https://github.com/huggingface/transformers) library.
+The training took 26 days using 16 NVIDIA A100-SXM4-40GB GPUs.
+The following hyperparameters were used during pre-training:
+- learning_rate: 1e-4
+- per_device_train_batch_size: 26
+- distributed_type: multi-GPU
+- num_devices: 16
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 3,328
+- max_seq_length: 512
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-06
+- lr_scheduler_type: linear schedule with warmup (lr = 0 at 300k steps)
+- training_steps: 260,000
+- warmup_steps: 10,000
+The accuracy of the trained model on the masked language modeling task was 0.795.
+The evaluation set consists of 5,000 randomly sampled documents from each of the training corpora.
+## Acknowledgments
+This work was supported by Joint Usage/Research Center for Interdisciplinary Large-scale Information Infrastructures (JHPCN) through General Collaboration Project no. jh221004, "Developing a Platform for Constructing and Sharing of Large-Scale Japanese Language Models".
+For training models, we used the mdx: a platform for the data-driven future.

bert/deberta-v2-large-japanese-char-wwm/config.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "architectures": [
+    "DebertaV2ForMaskedLM"
+  ],
+  "attention_head_size": 64,
+  "attention_probs_dropout_prob": 0.1,
+  "conv_act": "gelu",
+  "conv_kernel_size": 3,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-07,
+  "max_position_embeddings": 512,
+  "max_relative_positions": -1,
+  "model_type": "deberta-v2",
+  "norm_rel_ebd": "layer_norm",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 0,
+  "pooler_dropout": 0,
+  "pooler_hidden_act": "gelu",
+  "pooler_hidden_size": 1024,
+  "pos_att_type": [
+    "p2c",
+    "c2p"
+  ],
+  "position_biased_input": false,
+  "position_buckets": 256,
+  "relative_attention": true,
+  "share_att_key": true,
+  "torch_dtype": "float16",
+  "transformers_version": "4.25.1",
+  "type_vocab_size": 0,
+  "vocab_size": 22012
+}

bert/deberta-v2-large-japanese-char-wwm/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bf0dab8ad87bd7c22e85ec71e04f2240804fda6d33196157d6b5923af6ea1201
+size 1318456639

bert/deberta-v2-large-japanese-char-wwm/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

bert/deberta-v2-large-japanese-char-wwm/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "do_subword_tokenize": true,
+  "do_word_tokenize": true,
+  "jumanpp_kwargs": null,
+  "mask_token": "[MASK]",
+  "mecab_kwargs": null,
+  "model_max_length": 1000000000000000019884624838656,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": null,
+  "subword_tokenizer_type": "character",
+  "sudachi_kwargs": null,
+  "tokenizer_class": "BertJapaneseTokenizer",
+  "unk_token": "[UNK]",
+  "word_tokenizer_type": "basic"
+}

bert/deberta-v2-large-japanese-char-wwm/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

bert_gen.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import torch
-from multiprocessing import Pool
 import commons
 import utils
-from tqdm import tqdm
-from text import check_bert_models, cleaned_text_to_sequence, get_bert
-import argparse
-import torch.multiprocessing as mp
 from config import config
 def process_line(line):
@@ -57,7 +59,6 @@ if __name__ == "__main__":
     args, _ = parser.parse_known_args()
     config_path = args.config
     hps = utils.get_hparams_from_file(config_path)
-    check_bert_models()
     lines = []
     with open(hps.data.training_files, encoding="utf-8") as f:
         lines.extend(f.readlines())
@@ -65,7 +66,7 @@ if __name__ == "__main__":
     with open(hps.data.validation_files, encoding="utf-8") as f:
         lines.extend(f.readlines())
     if len(lines) != 0:
-        num_processes = args.num_processes
         with Pool(processes=num_processes) as pool:
             for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
                 pass

+import argparse
+from multiprocessing import Pool, cpu_count
 import torch
+import torch.multiprocessing as mp
+from tqdm import tqdm
 import commons
 import utils
 from config import config
+from text import cleaned_text_to_sequence, get_bert
 def process_line(line):
     args, _ = parser.parse_known_args()
     config_path = args.config
     hps = utils.get_hparams_from_file(config_path)
     lines = []
     with open(hps.data.training_files, encoding="utf-8") as f:
         lines.extend(f.readlines())
     with open(hps.data.validation_files, encoding="utf-8") as f:
         lines.extend(f.readlines())
     if len(lines) != 0:
+        num_processes = min(args.num_processes, cpu_count())
         with Pool(processes=num_processes) as pool:
             for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
                 pass

clap_gen.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import argparse
+from multiprocessing import Pool, cpu_count
+import torch
+import torch.multiprocessing as mp
+from tqdm import tqdm
+import utils
+from config import config
+from clap_wrapper import get_clap_audio_feature
+import librosa
+import os
+os.environ["OMP_NUM_THREADS"] = "1"
+os.environ["MKL_NUM_THREADS"] = "1"
+def process_line(line):
+    device = config.emo_gen_config.device
+    if config.emo_gen_config.use_multi_device:
+        rank = mp.current_process()._identity
+        rank = rank[0] if len(rank) > 0 else 0
+        if torch.cuda.is_available():
+            gpu_id = rank % torch.cuda.device_count()
+            device = torch.device(f"cuda:{gpu_id}")
+        else:
+            device = torch.device("cpu")
+    wav_path, _, language_str, text, phones, tone, word2ph = line.strip().split("|")
+    clap_path = wav_path.replace(".WAV", ".wav").replace(".wav", ".emo.npy")
+    if os.path.isfile(clap_path):
+        return
+    audio = librosa.load(wav_path, 48000)[0]
+    # audio = librosa.resample(audio, 44100, 48000)
+    clap = get_clap_audio_feature(audio, device)
+    torch.save(clap, clap_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-c", "--config", type=str, default=config.emo_gen_config.config_path
+    )
+    parser.add_argument(
+        "--num_processes", type=int, default=config.emo_gen_config.num_processes
+    )
+    args, _ = parser.parse_known_args()
+    config_path = args.config
+    hps = utils.get_hparams_from_file(config_path)
+    lines = []
+    with open(hps.data.training_files, encoding="utf-8") as f:
+        lines.extend(f.readlines())
+    with open(hps.data.validation_files, encoding="utf-8") as f:
+        lines.extend(f.readlines())
+    if len(lines) != 0:
+        num_processes = min(args.num_processes, cpu_count())
+        with Pool(processes=num_processes) as pool:
+            for _ in tqdm(pool.imap_unordered(process_line, lines), total=len(lines)):
+                pass
+    print(f"clap生成完毕!, 共有{len(lines)}个emo.pt生成!")

clap_wrapper.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import sys
+import torch
+from transformers import ClapModel, ClapProcessor
+from config import config
+models = dict()
+processor = ClapProcessor.from_pretrained("./emotional/clap-htsat-fused")
+def get_clap_audio_feature(audio_data, device=config.bert_gen_config.device):
+    if (
+        sys.platform == "darwin"
+        and torch.backends.mps.is_available()
+        and device == "cpu"
+    ):
+        device = "mps"
+    if not device:
+        device = "cuda"
+    if device not in models.keys():
+        models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to(
+            device
+        )
+    with torch.no_grad():
+        inputs = processor(
+            audios=audio_data, return_tensors="pt", sampling_rate=48000
+        ).to(device)
+        emb = models[device].get_audio_features(**inputs)
+    return emb.T
+def get_clap_text_feature(text, device=config.bert_gen_config.device):
+    if (
+        sys.platform == "darwin"
+        and torch.backends.mps.is_available()
+        and device == "cpu"
+    ):
+        device = "mps"
+    if not device:
+        device = "cuda"
+    if device not in models.keys():
+        models[device] = ClapModel.from_pretrained("./emotional/clap-htsat-fused").to(
+            device
+        )
+    with torch.no_grad():
+        inputs = processor(text=text, return_tensors="pt").to(device)
+        emb = models[device].get_text_features(**inputs)
+    return emb.T

commons.py CHANGED Viewed

@@ -46,26 +46,18 @@ def rand_gumbel_like(x):
 def slice_segments(x, ids_str, segment_size=4):
-    ret = torch.zeros_like(x[:, :, :segment_size])
-    for i in range(x.size(0)):
-        idx_str = ids_str[i]
-        idx_end = idx_str + segment_size
-        if idx_str < 0:
-            i1 = x.size(2) + idx_str
-            r1 = x[i, :, i1:]
-            r2 = x[i, :, :idx_end]
-            ret[i] = torch.cat([r1, r2], dim=1)
-        else:
-            ret[i] = x[i, :, idx_str:idx_end]
-    return ret
 def rand_slice_segments(x, x_lengths=None, segment_size=4):
     b, d, t = x.size()
     if x_lengths is None:
         x_lengths = t
-    ids_str_max = x_lengths - segment_size + 1
-    ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
     ret = slice_segments(x, ids_str, segment_size)
     return ret, ids_str

 def slice_segments(x, ids_str, segment_size=4):
+    gather_indices = ids_str.view(x.size(0), 1, 1).repeat(
+        1, x.size(1), 1
+    ) + torch.arange(segment_size, device=x.device)
+    return torch.gather(x, 2, gather_indices)
 def rand_slice_segments(x, x_lengths=None, segment_size=4):
     b, d, t = x.size()
     if x_lengths is None:
         x_lengths = t
+    ids_str_max = torch.clamp(x_lengths - segment_size + 1, min=0)
+    ids_str = (torch.rand([b], device=x.device) * ids_str_max).to(dtype=torch.long)
     ret = slice_segments(x, ids_str, segment_size)
     return ret, ids_str

compress_model.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from collections import OrderedDict
+from text.symbols import symbols
+import torch
+from tools.log import logger
+import utils
+from models import SynthesizerTrn
+import os
+def copyStateDict(state_dict):
+    if list(state_dict.keys())[0].startswith("module"):
+        start_idx = 1
+    else:
+        start_idx = 0
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        name = ",".join(k.split(".")[start_idx:])
+        new_state_dict[name] = v
+    return new_state_dict
+def removeOptimizer(config: str, input_model: str, ishalf: bool, output_model: str):
+    hps = utils.get_hparams_from_file(config)
+    net_g = SynthesizerTrn(
+        len(symbols),
+        hps.data.filter_length // 2 + 1,
+        hps.train.segment_size // hps.data.hop_length,
+        n_speakers=hps.data.n_speakers,
+        **hps.model,
+    )
+    optim_g = torch.optim.AdamW(
+        net_g.parameters(),
+        hps.train.learning_rate,
+        betas=hps.train.betas,
+        eps=hps.train.eps,
+    )
+    state_dict_g = torch.load(input_model, map_location="cpu")
+    new_dict_g = copyStateDict(state_dict_g)
+    keys = []
+    for k, v in new_dict_g["model"].items():
+        if "enc_q" in k:
+            continue  # noqa: E701
+        keys.append(k)
+    new_dict_g = (
+        {k: new_dict_g["model"][k].half() for k in keys}
+        if ishalf
+        else {k: new_dict_g["model"][k] for k in keys}
+    )
+    torch.save(
+        {
+            "model": new_dict_g,
+            "iteration": 0,
+            "optimizer": optim_g.state_dict(),
+            "learning_rate": 0.0001,
+        },
+        output_model,
+    )
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-c", "--config", type=str, default="configs/config.json")
+    parser.add_argument("-i", "--input", type=str)
+    parser.add_argument("-o", "--output", type=str, default=None)
+    parser.add_argument(
+        "-hf", "--half", action="store_true", default=False, help="Save as FP16"
+    )
+    args = parser.parse_args()
+    output = args.output
+    if output is None:
+        import os.path
+        filename, ext = os.path.splitext(args.input)
+        half = "_half" if args.half else ""
+        output = filename + "_release" + half + ext
+    removeOptimizer(args.config, args.input, args.half, output)
+    logger.info(f"压缩模型成功, 输出模型: {os.path.abspath(output)}")

config.py CHANGED Viewed

@@ -38,7 +38,7 @@ class Preprocess_text_config:
         train_path: str,
         val_path: str,
         config_path: str,
-        val_per_spk: int = 5,
         max_val_total: int = 10000,
         clean: bool = True,
     ):
@@ -47,7 +47,7 @@ class Preprocess_text_config:
         self.train_path: str = train_path  # 训练集路径，可以不填。不填则将在原始文本目录生成
         self.val_path: str = val_path  # 验证集路径，可以不填。不填则将在原始文本目录生成
         self.config_path: str = config_path  # 配置文件路径
-        self.val_per_spk: int = val_per_spk  # 每个speaker的验证集条数
         self.max_val_total: int = max_val_total  # 验证集最大条数，多于的会被截断并放到训练集中
         self.clean: bool = clean  # 是否进行数据清洗
@@ -99,10 +99,12 @@ class Emo_gen_config:
         config_path: str,
         num_processes: int = 2,
         device: str = "cuda",
     ):
         self.config_path = config_path
         self.num_processes = num_processes
         self.device = device
     @classmethod
     def from_dict(cls, dataset_path: str, data: Dict[str, any]):
@@ -120,11 +122,17 @@ class Train_ms_config:
         env: Dict[str, any],
         base: Dict[str, any],
         model: str,
     ):
         self.env = env  # 需要加载的环境变量
         self.base = base  # 底模配置
         self.model = model  # 训练模型存储目录，该路径为相对于dataset_path的路径，而非项目根目录
         self.config_path = config_path  # 配置文件路径
     @classmethod
     def from_dict(cls, dataset_path: str, data: Dict[str, any]):
@@ -216,6 +224,9 @@ class Config:
             self.bert_gen_config: Bert_gen_config = Bert_gen_config.from_dict(
                 dataset_path, yaml_config["bert_gen"]
             )
             self.train_ms_config: Train_ms_config = Train_ms_config.from_dict(
                 dataset_path, yaml_config["train_ms"]
             )

         train_path: str,
         val_path: str,
         config_path: str,
+        val_per_lang: int = 5,
         max_val_total: int = 10000,
         clean: bool = True,
     ):
         self.train_path: str = train_path  # 训练集路径，可以不填。不填则将在原始文本目录生成
         self.val_path: str = val_path  # 验证集路径，可以不填。不填则将在原始文本目录生成
         self.config_path: str = config_path  # 配置文件路径
+        self.val_per_lang: int = val_per_lang  # 每个speaker的验证集条数
         self.max_val_total: int = max_val_total  # 验证集最大条数，多于的会被截断并放到训练集中
         self.clean: bool = clean  # 是否进行数据清洗
         config_path: str,
         num_processes: int = 2,
         device: str = "cuda",
+        use_multi_device: bool = False,
     ):
         self.config_path = config_path
         self.num_processes = num_processes
         self.device = device
+        self.use_multi_device = use_multi_device
     @classmethod
     def from_dict(cls, dataset_path: str, data: Dict[str, any]):
         env: Dict[str, any],
         base: Dict[str, any],
         model: str,
+        num_workers: int,
+        spec_cache: bool,
+        keep_ckpts: int,
     ):
         self.env = env  # 需要加载的环境变量
         self.base = base  # 底模配置
         self.model = model  # 训练模型存储目录，该路径为相对于dataset_path的路径，而非项目根目录
         self.config_path = config_path  # 配置文件路径
+        self.num_workers = num_workers  # worker数量
+        self.spec_cache = spec_cache  # 是否启用spec缓存
+        self.keep_ckpts = keep_ckpts  # ckpt数量
     @classmethod
     def from_dict(cls, dataset_path: str, data: Dict[str, any]):
             self.bert_gen_config: Bert_gen_config = Bert_gen_config.from_dict(
                 dataset_path, yaml_config["bert_gen"]
             )
+            self.emo_gen_config: Emo_gen_config = Emo_gen_config.from_dict(
+                dataset_path, yaml_config["emo_gen"]
+            )
             self.train_ms_config: Train_ms_config = Train_ms_config.from_dict(
                 dataset_path, yaml_config["train_ms"]
             )

config.yml CHANGED Viewed

@@ -4,7 +4,7 @@
 # 拟提供通用路径配置，统一存放数据，避免数据放得很乱
 # 每个数据集与其对应的模型存放至统一路径下，后续所有的路径配置均为相对于datasetPath的路径
 # 不填或者填空则路径为相对于项目根目录的路径
-dataset_path: ""
 # 模型镜像源，默认huggingface，使用openi镜像源需指定openi_token
 mirror: ""
@@ -17,16 +17,16 @@ resample:
   sampling_rate: 44100
   # 音频文件输入路径，重采样会将该路径下所有.wav音频文件重采样
   # 请填入相对于datasetPath的相对路径
-  in_dir: "" # 相对于根目录的路径为 /datasetPath/in_dir
   # 音频文件重采样后输出路径
-  out_dir: ""
 # preprocess_text 数据集预处理相关配置
 # 注意， “:” 后需要加空格
 preprocess_text:
   # 原始文本文件路径，文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
-  transcription_path: "filelists/bushroid.list"
   # 数据清洗后文本路径，可以不填。不填则将在原始文本目录生成
   cleaned_path: ""
   # 训练集路径
@@ -35,10 +35,10 @@ preprocess_text:
   val_path: "filelists/val.list"
   # 配置文件路径
   config_path: "config.json"
-  # 每个speaker的验证集条数
-  val_per_spk: 4
   # 验证集最大条数，多于的会被截断并放到训练集中
-  max_val_total: 8
   # 是否进行数据清洗
   clean: true
@@ -49,35 +49,51 @@ bert_gen:
   # 训练数据集配置文件路径
   config_path: "config.json"
   # 并行数
-  num_processes: 2
   # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
   # 该选项同时决定了get_bert_feature的默认设备
   device: "cuda"
   # 使用多卡推理
   use_multi_device: false
 # train 训练配置
 # 注意， “:” 后需要加空格
 train_ms:
-  # 需要加载的环境变量，多显卡训练时RANK请手动在环境变量填写
-  # 环境变量对应名称环境变量不存在时加载，也就是说手动添加的环境变量优先级更高，会覆盖本配置文件
   env:
     MASTER_ADDR: "localhost"
     MASTER_PORT: 10086
     WORLD_SIZE: 1
     RANK: 0
     # 可以填写任意名的环境变量
     # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
   # 底模设置
   base:
-    use_base_model: True
     repo_id: "Stardust_minus/Bert-VITS2"
-    model_image: "Bert-VITS2中日英底模-fix" # openi网页的模型名
   # 训练模型存储目录：与旧版本的区别，原先数据集是存放在logs/model_name下的，现在改为统一存放在Data/你的数据集/models下
   model: "models"
   # 配置文件路径
-  config_path: "configs/config.json"
 # webui webui配置
@@ -86,9 +102,9 @@ webui:
   # 推理设备
   device: "cuda"
   # 模型路径
-  model: "genshin/models/G_8000.pth"
   # 配置文件路径
-  config_path: "configs/config.json"
   # 端口号
   port: 7860
   # 是否公开部署，对外网开放
@@ -99,7 +115,7 @@ webui:
   language_identification_library: "langid"
-# server api配置
 # 注意， “:” 后需要加空格
 # 注意，本配置下的所有配置均为相对于根目录的路径
 server:
@@ -107,8 +123,10 @@ server:
   port: 5000
   # 模型默认使用设备：但是当前并没有实现这个配置。
   device: "cuda"
-  # 需要加载的所有模型的配置
   # 注意，所有模型都必须正确配置model与config的路径，空路径会导致加载错误。
   models:
     - # 模型的路径
       model: ""
@@ -149,7 +167,6 @@ server:
       # 不必填写所有人物，不填的使用默认值
       speakers: [ ] # 也可以不填
 # 百度翻译开放平台 api配置
 # api接入文档 https://api.fanyi.baidu.com/doc/21
 # 请不要在github等网站公开分享你的app id 与 key

 # 拟提供通用路径配置，统一存放数据，避免数据放得很乱
 # 每个数据集与其对应的模型存放至统一路径下，后续所有的路径配置均为相对于datasetPath的路径
 # 不填或者填空则路径为相对于项目根目录的路径
+dataset_path: "Data/"
 # 模型镜像源，默认huggingface，使用openi镜像源需指定openi_token
 mirror: ""
   sampling_rate: 44100
   # 音频文件输入路径，重采样会将该路径下所有.wav音频文件重采样
   # 请填入相对于datasetPath的相对路径
+  in_dir: "audios/raw" # 相对于根目录的路径为 /datasetPath/in_dir
   # 音频文件重采样后输出路径
+  out_dir: "audios/wavs"
 # preprocess_text 数据集预处理相关配置
 # 注意， “:” 后需要加空格
 preprocess_text:
   # 原始文本文件路径，文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
+  transcription_path: "filelists/你的数据集文本.list"
   # 数据清洗后文本路径，可以不填。不填则将在原始文本目录生成
   cleaned_path: ""
   # 训练集路径
   val_path: "filelists/val.list"
   # 配置文件路径
   config_path: "config.json"
+  # 每个语言的验证集条数
+  val_per_lang: 4
   # 验证集最大条数，多于的会被截断并放到训练集中
+  max_val_total: 12
   # 是否进行数据清洗
   clean: true
   # 训练数据集配置文件路径
   config_path: "config.json"
   # 并行数
+  num_processes: 4
   # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
   # 该选项同时决定了get_bert_feature的默认设备
   device: "cuda"
   # 使用多卡推理
   use_multi_device: false
+# emo_gen 相关配置
+# 注意， “:” 后需要加空格
+emo_gen:
+  # 训练数据集配置文件路径
+  config_path: "config.json"
+  # 并行数
+  num_processes: 4
+  # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
+  device: "cuda"
+  # 使用多卡推理
+  use_multi_device: false
 # train 训练配置
 # 注意， “:” 后需要加空格
 train_ms:
   env:
     MASTER_ADDR: "localhost"
     MASTER_PORT: 10086
     WORLD_SIZE: 1
+    LOCAL_RANK: 0
     RANK: 0
     # 可以填写任意名的环境变量
     # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
   # 底模设置
   base:
+    use_base_model: false
     repo_id: "Stardust_minus/Bert-VITS2"
+    model_image: "Bert-VITS2_2.2-Clap底模" # openi网页的模型名
   # 训练模型存储目录：与旧版本的区别，原先数据集是存放在logs/model_name下的，现在改为统一存放在Data/你的数据集/models下
   model: "models"
   # 配置文件路径
+  config_path: "config.json"
+  # 训练使用的worker，不建议超过CPU核心数
+  num_workers: 16
+  # 关闭此项可以节约接近50%的磁盘空间，但是可能导致实际训练速度变慢和更高的CPU使用率。
+  spec_cache: True
+  # 保存的检查点数量，多于此数目的权重会被删除来节省空间。
+  keep_ckpts: 8
 # webui webui配置
   # 推理设备
   device: "cuda"
   # 模型路径
+  model: "models/G_8000.pth"
   # 配置文件路径
+  config_path: "config.json"
   # 端口号
   port: 7860
   # 是否公开部署，对外网开放
   language_identification_library: "langid"
+# server-fastapi配置
 # 注意， “:” 后需要加空格
 # 注意，本配置下的所有配置均为相对于根目录的路径
 server:
   port: 5000
   # 模型默认使用设备：但是当前并没有实现这个配置。
   device: "cuda"
+  # 需要加载的所有模型的配置，可以填多个模型，也可以不填模型，等网页成功后手动加载模型
+  # 不加载模型的配置格式：删除默认给的两个模型配置，给models赋值 [ ]，也就是空列表。参考模型2的speakers 即 models: [ ]
   # 注意，所有模型都必须正确配置model与config的路径，空路径会导致加载错误。
+  # 也可以不填模型，等网页加载成功后手动填写models。
   models:
     - # 模型的路径
       model: ""
       # 不必填写所有人物，不填的使用默认值
       speakers: [ ] # 也可以不填
 # 百度翻译开放平台 api配置
 # api接入文档 https://api.fanyi.baidu.com/doc/21
 # 请不要在github等网站公开分享你的app id 与 key

configs/config.json CHANGED Viewed

@@ -10,7 +10,7 @@
       0.99
     ],
     "eps": 1e-09,
-    "batch_size": 16,
     "fp16_run": false,
     "lr_decay": 0.99995,
     "segment_size": 16384,
@@ -18,11 +18,14 @@
     "warmup_epochs": 0,
     "c_mel": 45,
     "c_kl": 1.0,
-    "skip_optimizer": true
   },
   "data": {
-    "training_files": "Data/BangDream/filelists/train.list",
-    "validation_files": "Data/BangDream/filelists/val.list",
     "max_wav_value": 32768.0,
     "sampling_rate": 44100,
     "filter_length": 2048,
@@ -32,101 +35,864 @@
     "mel_fmin": 0.0,
     "mel_fmax": null,
     "add_blank": true,
-    "n_speakers": 700,
     "cleaned_text": true,
     "spk2id": {
-      "華戀": 0,
-      "晶": 1,
-      "光": 2,
-      "未知留": 3,
-      "香子": 4,
-      "雙葉": 5,
-      "真晝": 6,
-      "艾露": 7,
-      "珠緒": 8,
-      "艾露露": 9,
-      "純那": 10,
-      "克洛迪娜": 11,
-      "真矢": 12,
-      "奈奈": 13,
-      "壘": 14,
-      "文": 15,
-      "一愛": 16,
-      "菈樂菲": 17,
-      "司": 18,
-      "美空": 19,
-      "靜羽": 20,
-      "悠悠子": 21,
-      "八千代": 22,
-      "栞": 23,
-      "美帆": 24,
-      "芙蘿菈": 25,
-      "克蕾兒": 26,
-      "安德露": 27,
-      "瑪莉亞貝菈": 28,
-      "克拉迪亞": 29,
-      "桃樂西": 30,
-      "瑪麗安": 31,
-      "三月七": 32,
-      "香澄": 33,
-      "有咲": 34,
-      "沙綾": 35,
-      "りみ": 36,
-      "たえ": 37,
-      "沙綾、りみ、たえ": 38,
-      "巴": 39,
-      "一同": 40,
-      "まりな": 41,
-      "ゆり": 42,
-      "明日香": 43,
-      "？？？": 44,
-      "ひまり": 45,
-      "モカ": 46,
-      "つぐみ": 47,
-      "蘭": 48,
-      "リサ": 49,
-      "千聖": 50,
-      "花音": 51,
-      "イヴ": 52,
-      "日菜": 53,
-      "友希那": 54,
-      "紗夜": 55,
-      "こころ": 56,
-      "美咲": 57,
-      "薫": 58,
-      "はぐみ": 59,
-      "ミッシェル": 60,
-      "マリー": 61,
-      "怪盗ハロハッピー": 62,
-      "ニコリーナ": 63,
-      "彩": 64,
-      "麻弥": 65,
-      "燐子": 66,
-      "あこ": 67,
-      "ゆきな": 68,
-      "ましろ": 69,
-      "つくし": 70,
-      "透子": 71,
-      "七深": 72,
-      "瑠唯": 73,
-      "六花": 74,
-      "パレオ": 75,
-      "レイヤ": 76,
-      "マスキング": 77,
-      "チュチュ": 78,
-      "ますき": 79,
-      "ロック": 80,
-      "令王那": 81,
-      "CHIYU": 82,
-      "レイ": 83,
-      "燈": 84,
-      "そよ": 85,
-      "祥子": 86,
-      "立希": 87,
-      "睦": 88,
-      "愛音": 89,
-      "楽奈": 90,
-      "海鈴": 91
     }
   },
   "model": {
@@ -183,5 +949,5 @@
     "use_spectral_norm": false,
     "gin_channels": 256
   },
-  "version": "2.0"
-}

       0.99
     ],
     "eps": 1e-09,
+    "batch_size": 12,
     "fp16_run": false,
     "lr_decay": 0.99995,
     "segment_size": 16384,
     "warmup_epochs": 0,
     "c_mel": 45,
     "c_kl": 1.0,
+    "skip_optimizer": true,
+    "freeze_ZH_bert": false,
+    "freeze_JP_bert": false,
+    "freeze_EN_bert": false
   },
   "data": {
+    "training_files": "filelists/train.list",
+    "validation_files": "filelists/val.list",
     "max_wav_value": 32768.0,
     "sampling_rate": 44100,
     "filter_length": 2048,
     "mel_fmin": 0.0,
     "mel_fmax": null,
     "add_blank": true,
+    "n_speakers": 896,
     "cleaned_text": true,
     "spk2id": {
+      "派蒙_ZH": 0,
+      "纳西妲_ZH": 1,
+      "凯亚_ZH": 2,
+      "阿贝多_ZH": 3,
+      "温迪_ZH": 4,
+      "枫原万叶_ZH": 5,
+      "钟离_ZH": 6,
+      "荒泷一斗_ZH": 7,
+      "八重神子_ZH": 8,
+      "艾尔海森_ZH": 9,
+      "提纳里_ZH": 10,
+      "迪希雅_ZH": 11,
+      "卡维_ZH": 12,
+      "宵宫_ZH": 13,
+      "那维莱特_ZH": 14,
+      "莱依拉_ZH": 15,
+      "赛诺_ZH": 16,
+      "莫娜_ZH": 17,
+      "诺艾尔_ZH": 18,
+      "托马_ZH": 19,
+      "凝光_ZH": 20,
+      "林尼_ZH": 21,
+      "北斗_ZH": 22,
+      "柯莱_ZH": 23,
+      "神里绫华_ZH": 24,
+      "可莉_ZH": 25,
+      "芭芭拉_ZH": 26,
+      "雷电将军_ZH": 27,
+      "娜维娅_ZH": 28,
+      "芙宁娜_ZH": 29,
+      "珊瑚宫心海_ZH": 30,
+      "鹿野院平藏_ZH": 31,
+      "迪奥娜_ZH": 32,
+      "琴_ZH": 33,
+      "五郎_ZH": 34,
+      "班尼特_ZH": 35,
+      "达达利亚_ZH": 36,
+      "安柏_ZH": 37,
+      "莱欧斯利_ZH": 38,
+      "夜兰_ZH": 39,
+      "妮露_ZH": 40,
+      "辛焱_ZH": 41,
+      "丽莎_ZH": 42,
+      "珐露珊_ZH": 43,
+      "魈_ZH": 44,
+      "香菱_ZH": 45,
+      "迪卢克_ZH": 46,
+      "砂糖_ZH": 47,
+      "烟绯_ZH": 48,
+      "早柚_ZH": 49,
+      "云堇_ZH": 50,
+      "刻晴_ZH": 51,
+      "重云_ZH": 52,
+      "优菈_ZH": 53,
+      "胡桃_ZH": 54,
+      "流浪者_ZH": 55,
+      "久岐忍_ZH": 56,
+      "神里绫人_ZH": 57,
+      "甘雨_ZH": 58,
+      "戴因斯雷布_ZH": 59,
+      "菲谢尔_ZH": 60,
+      "白术_ZH": 61,
+      "行秋_ZH": 62,
+      "九条裟罗_ZH": 63,
+      "夏洛蒂_ZH": 64,
+      "雷泽_ZH": 65,
+      "申鹤_ZH": 66,
+      "荧_ZH": 67,
+      "空_ZH": 68,
+      "迪娜泽黛_ZH": 69,
+      "凯瑟琳_ZH": 70,
+      "多莉_ZH": 71,
+      "坎蒂丝_ZH": 72,
+      "琳妮特_ZH": 73,
+      "萍姥姥_ZH": 74,
+      "罗莎莉亚_ZH": 75,
+      "埃德_ZH": 76,
+      "爱贝尔_ZH": 77,
+      "伊迪娅_ZH": 78,
+      "留云借风真君_ZH": 79,
+      "绮良良_ZH": 80,
+      "七七_ZH": 81,
+      "式大将_ZH": 82,
+      "瑶瑶_ZH": 83,
+      "奥兹_ZH": 84,
+      "菲米尼_ZH": 85,
+      "米卡_ZH": 86,
+      "哲平_ZH": 87,
+      "大肉丸_ZH": 88,
+      "托克_ZH": 89,
+      "蒂玛乌斯_ZH": 90,
+      "昆钧_ZH": 91,
+      "欧菲妮_ZH": 92,
+      "塞琉斯_ZH": 93,
+      "仆人_ZH": 94,
+      "迈勒斯_ZH": 95,
+      "希格雯_ZH": 96,
+      "阿守_ZH": 97,
+      "拉赫曼_ZH": 98,
+      "杜拉夫_ZH": 99,
+      "伊利亚斯_ZH": 100,
+      "阿晃_ZH": 101,
+      "旁白_ZH": 102,
+      "爱德琳_ZH": 103,
+      "埃洛伊_ZH": 104,
+      "德沃沙克_ZH": 105,
+      "玛乔丽_ZH": 106,
+      "塞塔蕾_ZH": 107,
+      "柊千里_ZH": 108,
+      "海芭夏_ZH": 109,
+      "九条镰治_ZH": 110,
+      "阿娜耶_ZH": 111,
+      "笼钓瓶一心_ZH": 112,
+      "回声海螺_ZH": 113,
+      "劳维克_ZH": 114,
+      "元太_ZH": 115,
+      "阿扎尔_ZH": 116,
+      "查尔斯_ZH": 117,
+      "阿洛瓦_ZH": 118,
+      "埃勒曼_ZH": 119,
+      "纳比尔_ZH": 120,
+      "莎拉_ZH": 121,
+      "康纳_ZH": 122,
+      "博来_ZH": 123,
+      "玛塞勒_ZH": 124,
+      "阿祇_ZH": 125,
+      "博士_ZH": 126,
+      "玛格丽特_ZH": 127,
+      "迪尔菲_ZH": 128,
+      "宛烟_ZH": 129,
+      "羽生田千鹤_ZH": 130,
+      "海妮耶_ZH": 131,
+      "旅行者_ZH": 132,
+      "霍夫曼_ZH": 133,
+      "佐西摩斯_ZH": 134,
+      "鹿野奈奈_ZH": 135,
+      "舒伯特_ZH": 136,
+      "天叔_ZH": 137,
+      "艾莉丝_ZH": 138,
+      "龙二_ZH": 139,
+      "莺儿_ZH": 140,
+      "嘉良_ZH": 141,
+      "一心传名刀_ZH": 142,
+      "费迪南德_ZH": 143,
+      "珊瑚_ZH": 144,
+      "言笑_ZH": 145,
+      "久利须_ZH": 146,
+      "嘉玛_ZH": 147,
+      "艾文_ZH": 148,
+      "克洛琳德_ZH": 149,
+      "丹吉尔_ZH": 150,
+      "女士_ZH": 151,
+      "白老先生_ZH": 152,
+      "天目十五_ZH": 153,
+      "老孟_ZH": 154,
+      "巴达维_ZH": 155,
+      "长生_ZH": 156,
+      "吴船长_ZH": 157,
+      "拉齐_ZH": 158,
+      "艾伯特_ZH": 159,
+      "松浦_ZH": 160,
+      "埃泽_ZH": 161,
+      "阿圆_ZH": 162,
+      "莫塞伊思_ZH": 163,
+      "阿拉夫_ZH": 164,
+      "杜吉耶_ZH": 165,
+      "石头_ZH": 166,
+      "百闻_ZH": 167,
+      "波洛_ZH": 168,
+      "斯坦利_ZH": 169,
+      "博易_ZH": 170,
+      "迈蒙_ZH": 171,
+      "掇星攫辰天君_ZH": 172,
+      "毗伽尔_ZH": 173,
+      "芙卡洛斯_ZH": 174,
+      "恶龙_ZH": 175,
+      "恕筠_ZH": 176,
+      "知易_ZH": 177,
+      "克列门特_ZH": 178,
+      "大慈树王_ZH": 179,
+      "西拉杰_ZH": 180,
+      "上杉_ZH": 181,
+      "阿尔卡米_ZH": 182,
+      "纯水精灵_ZH": 183,
+      "常九爷_ZH": 184,
+      "沙扎曼_ZH": 185,
+      "田铁嘴_ZH": 186,
+      "克罗索_ZH": 187,
+      "阿巴图伊_ZH": 188,
+      "悦_ZH": 189,
+      "阿佩普_ZH": 190,
+      "埃尔欣根_ZH": 191,
+      "萨赫哈蒂_ZH": 192,
+      "塔杰·拉德卡尼_ZH": 193,
+      "安西_ZH": 194,
+      "埃舍尔_ZH": 195,
+      "萨齐因_ZH": 196,
+      "派蒙_JP": 197,
+      "纳西妲_JP": 198,
+      "凯亚_JP": 199,
+      "阿贝多_JP": 200,
+      "温迪_JP": 201,
+      "枫原万叶_JP": 202,
+      "钟离_JP": 203,
+      "荒泷一斗_JP": 204,
+      "八重神子_JP": 205,
+      "艾尔海森_JP": 206,
+      "提纳里_JP": 207,
+      "迪希雅_JP": 208,
+      "卡维_JP": 209,
+      "宵宫_JP": 210,
+      "那维莱特_JP": 211,
+      "莱依拉_JP": 212,
+      "赛诺_JP": 213,
+      "莫娜_JP": 214,
+      "诺艾尔_JP": 215,
+      "托马_JP": 216,
+      "凝光_JP": 217,
+      "林尼_JP": 218,
+      "北斗_JP": 219,
+      "柯莱_JP": 220,
+      "神里绫华_JP": 221,
+      "可莉_JP": 222,
+      "芭芭拉_JP": 223,
+      "雷电将军_JP": 224,
+      "娜维娅_JP": 225,
+      "芙宁娜_JP": 226,
+      "珊瑚宫心海_JP": 227,
+      "鹿野院平藏_JP": 228,
+      "迪奥娜_JP": 229,
+      "琴_JP": 230,
+      "五郎_JP": 231,
+      "班尼特_JP": 232,
+      "达达利亚_JP": 233,
+      "安柏_JP": 234,
+      "莱欧斯利_JP": 235,
+      "夜兰_JP": 236,
+      "妮露_JP": 237,
+      "辛焱_JP": 238,
+      "丽莎_JP": 239,
+      "珐露珊_JP": 240,
+      "魈_JP": 241,
+      "香菱_JP": 242,
+      "迪卢克_JP": 243,
+      "砂糖_JP": 244,
+      "烟绯_JP": 245,
+      "早柚_JP": 246,
+      "云堇_JP": 247,
+      "刻晴_JP": 248,
+      "重云_JP": 249,
+      "优菈_JP": 250,
+      "胡桃_JP": 251,
+      "流浪者_JP": 252,
+      "久岐忍_JP": 253,
+      "神里绫人_JP": 254,
+      "甘雨_JP": 255,
+      "戴因斯雷布_JP": 256,
+      "菲谢尔_JP": 257,
+      "白术_JP": 258,
+      "行秋_JP": 259,
+      "九条裟罗_JP": 260,
+      "夏洛蒂_JP": 261,
+      "雷泽_JP": 262,
+      "申鹤_JP": 263,
+      "空_JP": 264,
+      "荧_JP": 265,
+      "迪娜泽黛_JP": 266,
+      "凯瑟琳_JP": 267,
+      "多莉_JP": 268,
+      "坎蒂丝_JP": 269,
+      "琳妮特_JP": 270,
+      "萍姥姥_JP": 271,
+      "罗莎莉亚_JP": 272,
+      "埃德_JP": 273,
+      "爱贝尔_JP": 274,
+      "伊迪娅_JP": 275,
+      "留云借风真君_JP": 276,
+      "绮良良_JP": 277,
+      "七七_JP": 278,
+      "式大将_JP": 279,
+      "瑶瑶_JP": 280,
+      "奥兹_JP": 281,
+      "菲米尼_JP": 282,
+      "米卡_JP": 283,
+      "哲平_JP": 284,
+      "大肉丸_JP": 285,
+      "托克_JP": 286,
+      "蒂玛乌斯_JP": 287,
+      "昆钧_JP": 288,
+      "欧菲妮_JP": 289,
+      "塞琉斯_JP": 290,
+      "仆人_JP": 291,
+      "迈勒斯_JP": 292,
+      "希格雯_JP": 293,
+      "阿守_JP": 294,
+      "拉赫曼_JP": 295,
+      "杜拉夫_JP": 296,
+      "伊利亚斯_JP": 297,
+      "阿晃_JP": 298,
+      "旁白_JP": 299,
+      "爱德琳_JP": 300,
+      "埃洛伊_JP": 301,
+      "德沃沙克_JP": 302,
+      "玛乔丽_JP": 303,
+      "塞塔蕾_JP": 304,
+      "柊千里_JP": 305,
+      "海芭夏_JP": 306,
+      "九条镰治_JP": 307,
+      "阿娜耶_JP": 308,
+      "笼钓瓶一心_JP": 309,
+      "回声海螺_JP": 310,
+      "劳维克_JP": 311,
+      "元太_JP": 312,
+      "阿扎尔_JP": 313,
+      "查尔斯_JP": 314,
+      "阿洛瓦_JP": 315,
+      "埃勒曼_JP": 316,
+      "纳比尔_JP": 317,
+      "莎拉_JP": 318,
+      "康纳_JP": 319,
+      "博来_JP": 320,
+      "玛塞勒_JP": 321,
+      "阿祇_JP": 322,
+      "博士_JP": 323,
+      "迪尔菲_JP": 324,
+      "玛格丽特_JP": 325,
+      "宛烟_JP": 326,
+      "羽生田千鹤_JP": 327,
+      "海妮耶_JP": 328,
+      "霍夫曼_JP": 329,
+      "旅行者_JP": 330,
+      "佐西摩斯_JP": 331,
+      "舒伯特_JP": 332,
+      "鹿野奈奈_JP": 333,
+      "天叔_JP": 334,
+      "龙二_JP": 335,
+      "艾莉丝_JP": 336,
+      "莺儿_JP": 337,
+      "嘉良_JP": 338,
+      "珊瑚_JP": 339,
+      "言笑_JP": 340,
+      "一心传名刀_JP": 341,
+      "费迪南德_JP": 342,
+      "久利须_JP": 343,
+      "嘉玛_JP": 344,
+      "艾文_JP": 345,
+      "克洛琳德_JP": 346,
+      "丹吉尔_JP": 347,
+      "天目十五_JP": 348,
+      "女士_JP": 349,
+      "老孟_JP": 350,
+      "白老先生_JP": 351,
+      "舍利夫_JP": 352,
+      "巴达维_JP": 353,
+      "拉齐_JP": 354,
+      "长生_JP": 355,
+      "吴船长_JP": 356,
+      "艾伯特_JP": 357,
+      "松浦_JP": 358,
+      "埃泽_JP": 359,
+      "阿圆_JP": 360,
+      "阿拉夫_JP": 361,
+      "莫塞伊思_JP": 362,
+      "石头_JP": 363,
+      "百闻_JP": 364,
+      "杜吉耶_JP": 365,
+      "波洛_JP": 366,
+      "掇星攫辰天君_JP": 367,
+      "迈蒙_JP": 368,
+      "博易_JP": 369,
+      "诗筠_JP": 370,
+      "斯坦利_JP": 371,
+      "毗伽尔_JP": 372,
+      "芙卡洛斯_JP": 373,
+      "恶龙_JP": 374,
+      "小仓澪_JP": 375,
+      "恕筠_JP": 376,
+      "知易_JP": 377,
+      "克列门特_JP": 378,
+      "大慈树王_JP": 379,
+      "望雅_JP": 380,
+      "黑田_JP": 381,
+      "卡莉娜_JP": 382,
+      "马姆杜_JP": 383,
+      "科林斯_JP": 384,
+      "上杉_JP": 385,
+      "西拉杰_JP": 386,
+      "菲尔戈黛特_JP": 387,
+      "一平_JP": 388,
+      "纯水精灵_JP": 389,
+      "阿尔卡米_JP": 390,
+      "老戴_JP": 391,
+      "谢赫祖拜尔_JP": 392,
+      "沙扎曼_JP": 393,
+      "田铁嘴_JP": 394,
+      "小野寺_JP": 395,
+      "百识_JP": 396,
+      "克罗索_JP": 397,
+      "莱斯格_JP": 398,
+      "芷巧_JP": 399,
+      "加藤洋平_JP": 400,
+      "阿巴图伊_JP": 401,
+      "埃尔欣根_JP": 402,
+      "斯嘉莉_JP": 403,
+      "阿佩普_JP": 404,
+      "巫女_JP": 405,
+      "卡布斯_JP": 406,
+      "洛伦佐_JP": 407,
+      "萨赫哈蒂_JP": 408,
+      "娜德瓦_JP": 409,
+      "塞德娜_JP": 410,
+      "塔杰·拉德卡尼_JP": 411,
+      "绘星_JP": 412,
+      "泽田_JP": 413,
+      "安西_JP": 414,
+      "拉伊德_JP": 415,
+      "亚卡巴_JP": 416,
+      "有乐斋_JP": 417,
+      "莱昂_JP": 418,
+      "尤苏波夫_JP": 419,
+      "夏妮_JP": 420,
+      "埃舍尔_JP": 421,
+      "萨齐因_JP": 422,
+      "古山_JP": 423,
+      "自称渊上之物_JP": 424,
+      "丹羽_JP": 425,
+      "塞萨尔的日记_JP": 426,
+      "派蒙_EN": 427,
+      "纳西妲_EN": 428,
+      "凯亚_EN": 429,
+      "阿贝多_EN": 430,
+      "温迪_EN": 431,
+      "枫原万叶_EN": 432,
+      "钟离_EN": 433,
+      "荒泷一斗_EN": 434,
+      "八重神子_EN": 435,
+      "艾尔海森_EN": 436,
+      "提纳里_EN": 437,
+      "迪希雅_EN": 438,
+      "卡维_EN": 439,
+      "宵宫_EN": 440,
+      "莱依拉_EN": 441,
+      "那维莱特_EN": 442,
+      "赛诺_EN": 443,
+      "莫娜_EN": 444,
+      "诺艾尔_EN": 445,
+      "托马_EN": 446,
+      "凝光_EN": 447,
+      "林尼_EN": 448,
+      "北斗_EN": 449,
+      "柯莱_EN": 450,
+      "神里绫华_EN": 451,
+      "可莉_EN": 452,
+      "芭芭拉_EN": 453,
+      "雷电将军_EN": 454,
+      "娜维娅_EN": 455,
+      "芙宁娜_EN": 456,
+      "珊瑚宫心海_EN": 457,
+      "鹿野院平藏_EN": 458,
+      "迪奥娜_EN": 459,
+      "五郎_EN": 460,
+      "琴_EN": 461,
+      "班尼特_EN": 462,
+      "达达利亚_EN": 463,
+      "安柏_EN": 464,
+      "莱欧斯利_EN": 465,
+      "夜兰_EN": 466,
+      "妮露_EN": 467,
+      "辛焱_EN": 468,
+      "珐露珊_EN": 469,
+      "丽莎_EN": 470,
+      "魈_EN": 471,
+      "香菱_EN": 472,
+      "迪卢克_EN": 473,
+      "砂糖_EN": 474,
+      "烟绯_EN": 475,
+      "早柚_EN": 476,
+      "云堇_EN": 477,
+      "刻晴_EN": 478,
+      "重云_EN": 479,
+      "优菈_EN": 480,
+      "胡桃_EN": 481,
+      "流浪者_EN": 482,
+      "久岐忍_EN": 483,
+      "神里绫人_EN": 484,
+      "甘雨_EN": 485,
+      "戴因斯雷布_EN": 486,
+      "菲谢尔_EN": 487,
+      "白术_EN": 488,
+      "行秋_EN": 489,
+      "九条裟罗_EN": 490,
+      "夏洛蒂_EN": 491,
+      "雷泽_EN": 492,
+      "申鹤_EN": 493,
+      "荧_EN": 494,
+      "空_EN": 495,
+      "迪娜泽黛_EN": 496,
+      "凯瑟琳_EN": 497,
+      "多莉_EN": 498,
+      "坎蒂丝_EN": 499,
+      "琳妮特_EN": 500,
+      "萍姥姥_EN": 501,
+      "罗莎莉亚_EN": 502,
+      "埃德_EN": 503,
+      "爱贝尔_EN": 504,
+      "伊迪娅_EN": 505,
+      "留云借风真君_EN": 506,
+      "绮良良_EN": 507,
+      "七七_EN": 508,
+      "式大将_EN": 509,
+      "瑶瑶_EN": 510,
+      "奥兹_EN": 511,
+      "菲米尼_EN": 512,
+      "米卡_EN": 513,
+      "哲平_EN": 514,
+      "大肉丸_EN": 515,
+      "托克_EN": 516,
+      "蒂玛乌斯_EN": 517,
+      "昆钧_EN": 518,
+      "欧菲妮_EN": 519,
+      "塞琉斯_EN": 520,
+      "仆人_EN": 521,
+      "迈勒斯_EN": 522,
+      "希格雯_EN": 523,
+      "阿守_EN": 524,
+      "拉赫曼_EN": 525,
+      "杜拉夫_EN": 526,
+      "伊利亚斯_EN": 527,
+      "阿晃_EN": 528,
+      "旁白_EN": 529,
+      "爱德琳_EN": 530,
+      "埃洛伊_EN": 531,
+      "德沃沙克_EN": 532,
+      "玛乔丽_EN": 533,
+      "塞塔蕾_EN": 534,
+      "柊千里_EN": 535,
+      "海芭夏_EN": 536,
+      "九条镰治_EN": 537,
+      "阿娜耶_EN": 538,
+      "笼钓瓶一心_EN": 539,
+      "回声海螺_EN": 540,
+      "劳维克_EN": 541,
+      "元太_EN": 542,
+      "阿扎尔_EN": 543,
+      "查尔斯_EN": 544,
+      "阿洛瓦_EN": 545,
+      "埃勒曼_EN": 546,
+      "纳比尔_EN": 547,
+      "莎拉_EN": 548,
+      "康纳_EN": 549,
+      "博来_EN": 550,
+      "玛塞勒_EN": 551,
+      "阿祇_EN": 552,
+      "博士_EN": 553,
+      "迪尔菲_EN": 554,
+      "宛烟_EN": 555,
+      "玛格丽特_EN": 556,
+      "羽生田千鹤_EN": 557,
+      "海妮耶_EN": 558,
+      "霍夫曼_EN": 559,
+      "旅行者_EN": 560,
+      "佐西摩斯_EN": 561,
+      "鹿野奈奈_EN": 562,
+      "舒伯特_EN": 563,
+      "天叔_EN": 564,
+      "艾莉丝_EN": 565,
+      "龙二_EN": 566,
+      "莺儿_EN": 567,
+      "嘉良_EN": 568,
+      "珊瑚_EN": 569,
+      "费迪南德_EN": 570,
+      "言笑_EN": 571,
+      "一心传名刀_EN": 572,
+      "久利须_EN": 573,
+      "嘉玛_EN": 574,
+      "艾文_EN": 575,
+      "克洛琳德_EN": 576,
+      "丹吉尔_EN": 577,
+      "女士_EN": 578,
+      "天目十五_EN": 579,
+      "老孟_EN": 580,
+      "白老先生_EN": 581,
+      "舍利夫_EN": 582,
+      "巴达维_EN": 583,
+      "拉齐_EN": 584,
+      "长生_EN": 585,
+      "吴船长_EN": 586,
+      "艾伯特_EN": 587,
+      "松浦_EN": 588,
+      "埃泽_EN": 589,
+      "阿圆_EN": 590,
+      "阿拉夫_EN": 591,
+      "莫塞伊思_EN": 592,
+      "石头_EN": 593,
+      "百闻_EN": 594,
+      "杜吉耶_EN": 595,
+      "波洛_EN": 596,
+      "斯坦利_EN": 597,
+      "掇星攫辰天君_EN": 598,
+      "迈蒙_EN": 599,
+      "博易_EN": 600,
+      "诗筠_EN": 601,
+      "毗伽尔_EN": 602,
+      "慧心_EN": 603,
+      "芙卡洛斯_EN": 604,
+      "恶龙_EN": 605,
+      "小仓澪_EN": 606,
+      "恕筠_EN": 607,
+      "知易_EN": 608,
+      "克列门特_EN": 609,
+      "大慈树王_EN": 610,
+      "维多利亚_EN": 611,
+      "黑田_EN": 612,
+      "马姆杜_EN": 613,
+      "科林斯_EN": 614,
+      "上杉_EN": 615,
+      "西拉杰_EN": 616,
+      "宁禄_EN": 617,
+      "纯水精灵_EN": 618,
+      "常九爷_EN": 619,
+      "阿尔卡米_EN": 620,
+      "沙扎曼_EN": 621,
+      "田铁嘴_EN": 622,
+      "加萨尼_EN": 623,
+      "克罗索_EN": 624,
+      "星稀_EN": 625,
+      "莱斯格_EN": 626,
+      "阿巴图伊_EN": 627,
+      "悦_EN": 628,
+      "德田_EN": 629,
+      "埃尔欣根_EN": 630,
+      "阿佩普_EN": 631,
+      "萨赫哈蒂_EN": 632,
+      "洛伦佐_EN": 633,
+      "塔杰·拉德卡尼_EN": 634,
+      "泽田_EN": 635,
+      "安西_EN": 636,
+      "理水叠山真君_EN": 637,
+      "埃舍尔_EN": 638,
+      "萨齐因_EN": 639,
+      "古田_EN": 640,
+      "三月七_ZH": 641,
+      "丹恒_ZH": 642,
+      "希儿_ZH": 643,
+      "娜塔莎_ZH": 644,
+      "希露瓦_ZH": 645,
+      "瓦尔特_ZH": 646,
+      "佩拉_ZH": 647,
+      "布洛妮娅_ZH": 648,
+      "虎克_ZH": 649,
+      "素裳_ZH": 650,
+      "克拉拉_ZH": 651,
+      "符玄_ZH": 652,
+      "白露_ZH": 653,
+      "杰帕德_ZH": 654,
+      "景元_ZH": 655,
+      "藿藿_ZH": 656,
+      "姬子_ZH": 657,
+      "穹_ZH": 658,
+      "星_ZH": 659,
+      "卡芙卡_ZH": 660,
+      "桂乃芬_ZH": 661,
+      "艾丝妲_ZH": 662,
+      "玲可_ZH": 663,
+      "彦卿_ZH": 664,
+      "托帕_ZH": 665,
+      "驭空_ZH": 666,
+      "浮烟_ZH": 667,
+      "停云_ZH": 668,
+      "镜流_ZH": 669,
+      "罗刹_ZH": 670,
+      "卢卡_ZH": 671,
+      "史瓦罗_ZH": 672,
+      "黑塔_ZH": 673,
+      "桑博_ZH": 674,
+      "伦纳德_ZH": 675,
+      "明曦_ZH": 676,
+      "银狼_ZH": 677,
+      "帕姆_ZH": 678,
+      "青雀_ZH": 679,
+      "乔瓦尼_ZH": 680,
+      "公输师傅_ZH": 681,
+      "晴霓_ZH": 682,
+      "螺丝咕姆_ZH": 683,
+      "阿兰_ZH": 684,
+      "奥列格_ZH": 685,
+      "丹枢_ZH": 686,
+      "尾巴_ZH": 687,
+      "寒鸦_ZH": 688,
+      "雪衣_ZH": 689,
+      "可可利亚_ZH": 690,
+      "青镞_ZH": 691,
+      "半夏_ZH": 692,
+      "银枝_ZH": 693,
+      "大毫_ZH": 694,
+      "霄翰_ZH": 695,
+      "信使_ZH": 696,
+      "费斯曼_ZH": 697,
+      "绿芙蓉_ZH": 698,
+      "dev_成男_ZH": 699,
+      "金人会长_ZH": 700,
+      "维利特_ZH": 701,
+      "维尔德_ZH": 702,
+      "斯科特_ZH": 703,
+      "卡波特_ZH": 704,
+      "刃_ZH": 705,
+      "岩明_ZH": 706,
+      "浣溪_ZH": 707,
+      "三月七_JP": 708,
+      "丹恒_JP": 709,
+      "希儿_JP": 710,
+      "娜塔莎_JP": 711,
+      "希露瓦_JP": 712,
+      "瓦尔特_JP": 713,
+      "佩拉_JP": 714,
+      "布洛妮娅_JP": 715,
+      "虎克_JP": 716,
+      "素裳_JP": 717,
+      "克拉拉_JP": 718,
+      "符玄_JP": 719,
+      "白露_JP": 720,
+      "杰帕德_JP": 721,
+      "景元_JP": 722,
+      "藿藿_JP": 723,
+      "姬子_JP": 724,
+      "卡芙卡_JP": 725,
+      "穹_JP": 726,
+      "星_JP": 727,
+      "桂乃芬_JP": 728,
+      "艾丝妲_JP": 729,
+      "彦卿_JP": 730,
+      "玲可_JP": 731,
+      "托帕_JP": 732,
+      "驭空_JP": 733,
+      "浮烟_JP": 734,
+      "停云_JP": 735,
+      "镜流_JP": 736,
+      "罗刹_JP": 737,
+      "卢卡_JP": 738,
+      "史瓦罗_JP": 739,
+      "黑塔_JP": 740,
+      "桑博_JP": 741,
+      "伦纳德_JP": 742,
+      "明曦_JP": 743,
+      "银狼_JP": 744,
+      "帕姆_JP": 745,
+      "青雀_JP": 746,
+      "乔瓦尼_JP": 747,
+      "公输师傅_JP": 748,
+      "晴霓_JP": 749,
+      "螺丝咕姆_JP": 750,
+      "阿兰_JP": 751,
+      "奥列格_JP": 752,
+      "丹枢_JP": 753,
+      "尾巴_JP": 754,
+      "寒鸦_JP": 755,
+      "雪衣_JP": 756,
+      "可可利亚_JP": 757,
+      "青镞_JP": 758,
+      "半夏_JP": 759,
+      "银枝_JP": 760,
+      "大毫_JP": 761,
+      "霄翰_JP": 762,
+      "信使_JP": 763,
+      "费斯曼_JP": 764,
+      "绿芙蓉_JP": 765,
+      "dev_成男_JP": 766,
+      "金人会长_JP": 767,
+      "维利特_JP": 768,
+      "维尔德_JP": 769,
+      "斯科特_JP": 770,
+      "刃_JP": 771,
+      "卡波特_JP": 772,
+      "岩明_JP": 773,
+      "浣溪_JP": 774,
+      "净砚_JP": 775,
+      "紫月季_JP": 776,
+      "歌蒂_JP": 777,
+      "奇怪的云骑_JP": 778,
+      "幻胧_JP": 779,
+      "斯薇塔_JP": 780,
+      "隐书_JP": 781,
+      "三月七_EN": 782,
+      "丹恒_EN": 783,
+      "希儿_EN": 784,
+      "娜塔莎_EN": 785,
+      "希露瓦_EN": 786,
+      "瓦尔特_EN": 787,
+      "佩拉_EN": 788,
+      "布洛妮娅_EN": 789,
+      "虎克_EN": 790,
+      "素裳_EN": 791,
+      "克拉拉_EN": 792,
+      "符玄_EN": 793,
+      "白露_EN": 794,
+      "杰帕德_EN": 795,
+      "景元_EN": 796,
+      "藿藿_EN": 797,
+      "姬子_EN": 798,
+      "卡芙卡_EN": 799,
+      "穹_EN": 800,
+      "星_EN": 801,
+      "桂乃芬_EN": 802,
+      "艾丝妲_EN": 803,
+      "彦卿_EN": 804,
+      "玲可_EN": 805,
+      "托帕_EN": 806,
+      "驭空_EN": 807,
+      "浮烟_EN": 808,
+      "停云_EN": 809,
+      "镜流_EN": 810,
+      "罗刹_EN": 811,
+      "卢卡_EN": 812,
+      "史瓦罗_EN": 813,
+      "黑塔_EN": 814,
+      "桑博_EN": 815,
+      "伦纳德_EN": 816,
+      "明曦_EN": 817,
+      "银狼_EN": 818,
+      "帕姆_EN": 819,
+      "青雀_EN": 820,
+      "乔瓦尼_EN": 821,
+      "公输师傅_EN": 822,
+      "晴霓_EN": 823,
+      "螺丝咕姆_EN": 824,
+      "阿兰_EN": 825,
+      "奥列格_EN": 826,
+      "丹枢_EN": 827,
+      "尾巴_EN": 828,
+      "寒鸦_EN": 829,
+      "雪衣_EN": 830,
+      "可可利亚_EN": 831,
+      "青镞_EN": 832,
+      "半夏_EN": 833,
+      "银枝_EN": 834,
+      "大毫_EN": 835,
+      "霄翰_EN": 836,
+      "信使_EN": 837,
+      "费斯曼_EN": 838,
+      "绿芙蓉_EN": 839,
+      "dev_成男_EN": 840,
+      "金人会长_EN": 841,
+      "维利特_EN": 842,
+      "维尔德_EN": 843,
+      "刃_EN": 844,
+      "卡波特_EN": 845,
+      "岩明_EN": 846,
+      "浣溪_EN": 847,
+      "紫月季_EN": 848,
+      "幻胧_EN": 849,
+      "女声_EN": 850,
+      "陆景和": 851,
+      "莫弈": 852,
+      "左然": 853,
+      "夏彦": 854
     }
   },
   "model": {
     "use_spectral_norm": false,
     "gin_channels": 256
   },
+  "version": "2.2"
+}

css/custom.css ADDED Viewed

	@@ -0,0 +1,18 @@

+#yml_code {
+    height: 600px;
+    flex-grow: inherit;
+    overflow-y: auto;
+}
+#json_code {
+    height: 600px;
+    flex-grow: inherit;
+    overflow-y: auto;
+}
+#gpu_code {
+    height: 300px;
+    flex-grow: inherit;
+    overflow-y: auto;
+}

data_utils.py CHANGED Viewed

@@ -3,11 +3,13 @@ import random
 import torch
 import torch.utils.data
 from tqdm import tqdm
 from tools.log import logger
 import commons
 from mel_processing import spectrogram_torch, mel_spectrogram_torch
 from utils import load_wav_to_torch, load_filepaths_and_text
 from text import cleaned_text_to_sequence
 """Multi speaker version"""
@@ -40,7 +42,11 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
         self.add_blank = hparams.add_blank
         self.min_text_len = getattr(hparams, "min_text_len", 1)
-        self.max_text_len = getattr(hparams, "max_text_len", 300)
         random.seed(1234)
         random.shuffle(self.audiopaths_sid_text)
@@ -91,7 +97,15 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
         spec, wav = self.get_audio(audiopath)
         sid = torch.LongTensor([int(self.spk_map[sid])])
-        return (phones, spec, wav, sid, tone, language, bert, ja_bert, en_bert)
     def get_audio(self, filename):
         audio, sampling_rate = load_wav_to_torch(filename)
@@ -131,7 +145,8 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
                     center=False,
                 )
             spec = torch.squeeze(spec, 0)
-            torch.save(spec, spec_filename)
         return spec, audio_norm
     def get_text(self, text, word2ph, phone, tone, language_str, wav_path):
@@ -153,15 +168,15 @@ class TextAudioSpeakerLoader(torch.utils.data.Dataset):
         if language_str == "ZH":
             bert = bert_ori
-            ja_bert = torch.zeros(1024, len(phone))
-            en_bert = torch.zeros(1024, len(phone))
         elif language_str == "JP":
-            bert = torch.zeros(1024, len(phone))
             ja_bert = bert_ori
-            en_bert = torch.zeros(1024, len(phone))
         elif language_str == "EN":
-            bert = torch.zeros(1024, len(phone))
-            ja_bert = torch.zeros(1024, len(phone))
             en_bert = bert_ori
         phone = torch.LongTensor(phone)
         tone = torch.LongTensor(tone)
@@ -211,6 +226,7 @@ class TextAudioSpeakerCollate:
         bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
         ja_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
         en_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
         spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
         wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
@@ -222,6 +238,7 @@ class TextAudioSpeakerCollate:
         bert_padded.zero_()
         ja_bert_padded.zero_()
         en_bert_padded.zero_()
         for i in range(len(ids_sorted_decreasing)):
             row = batch[ids_sorted_decreasing[i]]
@@ -255,6 +272,8 @@ class TextAudioSpeakerCollate:
             en_bert = row[8]
             en_bert_padded[i, :, : en_bert.size(1)] = en_bert
         return (
             text_padded,
             text_lengths,
@@ -268,6 +287,7 @@ class TextAudioSpeakerCollate:
             bert_padded,
             ja_bert_padded,
             en_bert_padded,
         )

 import torch
 import torch.utils.data
 from tqdm import tqdm
+import numpy as np
 from tools.log import logger
 import commons
 from mel_processing import spectrogram_torch, mel_spectrogram_torch
 from utils import load_wav_to_torch, load_filepaths_and_text
 from text import cleaned_text_to_sequence
+from config import config
 """Multi speaker version"""
         self.add_blank = hparams.add_blank
         self.min_text_len = getattr(hparams, "min_text_len", 1)
+        self.max_text_len = getattr(hparams, "max_text_len", 384)
+        self.empty_emo = torch.squeeze(
+            torch.load("empty_emo.npy", map_location="cpu"), dim=1
+        )
         random.seed(1234)
         random.shuffle(self.audiopaths_sid_text)
         spec, wav = self.get_audio(audiopath)
         sid = torch.LongTensor([int(self.spk_map[sid])])
+        if np.random.rand() > 0.1:
+            emo = torch.squeeze(
+                torch.load(audiopath.replace(".wav", ".emo.npy"), map_location="cpu"),
+                dim=1,
+            )
+        else:
+            emo = self.empty_emo
+        return (phones, spec, wav, sid, tone, language, bert, ja_bert, en_bert, emo)
     def get_audio(self, filename):
         audio, sampling_rate = load_wav_to_torch(filename)
                     center=False,
                 )
             spec = torch.squeeze(spec, 0)
+            if config.train_ms_config.spec_cache:
+                torch.save(spec, spec_filename)
         return spec, audio_norm
     def get_text(self, text, word2ph, phone, tone, language_str, wav_path):
         if language_str == "ZH":
             bert = bert_ori
+            ja_bert = torch.rand(1024, len(phone))
+            en_bert = torch.rand(1024, len(phone))
         elif language_str == "JP":
+            bert = torch.rand(1024, len(phone))
             ja_bert = bert_ori
+            en_bert = torch.rand(1024, len(phone))
         elif language_str == "EN":
+            bert = torch.rand(1024, len(phone))
+            ja_bert = torch.rand(1024, len(phone))
             en_bert = bert_ori
         phone = torch.LongTensor(phone)
         tone = torch.LongTensor(tone)
         bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
         ja_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
         en_bert_padded = torch.FloatTensor(len(batch), 1024, max_text_len)
+        emo = torch.FloatTensor(len(batch), 512)
         spec_padded = torch.FloatTensor(len(batch), batch[0][1].size(0), max_spec_len)
         wav_padded = torch.FloatTensor(len(batch), 1, max_wav_len)
         bert_padded.zero_()
         ja_bert_padded.zero_()
         en_bert_padded.zero_()
+        emo.zero_()
         for i in range(len(ids_sorted_decreasing)):
             row = batch[ids_sorted_decreasing[i]]
             en_bert = row[8]
             en_bert_padded[i, :, : en_bert.size(1)] = en_bert
+            emo[i, :] = row[9]
         return (
             text_padded,
             text_lengths,
             bert_padded,
             ja_bert_padded,
             en_bert_padded,
+            emo,
         )

default_config.yml CHANGED Viewed

@@ -4,7 +4,7 @@
 # 拟提供通用路径配置，统一存放数据，避免数据放得很乱
 # 每个数据集与其对应的模型存放至统一路径下，后续所有的路径配置均为相对于datasetPath的路径
 # 不填或者填空则路径为相对于项目根目录的路径
-dataset_path: ""
 # 模型镜像源，默认huggingface，使用openi镜像源需指定openi_token
 mirror: ""
@@ -17,16 +17,16 @@ resample:
   sampling_rate: 44100
   # 音频文件输入路径，重采样会将该路径下所有.wav音频文件重采样
   # 请填入相对于datasetPath的相对路径
-  in_dir: "" # 相对于根目录的路径为 /datasetPath/in_dir
   # 音频文件重采样后输出路径
-  out_dir: ""
 # preprocess_text 数据集预处理相关配置
 # 注意， “:” 后需要加空格
 preprocess_text:
   # 原始文本文件路径，文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
-  transcription_path: "filelists/bushroid.list"
   # 数据清洗后文本路径，可以不填。不填则将在原始文本目录生成
   cleaned_path: ""
   # 训练集路径
@@ -35,10 +35,10 @@ preprocess_text:
   val_path: "filelists/val.list"
   # 配置文件路径
   config_path: "config.json"
-  # 每个speaker的验证集条数
-  val_per_spk: 4
   # 验证集最大条数，多于的会被截断并放到训练集中
-  max_val_total: 8
   # 是否进行数据清洗
   clean: true
@@ -49,35 +49,51 @@ bert_gen:
   # 训练数据集配置文件路径
   config_path: "config.json"
   # 并行数
-  num_processes: 2
   # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
   # 该选项同时决定了get_bert_feature的默认设备
   device: "cuda"
   # 使用多卡推理
   use_multi_device: false
 # train 训练配置
 # 注意， “:” 后需要加空格
 train_ms:
-  # 需要加载的环境变量，多显卡训练时RANK请手动在环境变量填写
-  # 环境变量对应名称环境变量不存在时加载，也就是说手动添加的环境变量优先级更高，会覆盖本配置文件
   env:
     MASTER_ADDR: "localhost"
     MASTER_PORT: 10086
     WORLD_SIZE: 1
     RANK: 0
     # 可以填写任意名的环境变量
     # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
   # 底模设置
   base:
-    use_base_model: True
     repo_id: "Stardust_minus/Bert-VITS2"
-    model_image: "Bert-VITS2中日英底模-fix" # openi网页的模型名
   # 训练模型存储目录：与旧版本的区别，原先数据集是存放在logs/model_name下的，现在改为统一存放在Data/你的数据集/models下
   model: "models"
   # 配置文件路径
-  config_path: "configs/config.json"
 # webui webui配置
@@ -86,9 +102,9 @@ webui:
   # 推理设备
   device: "cuda"
   # 模型路径
-  model: "genshin/models/G_8000.pth"
   # 配置文件路径
-  config_path: "configs/config.json"
   # 端口号
   port: 7860
   # 是否公开部署，对外网开放
@@ -99,7 +115,7 @@ webui:
   language_identification_library: "langid"
-# server api配置
 # 注意， “:” 后需要加空格
 # 注意，本配置下的所有配置均为相对于根目录的路径
 server:
@@ -107,8 +123,10 @@ server:
   port: 5000
   # 模型默认使用设备：但是当前并没有实现这个配置。
   device: "cuda"
-  # 需要加载的所有模型的配置
   # 注意，所有模型都必须正确配置model与config的路径，空路径会导致加载错误。
   models:
     - # 模型的路径
       model: ""
@@ -149,7 +167,6 @@ server:
       # 不必填写所有人物，不填的使用默认值
       speakers: [ ] # 也可以不填
 # 百度翻译开放平台 api配置
 # api接入文档 https://api.fanyi.baidu.com/doc/21
 # 请不要在github等网站公开分享你的app id 与 key

 # 拟提供通用路径配置，统一存放数据，避免数据放得很乱
 # 每个数据集与其对应的模型存放至统一路径下，后续所有的路径配置均为相对于datasetPath的路径
 # 不填或者填空则路径为相对于项目根目录的路径
+dataset_path: "Data/"
 # 模型镜像源，默认huggingface，使用openi镜像源需指定openi_token
 mirror: ""
   sampling_rate: 44100
   # 音频文件输入路径，重采样会将该路径下所有.wav音频文件重采样
   # 请填入相对于datasetPath的相对路径
+  in_dir: "audios/raw" # 相对于根目录的路径为 /datasetPath/in_dir
   # 音频文件重采样后输出路径
+  out_dir: "audios/wavs"
 # preprocess_text 数据集预处理相关配置
 # 注意， “:” 后需要加空格
 preprocess_text:
   # 原始文本文件路径，文本格式应为{wav_path}|{speaker_name}|{language}|{text}。
+  transcription_path: "filelists/你的数据集文本.list"
   # 数据清洗后文本路径，可以不填。不填则将在原始文本目录生成
   cleaned_path: ""
   # 训练集路径
   val_path: "filelists/val.list"
   # 配置文件路径
   config_path: "config.json"
+  # 每个语言的验证集条数
+  val_per_lang: 4
   # 验证集最大条数，多于的会被截断并放到训练集中
+  max_val_total: 12
   # 是否进行数据清洗
   clean: true
   # 训练数据集配置文件路径
   config_path: "config.json"
   # 并行数
+  num_processes: 4
   # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
   # 该选项同时决定了get_bert_feature的默认设备
   device: "cuda"
   # 使用多卡推理
   use_multi_device: false
+# emo_gen 相关配置
+# 注意， “:” 后需要加空格
+emo_gen:
+  # 训练数据集配置文件路径
+  config_path: "config.json"
+  # 并行数
+  num_processes: 4
+  # 使用设备：可选项 "cuda" 显卡推理，"cpu" cpu推理
+  device: "cuda"
+  # 使用多卡推理
+  use_multi_device: false
 # train 训练配置
 # 注意， “:” 后需要加空格
 train_ms:
   env:
     MASTER_ADDR: "localhost"
     MASTER_PORT: 10086
     WORLD_SIZE: 1
+    LOCAL_RANK: 0
     RANK: 0
     # 可以填写任意名的环境变量
     # THE_ENV_VAR_YOU_NEED_TO_USE: "1234567"
   # 底模设置
   base:
+    use_base_model: false
     repo_id: "Stardust_minus/Bert-VITS2"
+    model_image: "Bert-VITS2_2.2-Clap底模" # openi网页的模型名
   # 训练模型存储目录：与旧版本的区别，原先数据集是存放在logs/model_name下的，现在改为统一存放在Data/你的数据集/models下
   model: "models"
   # 配置文件路径
+  config_path: "config.json"
+  # 训练使用的worker，不建议超过CPU核心数
+  num_workers: 16
+  # 关闭此项可以节约接近50%的磁盘空间，但是可能导致实际训练速度变慢和更高的CPU使用率。
+  spec_cache: True
+  # 保存的检查点数量，多于此数目的权重会被删除来节省空间。
+  keep_ckpts: 8
 # webui webui配置
   # 推理设备
   device: "cuda"
   # 模型路径
+  model: "models/G_8000.pth"
   # 配置文件路径
+  config_path: "config.json"
   # 端口号
   port: 7860
   # 是否公开部署，对外网开放
   language_identification_library: "langid"
+# server-fastapi配置
 # 注意， “:” 后需要加空格
 # 注意，本配置下的所有配置均为相对于根目录的路径
 server:
   port: 5000
   # 模型默认使用设备：但是当前并没有实现这个配置。
   device: "cuda"
+  # 需要加载的所有模型的配置，可以填多个模型，也可以不填模型，等网页成功后手动加载模型
+  # 不加载模型的配置格式：删除默认给的两个模型配置，给models赋值 [ ]，也就是空列表。参考模型2的speakers 即 models: [ ]
   # 注意，所有模型都必须正确配置model与config的路径，空路径会导致加载错误。
+  # 也可以不填模型，等网页加载成功后手动填写models。
   models:
     - # 模型的路径
       model: ""
       # 不必填写所有人物，不填的使用默认值
       speakers: [ ] # 也可以不填
 # 百度翻译开放平台 api配置
 # api接入文档 https://api.fanyi.baidu.com/doc/21
 # 请不要在github等网站公开分享你的app id 与 key

emotional/clap-htsat-fused/.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

emotional/clap-htsat-fused/README.md ADDED Viewed

	@@ -0,0 +1,107 @@

+---
+license: apache-2.0
+---
+# Model card for CLAP
+Model card for CLAP: Contrastive Language-Audio Pretraining
+![clap_image](https://s3.amazonaws.com/moonup/production/uploads/1678811100805-62441d1d9fdefb55a0b7d12c.png)
+#  Table of Contents
+0. [TL;DR](#TL;DR)
+1. [Model Details](#model-details)
+2. [Usage](#usage)
+3. [Uses](#uses)
+4. [Citation](#citation)
+# TL;DR
+The abstract of the paper states that:
+> Contrastive learning has shown remarkable success in the field of multimodal representation learning. In this paper, we propose a pipeline of contrastive language-audio pretraining to develop an audio representation by combining audio data with natural language descriptions. To accomplish this target, we first release LAION-Audio-630K, a large collection of 633,526 audio-text pairs from different data sources. Second, we construct a contrastive language-audio pretraining model by considering different audio encoders and text encoders. We incorporate the feature fusion mechanism and keyword-to-caption augmentation into the model design to further enable the model to process audio inputs of variable lengths and enhance the performance. Third, we perform comprehensive experiments to evaluate our model across three tasks: text-to-audio retrieval, zero-shot audio classification, and supervised audio classification. The results demonstrate that our model achieves superior performance in text-to-audio retrieval task. In audio classification tasks, the model achieves state-of-the-art performance in the zero-shot setting and is able to obtain performance comparable to models' results in the non-zero-shot setting. LAION-Audio-630K and the proposed model are both available to the public.
+# Usage
+You can use this model for zero shot audio classification or extracting audio and/or textual features.
+# Uses
+## Perform zero-shot audio classification
+### Using `pipeline`
+```python
+from datasets import load_dataset
+from transformers import pipeline
+dataset = load_dataset("ashraq/esc50")
+audio = dataset["train"]["audio"][-1]["array"]
+audio_classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-fused")
+output = audio_classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
+print(output)
+>>> [{"score": 0.999, "label": "Sound of a dog"}, {"score": 0.001, "label": "Sound of vaccum cleaner"}]
+```
+## Run the model:
+You can also get the audio and text embeddings using `ClapModel`
+### Run the model on CPU:
+```python
+from datasets import load_dataset
+from transformers import ClapModel, ClapProcessor
+librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+audio_sample = librispeech_dummy[0]
+model = ClapModel.from_pretrained("laion/clap-htsat-fused")
+processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
+inputs = processor(audios=audio_sample["audio"]["array"], return_tensors="pt")
+audio_embed = model.get_audio_features(**inputs)
+```
+### Run the model on GPU:
+```python
+from datasets import load_dataset
+from transformers import ClapModel, ClapProcessor
+librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+audio_sample = librispeech_dummy[0]
+model = ClapModel.from_pretrained("laion/clap-htsat-fused").to(0)
+processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")
+inputs = processor(audios=audio_sample["audio"]["array"], return_tensors="pt").to(0)
+audio_embed = model.get_audio_features(**inputs)
+```
+# Citation
+If you are using this model for your work, please consider citing the original paper:
+```
+@misc{https://doi.org/10.48550/arxiv.2211.06687,
+  doi = {10.48550/ARXIV.2211.06687},
+  url = {https://arxiv.org/abs/2211.06687},
+  author = {Wu, Yusong and Chen, Ke and Zhang, Tianyu and Hui, Yuchen and Berg-Kirkpatrick, Taylor and Dubnov, Shlomo},
+  keywords = {Sound (cs.SD), Audio and Speech Processing (eess.AS), FOS: Computer and information sciences, FOS: Computer and information sciences, FOS: Electrical engineering, electronic engineering, information engineering, FOS: Electrical engineering, electronic engineering, information engineering},
+  title = {Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation},
+  publisher = {arXiv},
+  year = {2022},
+  copyright = {Creative Commons Attribution 4.0 International}
+}
+```

emotional/clap-htsat-fused/config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "_commit_hash": null,
+  "architectures": [
+    "ClapModel"
+  ],
+  "audio_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "aff_block_r": 4,
+    "architectures": null,
+    "attention_probs_dropout_prob": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "depths": [
+      2,
+      2,
+      6,
+      2
+    ],
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "drop_path_rate": 0.0,
+    "early_stopping": false,
+    "enable_fusion": true,
+    "enable_patch_fusion": true,
+    "enable_patch_layer_norm": true,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "flatten_patch_embeds": true,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "fusion_num_hidden_layers": 2,
+    "fusion_type": null,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "mlp_ratio": 4.0,
+    "model_type": "clap_audio_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": [
+      4,
+      8,
+      16,
+      32
+    ],
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_classes": 527,
+    "num_hidden_layers": 4,
+    "num_mel_bins": 64,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_embed_input_channels": 1,
+    "patch_embeds_hidden_size": 96,
+    "patch_size": 4,
+    "patch_stride": [
+      4,
+      4
+    ],
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "projection_hidden_act": "relu",
+    "projection_hidden_size": 768,
+    "pruned_heads": {},
+    "qkv_bias": true,
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "spec_size": 256,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.27.0.dev0",
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "window_size": 8
+  },
+  "hidden_size": 768,
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 14.285714285714285,
+  "model_type": "clap",
+  "num_hidden_layers": 16,
+  "projection_dim": 512,
+  "projection_hidden_act": "relu",
+  "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_probs_dropout_prob": 0.1,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "classifier_dropout": null,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "fusion_hidden_size": 768,
+    "fusion_num_hidden_layers": 2,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-12,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 514,
+    "min_length": 0,
+    "model_type": "clap_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "position_embedding_type": "absolute",
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim": 512,
+    "projection_hidden_act": "relu",
+    "projection_hidden_size": 768,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.27.0.dev0",
+    "type_vocab_size": 1,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 50265
+  },
+  "torch_dtype": "float32",
+  "transformers_version": null
+}

emotional/clap-htsat-fused/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

emotional/clap-htsat-fused/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,22 @@

+{
+  "chunk_length_s": 10,
+  "feature_extractor_type": "ClapFeatureExtractor",
+  "feature_size": 64,
+  "fft_window_size": 1024,
+  "frequency_max": 14000,
+  "frequency_min": 50,
+  "hop_length": 480,
+  "max_length_s": 10,
+  "n_fft": 1024,
+  "nb_frequency_bins": 513,
+  "nb_max_frames": 1000,
+  "nb_max_samples": 480000,
+  "padding": "repeatpad",
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "ClapProcessor",
+  "return_attention_mask": false,
+  "sampling_rate": 48000,
+  "top_db": null,
+  "truncation": "fusion"
+}

emotional/clap-htsat-fused/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1ed5d0215d887551ddd0a49ce7311b21429ebdf1e6a129d4e68f743357225253
+size 614596545

emotional/clap-htsat-fused/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,15 @@

+{
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "mask_token": {
+    "content": "<mask>",
+    "lstrip": true,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<pad>",
+  "sep_token": "</s>",
+  "unk_token": "<unk>"
+}

emotional/clap-htsat-fused/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

emotional/clap-htsat-fused/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "add_prefix_space": false,
+  "bos_token": "<s>",
+  "cls_token": "<s>",
+  "eos_token": "</s>",
+  "errors": "replace",
+  "mask_token": "<mask>",
+  "model_max_length": 512,
+  "pad_token": "<pad>",
+  "processor_class": "ClapProcessor",
+  "sep_token": "</s>",
+  "special_tokens_map_file": null,
+  "tokenizer_class": "RobertaTokenizer",
+  "trim_offsets": true,
+  "unk_token": "<unk>"
+}

emotional/clap-htsat-fused/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

empty_emo.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:07063411ab7d6e7aacfc73c582616c3fbc8fdf518b20d42d8be77bc9caf6fab9
+size 3238

export_onnx.py CHANGED Viewed

@@ -1,54 +1,10 @@
-from models_onnx import SynthesizerTrn
-import utils
-from text.symbols import symbols
 import os
-import json
-def export_onnx(export_path, model_path, config_path):
-    hps = utils.get_hparams_from_file(config_path)
-    net_g = SynthesizerTrn(
-        len(symbols),
-        hps.data.filter_length // 2 + 1,
-        hps.train.segment_size // hps.data.hop_length,
-        n_speakers=hps.data.n_speakers,
-        **hps.model,
-    )
-    _ = net_g.eval()
-    _ = utils.load_checkpoint(model_path, net_g, None, skip_optimizer=True)
-    net_g.export_onnx(export_path)
-    spklist = []
-    for key in hps.data.spk2id.keys():
-        spklist.append(key)
-    MoeVSConf = {
-        "Folder": f"{export_path}",
-        "Name": f"{export_path}",
-        "Type": "BertVits",
-        "Symbol": symbols,
-        "Cleaner": "",
-        "Rate": hps.data.sampling_rate,
-        "CharaMix": True,
-        "Characters": spklist,
-        "LanguageMap": {"ZH": [0, 0], "JP": [1, 6], "EN": [2, 8]},
-        "Dict": "BasicDict",
-        "BertPath": [
-            "chinese-roberta-wwm-ext-large",
-            "deberta-v2-large-japanese",
-            "bert-base-japanese-v3",
-        ],
-    }
-    with open(f"onnx/{export_path}.json", "w") as MoeVsConfFile:
-        json.dump(MoeVSConf, MoeVsConfFile, indent=4)
 if __name__ == "__main__":
-    print(symbols)
-    export_path = "HimenoSena"
-    model_path = "G_53000.pth"
-    config_path = "config.json"
     if not os.path.exists("onnx"):
         os.makedirs("onnx")
     if not os.path.exists(f"onnx/{export_path}"):

+from onnx_modules import export_onnx
 import os
 if __name__ == "__main__":
+    export_path = "BertVits2.2PT"
+    model_path = "model\\G_0.pth"
+    config_path = "model\\config.json"
     if not os.path.exists("onnx"):
         os.makedirs("onnx")
     if not os.path.exists(f"onnx/{export_path}"):

filelists/sample.list ADDED Viewed

	@@ -0,0 +1,3 @@

+Example:
+{wav_path}|{speaker_name}|{language}|{text}
+派蒙_1.wav|派蒙|ZH|前面的区域，以后再来探索吧！

img/yuyu.png ADDED Viewed

img//345/217/202/346/225/260/350/257/264/346/230/216.png ADDED Viewed

img//345/256/265/345/256/253.png ADDED Viewed

img//345/276/256/344/277/241/345/233/276/347/211/207_20231010105112.png ADDED Viewed

img//347/245/236/351/207/214/347/273/253/345/215/216.png ADDED Viewed

img//347/272/263/350/245/277/345/246/262.png ADDED Viewed

infer.py CHANGED Viewed

@@ -5,17 +5,23 @@
     2. 请在模型的config.json中显示声明版本号，添加一个字段"version" : "你的版本号"
 特殊版本说明：
     1.1.1-fix： 1.1.1版本训练的模型，但是在推理时使用dev的日语修复
-    1.1.1-dev： dev开发
-    2.0：当前版本
 """
 import torch
 import commons
 from text import cleaned_text_to_sequence, get_bert
 from text.cleaner import clean_text
 import utils
 from models import SynthesizerTrn
 from text.symbols import symbols
 from oldVersion.V111.models import SynthesizerTrn as V111SynthesizerTrn
 from oldVersion.V111.text import symbols as V111symbols
 from oldVersion.V110.models import SynthesizerTrn as V110SynthesizerTrn
@@ -23,13 +29,17 @@ from oldVersion.V110.text import symbols as V110symbols
 from oldVersion.V101.models import SynthesizerTrn as V101SynthesizerTrn
 from oldVersion.V101.text import symbols as V101symbols
-from oldVersion import V111, V110, V101
 # 当前版本信息
-latest_version = "2.0"
 # 版本兼容
 SynthesizerTrnMap = {
     "1.1.1-fix": V111SynthesizerTrn,
     "1.1.1": V111SynthesizerTrn,
     "1.1": V110SynthesizerTrn,
@@ -40,6 +50,10 @@ SynthesizerTrnMap = {
 }
 symbolsMap = {
     "1.1.1-fix": V111symbols,
     "1.1.1": V111symbols,
     "1.1": V110symbols,
@@ -50,6 +64,17 @@ symbolsMap = {
 }
 def get_net_g(model_path: str, version: str, device: str, hps):
     if version != latest_version:
         net_g = SynthesizerTrnMap[version](
@@ -91,15 +116,15 @@ def get_text(text, language_str, hps, device):
     if language_str == "ZH":
         bert = bert_ori
-        ja_bert = torch.zeros(1024, len(phone))
-        en_bert = torch.zeros(1024, len(phone))
     elif language_str == "JP":
-        bert = torch.zeros(1024, len(phone))
         ja_bert = bert_ori
-        en_bert = torch.zeros(1024, len(phone))
     elif language_str == "EN":
-        bert = torch.zeros(1024, len(phone))
-        ja_bert = torch.zeros(1024, len(phone))
         en_bert = bert_ori
     else:
         raise ValueError("language_str should be ZH, JP or EN")
@@ -116,6 +141,7 @@ def get_text(text, language_str, hps, device):
 def infer(
     text,
     sdp_ratio,
     noise_scale,
     noise_scale_w,
@@ -125,9 +151,20 @@ def infer(
     hps,
     net_g,
     device,
 ):
-    # 支持中日双语版本
     inferMap_V2 = {
         "1.1.1-fix": V111.infer_fix,
         "1.1.1": V111.infer,
         "1.1": V110.infer,
@@ -143,6 +180,23 @@ def infer(
     version = hps.version if hasattr(hps, "version") else latest_version
     # 非当前版本，根据版本号选择合适的infer
     if version != latest_version:
         if version in inferMap_V2.keys():
             return inferMap_V2[version](
                 text,
@@ -169,9 +223,127 @@ def infer(
                 device,
             )
     # 在此处实现当前版本的推理
     bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
         text, language, hps, device
     )
     with torch.no_grad():
         x_tst = phones.to(device).unsqueeze(0)
         tones = tones.to(device).unsqueeze(0)
@@ -179,6 +351,7 @@ def infer(
         bert = bert.to(device).unsqueeze(0)
         ja_bert = ja_bert.to(device).unsqueeze(0)
         en_bert = en_bert.to(device).unsqueeze(0)
         x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
         del phones
         speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
@@ -192,6 +365,7 @@ def infer(
                 bert,
                 ja_bert,
                 en_bert,
                 sdp_ratio=sdp_ratio,
                 noise_scale=noise_scale,
                 noise_scale_w=noise_scale_w,
@@ -201,7 +375,7 @@ def infer(
             .float()
             .numpy()
         )
-        del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         return audio

     2. 请在模型的config.json中显示声明版本号，添加一个字段"version" : "你的版本号"
 特殊版本说明：
     1.1.1-fix： 1.1.1版本训练的模型，但是在推理时使用dev的日语修复
+    2.2：当前版本
 """
 import torch
 import commons
 from text import cleaned_text_to_sequence, get_bert
+from clap_wrapper import get_clap_audio_feature, get_clap_text_feature
 from text.cleaner import clean_text
 import utils
+import numpy as np
 from models import SynthesizerTrn
 from text.symbols import symbols
+from oldVersion.V210.models import SynthesizerTrn as V210SynthesizerTrn
+from oldVersion.V210.text import symbols as V210symbols
+from oldVersion.V200.models import SynthesizerTrn as V200SynthesizerTrn
+from oldVersion.V200.text import symbols as V200symbols
 from oldVersion.V111.models import SynthesizerTrn as V111SynthesizerTrn
 from oldVersion.V111.text import symbols as V111symbols
 from oldVersion.V110.models import SynthesizerTrn as V110SynthesizerTrn
 from oldVersion.V101.models import SynthesizerTrn as V101SynthesizerTrn
 from oldVersion.V101.text import symbols as V101symbols
+from oldVersion import V111, V110, V101, V200, V210
 # 当前版本信息
+latest_version = "2.2"
 # 版本兼容
 SynthesizerTrnMap = {
+    "2.1": V210SynthesizerTrn,
+    "2.0.2-fix": V200SynthesizerTrn,
+    "2.0.1": V200SynthesizerTrn,
+    "2.0": V200SynthesizerTrn,
     "1.1.1-fix": V111SynthesizerTrn,
     "1.1.1": V111SynthesizerTrn,
     "1.1": V110SynthesizerTrn,
 }
 symbolsMap = {
+    "2.1": V210symbols,
+    "2.0.2-fix": V200symbols,
+    "2.0.1": V200symbols,
+    "2.0": V200symbols,
     "1.1.1-fix": V111symbols,
     "1.1.1": V111symbols,
     "1.1": V110symbols,
 }
+# def get_emo_(reference_audio, emotion, sid):
+#     emo = (
+#         torch.from_numpy(get_emo(reference_audio))
+#         if reference_audio and emotion == -1
+#         else torch.FloatTensor(
+#             np.load(f"emo_clustering/{sid}/cluster_center_{emotion}.npy")
+#         )
+#     )
+#     return emo
 def get_net_g(model_path: str, version: str, device: str, hps):
     if version != latest_version:
         net_g = SynthesizerTrnMap[version](
     if language_str == "ZH":
         bert = bert_ori
+        ja_bert = torch.rand(1024, len(phone))
+        en_bert = torch.rand(1024, len(phone))
     elif language_str == "JP":
+        bert = torch.rand(1024, len(phone))
         ja_bert = bert_ori
+        en_bert = torch.rand(1024, len(phone))
     elif language_str == "EN":
+        bert = torch.rand(1024, len(phone))
+        ja_bert = torch.rand(1024, len(phone))
         en_bert = bert_ori
     else:
         raise ValueError("language_str should be ZH, JP or EN")
 def infer(
     text,
+    emotion,
     sdp_ratio,
     noise_scale,
     noise_scale_w,
     hps,
     net_g,
     device,
+    reference_audio=None,
+    skip_start=False,
+    skip_end=False,
 ):
+    # 2.2版本参数位置变了
+    # 2.1 参数新增 emotion reference_audio skip_start skip_end
+    inferMap_V3 = {
+        "2.1": V210.infer,
+    }
+    # 支持中日英三语版本
     inferMap_V2 = {
+        "2.0.2-fix": V200.infer,
+        "2.0.1": V200.infer,
+        "2.0": V200.infer,
         "1.1.1-fix": V111.infer_fix,
         "1.1.1": V111.infer,
         "1.1": V110.infer,
     version = hps.version if hasattr(hps, "version") else latest_version
     # 非当前版本，根据版本号选择合适的infer
     if version != latest_version:
+        if version in inferMap_V3.keys():
+            return inferMap_V3[version](
+                text,
+                sdp_ratio,
+                noise_scale,
+                noise_scale_w,
+                length_scale,
+                sid,
+                language,
+                hps,
+                net_g,
+                device,
+                reference_audio,
+                emotion,
+                skip_start,
+                skip_end,
+            )
         if version in inferMap_V2.keys():
             return inferMap_V2[version](
                 text,
                 device,
             )
     # 在此处实现当前版本的推理
+    # emo = get_emo_(reference_audio, emotion, sid)
+    if isinstance(reference_audio, np.ndarray):
+        emo = get_clap_audio_feature(reference_audio, device)
+    else:
+        emo = get_clap_text_feature(emotion, device)
+    emo = torch.squeeze(emo, dim=1)
     bert, ja_bert, en_bert, phones, tones, lang_ids = get_text(
         text, language, hps, device
     )
+    if skip_start:
+        phones = phones[3:]
+        tones = tones[3:]
+        lang_ids = lang_ids[3:]
+        bert = bert[:, 3:]
+        ja_bert = ja_bert[:, 3:]
+        en_bert = en_bert[:, 3:]
+    if skip_end:
+        phones = phones[:-2]
+        tones = tones[:-2]
+        lang_ids = lang_ids[:-2]
+        bert = bert[:, :-2]
+        ja_bert = ja_bert[:, :-2]
+        en_bert = en_bert[:, :-2]
+    with torch.no_grad():
+        x_tst = phones.to(device).unsqueeze(0)
+        tones = tones.to(device).unsqueeze(0)
+        lang_ids = lang_ids.to(device).unsqueeze(0)
+        bert = bert.to(device).unsqueeze(0)
+        ja_bert = ja_bert.to(device).unsqueeze(0)
+        en_bert = en_bert.to(device).unsqueeze(0)
+        x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
+        emo = emo.to(device).unsqueeze(0)
+        del phones
+        speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
+        audio = (
+            net_g.infer(
+                x_tst,
+                x_tst_lengths,
+                speakers,
+                tones,
+                lang_ids,
+                bert,
+                ja_bert,
+                en_bert,
+                emo,
+                sdp_ratio=sdp_ratio,
+                noise_scale=noise_scale,
+                noise_scale_w=noise_scale_w,
+                length_scale=length_scale,
+            )[0][0, 0]
+            .data.cpu()
+            .float()
+            .numpy()
+        )
+        del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        return audio
+def infer_multilang(
+    text,
+    sdp_ratio,
+    noise_scale,
+    noise_scale_w,
+    length_scale,
+    sid,
+    language,
+    hps,
+    net_g,
+    device,
+    reference_audio=None,
+    emotion=None,
+    skip_start=False,
+    skip_end=False,
+):
+    bert, ja_bert, en_bert, phones, tones, lang_ids = [], [], [], [], [], []
+    # emo = get_emo_(reference_audio, emotion, sid)
+    if isinstance(reference_audio, np.ndarray):
+        emo = get_clap_audio_feature(reference_audio, device)
+    else:
+        emo = get_clap_text_feature(emotion, device)
+    emo = torch.squeeze(emo, dim=1)
+    for idx, (txt, lang) in enumerate(zip(text, language)):
+        skip_start = (idx != 0) or (skip_start and idx == 0)
+        skip_end = (idx != len(text) - 1) or (skip_end and idx == len(text) - 1)
+        (
+            temp_bert,
+            temp_ja_bert,
+            temp_en_bert,
+            temp_phones,
+            temp_tones,
+            temp_lang_ids,
+        ) = get_text(txt, lang, hps, device)
+        if skip_start:
+            temp_bert = temp_bert[:, 3:]
+            temp_ja_bert = temp_ja_bert[:, 3:]
+            temp_en_bert = temp_en_bert[:, 3:]
+            temp_phones = temp_phones[3:]
+            temp_tones = temp_tones[3:]
+            temp_lang_ids = temp_lang_ids[3:]
+        if skip_end:
+            temp_bert = temp_bert[:, :-2]
+            temp_ja_bert = temp_ja_bert[:, :-2]
+            temp_en_bert = temp_en_bert[:, :-2]
+            temp_phones = temp_phones[:-2]
+            temp_tones = temp_tones[:-2]
+            temp_lang_ids = temp_lang_ids[:-2]
+        bert.append(temp_bert)
+        ja_bert.append(temp_ja_bert)
+        en_bert.append(temp_en_bert)
+        phones.append(temp_phones)
+        tones.append(temp_tones)
+        lang_ids.append(temp_lang_ids)
+    bert = torch.concatenate(bert, dim=1)
+    ja_bert = torch.concatenate(ja_bert, dim=1)
+    en_bert = torch.concatenate(en_bert, dim=1)
+    phones = torch.concatenate(phones, dim=0)
+    tones = torch.concatenate(tones, dim=0)
+    lang_ids = torch.concatenate(lang_ids, dim=0)
     with torch.no_grad():
         x_tst = phones.to(device).unsqueeze(0)
         tones = tones.to(device).unsqueeze(0)
         bert = bert.to(device).unsqueeze(0)
         ja_bert = ja_bert.to(device).unsqueeze(0)
         en_bert = en_bert.to(device).unsqueeze(0)
+        emo = emo.to(device).unsqueeze(0)
         x_tst_lengths = torch.LongTensor([phones.size(0)]).to(device)
         del phones
         speakers = torch.LongTensor([hps.data.spk2id[sid]]).to(device)
                 bert,
                 ja_bert,
                 en_bert,
+                emo,
                 sdp_ratio=sdp_ratio,
                 noise_scale=noise_scale,
                 noise_scale_w=noise_scale_w,
             .float()
             .numpy()
         )
+        del x_tst, tones, lang_ids, bert, x_tst_lengths, speakers, ja_bert, en_bert, emo
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
         return audio

models.py CHANGED Viewed

@@ -10,9 +10,12 @@ import monotonic_align
 from torch.nn import Conv1d, ConvTranspose1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 from commons import init_weights, get_padding
 from text import symbols, num_tones, num_languages
 class DurationDiscriminator(nn.Module):  # vits2
     def __init__(
@@ -309,6 +312,37 @@ class DurationPredictor(nn.Module):
         return x * x_mask
 class TextEncoder(nn.Module):
     def __init__(
         self,
@@ -320,6 +354,7 @@ class TextEncoder(nn.Module):
         n_layers,
         kernel_size,
         p_dropout,
         gin_channels=0,
     ):
         super().__init__()
@@ -341,6 +376,31 @@ class TextEncoder(nn.Module):
         self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
         self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
         self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
         self.encoder = attentions.Encoder(
             hidden_channels,
@@ -354,11 +414,17 @@ class TextEncoder(nn.Module):
         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
     def forward(
-        self, x, x_lengths, tone, language, bert, ja_bert, en_bert, sid, g=None
     ):
         bert_emb = self.bert_proj(bert).transpose(1, 2)
         ja_bert_emb = self.ja_bert_proj(ja_bert).transpose(1, 2)
         en_bert_emb = self.en_bert_proj(en_bert).transpose(1, 2)
         x = (
             self.emb(x)
             + self.tone_emb(tone)
@@ -366,6 +432,7 @@ class TextEncoder(nn.Module):
             + bert_emb
             + ja_bert_emb
             + en_bert_emb
         ) * math.sqrt(
             self.hidden_channels
         )  # [b, t, h]
@@ -378,7 +445,7 @@ class TextEncoder(nn.Module):
         stats = self.proj(x) * x_mask
         m, logs = torch.split(stats, self.out_channels, dim=1)
-        return x, m, logs, x_mask
 class ResidualCouplingBlock(nn.Module):
@@ -811,6 +878,7 @@ class SynthesizerTrn(nn.Module):
             n_layers,
             kernel_size,
             p_dropout,
             gin_channels=self.enc_gin_channels,
         )
         self.dec = Generator(
@@ -884,8 +952,8 @@ class SynthesizerTrn(nn.Module):
             g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
         else:
             g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
-        x, m_p, logs_p, x_mask = self.enc_p(
-            x, x_lengths, tone, language, bert, ja_bert, en_bert, sid, g=g
         )
         z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
         z_p = self.flow(z, y_mask, g=g)
@@ -951,6 +1019,8 @@ class SynthesizerTrn(nn.Module):
             y_mask,
             (z, z_p, m_p, logs_p, m_q, logs_q),
             (x, logw, logw_),
         )
     def infer(
@@ -963,6 +1033,7 @@ class SynthesizerTrn(nn.Module):
         bert,
         ja_bert,
         en_bert,
         noise_scale=0.667,
         length_scale=1,
         noise_scale_w=0.8,
@@ -976,8 +1047,8 @@ class SynthesizerTrn(nn.Module):
             g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
         else:
             g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
-        x, m_p, logs_p, x_mask = self.enc_p(
-            x, x_lengths, tone, language, bert, ja_bert, en_bert, sid, g=g
         )
         logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * (
             sdp_ratio

 from torch.nn import Conv1d, ConvTranspose1d, Conv2d
 from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
 from commons import init_weights, get_padding
 from text import symbols, num_tones, num_languages
+from vector_quantize_pytorch import VectorQuantize
 class DurationDiscriminator(nn.Module):  # vits2
     def __init__(
         return x * x_mask
+class Bottleneck(nn.Sequential):
+    def __init__(self, in_dim, hidden_dim):
+        c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
+        c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
+        super().__init__(*[c_fc1, c_fc2])
+class Block(nn.Module):
+    def __init__(self, in_dim, hidden_dim) -> None:
+        super().__init__()
+        self.norm = nn.LayerNorm(in_dim)
+        self.mlp = MLP(in_dim, hidden_dim)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.mlp(self.norm(x))
+        return x
+class MLP(nn.Module):
+    def __init__(self, in_dim, hidden_dim):
+        super().__init__()
+        self.c_fc1 = nn.Linear(in_dim, hidden_dim, bias=False)
+        self.c_fc2 = nn.Linear(in_dim, hidden_dim, bias=False)
+        self.c_proj = nn.Linear(hidden_dim, in_dim, bias=False)
+    def forward(self, x: torch.Tensor):
+        x = F.silu(self.c_fc1(x)) * self.c_fc2(x)
+        x = self.c_proj(x)
+        return x
 class TextEncoder(nn.Module):
     def __init__(
         self,
         n_layers,
         kernel_size,
         p_dropout,
+        n_speakers,
         gin_channels=0,
     ):
         super().__init__()
         self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
         self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
         self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
+        # self.emo_proj = nn.Linear(512, hidden_channels)
+        self.in_feature_net = nn.Sequential(
+            # input is assumed to an already normalized embedding
+            nn.Linear(512, 1028, bias=False),
+            nn.GELU(),
+            nn.LayerNorm(1028),
+            *[Block(1028, 512) for _ in range(1)],
+            nn.Linear(1028, 512, bias=False),
+            # normalize before passing to VQ?
+            # nn.GELU(),
+            # nn.LayerNorm(512),
+        )
+        self.emo_vq = VectorQuantize(
+            dim=512,
+            codebook_size=64,
+            codebook_dim=32,
+            commitment_weight=0.1,
+            decay=0.85,
+            heads=32,
+            kmeans_iters=20,
+            separate_codebook_per_head=True,
+            stochastic_sample_codes=True,
+            threshold_ema_dead_code=2,
+        )
+        self.out_feature_net = nn.Linear(512, hidden_channels)
         self.encoder = attentions.Encoder(
             hidden_channels,
         self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
     def forward(
+        self, x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, sid, g=None
     ):
+        sid = sid.cpu()
         bert_emb = self.bert_proj(bert).transpose(1, 2)
         ja_bert_emb = self.ja_bert_proj(ja_bert).transpose(1, 2)
         en_bert_emb = self.en_bert_proj(en_bert).transpose(1, 2)
+        emo_emb = self.in_feature_net(emo)
+        emo_emb, _, loss_commit = self.emo_vq(emo_emb.unsqueeze(1))
+        loss_commit = loss_commit.mean()
+        emo_emb = self.out_feature_net(emo_emb)
+        # emo_emb = self.emo_proj(emo.unsqueeze(1))
         x = (
             self.emb(x)
             + self.tone_emb(tone)
             + bert_emb
             + ja_bert_emb
             + en_bert_emb
+            + emo_emb
         ) * math.sqrt(
             self.hidden_channels
         )  # [b, t, h]
         stats = self.proj(x) * x_mask
         m, logs = torch.split(stats, self.out_channels, dim=1)
+        return x, m, logs, x_mask, loss_commit
 class ResidualCouplingBlock(nn.Module):
             n_layers,
             kernel_size,
             p_dropout,
+            self.n_speakers,
             gin_channels=self.enc_gin_channels,
         )
         self.dec = Generator(
             g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
         else:
             g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
+        x, m_p, logs_p, x_mask, loss_commit = self.enc_p(
+            x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, sid, g=g
         )
         z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g)
         z_p = self.flow(z, y_mask, g=g)
             y_mask,
             (z, z_p, m_p, logs_p, m_q, logs_q),
             (x, logw, logw_),
+            g,
+            loss_commit,
         )
     def infer(
         bert,
         ja_bert,
         en_bert,
+        emo=None,
         noise_scale=0.667,
         length_scale=1,
         noise_scale_w=0.8,
             g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
         else:
             g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
+        x, m_p, logs_p, x_mask, _ = self.enc_p(
+            x, x_lengths, tone, language, bert, ja_bert, en_bert, emo, sid, g=g
         )
         logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * (
             sdp_ratio

monotonic_align/__pycache__/__init__.cpython-311.pyc CHANGED Viewed

Binary files a/monotonic_align/__pycache__/__init__.cpython-311.pyc and b/monotonic_align/__pycache__/__init__.cpython-311.pyc differ

monotonic_align/__pycache__/core.cpython-311.pyc CHANGED Viewed

Binary files a/monotonic_align/__pycache__/core.cpython-311.pyc and b/monotonic_align/__pycache__/core.cpython-311.pyc differ

onnx_modules/V200/__init__.py ADDED Viewed

File without changes

onnx_modules/V200/attentions_onnx.py ADDED Viewed

	@@ -0,0 +1,378 @@

+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+import commons
+import logging
+logger = logging.getLogger(__name__)
+class LayerNorm(nn.Module):
+    def __init__(self, channels, eps=1e-5):
+        super().__init__()
+        self.channels = channels
+        self.eps = eps
+        self.gamma = nn.Parameter(torch.ones(channels))
+        self.beta = nn.Parameter(torch.zeros(channels))
+    def forward(self, x):
+        x = x.transpose(1, -1)
+        x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
+        return x.transpose(1, -1)
+@torch.jit.script
+def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
+    n_channels_int = n_channels[0]
+    in_act = input_a + input_b
+    t_act = torch.tanh(in_act[:, :n_channels_int, :])
+    s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
+    acts = t_act * s_act
+    return acts
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size=1,
+        p_dropout=0.0,
+        window_size=4,
+        isflow=True,
+        **kwargs
+    ):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        # if isflow:
+        #  cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1)
+        #  self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
+        #  self.cond_layer = weight_norm(cond_layer, name='weight')
+        #  self.gin_channels = 256
+        self.cond_layer_idx = self.n_layers
+        if "gin_channels" in kwargs:
+            self.gin_channels = kwargs["gin_channels"]
+            if self.gin_channels != 0:
+                self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
+                # vits2 says 3rd block, so idx is 2 by default
+                self.cond_layer_idx = (
+                    kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
+                )
+                logging.debug(self.gin_channels, self.cond_layer_idx)
+                assert (
+                    self.cond_layer_idx < self.n_layers
+                ), "cond_layer_idx should be less than n_layers"
+        self.drop = nn.Dropout(p_dropout)
+        self.attn_layers = nn.ModuleList()
+        self.norm_layers_1 = nn.ModuleList()
+        self.ffn_layers = nn.ModuleList()
+        self.norm_layers_2 = nn.ModuleList()
+        for i in range(self.n_layers):
+            self.attn_layers.append(
+                MultiHeadAttention(
+                    hidden_channels,
+                    hidden_channels,
+                    n_heads,
+                    p_dropout=p_dropout,
+                    window_size=window_size,
+                )
+            )
+            self.norm_layers_1.append(LayerNorm(hidden_channels))
+            self.ffn_layers.append(
+                FFN(
+                    hidden_channels,
+                    hidden_channels,
+                    filter_channels,
+                    kernel_size,
+                    p_dropout=p_dropout,
+                )
+            )
+            self.norm_layers_2.append(LayerNorm(hidden_channels))
+    def forward(self, x, x_mask, g=None):
+        attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
+        x = x * x_mask
+        for i in range(self.n_layers):
+            if i == self.cond_layer_idx and g is not None:
+                g = self.spk_emb_linear(g.transpose(1, 2))
+                g = g.transpose(1, 2)
+                x = x + g
+                x = x * x_mask
+            y = self.attn_layers[i](x, x, attn_mask)
+            y = self.drop(y)
+            x = self.norm_layers_1[i](x + y)
+            y = self.ffn_layers[i](x, x_mask)
+            y = self.drop(y)
+            x = self.norm_layers_2[i](x + y)
+        x = x * x_mask
+        return x
+class MultiHeadAttention(nn.Module):
+    def __init__(
+        self,
+        channels,
+        out_channels,
+        n_heads,
+        p_dropout=0.0,
+        window_size=None,
+        heads_share=True,
+        block_length=None,
+        proximal_bias=False,
+        proximal_init=False,
+    ):
+        super().__init__()
+        assert channels % n_heads == 0
+        self.channels = channels
+        self.out_channels = out_channels
+        self.n_heads = n_heads
+        self.p_dropout = p_dropout
+        self.window_size = window_size
+        self.heads_share = heads_share
+        self.block_length = block_length
+        self.proximal_bias = proximal_bias
+        self.proximal_init = proximal_init
+        self.attn = None
+        self.k_channels = channels // n_heads
+        self.conv_q = nn.Conv1d(channels, channels, 1)
+        self.conv_k = nn.Conv1d(channels, channels, 1)
+        self.conv_v = nn.Conv1d(channels, channels, 1)
+        self.conv_o = nn.Conv1d(channels, out_channels, 1)
+        self.drop = nn.Dropout(p_dropout)
+        if window_size is not None:
+            n_heads_rel = 1 if heads_share else n_heads
+            rel_stddev = self.k_channels**-0.5
+            self.emb_rel_k = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+            self.emb_rel_v = nn.Parameter(
+                torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
+                * rel_stddev
+            )
+        nn.init.xavier_uniform_(self.conv_q.weight)
+        nn.init.xavier_uniform_(self.conv_k.weight)
+        nn.init.xavier_uniform_(self.conv_v.weight)
+        if proximal_init:
+            with torch.no_grad():
+                self.conv_k.weight.copy_(self.conv_q.weight)
+                self.conv_k.bias.copy_(self.conv_q.bias)
+    def forward(self, x, c, attn_mask=None):
+        q = self.conv_q(x)
+        k = self.conv_k(c)
+        v = self.conv_v(c)
+        x, self.attn = self.attention(q, k, v, mask=attn_mask)
+        x = self.conv_o(x)
+        return x
+    def attention(self, query, key, value, mask=None):
+        # reshape [b, d, t] -> [b, n_h, t, d_k]
+        b, d, t_s, t_t = (*key.size(), query.size(2))
+        query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
+        key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
+        scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
+        if self.window_size is not None:
+            assert (
+                t_s == t_t
+            ), "Relative attention is only available for self-attention."
+            key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
+            rel_logits = self._matmul_with_relative_keys(
+                query / math.sqrt(self.k_channels), key_relative_embeddings
+            )
+            scores_local = self._relative_position_to_absolute_position(rel_logits)
+            scores = scores + scores_local
+        if self.proximal_bias:
+            assert t_s == t_t, "Proximal bias is only available for self-attention."
+            scores = scores + self._attention_bias_proximal(t_s).to(
+                device=scores.device, dtype=scores.dtype
+            )
+        if mask is not None:
+            scores = scores.masked_fill(mask == 0, -1e4)
+            if self.block_length is not None:
+                assert (
+                    t_s == t_t
+                ), "Local attention is only available for self-attention."
+                block_mask = (
+                    torch.ones_like(scores)
+                    .triu(-self.block_length)
+                    .tril(self.block_length)
+                )
+                scores = scores.masked_fill(block_mask == 0, -1e4)
+        p_attn = F.softmax(scores, dim=-1)  # [b, n_h, t_t, t_s]
+        p_attn = self.drop(p_attn)
+        output = torch.matmul(p_attn, value)
+        if self.window_size is not None:
+            relative_weights = self._absolute_position_to_relative_position(p_attn)
+            value_relative_embeddings = self._get_relative_embeddings(
+                self.emb_rel_v, t_s
+            )
+            output = output + self._matmul_with_relative_values(
+                relative_weights, value_relative_embeddings
+            )
+        output = (
+            output.transpose(2, 3).contiguous().view(b, d, t_t)
+        )  # [b, n_h, t_t, d_k] -> [b, d, t_t]
+        return output, p_attn
+    def _matmul_with_relative_values(self, x, y):
+        """
+        x: [b, h, l, m]
+        y: [h or 1, m, d]
+        ret: [b, h, l, d]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0))
+        return ret
+    def _matmul_with_relative_keys(self, x, y):
+        """
+        x: [b, h, l, d]
+        y: [h or 1, m, d]
+        ret: [b, h, l, m]
+        """
+        ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
+        return ret
+    def _get_relative_embeddings(self, relative_embeddings, length):
+        max_relative_position = 2 * self.window_size + 1
+        # Pad first before slice to avoid using cond ops.
+        pad_length = max(length - (self.window_size + 1), 0)
+        slice_start_position = max((self.window_size + 1) - length, 0)
+        slice_end_position = slice_start_position + 2 * length - 1
+        if pad_length > 0:
+            padded_relative_embeddings = F.pad(
+                relative_embeddings,
+                commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
+            )
+        else:
+            padded_relative_embeddings = relative_embeddings
+        used_relative_embeddings = padded_relative_embeddings[
+            :, slice_start_position:slice_end_position
+        ]
+        return used_relative_embeddings
+    def _relative_position_to_absolute_position(self, x):
+        """
+        x: [b, h, l, 2*l-1]
+        ret: [b, h, l, l]
+        """
+        batch, heads, length, _ = x.size()
+        # Concat columns of pad to shift from relative to absolute indexing.
+        x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
+        # Concat extra elements so to add up to shape (len+1, 2*len-1).
+        x_flat = x.view([batch, heads, length * 2 * length])
+        x_flat = F.pad(
+            x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
+        )
+        # Reshape and slice out the padded elements.
+        x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
+            :, :, :length, length - 1 :
+        ]
+        return x_final
+    def _absolute_position_to_relative_position(self, x):
+        """
+        x: [b, h, l, l]
+        ret: [b, h, l, 2*l-1]
+        """
+        batch, heads, length, _ = x.size()
+        # padd along column
+        x = F.pad(
+            x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
+        )
+        x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
+        # add 0's in the beginning that will skew the elements after reshape
+        x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
+        x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
+        return x_final
+    def _attention_bias_proximal(self, length):
+        """Bias for self-attention to encourage attention to close positions.
+        Args:
+          length: an integer scalar.
+        Returns:
+          a Tensor with shape [1, 1, length, length]
+        """
+        r = torch.arange(length, dtype=torch.float32)
+        diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
+        return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
+class FFN(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        filter_channels,
+        kernel_size,
+        p_dropout=0.0,
+        activation=None,
+        causal=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.activation = activation
+        self.causal = causal
+        if causal:
+            self.padding = self._causal_padding
+        else:
+            self.padding = self._same_padding
+        self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
+        self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
+        self.drop = nn.Dropout(p_dropout)
+    def forward(self, x, x_mask):
+        x = self.conv_1(self.padding(x * x_mask))
+        if self.activation == "gelu":
+            x = x * torch.sigmoid(1.702 * x)
+        else:
+            x = torch.relu(x)
+        x = self.drop(x)
+        x = self.conv_2(self.padding(x * x_mask))
+        return x * x_mask
+    def _causal_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = self.kernel_size - 1
+        pad_r = 0
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x
+    def _same_padding(self, x):
+        if self.kernel_size == 1:
+            return x
+        pad_l = (self.kernel_size - 1) // 2
+        pad_r = self.kernel_size // 2
+        padding = [[0, 0], [0, 0], [pad_l, pad_r]]
+        x = F.pad(x, commons.convert_pad_shape(padding))
+        return x

onnx_modules/V200/models_onnx.py ADDED Viewed

	@@ -0,0 +1,990 @@

+import math
+import torch
+from torch import nn
+from torch.nn import functional as F
+import commons
+import modules
+from . import attentions_onnx
+from torch.nn import Conv1d, ConvTranspose1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from commons import init_weights, get_padding
+from .text import symbols, num_tones, num_languages
+class DurationDiscriminator(nn.Module):  # vits2
+    def __init__(
+        self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.gin_channels = gin_channels
+        self.drop = nn.Dropout(p_dropout)
+        self.conv_1 = nn.Conv1d(
+            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_1 = modules.LayerNorm(filter_channels)
+        self.conv_2 = nn.Conv1d(
+            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_2 = modules.LayerNorm(filter_channels)
+        self.dur_proj = nn.Conv1d(1, filter_channels, 1)
+        self.pre_out_conv_1 = nn.Conv1d(
+            2 * filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.pre_out_norm_1 = modules.LayerNorm(filter_channels)
+        self.pre_out_conv_2 = nn.Conv1d(
+            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.pre_out_norm_2 = modules.LayerNorm(filter_channels)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, in_channels, 1)
+        self.output_layer = nn.Sequential(nn.Linear(filter_channels, 1), nn.Sigmoid())
+    def forward_probability(self, x, x_mask, dur, g=None):
+        dur = self.dur_proj(dur)
+        x = torch.cat([x, dur], dim=1)
+        x = self.pre_out_conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.pre_out_norm_1(x)
+        x = self.drop(x)
+        x = self.pre_out_conv_2(x * x_mask)
+        x = torch.relu(x)
+        x = self.pre_out_norm_2(x)
+        x = self.drop(x)
+        x = x * x_mask
+        x = x.transpose(1, 2)
+        output_prob = self.output_layer(x)
+        return output_prob
+    def forward(self, x, x_mask, dur_r, dur_hat, g=None):
+        x = torch.detach(x)
+        if g is not None:
+            g = torch.detach(g)
+            x = x + self.cond(g)
+        x = self.conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_1(x)
+        x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_2(x)
+        x = self.drop(x)
+        output_probs = []
+        for dur in [dur_r, dur_hat]:
+            output_prob = self.forward_probability(x, x_mask, dur, g)
+            output_probs.append(output_prob)
+        return output_probs
+class TransformerCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        n_flows=4,
+        gin_channels=0,
+        share_parameter=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.flows = nn.ModuleList()
+        self.wn = (
+            attentions_onnx.FFT(
+                hidden_channels,
+                filter_channels,
+                n_heads,
+                n_layers,
+                kernel_size,
+                p_dropout,
+                isflow=True,
+                gin_channels=self.gin_channels,
+            )
+            if share_parameter
+            else None
+        )
+        for i in range(n_flows):
+            self.flows.append(
+                modules.TransformerCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    n_layers,
+                    n_heads,
+                    p_dropout,
+                    filter_channels,
+                    mean_only=True,
+                    wn_sharing_parameter=self.wn,
+                    gin_channels=self.gin_channels,
+                )
+            )
+            self.flows.append(modules.Flip())
+    def forward(self, x, x_mask, g=None, reverse=True):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+class StochasticDurationPredictor(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        filter_channels,
+        kernel_size,
+        p_dropout,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        filter_channels = in_channels  # it needs to be removed from future version.
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.log_flow = modules.Log()
+        self.flows = nn.ModuleList()
+        self.flows.append(modules.ElementwiseAffine(2))
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
+            )
+            self.flows.append(modules.Flip())
+        self.post_pre = nn.Conv1d(1, filter_channels, 1)
+        self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.post_convs = modules.DDSConv(
+            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
+        )
+        self.post_flows = nn.ModuleList()
+        self.post_flows.append(modules.ElementwiseAffine(2))
+        for i in range(4):
+            self.post_flows.append(
+                modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3)
+            )
+            self.post_flows.append(modules.Flip())
+        self.pre = nn.Conv1d(in_channels, filter_channels, 1)
+        self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
+        self.convs = modules.DDSConv(
+            filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout
+        )
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
+    def forward(self, x, x_mask, z, g=None):
+        x = torch.detach(x)
+        x = self.pre(x)
+        if g is not None:
+            g = torch.detach(g)
+            x = x + self.cond(g)
+        x = self.convs(x, x_mask)
+        x = self.proj(x) * x_mask
+        flows = list(reversed(self.flows))
+        flows = flows[:-2] + [flows[-1]]  # remove a useless vflow
+        for flow in flows:
+            z = flow(z, x_mask, g=x, reverse=True)
+        z0, z1 = torch.split(z, [1, 1], 1)
+        logw = z0
+        return logw
+class DurationPredictor(nn.Module):
+    def __init__(
+        self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.filter_channels = filter_channels
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.gin_channels = gin_channels
+        self.drop = nn.Dropout(p_dropout)
+        self.conv_1 = nn.Conv1d(
+            in_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_1 = modules.LayerNorm(filter_channels)
+        self.conv_2 = nn.Conv1d(
+            filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
+        )
+        self.norm_2 = modules.LayerNorm(filter_channels)
+        self.proj = nn.Conv1d(filter_channels, 1, 1)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, in_channels, 1)
+    def forward(self, x, x_mask, g=None):
+        x = torch.detach(x)
+        if g is not None:
+            g = torch.detach(g)
+            x = x + self.cond(g)
+        x = self.conv_1(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_1(x)
+        x = self.drop(x)
+        x = self.conv_2(x * x_mask)
+        x = torch.relu(x)
+        x = self.norm_2(x)
+        x = self.drop(x)
+        x = self.proj(x * x_mask)
+        return x * x_mask
+class TextEncoder(nn.Module):
+    def __init__(
+        self,
+        n_vocab,
+        out_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.n_vocab = n_vocab
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.gin_channels = gin_channels
+        self.emb = nn.Embedding(len(symbols), hidden_channels)
+        nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
+        self.tone_emb = nn.Embedding(num_tones, hidden_channels)
+        nn.init.normal_(self.tone_emb.weight, 0.0, hidden_channels**-0.5)
+        self.language_emb = nn.Embedding(num_languages, hidden_channels)
+        nn.init.normal_(self.language_emb.weight, 0.0, hidden_channels**-0.5)
+        self.bert_proj = nn.Conv1d(1024, hidden_channels, 1)
+        self.ja_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
+        self.en_bert_proj = nn.Conv1d(1024, hidden_channels, 1)
+        self.encoder = attentions_onnx.Encoder(
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+            gin_channels=self.gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, x, x_lengths, tone, language, bert, ja_bert, en_bert, g=None):
+        x_mask = torch.ones_like(x).unsqueeze(0)
+        bert_emb = self.bert_proj(bert.transpose(0, 1).unsqueeze(0)).transpose(1, 2)
+        ja_bert_emb = self.ja_bert_proj(ja_bert.transpose(0, 1).unsqueeze(0)).transpose(
+            1, 2
+        )
+        en_bert_emb = self.en_bert_proj(en_bert.transpose(0, 1).unsqueeze(0)).transpose(
+            1, 2
+        )
+        x = (
+            self.emb(x)
+            + self.tone_emb(tone)
+            + self.language_emb(language)
+            + bert_emb
+            + ja_bert_emb
+            + en_bert_emb
+        ) * math.sqrt(
+            self.hidden_channels
+        )  # [b, t, h]
+        x = torch.transpose(x, 1, -1)  # [b, h, t]
+        x_mask = x_mask.to(x.dtype)
+        x = self.encoder(x * x_mask, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        return x, m, logs, x_mask
+class ResidualCouplingBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        n_flows=4,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.n_flows = n_flows
+        self.gin_channels = gin_channels
+        self.flows = nn.ModuleList()
+        for i in range(n_flows):
+            self.flows.append(
+                modules.ResidualCouplingLayer(
+                    channels,
+                    hidden_channels,
+                    kernel_size,
+                    dilation_rate,
+                    n_layers,
+                    gin_channels=gin_channels,
+                    mean_only=True,
+                )
+            )
+            self.flows.append(modules.Flip())
+    def forward(self, x, x_mask, g=None, reverse=True):
+        if not reverse:
+            for flow in self.flows:
+                x, _ = flow(x, x_mask, g=g, reverse=reverse)
+        else:
+            for flow in reversed(self.flows):
+                x = flow(x, x_mask, g=g, reverse=reverse)
+        return x
+class PosteriorEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        hidden_channels,
+        kernel_size,
+        dilation_rate,
+        n_layers,
+        gin_channels=0,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.hidden_channels = hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation_rate = dilation_rate
+        self.n_layers = n_layers
+        self.gin_channels = gin_channels
+        self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
+        self.enc = modules.WN(
+            hidden_channels,
+            kernel_size,
+            dilation_rate,
+            n_layers,
+            gin_channels=gin_channels,
+        )
+        self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
+    def forward(self, x, x_lengths, g=None):
+        x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
+            x.dtype
+        )
+        x = self.pre(x) * x_mask
+        x = self.enc(x, x_mask, g=g)
+        stats = self.proj(x) * x_mask
+        m, logs = torch.split(stats, self.out_channels, dim=1)
+        z = (m + torch.randn_like(m) * torch.exp(logs)) * x_mask
+        return z, m, logs, x_mask
+class Generator(torch.nn.Module):
+    def __init__(
+        self,
+        initial_channel,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        gin_channels=0,
+    ):
+        super(Generator, self).__init__()
+        self.num_kernels = len(resblock_kernel_sizes)
+        self.num_upsamples = len(upsample_rates)
+        self.conv_pre = Conv1d(
+            initial_channel, upsample_initial_channel, 7, 1, padding=3
+        )
+        resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
+        self.ups = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
+            self.ups.append(
+                weight_norm(
+                    ConvTranspose1d(
+                        upsample_initial_channel // (2**i),
+                        upsample_initial_channel // (2 ** (i + 1)),
+                        k,
+                        u,
+                        padding=(k - u) // 2,
+                    )
+                )
+            )
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = upsample_initial_channel // (2 ** (i + 1))
+            for j, (k, d) in enumerate(
+                zip(resblock_kernel_sizes, resblock_dilation_sizes)
+            ):
+                self.resblocks.append(resblock(ch, k, d))
+        self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
+        self.ups.apply(init_weights)
+        if gin_channels != 0:
+            self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
+    def forward(self, x, g=None):
+        x = self.conv_pre(x)
+        if g is not None:
+            x = x + self.cond(g)
+        for i in range(self.num_upsamples):
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i * self.num_kernels + j](x)
+                else:
+                    xs += self.resblocks[i * self.num_kernels + j](x)
+            x = xs / self.num_kernels
+        x = F.leaky_relu(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        print("Removing weight norm...")
+        for layer in self.ups:
+            remove_weight_norm(layer)
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        self.use_spectral_norm = use_spectral_norm
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(
+                    Conv2d(
+                        1,
+                        32,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        32,
+                        128,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        128,
+                        512,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        512,
+                        1024,
+                        (kernel_size, 1),
+                        (stride, 1),
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+                norm_f(
+                    Conv2d(
+                        1024,
+                        1024,
+                        (kernel_size, 1),
+                        1,
+                        padding=(get_padding(kernel_size, 1), 0),
+                    )
+                ),
+            ]
+        )
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0:  # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for layer in self.convs:
+            x = layer(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm is False else spectral_norm
+        self.convs = nn.ModuleList(
+            [
+                norm_f(Conv1d(1, 16, 15, 1, padding=7)),
+                norm_f(Conv1d(16, 64, 41, 4, groups=4, padding=20)),
+                norm_f(Conv1d(64, 256, 41, 4, groups=16, padding=20)),
+                norm_f(Conv1d(256, 1024, 41, 4, groups=64, padding=20)),
+                norm_f(Conv1d(1024, 1024, 41, 4, groups=256, padding=20)),
+                norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+            ]
+        )
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for layer in self.convs:
+            x = layer(x)
+            x = F.leaky_relu(x, modules.LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(MultiPeriodDiscriminator, self).__init__()
+        periods = [2, 3, 5, 7, 11]
+        discs = [DiscriminatorS(use_spectral_norm=use_spectral_norm)]
+        discs = discs + [
+            DiscriminatorP(i, use_spectral_norm=use_spectral_norm) for i in periods
+        ]
+        self.discriminators = nn.ModuleList(discs)
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            y_d_gs.append(y_d_g)
+            fmap_rs.append(fmap_r)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class ReferenceEncoder(nn.Module):
+    """
+    inputs --- [N, Ty/r, n_mels*r]  mels
+    outputs --- [N, ref_enc_gru_size]
+    """
+    def __init__(self, spec_channels, gin_channels=0):
+        super().__init__()
+        self.spec_channels = spec_channels
+        ref_enc_filters = [32, 32, 64, 64, 128, 128]
+        K = len(ref_enc_filters)
+        filters = [1] + ref_enc_filters
+        convs = [
+            weight_norm(
+                nn.Conv2d(
+                    in_channels=filters[i],
+                    out_channels=filters[i + 1],
+                    kernel_size=(3, 3),
+                    stride=(2, 2),
+                    padding=(1, 1),
+                )
+            )
+            for i in range(K)
+        ]
+        self.convs = nn.ModuleList(convs)
+        # self.wns = nn.ModuleList([weight_norm(num_features=ref_enc_filters[i]) for i in range(K)]) # noqa: E501
+        out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
+        self.gru = nn.GRU(
+            input_size=ref_enc_filters[-1] * out_channels,
+            hidden_size=256 // 2,
+            batch_first=True,
+        )
+        self.proj = nn.Linear(128, gin_channels)
+    def forward(self, inputs, mask=None):
+        N = inputs.size(0)
+        out = inputs.view(N, 1, -1, self.spec_channels)  # [N, 1, Ty, n_freqs]
+        for conv in self.convs:
+            out = conv(out)
+            # out = wn(out)
+            out = F.relu(out)  # [N, 128, Ty//2^K, n_mels//2^K]
+        out = out.transpose(1, 2)  # [N, Ty//2^K, 128, n_mels//2^K]
+        T = out.size(1)
+        N = out.size(0)
+        out = out.contiguous().view(N, T, -1)  # [N, Ty//2^K, 128*n_mels//2^K]
+        self.gru.flatten_parameters()
+        memory, out = self.gru(out)  # out --- [1, N, 128]
+        return self.proj(out.squeeze(0))
+    def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
+        for i in range(n_convs):
+            L = (L - kernel_size + 2 * pad) // stride + 1
+        return L
+class SynthesizerTrn(nn.Module):
+    """
+    Synthesizer for Training
+    """
+    def __init__(
+        self,
+        n_vocab,
+        spec_channels,
+        segment_size,
+        inter_channels,
+        hidden_channels,
+        filter_channels,
+        n_heads,
+        n_layers,
+        kernel_size,
+        p_dropout,
+        resblock,
+        resblock_kernel_sizes,
+        resblock_dilation_sizes,
+        upsample_rates,
+        upsample_initial_channel,
+        upsample_kernel_sizes,
+        n_speakers=256,
+        gin_channels=256,
+        use_sdp=True,
+        n_flow_layer=4,
+        n_layers_trans_flow=4,
+        flow_share_parameter=False,
+        use_transformer_flow=True,
+        **kwargs,
+    ):
+        super().__init__()
+        self.n_vocab = n_vocab
+        self.spec_channels = spec_channels
+        self.inter_channels = inter_channels
+        self.hidden_channels = hidden_channels
+        self.filter_channels = filter_channels
+        self.n_heads = n_heads
+        self.n_layers = n_layers
+        self.kernel_size = kernel_size
+        self.p_dropout = p_dropout
+        self.resblock = resblock
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.upsample_rates = upsample_rates
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.segment_size = segment_size
+        self.n_speakers = n_speakers
+        self.gin_channels = gin_channels
+        self.n_layers_trans_flow = n_layers_trans_flow
+        self.use_spk_conditioned_encoder = kwargs.get(
+            "use_spk_conditioned_encoder", True
+        )
+        self.use_sdp = use_sdp
+        self.use_noise_scaled_mas = kwargs.get("use_noise_scaled_mas", False)
+        self.mas_noise_scale_initial = kwargs.get("mas_noise_scale_initial", 0.01)
+        self.noise_scale_delta = kwargs.get("noise_scale_delta", 2e-6)
+        self.current_mas_noise_scale = self.mas_noise_scale_initial
+        if self.use_spk_conditioned_encoder and gin_channels > 0:
+            self.enc_gin_channels = gin_channels
+        self.enc_p = TextEncoder(
+            n_vocab,
+            inter_channels,
+            hidden_channels,
+            filter_channels,
+            n_heads,
+            n_layers,
+            kernel_size,
+            p_dropout,
+            gin_channels=self.enc_gin_channels,
+        )
+        self.dec = Generator(
+            inter_channels,
+            resblock,
+            resblock_kernel_sizes,
+            resblock_dilation_sizes,
+            upsample_rates,
+            upsample_initial_channel,
+            upsample_kernel_sizes,
+            gin_channels=gin_channels,
+        )
+        self.enc_q = PosteriorEncoder(
+            spec_channels,
+            inter_channels,
+            hidden_channels,
+            5,
+            1,
+            16,
+            gin_channels=gin_channels,
+        )
+        if use_transformer_flow:
+            self.flow = TransformerCouplingBlock(
+                inter_channels,
+                hidden_channels,
+                filter_channels,
+                n_heads,
+                n_layers_trans_flow,
+                5,
+                p_dropout,
+                n_flow_layer,
+                gin_channels=gin_channels,
+                share_parameter=flow_share_parameter,
+            )
+        else:
+            self.flow = ResidualCouplingBlock(
+                inter_channels,
+                hidden_channels,
+                5,
+                1,
+                n_flow_layer,
+                gin_channels=gin_channels,
+            )
+        self.sdp = StochasticDurationPredictor(
+            hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels
+        )
+        self.dp = DurationPredictor(
+            hidden_channels, 256, 3, 0.5, gin_channels=gin_channels
+        )
+        if n_speakers >= 1:
+            self.emb_g = nn.Embedding(n_speakers, gin_channels)
+        else:
+            self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
+    def export_onnx(
+        self,
+        path,
+        max_len=None,
+        sdp_ratio=0,
+        y=None,
+    ):
+        noise_scale = 0.667
+        length_scale = 1
+        noise_scale_w = 0.8
+        x = (
+            torch.LongTensor(
+                [
+                    0,
+                    97,
+                    0,
+                    8,
+                    0,
+                    78,
+                    0,
+                    8,
+                    0,
+                    76,
+                    0,
+                    37,
+                    0,
+                    40,
+                    0,
+                    97,
+                    0,
+                    8,
+                    0,
+                    23,
+                    0,
+                    8,
+                    0,
+                    74,
+                    0,
+                    26,
+                    0,
+                    104,
+                    0,
+                ]
+            )
+            .unsqueeze(0)
+            .cpu()
+        )
+        tone = torch.zeros_like(x).cpu()
+        language = torch.zeros_like(x).cpu()
+        x_lengths = torch.LongTensor([x.shape[1]]).cpu()
+        sid = torch.LongTensor([0]).cpu()
+        bert = torch.randn(size=(x.shape[1], 1024)).cpu()
+        ja_bert = torch.randn(size=(x.shape[1], 1024)).cpu()
+        en_bert = torch.randn(size=(x.shape[1], 1024)).cpu()
+        if self.n_speakers > 0:
+            g = self.emb_g(sid).unsqueeze(-1)  # [b, h, 1]
+            torch.onnx.export(
+                self.emb_g,
+                (sid),
+                f"onnx/{path}/{path}_emb.onnx",
+                input_names=["sid"],
+                output_names=["g"],
+                verbose=True,
+            )
+        else:
+            g = self.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
+        torch.onnx.export(
+            self.enc_p,
+            (x, x_lengths, tone, language, bert, ja_bert, en_bert, g),
+            f"onnx/{path}/{path}_enc_p.onnx",
+            input_names=[
+                "x",
+                "x_lengths",
+                "t",
+                "language",
+                "bert_0",
+                "bert_1",
+                "bert_2",
+                "g",
+            ],
+            output_names=["xout", "m_p", "logs_p", "x_mask"],
+            dynamic_axes={
+                "x": [0, 1],
+                "t": [0, 1],
+                "language": [0, 1],
+                "bert_0": [0],
+                "bert_1": [0],
+                "bert_2": [0],
+                "xout": [0, 2],
+                "m_p": [0, 2],
+                "logs_p": [0, 2],
+                "x_mask": [0, 2],
+            },
+            verbose=True,
+            opset_version=16,
+        )
+        x, m_p, logs_p, x_mask = self.enc_p(
+            x, x_lengths, tone, language, bert, ja_bert, en_bert, g=g
+        )
+        zinput = (
+            torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype)
+            * noise_scale_w
+        )
+        torch.onnx.export(
+            self.sdp,
+            (x, x_mask, zinput, g),
+            f"onnx/{path}/{path}_sdp.onnx",
+            input_names=["x", "x_mask", "zin", "g"],
+            output_names=["logw"],
+            dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "zin": [0, 2], "logw": [0, 2]},
+            verbose=True,
+        )
+        torch.onnx.export(
+            self.dp,
+            (x, x_mask, g),
+            f"onnx/{path}/{path}_dp.onnx",
+            input_names=["x", "x_mask", "g"],
+            output_names=["logw"],
+            dynamic_axes={"x": [0, 2], "x_mask": [0, 2], "logw": [0, 2]},
+            verbose=True,
+        )
+        logw = self.sdp(x, x_mask, zinput, g=g) * (sdp_ratio) + self.dp(
+            x, x_mask, g=g
+        ) * (1 - sdp_ratio)
+        w = torch.exp(logw) * x_mask * length_scale
+        w_ceil = torch.ceil(w)
+        y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
+        y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(
+            x_mask.dtype
+        )
+        attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
+        attn = commons.generate_path(w_ceil, attn_mask)
+        m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(
+            1, 2
+        )  # [b, t', t], [b, t, d] -> [b, d, t']
+        logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(
+            1, 2
+        )  # [b, t', t], [b, t, d] -> [b, d, t']
+        z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
+        torch.onnx.export(
+            self.flow,
+            (z_p, y_mask, g),
+            f"onnx/{path}/{path}_flow.onnx",
+            input_names=["z_p", "y_mask", "g"],
+            output_names=["z"],
+            dynamic_axes={"z_p": [0, 2], "y_mask": [0, 2], "z": [0, 2]},
+            verbose=True,
+        )
+        z = self.flow(z_p, y_mask, g=g, reverse=True)
+        z_in = (z * y_mask)[:, :, :max_len]
+        torch.onnx.export(
+            self.dec,
+            (z_in, g),
+            f"onnx/{path}/{path}_dec.onnx",
+            input_names=["z_in", "g"],
+            output_names=["o"],
+            dynamic_axes={"z_in": [0, 2], "o": [0, 2]},
+            verbose=True,
+        )
+        o = self.dec((z * y_mask)[:, :, :max_len], g=g)

onnx_modules/V200/text/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .symbols import *