Lycoris53 commited on
Commit
03204e8
0 Parent(s):

Duplicate from Lycoris53/VITS-TTS-Japanese-Only-Amitaro

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .gitignore +3 -0
  3. ConvertBitrate.ipynb +165 -0
  4. ConvertBitrate.py +77 -0
  5. LICENSE +201 -0
  6. OUTPUT_MODEL/D_Amitaro.pth +3 -0
  7. OUTPUT_MODEL/G_Amitaro.pth +3 -0
  8. OUTPUT_MODEL/config.json +56 -0
  9. ParseAmitaroHTML.py +68 -0
  10. README.md +13 -0
  11. Test_Inference.ipynb +129 -0
  12. VC_inference.py +146 -0
  13. amitaro_jp_base.json +56 -0
  14. app.py +66 -0
  15. attentions.py +303 -0
  16. cmd_inference.py +106 -0
  17. commons.py +164 -0
  18. configs/amitaro_jp_base.json +56 -0
  19. custom_character_voice/amitaro/amitaro_0.wav +0 -0
  20. custom_character_voice/amitaro/amitaro_1.wav +0 -0
  21. custom_character_voice/amitaro/amitaro_10.wav +0 -0
  22. custom_character_voice/amitaro/amitaro_11.wav +0 -0
  23. custom_character_voice/amitaro/amitaro_12.wav +0 -0
  24. custom_character_voice/amitaro/amitaro_13.wav +0 -0
  25. custom_character_voice/amitaro/amitaro_14.wav +0 -0
  26. custom_character_voice/amitaro/amitaro_15.wav +0 -0
  27. custom_character_voice/amitaro/amitaro_16.wav +0 -0
  28. custom_character_voice/amitaro/amitaro_17.wav +0 -0
  29. custom_character_voice/amitaro/amitaro_18.wav +0 -0
  30. custom_character_voice/amitaro/amitaro_19.wav +0 -0
  31. custom_character_voice/amitaro/amitaro_2.wav +0 -0
  32. custom_character_voice/amitaro/amitaro_20.wav +0 -0
  33. custom_character_voice/amitaro/amitaro_21.wav +0 -0
  34. custom_character_voice/amitaro/amitaro_22.wav +0 -0
  35. custom_character_voice/amitaro/amitaro_23.wav +0 -0
  36. custom_character_voice/amitaro/amitaro_24.wav +4 -0
  37. custom_character_voice/amitaro/amitaro_25.wav +0 -0
  38. custom_character_voice/amitaro/amitaro_26.wav +0 -0
  39. custom_character_voice/amitaro/amitaro_27.wav +0 -0
  40. custom_character_voice/amitaro/amitaro_28.wav +0 -0
  41. custom_character_voice/amitaro/amitaro_29.wav +0 -0
  42. custom_character_voice/amitaro/amitaro_3.wav +0 -0
  43. custom_character_voice/amitaro/amitaro_30.wav +0 -0
  44. custom_character_voice/amitaro/amitaro_31.wav +0 -0
  45. custom_character_voice/amitaro/amitaro_32.wav +0 -0
  46. custom_character_voice/amitaro/amitaro_33.wav +0 -0
  47. custom_character_voice/amitaro/amitaro_34.wav +0 -0
  48. custom_character_voice/amitaro/amitaro_35.wav +0 -0
  49. custom_character_voice/amitaro/amitaro_36.wav +0 -0
  50. custom_character_voice/amitaro/amitaro_37.wav +0 -0
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ *.pth filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .ipynb_checkpoints/
2
+ __pycache__/
3
+ pretrained_models/
ConvertBitrate.ipynb ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 4,
6
+ "id": "8775d691",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import librosa\n",
11
+ "import os\n",
12
+ "import soundfile\n",
13
+ "from tqdm import tqdm, tqdm_notebook"
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "code",
18
+ "execution_count": 5,
19
+ "id": "bcd1f6dc",
20
+ "metadata": {},
21
+ "outputs": [
22
+ {
23
+ "name": "stderr",
24
+ "output_type": "stream",
25
+ "text": [
26
+ " 26%|█████████████████████████████████████████████▊ | 2606/10000 [01:01<02:54, 42.35it/s]"
27
+ ]
28
+ },
29
+ {
30
+ "name": "stdout",
31
+ "output_type": "stream",
32
+ "text": [
33
+ "Total audio file written : 2606\n"
34
+ ]
35
+ },
36
+ {
37
+ "name": "stderr",
38
+ "output_type": "stream",
39
+ "text": [
40
+ "\n"
41
+ ]
42
+ }
43
+ ],
44
+ "source": [
45
+ "base_dir = \"../data/amitaro\"\n",
46
+ "output_dir = \"../data/amitaro_22050hz\"\n",
47
+ "all_dir = [f for f in os.listdir(base_dir) if not os.path.isfile(os.path.join(base_dir, f))]\n",
48
+ "\n",
49
+ "file_list = []\n",
50
+ "\n",
51
+ "skip_dir = [\"301_dousa\",\n",
52
+ " \"801_eng_suuji\",\n",
53
+ " \"801_eng_jikan\",\n",
54
+ " \"803_eng_others\",\n",
55
+ " \"912_alphabet\",\n",
56
+ " \"912_alphabet2\",\n",
57
+ " \"913_web\",\n",
58
+ " \"sample\"]\n",
59
+ "\n",
60
+ "total_file_write = 0\n",
61
+ "\n",
62
+ "def recursive_til_audio_file_found(path):\n",
63
+ " listed_dir = [f for f in os.listdir(path)]\n",
64
+ " if len(listed_dir) == 0:\n",
65
+ " return\n",
66
+ " test_path_first = os.path.join(path, listed_dir[0])\n",
67
+ " \n",
68
+ " # continue through the directory if not a file\n",
69
+ " if not os.path.isfile(test_path_first):\n",
70
+ " for next_dir in listed_dir:\n",
71
+ " next_path = os.path.join(path, next_dir)\n",
72
+ " # skip any directory specify in skip_dir\n",
73
+ " for skip in skip_dir:\n",
74
+ " if next_path.find(skip) != -1:\n",
75
+ " break\n",
76
+ " else:\n",
77
+ " recursive_til_audio_file_found(next_path)\n",
78
+ " return\n",
79
+ "\n",
80
+ " #for new_dir in tqdm_notebook(listed_dir, desc=f\"Processing : {path}\"):\n",
81
+ " for new_dir in listed_dir:\n",
82
+ " new_path = os.path.join(path, new_dir)\n",
83
+ " \n",
84
+ " #if it is file, convert the audio to 16k and write to output directory\n",
85
+ "# output_path_base = path.replace(base_dir, output_dir)\n",
86
+ "# if not os.path.exists(output_path_base):\n",
87
+ "# os.makedirs(output_path_base, exist_ok=True)\n",
88
+ "\n",
89
+ " # not an audio file\n",
90
+ " if new_path.find(\".wav\") == -1 and new_path.find(\".mp3\") == -1:\n",
91
+ " continue\n",
92
+ "\n",
93
+ " global total_file_write\n",
94
+ "# audio, rate = librosa.load(new_path, sr=16000)\n",
95
+ " audio, rate = librosa.load(new_path, sr=22050)\n",
96
+ "# output_path = os.path.join(output_path_base, new_dir)\n",
97
+ " output_path = os.path.join(output_dir, new_dir)\n",
98
+ "# output_path = os.path.join(output_dir, \"sakuramiko_\" + str(total_file_write) + \".wav\")\n",
99
+ "# output_path = os.path.join(output_dir, new_dir[0:-4] + \".wav\")\n",
100
+ " soundfile.write(output_path, audio, rate, format='wav', subtype=\"PCM_16\")\n",
101
+ " file_list.append(new_dir)\n",
102
+ " \n",
103
+ " total_file_write += 1\n",
104
+ " pbar.update(1)\n",
105
+ " #print(f\"\\rWrite file{output_path}\", end=\"\")\n",
106
+ " \n",
107
+ "with tqdm(total=10000) as pbar:\n",
108
+ " recursive_til_audio_file_found(base_dir)\n",
109
+ "print(f\"Total audio file written : {total_file_write}\")"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": 7,
115
+ "id": "7efe2fec",
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "import os\n",
120
+ "base_dir = \"../data/amitaro_22050hz\"\n",
121
+ "output_dir = \"./custom_character_voice/amitaro\"\n",
122
+ "listed_dir = [f for f in os.listdir(base_dir)]\n",
123
+ "for i, val in enumerate(listed_dir):\n",
124
+ " os.rename(os.path.join(base_dir, val), os.path.join(output_dir, f\"amitaro_{i}.wav\"))"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": null,
130
+ "id": "34c1fd46",
131
+ "metadata": {},
132
+ "outputs": [],
133
+ "source": [
134
+ "import json\n",
135
+ "out_json = {}\n",
136
+ "for val in file_list:\n",
137
+ " out_json[val] = {\"path\":val, \"kana\":\"\"}\n",
138
+ " \n",
139
+ "with open(\"./amitaro.json\", \"w\") as outfile:\n",
140
+ " outfile.write(json.dumps(out_json))"
141
+ ]
142
+ }
143
+ ],
144
+ "metadata": {
145
+ "kernelspec": {
146
+ "display_name": "Python 3 (ipykernel)",
147
+ "language": "python",
148
+ "name": "python3"
149
+ },
150
+ "language_info": {
151
+ "codemirror_mode": {
152
+ "name": "ipython",
153
+ "version": 3
154
+ },
155
+ "file_extension": ".py",
156
+ "mimetype": "text/x-python",
157
+ "name": "python",
158
+ "nbconvert_exporter": "python",
159
+ "pygments_lexer": "ipython3",
160
+ "version": "3.10.9"
161
+ }
162
+ },
163
+ "nbformat": 4,
164
+ "nbformat_minor": 5
165
+ }
ConvertBitrate.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import os
3
+ import soundfile
4
+ from tqdm import tqdm, tqdm_notebook
5
+
6
+ base_dir = "./data_sakuramiko_senbetsu"
7
+ output_dir = "./plachta/VITS-fast-fine-tuning/custom_character_voice/sakuramiko"
8
+ all_dir = [f for f in os.listdir(base_dir) if not os.path.isfile(os.path.join(base_dir, f))]
9
+
10
+ file_list = []
11
+
12
+ skip_dir = ["301_dousa",
13
+ "801_eng_suuji",
14
+ "801_eng_jikan",
15
+ "803_eng_others",
16
+ "912_alphabet",
17
+ "912_alphabet2",
18
+ "913_web",
19
+ "sample"]
20
+
21
+ total_file_write = 0
22
+
23
+ def recursive_til_audio_file_found(path):
24
+ listed_dir = [f for f in os.listdir(path)]
25
+ if len(listed_dir) == 0:
26
+ return
27
+ test_path_first = os.path.join(path, listed_dir[0])
28
+
29
+ # continue through the directory if not a file
30
+ if not os.path.isfile(test_path_first):
31
+ for next_dir in listed_dir:
32
+ next_path = os.path.join(path, next_dir)
33
+ # skip any directory specify in skip_dir
34
+ for skip in skip_dir:
35
+ if next_path.find(skip) != -1:
36
+ break
37
+ else:
38
+ recursive_til_audio_file_found(next_path)
39
+ return
40
+
41
+ #for new_dir in tqdm_notebook(listed_dir, desc=f"Processing : {path}"):
42
+ for new_dir in listed_dir:
43
+ new_path = os.path.join(path, new_dir)
44
+
45
+ #if it is file, convert the audio to 16k and write to output directory
46
+ # output_path_base = path.replace(base_dir, output_dir)
47
+ # if not os.path.exists(output_path_base):
48
+ # os.makedirs(output_path_base, exist_ok=True)
49
+
50
+ # not an audio file
51
+ if new_path.find(".wav") == -1 and new_path.find(".mp3") == -1:
52
+ continue
53
+
54
+ global total_file_write
55
+ # audio, rate = librosa.load(new_path, sr=16000)
56
+ audio, rate = librosa.load(new_path, sr=22050)
57
+ # output_path = os.path.join(output_path_base, new_dir)
58
+ output_path = os.path.join(output_dir, "sakuramiko_" + str(total_file_write) + ".wav")
59
+ # output_path = os.path.join(output_dir, new_dir[0:-4] + ".wav")
60
+ soundfile.write(output_path, audio, rate, format='wav', subtype="PCM_16")
61
+ file_list.append(new_dir)
62
+
63
+ total_file_write += 1
64
+ pbar.update(1)
65
+ #print(f"\rWrite file{output_path}", end="")
66
+
67
+ with tqdm(total=24778) as pbar:
68
+ recursive_til_audio_file_found(base_dir)
69
+ print(f"Total audio file written : {total_file_write}")
70
+
71
+ import json
72
+ out_json = {}
73
+ for val in file_list:
74
+ out_json[val] = {"path":val, "kana":""}
75
+
76
+ with open("./amitaro.json", "w") as outfile:
77
+ outfile.write(json.dumps(out_json))
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
OUTPUT_MODEL/D_Amitaro.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4610d88f7ce89c54e7bbee6fe1a60b9c98ade40dc8ec052624d0fcac67d6676c
3
+ size 187027092
OUTPUT_MODEL/G_Amitaro.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7644eed4b2c8afd8102ba6ec231a81d620d3a9bd5b659c1481552a3b2d4fdbc9
3
+ size 158888169
OUTPUT_MODEL/config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 1,
11
+ "fp16_run": true,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8192,
14
+ "init_lr_ratio": 1,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0
18
+ },
19
+ "data": {
20
+ "training_files":"./final_annotation_train.txt",
21
+ "validation_files":"./final_annotation_val.txt",
22
+ "text_cleaners":["japanese_cleaners"],
23
+ "max_wav_value": 32768.0,
24
+ "sampling_rate": 22050,
25
+ "filter_length": 1024,
26
+ "hop_length": 256,
27
+ "win_length": 1024,
28
+ "n_mel_channels": 80,
29
+ "mel_fmin": 0.0,
30
+ "mel_fmax": null,
31
+ "add_blank": true,
32
+ "n_speakers": 1,
33
+ "cleaned_text": true
34
+ },
35
+ "model": {
36
+ "inter_channels": 192,
37
+ "hidden_channels": 192,
38
+ "filter_channels": 768,
39
+ "n_heads": 2,
40
+ "n_layers": 6,
41
+ "kernel_size": 3,
42
+ "p_dropout": 0.1,
43
+ "resblock": "1",
44
+ "resblock_kernel_sizes": [3,7,11],
45
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
46
+ "upsample_rates": [8,8,2,2],
47
+ "upsample_initial_channel": 512,
48
+ "upsample_kernel_sizes": [16,16,4,4],
49
+ "n_layers_q": 3,
50
+ "use_spectral_norm": false,
51
+ "gin_channels": 256
52
+ },
53
+ "speakers": {"amitaro":0
54
+ },
55
+ "symbols": ["_", ",", ".", "!", "?", "-", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u2193", "\u2191", " "]
56
+ }
ParseAmitaroHTML.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pip --cert /etc/pki/ca-trust/source/anchors/tri-ace-CA-2015.cer install --trusted-host pypi.org --trusted-host files.pythonhosted.org beautifulsoup4
2
+
3
+ from bs4 import BeautifulSoup
4
+
5
+ f = open("./amitaro.htm", "r")
6
+ txt = f.read()
7
+ soup = BeautifulSoup(txt)
8
+ print(soup.prettify())
9
+
10
+ import json
11
+ f = open('amitaro.json')
12
+ file_list = json.load(f)
13
+
14
+ td = soup.find_all('td')
15
+ for i, val in enumerate(td):
16
+ if len(val.contents) == 0:
17
+ continue
18
+ key = val.contents[0]
19
+ if key in file_list:
20
+ #print(td[i-1].contents[0])
21
+ if len(td[i-1].contents) > 0:
22
+ #print(td[i-1].contents[0])
23
+ temp = BeautifulSoup(str(td[i-1].contents[0]))
24
+ a = temp.find_all('a')
25
+ print(a[0].contents[0])
26
+ file_list[key]["kana"] = str(a[0].contents[0])
27
+
28
+ with open("./amitaro_with_kana.json", "w") as outfile:
29
+ outfile.write(json.dumps(file_list, indent=4,ensure_ascii=False))
30
+
31
+ for key, val in file_list.items():
32
+ val["path"] = "./data_amitaro22k/" + val["path"]
33
+
34
+ with open("./amitaro_with_kana.json", "w") as outfile:
35
+ outfile.write(json.dumps(file_list, indent=4,ensure_ascii=False))
36
+
37
+ file = []
38
+ for key, val in file_list.items():
39
+ if len(val['kana']) == 0:
40
+ continue
41
+ if val['kana'].find("(") != -1:
42
+ continue
43
+ file.append(f"{val['path']}|10|{val['kana']}")
44
+
45
+ amitaro_train = []
46
+ amitaro_val = []
47
+ for val in file:
48
+ amitaro_train.append(val)
49
+
50
+ import random
51
+
52
+ rands = []
53
+ while len(rands) < len(file)/10:
54
+ rand_num = random.randint(0, len(file)-1)
55
+ if rand_num in rands:
56
+ continue
57
+ amitaro_val.append(file[rand_num])
58
+ rands.append(rand_num)
59
+
60
+ f = open("amitaro_train.txt", "w")
61
+ for val in amitaro_train:
62
+ f.write(f"{val}\n")
63
+ f.close()
64
+
65
+ f = open("amitaro_val.txt", "w")
66
+ for val in amitaro_val:
67
+ f.write(f"{val}\n")
68
+ f.close()
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: VITS-TTS-Japanese-Only-Amitaro
3
+ emoji: 🚀
4
+ colorFrom: blue
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 3.27.0
8
+ app_file: app.py
9
+ pinned: false
10
+ python_version: '3.10'
11
+ license: apache-2.0
12
+ duplicated_from: Lycoris53/VITS-TTS-Japanese-Only-Amitaro
13
+ ---
Test_Inference.ipynb ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "5dde1b9d",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from pathlib import Path\n",
11
+ "import utils\n",
12
+ "from models import SynthesizerTrn\n",
13
+ "import torch\n",
14
+ "from torch import no_grad, LongTensor\n",
15
+ "import librosa\n",
16
+ "from text import text_to_sequence, _clean_text\n",
17
+ "import commons\n",
18
+ "import scipy.io.wavfile as wavf\n",
19
+ "import os\n",
20
+ "\n",
21
+ "import IPython.display as ipd"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": 11,
27
+ "id": "f4bc040a",
28
+ "metadata": {},
29
+ "outputs": [
30
+ {
31
+ "name": "stdout",
32
+ "output_type": "stream",
33
+ "text": [
34
+ "INFO:root:Loaded checkpoint './OUTPUT_MODEL/G_latest.pth' (iteration 601)\n",
35
+ "o↑hayoogozaima↓sU.\n",
36
+ " length:18\n",
37
+ " length:18\n"
38
+ ]
39
+ },
40
+ {
41
+ "data": {
42
+ "text/html": [
43
+ "\n",
44
+ " <audio controls=\"controls\" >\n",
45
+ " <source src=\"data:audio/wav;base64,\" type=\"audio/wav\" />\n",
46
+ " Your browser does not support the audio element.\n",
47
+ " </audio>\n",
48
+ " "
49
+ ],
50
+ "text/plain": [
51
+ "<IPython.lib.display.Audio object>"
52
+ ]
53
+ },
54
+ "metadata": {},
55
+ "output_type": "display_data"
56
+ }
57
+ ],
58
+ "source": [
59
+ "model_path = \"./OUTPUT_MODEL/G_latest.pth\"\n",
60
+ "config_path = \"./OUTPUT_MODEL/config.json\"\n",
61
+ "\n",
62
+ "length = 1.0\n",
63
+ "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
64
+ "\n",
65
+ "def get_text(text, hps, is_symbol):\n",
66
+ " text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)\n",
67
+ " if hps.data.add_blank:\n",
68
+ " text_norm = commons.intersperse(text_norm, 0)\n",
69
+ " text_norm = LongTensor(text_norm)\n",
70
+ " return text_norm\n",
71
+ "\n",
72
+ "hps = utils.get_hparams_from_file(config_path)\n",
73
+ "net_g = SynthesizerTrn(\n",
74
+ " len(hps.symbols),\n",
75
+ " hps.data.filter_length // 2 + 1,\n",
76
+ " hps.train.segment_size // hps.data.hop_length,\n",
77
+ " n_speakers=hps.data.n_speakers,\n",
78
+ " **hps.model).to(device)\n",
79
+ "_ = net_g.eval()\n",
80
+ "_ = utils.load_checkpoint(model_path, net_g, None)\n",
81
+ "\n",
82
+ "speaker_ids = hps.speakers\n",
83
+ "\n",
84
+ "text = \"おはようございます。\"\n",
85
+ "#text = \"[JA]\" + text + \"[JA]\"\n",
86
+ "speaker_id = 0\n",
87
+ "stn_tst = get_text(text, hps, False)\n",
88
+ "with no_grad():\n",
89
+ " x_tst = stn_tst.unsqueeze(0).to(device)\n",
90
+ " x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)\n",
91
+ " sid = LongTensor([speaker_id]).to(device)\n",
92
+ " audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.6,\n",
93
+ " length_scale=1.0 / length)[0][0, 0].data.cpu().float().numpy()\n",
94
+ "del stn_tst, x_tst, x_tst_lengths, sid\n",
95
+ "\n",
96
+ "ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": null,
102
+ "id": "032cc92d",
103
+ "metadata": {},
104
+ "outputs": [],
105
+ "source": []
106
+ }
107
+ ],
108
+ "metadata": {
109
+ "kernelspec": {
110
+ "display_name": "Python 3 (ipykernel)",
111
+ "language": "python",
112
+ "name": "python3"
113
+ },
114
+ "language_info": {
115
+ "codemirror_mode": {
116
+ "name": "ipython",
117
+ "version": 3
118
+ },
119
+ "file_extension": ".py",
120
+ "mimetype": "text/x-python",
121
+ "name": "python",
122
+ "nbconvert_exporter": "python",
123
+ "pygments_lexer": "ipython3",
124
+ "version": "3.10.9"
125
+ }
126
+ },
127
+ "nbformat": 4,
128
+ "nbformat_minor": 5
129
+ }
VC_inference.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import torch
4
+ from torch import no_grad, LongTensor
5
+ import argparse
6
+ import commons
7
+ from mel_processing import spectrogram_torch
8
+ import utils
9
+ from models import SynthesizerTrn
10
+ import gradio as gr
11
+ import librosa
12
+ import webbrowser
13
+
14
+ from text import text_to_sequence, _clean_text
15
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
16
+ import logging
17
+ logging.getLogger("PIL").setLevel(logging.WARNING)
18
+ logging.getLogger("urllib3").setLevel(logging.WARNING)
19
+ logging.getLogger("markdown_it").setLevel(logging.WARNING)
20
+ logging.getLogger("httpx").setLevel(logging.WARNING)
21
+ logging.getLogger("asyncio").setLevel(logging.WARNING)
22
+
23
+ language_marks = {
24
+ "Japanese": "",
25
+ "日本語": "[JA]",
26
+ "简体中文": "[ZH]",
27
+ "English": "[EN]",
28
+ "Mix": "",
29
+ }
30
+ lang = ['日本語', '简体中文', 'English', 'Mix']
31
+ def get_text(text, hps, is_symbol):
32
+ text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
33
+ if hps.data.add_blank:
34
+ text_norm = commons.intersperse(text_norm, 0)
35
+ text_norm = LongTensor(text_norm)
36
+ return text_norm
37
+
38
+ def create_tts_fn(model, hps, speaker_ids):
39
+ def tts_fn(text, speaker, language, speed):
40
+ if language is not None:
41
+ text = language_marks[language] + text + language_marks[language]
42
+ speaker_id = speaker_ids[speaker]
43
+ stn_tst = get_text(text, hps, False)
44
+ with no_grad():
45
+ x_tst = stn_tst.unsqueeze(0).to(device)
46
+ x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
47
+ sid = LongTensor([speaker_id]).to(device)
48
+ audio = model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8,
49
+ length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
50
+ del stn_tst, x_tst, x_tst_lengths, sid
51
+ return "Success", (hps.data.sampling_rate, audio)
52
+
53
+ return tts_fn
54
+
55
+ def create_vc_fn(model, hps, speaker_ids):
56
+ def vc_fn(original_speaker, target_speaker, record_audio, upload_audio):
57
+ input_audio = record_audio if record_audio is not None else upload_audio
58
+ if input_audio is None:
59
+ return "You need to record or upload an audio", None
60
+ sampling_rate, audio = input_audio
61
+ original_speaker_id = speaker_ids[original_speaker]
62
+ target_speaker_id = speaker_ids[target_speaker]
63
+
64
+ audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32)
65
+ if len(audio.shape) > 1:
66
+ audio = librosa.to_mono(audio.transpose(1, 0))
67
+ if sampling_rate != hps.data.sampling_rate:
68
+ audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=hps.data.sampling_rate)
69
+ with no_grad():
70
+ y = torch.FloatTensor(audio)
71
+ y = y / max(-y.min(), y.max()) / 0.99
72
+ y = y.to(device)
73
+ y = y.unsqueeze(0)
74
+ spec = spectrogram_torch(y, hps.data.filter_length,
75
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
76
+ center=False).to(device)
77
+ spec_lengths = LongTensor([spec.size(-1)]).to(device)
78
+ sid_src = LongTensor([original_speaker_id]).to(device)
79
+ sid_tgt = LongTensor([target_speaker_id]).to(device)
80
+ audio = model.voice_conversion(spec, spec_lengths, sid_src=sid_src, sid_tgt=sid_tgt)[0][
81
+ 0, 0].data.cpu().float().numpy()
82
+ del y, spec, spec_lengths, sid_src, sid_tgt
83
+ return "Success", (hps.data.sampling_rate, audio)
84
+
85
+ return vc_fn
86
+ if __name__ == "__main__":
87
+ parser = argparse.ArgumentParser()
88
+ parser.add_argument("--model_dir", default="./G_latest.pth", help="directory to your fine-tuned model")
89
+ parser.add_argument("--config_dir", default="./finetune_speaker.json", help="directory to your model config file")
90
+ parser.add_argument("--share", default=False, help="make link public (used in colab)")
91
+
92
+ args = parser.parse_args()
93
+ hps = utils.get_hparams_from_file(args.config_dir)
94
+
95
+
96
+ net_g = SynthesizerTrn(
97
+ len(hps.symbols),
98
+ hps.data.filter_length // 2 + 1,
99
+ hps.train.segment_size // hps.data.hop_length,
100
+ n_speakers=hps.data.n_speakers,
101
+ **hps.model).to(device)
102
+ _ = net_g.eval()
103
+
104
+ _ = utils.load_checkpoint(args.model_dir, net_g, None)
105
+ speaker_ids = hps.speakers
106
+ speakers = list(hps.speakers.keys())
107
+ tts_fn = create_tts_fn(net_g, hps, speaker_ids)
108
+ vc_fn = create_vc_fn(net_g, hps, speaker_ids)
109
+ app = gr.Blocks()
110
+ with app:
111
+ with gr.Tab("Text-to-Speech"):
112
+ with gr.Row():
113
+ with gr.Column():
114
+ textbox = gr.TextArea(label="Text",
115
+ placeholder="Type your sentence here",
116
+ value="こんにちわ。", elem_id=f"tts-input")
117
+ # select character
118
+ char_dropdown = gr.Dropdown(choices=speakers, value=speakers[0], label='character')
119
+ language_dropdown = gr.Dropdown(choices=lang, value=lang[0], label='language')
120
+ duration_slider = gr.Slider(minimum=0.1, maximum=5, value=1, step=0.1,
121
+ label='速度 Speed')
122
+ with gr.Column():
123
+ text_output = gr.Textbox(label="Message")
124
+ audio_output = gr.Audio(label="Output Audio", elem_id="tts-audio")
125
+ btn = gr.Button("Generate!")
126
+ btn.click(tts_fn,
127
+ inputs=[textbox, char_dropdown, language_dropdown, duration_slider,],
128
+ outputs=[text_output, audio_output])
129
+ with gr.Tab("Voice Conversion"):
130
+ gr.Markdown("""
131
+ 录制或上传声音,并选择要转换的音色。
132
+ """)
133
+ with gr.Column():
134
+ record_audio = gr.Audio(label="record your voice", source="microphone")
135
+ upload_audio = gr.Audio(label="or upload audio here", source="upload")
136
+ source_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="source speaker")
137
+ target_speaker = gr.Dropdown(choices=speakers, value=speakers[0], label="target speaker")
138
+ with gr.Column():
139
+ message_box = gr.Textbox(label="Message")
140
+ converted_audio = gr.Audio(label='converted audio')
141
+ btn = gr.Button("Convert!")
142
+ btn.click(vc_fn, inputs=[source_speaker, target_speaker, record_audio, upload_audio],
143
+ outputs=[message_box, converted_audio])
144
+ webbrowser.open("http://127.0.0.1:7860")
145
+ app.launch(share=args.share)
146
+
amitaro_jp_base.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 8,
11
+ "fp16_run": true,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8192,
14
+ "init_lr_ratio": 1,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0
18
+ },
19
+ "data": {
20
+ "training_files":"./final_annotation_train.txt",
21
+ "validation_files":"./final_annotation_val.txt",
22
+ "text_cleaners":["japanese_cleaners"],
23
+ "max_wav_value": 32768.0,
24
+ "sampling_rate": 22050,
25
+ "filter_length": 1024,
26
+ "hop_length": 256,
27
+ "win_length": 1024,
28
+ "n_mel_channels": 80,
29
+ "mel_fmin": 0.0,
30
+ "mel_fmax": null,
31
+ "add_blank": true,
32
+ "n_speakers": 1,
33
+ "cleaned_text": true
34
+ },
35
+ "model": {
36
+ "inter_channels": 192,
37
+ "hidden_channels": 192,
38
+ "filter_channels": 768,
39
+ "n_heads": 2,
40
+ "n_layers": 6,
41
+ "kernel_size": 3,
42
+ "p_dropout": 0.1,
43
+ "resblock": "1",
44
+ "resblock_kernel_sizes": [3,7,11],
45
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
46
+ "upsample_rates": [8,8,2,2],
47
+ "upsample_initial_channel": 512,
48
+ "upsample_kernel_sizes": [16,16,4,4],
49
+ "n_layers_q": 3,
50
+ "use_spectral_norm": false,
51
+ "gin_channels": 256
52
+ },
53
+ "speakers": {"amitaro":0
54
+ },
55
+ "symbols": ["_", ",", ".", "!", "?", "-", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u2193", "\u2191", " "]
56
+ }
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import utils
2
+ from models import SynthesizerTrn
3
+ import torch
4
+ from torch import no_grad, LongTensor
5
+ from text import text_to_sequence
6
+ import gradio as gr
7
+ import commons
8
+
9
+ model_path = "./OUTPUT_MODEL/G_Amitaro.pth"
10
+ config_path = "./OUTPUT_MODEL/config.json"
11
+
12
+ length = 1.0
13
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
14
+
15
+ def get_text(text, hps, is_symbol):
16
+ text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
17
+ if hps.data.add_blank:
18
+ text_norm = commons.intersperse(text_norm, 0)
19
+ text_norm = LongTensor(text_norm)
20
+ return text_norm
21
+
22
+ def get_vits_array(text):
23
+ hps = utils.get_hparams_from_file(config_path)
24
+ net_g = SynthesizerTrn(
25
+ len(hps.symbols),
26
+ hps.data.filter_length // 2 + 1,
27
+ hps.train.segment_size // hps.data.hop_length,
28
+ n_speakers=hps.data.n_speakers,
29
+ **hps.model).to(device)
30
+ _ = net_g.eval()
31
+ _ = utils.load_checkpoint(model_path, net_g, None)
32
+
33
+ speaker_ids = hps.speakers
34
+
35
+ #text = "[JA]" + text + "[JA]"
36
+ speaker_id = 0
37
+ stn_tst = get_text(text, hps, False)
38
+ with no_grad():
39
+ x_tst = stn_tst.unsqueeze(0).to(device)
40
+ x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
41
+ sid = LongTensor([speaker_id]).to(device)
42
+ audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.6,
43
+ length_scale=1.0 / length)[0][0, 0].data.cpu().float().numpy()
44
+ del stn_tst, x_tst, x_tst_lengths, sid
45
+
46
+ return (hps.data.sampling_rate, audio)
47
+
48
+ app = gr.Blocks()
49
+ with app:
50
+ gr.Markdown("# VITS-TTS-Japanese-Only-Amitaro\n\n"
51
+ "Sample usage of Finetune model [Lycoris53/Vits-Japanese-Only-Amitaro](https://huggingface.co/Lycoris53/Vits-Japanese-Only-Amitaro) \n"
52
+ "Base finetuning code is from [Plachtaa/VITS-fast-fine-tuning](https://github.com/Plachtaa/VITS-fast-fine-tuning)"
53
+ )
54
+ with gr.Row():
55
+ with gr.Column():
56
+ textbox = gr.TextArea(label="Text",
57
+ placeholder="Type your sentence here (Maximum 150 words)",
58
+ value="おはようございます。")
59
+ with gr.Column():
60
+ audio_output = gr.Audio(label="Output Audio")
61
+ btn = gr.Button("Generate Voice!")
62
+ btn.click(get_vits_array,
63
+ inputs=[textbox],
64
+ outputs=[audio_output])
65
+
66
+ app.queue(concurrency_count=3).launch()
attentions.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import math
3
+ import numpy as np
4
+ import torch
5
+ from torch import nn
6
+ from torch.nn import functional as F
7
+
8
+ import commons
9
+ import modules
10
+ from modules import LayerNorm
11
+
12
+
13
+ class Encoder(nn.Module):
14
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., window_size=4, **kwargs):
15
+ super().__init__()
16
+ self.hidden_channels = hidden_channels
17
+ self.filter_channels = filter_channels
18
+ self.n_heads = n_heads
19
+ self.n_layers = n_layers
20
+ self.kernel_size = kernel_size
21
+ self.p_dropout = p_dropout
22
+ self.window_size = window_size
23
+
24
+ self.drop = nn.Dropout(p_dropout)
25
+ self.attn_layers = nn.ModuleList()
26
+ self.norm_layers_1 = nn.ModuleList()
27
+ self.ffn_layers = nn.ModuleList()
28
+ self.norm_layers_2 = nn.ModuleList()
29
+ for i in range(self.n_layers):
30
+ self.attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, window_size=window_size))
31
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
32
+ self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout))
33
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
34
+
35
+ def forward(self, x, x_mask):
36
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
37
+ x = x * x_mask
38
+ for i in range(self.n_layers):
39
+ y = self.attn_layers[i](x, x, attn_mask)
40
+ y = self.drop(y)
41
+ x = self.norm_layers_1[i](x + y)
42
+
43
+ y = self.ffn_layers[i](x, x_mask)
44
+ y = self.drop(y)
45
+ x = self.norm_layers_2[i](x + y)
46
+ x = x * x_mask
47
+ return x
48
+
49
+
50
+ class Decoder(nn.Module):
51
+ def __init__(self, hidden_channels, filter_channels, n_heads, n_layers, kernel_size=1, p_dropout=0., proximal_bias=False, proximal_init=True, **kwargs):
52
+ super().__init__()
53
+ self.hidden_channels = hidden_channels
54
+ self.filter_channels = filter_channels
55
+ self.n_heads = n_heads
56
+ self.n_layers = n_layers
57
+ self.kernel_size = kernel_size
58
+ self.p_dropout = p_dropout
59
+ self.proximal_bias = proximal_bias
60
+ self.proximal_init = proximal_init
61
+
62
+ self.drop = nn.Dropout(p_dropout)
63
+ self.self_attn_layers = nn.ModuleList()
64
+ self.norm_layers_0 = nn.ModuleList()
65
+ self.encdec_attn_layers = nn.ModuleList()
66
+ self.norm_layers_1 = nn.ModuleList()
67
+ self.ffn_layers = nn.ModuleList()
68
+ self.norm_layers_2 = nn.ModuleList()
69
+ for i in range(self.n_layers):
70
+ self.self_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout, proximal_bias=proximal_bias, proximal_init=proximal_init))
71
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
72
+ self.encdec_attn_layers.append(MultiHeadAttention(hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout))
73
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
74
+ self.ffn_layers.append(FFN(hidden_channels, hidden_channels, filter_channels, kernel_size, p_dropout=p_dropout, causal=True))
75
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
76
+
77
+ def forward(self, x, x_mask, h, h_mask):
78
+ """
79
+ x: decoder input
80
+ h: encoder output
81
+ """
82
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(device=x.device, dtype=x.dtype)
83
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
84
+ x = x * x_mask
85
+ for i in range(self.n_layers):
86
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
87
+ y = self.drop(y)
88
+ x = self.norm_layers_0[i](x + y)
89
+
90
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
91
+ y = self.drop(y)
92
+ x = self.norm_layers_1[i](x + y)
93
+
94
+ y = self.ffn_layers[i](x, x_mask)
95
+ y = self.drop(y)
96
+ x = self.norm_layers_2[i](x + y)
97
+ x = x * x_mask
98
+ return x
99
+
100
+
101
+ class MultiHeadAttention(nn.Module):
102
+ def __init__(self, channels, out_channels, n_heads, p_dropout=0., window_size=None, heads_share=True, block_length=None, proximal_bias=False, proximal_init=False):
103
+ super().__init__()
104
+ assert channels % n_heads == 0
105
+
106
+ self.channels = channels
107
+ self.out_channels = out_channels
108
+ self.n_heads = n_heads
109
+ self.p_dropout = p_dropout
110
+ self.window_size = window_size
111
+ self.heads_share = heads_share
112
+ self.block_length = block_length
113
+ self.proximal_bias = proximal_bias
114
+ self.proximal_init = proximal_init
115
+ self.attn = None
116
+
117
+ self.k_channels = channels // n_heads
118
+ self.conv_q = nn.Conv1d(channels, channels, 1)
119
+ self.conv_k = nn.Conv1d(channels, channels, 1)
120
+ self.conv_v = nn.Conv1d(channels, channels, 1)
121
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
122
+ self.drop = nn.Dropout(p_dropout)
123
+
124
+ if window_size is not None:
125
+ n_heads_rel = 1 if heads_share else n_heads
126
+ rel_stddev = self.k_channels**-0.5
127
+ self.emb_rel_k = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
128
+ self.emb_rel_v = nn.Parameter(torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels) * rel_stddev)
129
+
130
+ nn.init.xavier_uniform_(self.conv_q.weight)
131
+ nn.init.xavier_uniform_(self.conv_k.weight)
132
+ nn.init.xavier_uniform_(self.conv_v.weight)
133
+ if proximal_init:
134
+ with torch.no_grad():
135
+ self.conv_k.weight.copy_(self.conv_q.weight)
136
+ self.conv_k.bias.copy_(self.conv_q.bias)
137
+
138
+ def forward(self, x, c, attn_mask=None):
139
+ q = self.conv_q(x)
140
+ k = self.conv_k(c)
141
+ v = self.conv_v(c)
142
+
143
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
144
+
145
+ x = self.conv_o(x)
146
+ return x
147
+
148
+ def attention(self, query, key, value, mask=None):
149
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
150
+ b, d, t_s, t_t = (*key.size(), query.size(2))
151
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
152
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
153
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
154
+
155
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
156
+ if self.window_size is not None:
157
+ assert t_s == t_t, "Relative attention is only available for self-attention."
158
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
159
+ rel_logits = self._matmul_with_relative_keys(query /math.sqrt(self.k_channels), key_relative_embeddings)
160
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
161
+ scores = scores + scores_local
162
+ if self.proximal_bias:
163
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
164
+ scores = scores + self._attention_bias_proximal(t_s).to(device=scores.device, dtype=scores.dtype)
165
+ if mask is not None:
166
+ scores = scores.masked_fill(mask == 0, -1e4)
167
+ if self.block_length is not None:
168
+ assert t_s == t_t, "Local attention is only available for self-attention."
169
+ block_mask = torch.ones_like(scores).triu(-self.block_length).tril(self.block_length)
170
+ scores = scores.masked_fill(block_mask == 0, -1e4)
171
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
172
+ p_attn = self.drop(p_attn)
173
+ output = torch.matmul(p_attn, value)
174
+ if self.window_size is not None:
175
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
176
+ value_relative_embeddings = self._get_relative_embeddings(self.emb_rel_v, t_s)
177
+ output = output + self._matmul_with_relative_values(relative_weights, value_relative_embeddings)
178
+ output = output.transpose(2, 3).contiguous().view(b, d, t_t) # [b, n_h, t_t, d_k] -> [b, d, t_t]
179
+ return output, p_attn
180
+
181
+ def _matmul_with_relative_values(self, x, y):
182
+ """
183
+ x: [b, h, l, m]
184
+ y: [h or 1, m, d]
185
+ ret: [b, h, l, d]
186
+ """
187
+ ret = torch.matmul(x, y.unsqueeze(0))
188
+ return ret
189
+
190
+ def _matmul_with_relative_keys(self, x, y):
191
+ """
192
+ x: [b, h, l, d]
193
+ y: [h or 1, m, d]
194
+ ret: [b, h, l, m]
195
+ """
196
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
197
+ return ret
198
+
199
+ def _get_relative_embeddings(self, relative_embeddings, length):
200
+ max_relative_position = 2 * self.window_size + 1
201
+ # Pad first before slice to avoid using cond ops.
202
+ pad_length = max(length - (self.window_size + 1), 0)
203
+ slice_start_position = max((self.window_size + 1) - length, 0)
204
+ slice_end_position = slice_start_position + 2 * length - 1
205
+ if pad_length > 0:
206
+ padded_relative_embeddings = F.pad(
207
+ relative_embeddings,
208
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]))
209
+ else:
210
+ padded_relative_embeddings = relative_embeddings
211
+ used_relative_embeddings = padded_relative_embeddings[:,slice_start_position:slice_end_position]
212
+ return used_relative_embeddings
213
+
214
+ def _relative_position_to_absolute_position(self, x):
215
+ """
216
+ x: [b, h, l, 2*l-1]
217
+ ret: [b, h, l, l]
218
+ """
219
+ batch, heads, length, _ = x.size()
220
+ # Concat columns of pad to shift from relative to absolute indexing.
221
+ x = F.pad(x, commons.convert_pad_shape([[0,0],[0,0],[0,0],[0,1]]))
222
+
223
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
224
+ x_flat = x.view([batch, heads, length * 2 * length])
225
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0,0],[0,0],[0,length-1]]))
226
+
227
+ # Reshape and slice out the padded elements.
228
+ x_final = x_flat.view([batch, heads, length+1, 2*length-1])[:, :, :length, length-1:]
229
+ return x_final
230
+
231
+ def _absolute_position_to_relative_position(self, x):
232
+ """
233
+ x: [b, h, l, l]
234
+ ret: [b, h, l, 2*l-1]
235
+ """
236
+ batch, heads, length, _ = x.size()
237
+ # padd along column
238
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length-1]]))
239
+ x_flat = x.view([batch, heads, length**2 + length*(length -1)])
240
+ # add 0's in the beginning that will skew the elements after reshape
241
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
242
+ x_final = x_flat.view([batch, heads, length, 2*length])[:,:,:,1:]
243
+ return x_final
244
+
245
+ def _attention_bias_proximal(self, length):
246
+ """Bias for self-attention to encourage attention to close positions.
247
+ Args:
248
+ length: an integer scalar.
249
+ Returns:
250
+ a Tensor with shape [1, 1, length, length]
251
+ """
252
+ r = torch.arange(length, dtype=torch.float32)
253
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
254
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
255
+
256
+
257
+ class FFN(nn.Module):
258
+ def __init__(self, in_channels, out_channels, filter_channels, kernel_size, p_dropout=0., activation=None, causal=False):
259
+ super().__init__()
260
+ self.in_channels = in_channels
261
+ self.out_channels = out_channels
262
+ self.filter_channels = filter_channels
263
+ self.kernel_size = kernel_size
264
+ self.p_dropout = p_dropout
265
+ self.activation = activation
266
+ self.causal = causal
267
+
268
+ if causal:
269
+ self.padding = self._causal_padding
270
+ else:
271
+ self.padding = self._same_padding
272
+
273
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
274
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
275
+ self.drop = nn.Dropout(p_dropout)
276
+
277
+ def forward(self, x, x_mask):
278
+ x = self.conv_1(self.padding(x * x_mask))
279
+ if self.activation == "gelu":
280
+ x = x * torch.sigmoid(1.702 * x)
281
+ else:
282
+ x = torch.relu(x)
283
+ x = self.drop(x)
284
+ x = self.conv_2(self.padding(x * x_mask))
285
+ return x * x_mask
286
+
287
+ def _causal_padding(self, x):
288
+ if self.kernel_size == 1:
289
+ return x
290
+ pad_l = self.kernel_size - 1
291
+ pad_r = 0
292
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
293
+ x = F.pad(x, commons.convert_pad_shape(padding))
294
+ return x
295
+
296
+ def _same_padding(self, x):
297
+ if self.kernel_size == 1:
298
+ return x
299
+ pad_l = (self.kernel_size - 1) // 2
300
+ pad_r = self.kernel_size // 2
301
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
302
+ x = F.pad(x, commons.convert_pad_shape(padding))
303
+ return x
cmd_inference.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """该模块用于生成VITS文件
2
+ 使用方法
3
+
4
+ python cmd_inference.py -m 模型路径 -c 配置文件路径 -o 输出文件路径 -l 输入的语言 -t 输入文本 -s 合成目标说话人名称
5
+
6
+ 可选参数
7
+ -ns 感情变化程度
8
+ -nsw 音素发音长度
9
+ -ls 整体语速
10
+ -on 输出文件的名称
11
+
12
+ """
13
+
14
+ from pathlib import Path
15
+ import utils
16
+ from models import SynthesizerTrn
17
+ import torch
18
+ from torch import no_grad, LongTensor
19
+ import librosa
20
+ from text import text_to_sequence, _clean_text
21
+ import commons
22
+ import scipy.io.wavfile as wavf
23
+ import os
24
+
25
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
26
+
27
+ language_marks = {
28
+ "Japanese": "",
29
+ "日本語": "[JA]",
30
+ "简体中文": "[ZH]",
31
+ "English": "[EN]",
32
+ "Mix": "",
33
+ }
34
+
35
+
36
+ def get_text(text, hps, is_symbol):
37
+ text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
38
+ if hps.data.add_blank:
39
+ text_norm = commons.intersperse(text_norm, 0)
40
+ text_norm = LongTensor(text_norm)
41
+ return text_norm
42
+
43
+
44
+
45
+ if __name__ == "__main__":
46
+ import argparse
47
+
48
+ parser = argparse.ArgumentParser(description='vits inference')
49
+ #必须参数
50
+ parser.add_argument('-m', '--model_path', type=str, default="logs/44k/G_0.pth", help='模型路径')
51
+ parser.add_argument('-c', '--config_path', type=str, default="configs/config.json", help='配置文件路径')
52
+ parser.add_argument('-o', '--output_path', type=str, default="output/vits", help='输出文件路径')
53
+ parser.add_argument('-l', '--language', type=str, default="日本語", help='输入的语言')
54
+ parser.add_argument('-t', '--text', type=str, help='输入文本')
55
+ parser.add_argument('-s', '--spk', type=str, help='合成目标说话人名称')
56
+ #可选参数
57
+ parser.add_argument('-on', '--output_name', type=str, default="output", help='输出文件的名称')
58
+ parser.add_argument('-ns', '--noise_scale', type=float,default= .667,help='感情变化程度')
59
+ parser.add_argument('-nsw', '--noise_scale_w', type=float,default=0.6, help='音素发音长度')
60
+ parser.add_argument('-ls', '--length_scale', type=float,default=1, help='整体语速')
61
+
62
+ args = parser.parse_args()
63
+
64
+ model_path = args.model_path
65
+ config_path = args.config_path
66
+ output_dir = Path(args.output_path)
67
+ output_dir.mkdir(parents=True, exist_ok=True)
68
+
69
+ language = args.language
70
+ text = args.text
71
+ spk = args.spk
72
+ noise_scale = args.noise_scale
73
+ noise_scale_w = args.noise_scale_w
74
+ length = args.length_scale
75
+ output_name = args.output_name
76
+
77
+ hps = utils.get_hparams_from_file(config_path)
78
+ net_g = SynthesizerTrn(
79
+ len(hps.symbols),
80
+ hps.data.filter_length // 2 + 1,
81
+ hps.train.segment_size // hps.data.hop_length,
82
+ n_speakers=hps.data.n_speakers,
83
+ **hps.model).to(device)
84
+ _ = net_g.eval()
85
+ _ = utils.load_checkpoint(model_path, net_g, None)
86
+
87
+ speaker_ids = hps.speakers
88
+
89
+
90
+ if language is not None:
91
+ text = language_marks[language] + text + language_marks[language]
92
+ speaker_id = speaker_ids[spk]
93
+ stn_tst = get_text(text, hps, False)
94
+ with no_grad():
95
+ x_tst = stn_tst.unsqueeze(0).to(device)
96
+ x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
97
+ sid = LongTensor([speaker_id]).to(device)
98
+ audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=noise_scale, noise_scale_w=noise_scale_w,
99
+ length_scale=1.0 / length)[0][0, 0].data.cpu().float().numpy()
100
+ del stn_tst, x_tst, x_tst_lengths, sid
101
+
102
+ wavf.write(str(output_dir)+"/"+output_name+".wav",hps.data.sampling_rate,audio)
103
+
104
+
105
+
106
+
commons.py ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import numpy as np
3
+ import torch
4
+ from torch import nn
5
+ from torch.nn import functional as F
6
+
7
+
8
+ def init_weights(m, mean=0.0, std=0.01):
9
+ classname = m.__class__.__name__
10
+ if classname.find("Conv") != -1:
11
+ m.weight.data.normal_(mean, std)
12
+
13
+
14
+ def get_padding(kernel_size, dilation=1):
15
+ return int((kernel_size*dilation - dilation)/2)
16
+
17
+
18
+ def convert_pad_shape(pad_shape):
19
+ l = pad_shape[::-1]
20
+ pad_shape = [item for sublist in l for item in sublist]
21
+ return pad_shape
22
+
23
+
24
+ def intersperse(lst, item):
25
+ result = [item] * (len(lst) * 2 + 1)
26
+ result[1::2] = lst
27
+ return result
28
+
29
+
30
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
31
+ """KL(P||Q)"""
32
+ kl = (logs_q - logs_p) - 0.5
33
+ kl += 0.5 * (torch.exp(2. * logs_p) + ((m_p - m_q)**2)) * torch.exp(-2. * logs_q)
34
+ return kl
35
+
36
+
37
+ def rand_gumbel(shape):
38
+ """Sample from the Gumbel distribution, protect from overflows."""
39
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
40
+ return -torch.log(-torch.log(uniform_samples))
41
+
42
+
43
+ def rand_gumbel_like(x):
44
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
45
+ return g
46
+
47
+
48
+ def slice_segments(x, ids_str, segment_size=4):
49
+ ret = torch.zeros_like(x[:, :, :segment_size])
50
+ for i in range(x.size(0)):
51
+ idx_str = ids_str[i]
52
+ idx_end = idx_str + segment_size
53
+ try:
54
+ ret[i] = x[i, :, idx_str:idx_end]
55
+ except RuntimeError:
56
+ print("?")
57
+ return ret
58
+
59
+
60
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
61
+ b, d, t = x.size()
62
+ if x_lengths is None:
63
+ x_lengths = t
64
+ ids_str_max = x_lengths - segment_size + 1
65
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
66
+ ret = slice_segments(x, ids_str, segment_size)
67
+ return ret, ids_str
68
+
69
+
70
+ def get_timing_signal_1d(
71
+ length, channels, min_timescale=1.0, max_timescale=1.0e4):
72
+ position = torch.arange(length, dtype=torch.float)
73
+ num_timescales = channels // 2
74
+ log_timescale_increment = (
75
+ math.log(float(max_timescale) / float(min_timescale)) /
76
+ (num_timescales - 1))
77
+ inv_timescales = min_timescale * torch.exp(
78
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment)
79
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
80
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
81
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
82
+ signal = signal.view(1, channels, length)
83
+ return signal
84
+
85
+
86
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
87
+ b, channels, length = x.size()
88
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
89
+ return x + signal.to(dtype=x.dtype, device=x.device)
90
+
91
+
92
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
93
+ b, channels, length = x.size()
94
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
95
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
96
+
97
+
98
+ def subsequent_mask(length):
99
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
100
+ return mask
101
+
102
+
103
+ @torch.jit.script
104
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
105
+ n_channels_int = n_channels[0]
106
+ in_act = input_a + input_b
107
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
108
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
109
+ acts = t_act * s_act
110
+ return acts
111
+
112
+
113
+ def convert_pad_shape(pad_shape):
114
+ l = pad_shape[::-1]
115
+ pad_shape = [item for sublist in l for item in sublist]
116
+ return pad_shape
117
+
118
+
119
+ def shift_1d(x):
120
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
121
+ return x
122
+
123
+
124
+ def sequence_mask(length, max_length=None):
125
+ if max_length is None:
126
+ max_length = length.max()
127
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
128
+ return x.unsqueeze(0) < length.unsqueeze(1)
129
+
130
+
131
+ def generate_path(duration, mask):
132
+ """
133
+ duration: [b, 1, t_x]
134
+ mask: [b, 1, t_y, t_x]
135
+ """
136
+ device = duration.device
137
+
138
+ b, _, t_y, t_x = mask.shape
139
+ cum_duration = torch.cumsum(duration, -1)
140
+
141
+ cum_duration_flat = cum_duration.view(b * t_x)
142
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
143
+ path = path.view(b, t_x, t_y)
144
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
145
+ path = path.unsqueeze(1).transpose(2,3) * mask
146
+ return path
147
+
148
+
149
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
150
+ if isinstance(parameters, torch.Tensor):
151
+ parameters = [parameters]
152
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
153
+ norm_type = float(norm_type)
154
+ if clip_value is not None:
155
+ clip_value = float(clip_value)
156
+
157
+ total_norm = 0
158
+ for p in parameters:
159
+ param_norm = p.grad.data.norm(norm_type)
160
+ total_norm += param_norm.item() ** norm_type
161
+ if clip_value is not None:
162
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
163
+ total_norm = total_norm ** (1. / norm_type)
164
+ return total_norm
configs/amitaro_jp_base.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "train": {
3
+ "log_interval": 200,
4
+ "eval_interval": 1000,
5
+ "seed": 1234,
6
+ "epochs": 10000,
7
+ "learning_rate": 2e-4,
8
+ "betas": [0.8, 0.99],
9
+ "eps": 1e-9,
10
+ "batch_size": 1,
11
+ "fp16_run": true,
12
+ "lr_decay": 0.999875,
13
+ "segment_size": 8192,
14
+ "init_lr_ratio": 1,
15
+ "warmup_epochs": 0,
16
+ "c_mel": 45,
17
+ "c_kl": 1.0
18
+ },
19
+ "data": {
20
+ "training_files":"./final_annotation_train.txt",
21
+ "validation_files":"./final_annotation_val.txt",
22
+ "text_cleaners":["japanese_cleaners"],
23
+ "max_wav_value": 32768.0,
24
+ "sampling_rate": 22050,
25
+ "filter_length": 1024,
26
+ "hop_length": 256,
27
+ "win_length": 1024,
28
+ "n_mel_channels": 80,
29
+ "mel_fmin": 0.0,
30
+ "mel_fmax": null,
31
+ "add_blank": true,
32
+ "n_speakers": 1,
33
+ "cleaned_text": true
34
+ },
35
+ "model": {
36
+ "inter_channels": 192,
37
+ "hidden_channels": 192,
38
+ "filter_channels": 768,
39
+ "n_heads": 2,
40
+ "n_layers": 6,
41
+ "kernel_size": 3,
42
+ "p_dropout": 0.1,
43
+ "resblock": "1",
44
+ "resblock_kernel_sizes": [3,7,11],
45
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
46
+ "upsample_rates": [8,8,2,2],
47
+ "upsample_initial_channel": 512,
48
+ "upsample_kernel_sizes": [16,16,4,4],
49
+ "n_layers_q": 3,
50
+ "use_spectral_norm": false,
51
+ "gin_channels": 256
52
+ },
53
+ "speakers": {"amitaro":0
54
+ },
55
+ "symbols": ["_", ",", ".", "!", "?", "-", "A", "E", "I", "N", "O", "Q", "U", "a", "b", "d", "e", "f", "g", "h", "i", "j", "k", "m", "n", "o", "p", "r", "s", "t", "u", "v", "w", "y", "z", "\u0283", "\u02a7", "\u2193", "\u2191", " "]
56
+ }
custom_character_voice/amitaro/amitaro_0.wav ADDED
Binary file (145 kB). View file
 
custom_character_voice/amitaro/amitaro_1.wav ADDED
Binary file (174 kB). View file
 
custom_character_voice/amitaro/amitaro_10.wav ADDED
Binary file (82.4 kB). View file
 
custom_character_voice/amitaro/amitaro_11.wav ADDED
Binary file (307 kB). View file
 
custom_character_voice/amitaro/amitaro_12.wav ADDED
Binary file (175 kB). View file
 
custom_character_voice/amitaro/amitaro_13.wav ADDED
Binary file (131 kB). View file
 
custom_character_voice/amitaro/amitaro_14.wav ADDED
Binary file (26.6 kB). View file
 
custom_character_voice/amitaro/amitaro_15.wav ADDED
Binary file (209 kB). View file
 
custom_character_voice/amitaro/amitaro_16.wav ADDED
Binary file (81.7 kB). View file
 
custom_character_voice/amitaro/amitaro_17.wav ADDED
Binary file (244 kB). View file
 
custom_character_voice/amitaro/amitaro_18.wav ADDED
Binary file (137 kB). View file
 
custom_character_voice/amitaro/amitaro_19.wav ADDED
Binary file (187 kB). View file
 
custom_character_voice/amitaro/amitaro_2.wav ADDED
Binary file (78.3 kB). View file
 
custom_character_voice/amitaro/amitaro_20.wav ADDED
Binary file (191 kB). View file
 
custom_character_voice/amitaro/amitaro_21.wav ADDED
Binary file (107 kB). View file
 
custom_character_voice/amitaro/amitaro_22.wav ADDED
Binary file (117 kB). View file
 
custom_character_voice/amitaro/amitaro_23.wav ADDED
Binary file (229 kB). View file
 
custom_character_voice/amitaro/amitaro_24.wav ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ [ViewState]
2
+ Mode=
3
+ Vid=
4
+ FolderType=Generic
custom_character_voice/amitaro/amitaro_25.wav ADDED
Binary file (101 kB). View file
 
custom_character_voice/amitaro/amitaro_26.wav ADDED
Binary file (83 kB). View file
 
custom_character_voice/amitaro/amitaro_27.wav ADDED
Binary file (91.8 kB). View file
 
custom_character_voice/amitaro/amitaro_28.wav ADDED
Binary file (96.6 kB). View file
 
custom_character_voice/amitaro/amitaro_29.wav ADDED
Binary file (88.6 kB). View file
 
custom_character_voice/amitaro/amitaro_3.wav ADDED
Binary file (105 kB). View file
 
custom_character_voice/amitaro/amitaro_30.wav ADDED
Binary file (97.8 kB). View file
 
custom_character_voice/amitaro/amitaro_31.wav ADDED
Binary file (103 kB). View file
 
custom_character_voice/amitaro/amitaro_32.wav ADDED
Binary file (81.7 kB). View file
 
custom_character_voice/amitaro/amitaro_33.wav ADDED
Binary file (69.7 kB). View file
 
custom_character_voice/amitaro/amitaro_34.wav ADDED
Binary file (83.1 kB). View file
 
custom_character_voice/amitaro/amitaro_35.wav ADDED
Binary file (67.9 kB). View file
 
custom_character_voice/amitaro/amitaro_36.wav ADDED
Binary file (47.1 kB). View file
 
custom_character_voice/amitaro/amitaro_37.wav ADDED
Binary file (72.6 kB). View file