Higobeatz commited on
Commit
0a97d6c
1 Parent(s): 51ff5b9

Initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. dreamvoice/.ipynb_checkpoints/__init__-checkpoint.py +1 -0
  2. dreamvoice/.ipynb_checkpoints/api-checkpoint.py +295 -0
  3. dreamvoice/.ipynb_checkpoints/dreamvc-checkpoint.yaml +26 -0
  4. dreamvoice/__init__.py +1 -0
  5. dreamvoice/__pycache__/__init__.cpython-310.pyc +0 -0
  6. dreamvoice/__pycache__/__init__.cpython-311.pyc +0 -0
  7. dreamvoice/__pycache__/api.cpython-310.pyc +0 -0
  8. dreamvoice/__pycache__/api.cpython-311.pyc +0 -0
  9. dreamvoice/api.py +295 -0
  10. dreamvoice/ckpts/bigvgan_24k/config.json +44 -0
  11. dreamvoice/ckpts/bigvgan_24k/g_01000000.pt +3 -0
  12. dreamvoice/ckpts/dreamvc_base.pt +3 -0
  13. dreamvoice/ckpts/dreamvc_cross.pt +3 -0
  14. dreamvoice/ckpts/dreamvc_plugin.pt +3 -0
  15. dreamvoice/ckpts/spk_encoder/pretrained.pt +3 -0
  16. dreamvoice/dreamvc.yaml +26 -0
  17. dreamvoice/src/.ipynb_checkpoints/extract_features-checkpoint.py +103 -0
  18. dreamvoice/src/.ipynb_checkpoints/plugin_wrapper-checkpoint.py +76 -0
  19. dreamvoice/src/.ipynb_checkpoints/train_plugin-checkpoint.py +0 -0
  20. dreamvoice/src/.ipynb_checkpoints/train_vc-checkpoint.py +0 -0
  21. dreamvoice/src/.ipynb_checkpoints/vc_wrapper-checkpoint.py +144 -0
  22. dreamvoice/src/__pycache__/plugin_wrapper.cpython-310.pyc +0 -0
  23. dreamvoice/src/__pycache__/plugin_wrapper.cpython-311.pyc +0 -0
  24. dreamvoice/src/__pycache__/vc_wrapper.cpython-310.pyc +0 -0
  25. dreamvoice/src/__pycache__/vc_wrapper.cpython-311.pyc +0 -0
  26. dreamvoice/src/configs/.ipynb_checkpoints/diffvc_base-checkpoint.yaml +47 -0
  27. dreamvoice/src/configs/.ipynb_checkpoints/diffvc_base_pitch-checkpoint.yaml +34 -0
  28. dreamvoice/src/configs/.ipynb_checkpoints/diffvc_cross-checkpoint.yaml +45 -0
  29. dreamvoice/src/configs/.ipynb_checkpoints/diffvc_cross_pitch-checkpoint.yaml +33 -0
  30. dreamvoice/src/configs/.ipynb_checkpoints/plugin_cross-checkpoint.yaml +39 -0
  31. dreamvoice/src/configs/diffvc_base.yaml +47 -0
  32. dreamvoice/src/configs/diffvc_base_pitch.yaml +34 -0
  33. dreamvoice/src/configs/diffvc_cross.yaml +45 -0
  34. dreamvoice/src/configs/diffvc_cross_pitch.yaml +33 -0
  35. dreamvoice/src/configs/plugin_cross.yaml +39 -0
  36. dreamvoice/src/debug.py +0 -0
  37. dreamvoice/src/extract_features.py +103 -0
  38. dreamvoice/src/feats/.ipynb_checkpoints/contentvec-checkpoint.py +42 -0
  39. dreamvoice/src/feats/.ipynb_checkpoints/contentvec_hf-checkpoint.py +40 -0
  40. dreamvoice/src/feats/.ipynb_checkpoints/hubert_model-checkpoint.py +24 -0
  41. dreamvoice/src/feats/.ipynb_checkpoints/test-checkpoint.py +22 -0
  42. dreamvoice/src/feats/__pycache__/contentvec.cpython-310.pyc +0 -0
  43. dreamvoice/src/feats/__pycache__/contentvec.cpython-311.pyc +0 -0
  44. dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-310.pyc +0 -0
  45. dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-311.pyc +0 -0
  46. dreamvoice/src/feats/__pycache__/hubert_model.cpython-311.pyc +0 -0
  47. dreamvoice/src/feats/contentvec.py +42 -0
  48. dreamvoice/src/feats/contentvec_hf.py +40 -0
  49. dreamvoice/src/feats/hubert/.gitignore +132 -0
  50. dreamvoice/src/feats/hubert/LICENSE +21 -0
dreamvoice/.ipynb_checkpoints/__init__-checkpoint.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .api import DreamVoice
dreamvoice/.ipynb_checkpoints/api-checkpoint.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import yaml
4
+ import torch
5
+ import librosa
6
+ import numpy as np
7
+ import soundfile as sf
8
+ from pathlib import Path
9
+ from transformers import T5Tokenizer, T5EncoderModel
10
+ from tqdm import tqdm
11
+ from .src.vc_wrapper import ReDiffVC, DreamVC
12
+ from .src.plugin_wrapper import DreamVG
13
+ from .src.modules.speaker_encoder.encoder import inference as spk_encoder
14
+ from .src.modules.BigVGAN.inference import load_model as load_vocoder
15
+ from .src.feats.contentvec_hf import get_content_model, get_content
16
+
17
+
18
+ class DreamVoice:
19
+ def __init__(self, config='dreamvc.yaml', mode='plugin', device='cuda', chunk_size=16):
20
+ # Initial setup
21
+ script_dir = Path(__file__).resolve().parent
22
+ config_path = script_dir / config
23
+
24
+ # Load configuration file
25
+ with open(config_path, 'r') as fp:
26
+ self.config = yaml.safe_load(fp)
27
+
28
+ self.script_dir = script_dir
29
+
30
+ # Ensure all checkpoints are downloaded
31
+ self._ensure_checkpoints_exist()
32
+
33
+ # Initialize attributes
34
+ self.device = device
35
+ self.sr = self.config['sample_rate']
36
+
37
+ # Load vocoder
38
+ vocoder_path = script_dir / self.config['vocoder_path']
39
+ self.hifigan, _ = load_vocoder(vocoder_path, device)
40
+ self.hifigan.eval()
41
+
42
+ # Load content model
43
+ self.content_model = get_content_model().to(device)
44
+
45
+ # Load tokenizer and text encoder
46
+ lm_path = self.config['lm_path']
47
+ self.tokenizer = T5Tokenizer.from_pretrained(lm_path)
48
+ self.text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
49
+
50
+ # Set mode
51
+ self.mode = mode
52
+ if mode == 'plugin':
53
+ self._init_plugin_mode()
54
+ elif mode == 'end2end':
55
+ self._init_end2end_mode()
56
+ else:
57
+ raise NotImplementedError("Select mode from 'plugin' and 'end2end'")
58
+
59
+ # chunk inputs to 10s clips
60
+ self.chunk_size = chunk_size * 50
61
+
62
+ def _ensure_checkpoints_exist(self):
63
+ checkpoints = [
64
+ ('vocoder_path', self.config.get('vocoder_url')),
65
+ ('vocoder_config_path', self.config.get('vocoder_config_url')),
66
+ ('speaker_path', self.config.get('speaker_url')),
67
+ ('dreamvc.ckpt_path', self.config.get('dreamvc', {}).get('ckpt_url')),
68
+ ('rediffvc.ckpt_path', self.config.get('rediffvc', {}).get('ckpt_url')),
69
+ ('dreamvg.ckpt_path', self.config.get('dreamvg', {}).get('ckpt_url'))
70
+ ]
71
+
72
+ for path_key, url in checkpoints:
73
+ local_path = self._get_local_path(path_key)
74
+ if not local_path.exists() and url:
75
+ print(f"Downloading {path_key} from {url}")
76
+ self._download_file(url, local_path)
77
+
78
+ def _get_local_path(self, path_key):
79
+ keys = path_key.split('.')
80
+ local_path = self.config
81
+ for key in keys:
82
+ local_path = local_path.get(key, {})
83
+ return self.script_dir / local_path
84
+
85
+ def _download_file(self, url, local_path):
86
+ try:
87
+ # Attempt to send a GET request to the URL
88
+ response = requests.get(url, stream=True)
89
+ response.raise_for_status() # Ensure we raise an exception for HTTP errors
90
+ except requests.exceptions.RequestException as e:
91
+ # Log the error for debugging purposes
92
+ print(f"Error encountered: {e}")
93
+
94
+ # Development mode: prompt user for Hugging Face API key
95
+ user_input = input("Private checkpoint, please request authorization and enter your Hugging Face API key.")
96
+ self.hf_key = user_input if user_input else None
97
+
98
+ # Set headers if an API key is provided
99
+ headers = {'Authorization': f'Bearer {self.hf_key}'} if self.hf_key else {}
100
+
101
+ try:
102
+ # Attempt to send a GET request with headers in development mode
103
+ response = requests.get(url, stream=True, headers=headers)
104
+ response.raise_for_status() # Ensure we raise an exception for HTTP errors
105
+ except requests.exceptions.RequestException as e:
106
+ # Log the error for debugging purposes
107
+ print(f"Error encountered in dev mode: {e}")
108
+ response = None # Handle response accordingly in your code
109
+
110
+ local_path.parent.mkdir(parents=True, exist_ok=True)
111
+
112
+ total_size = int(response.headers.get('content-length', 0))
113
+ block_size = 8192
114
+ t = tqdm(total=total_size, unit='iB', unit_scale=True)
115
+
116
+ with open(local_path, 'wb') as f:
117
+ for chunk in response.iter_content(chunk_size=block_size):
118
+ t.update(len(chunk))
119
+ f.write(chunk)
120
+ t.close()
121
+
122
+ def _init_plugin_mode(self):
123
+ # Initialize ReDiffVC
124
+ self.dreamvc = ReDiffVC(
125
+ config_path=self.script_dir / self.config['rediffvc']['config_path'],
126
+ ckpt_path=self.script_dir / self.config['rediffvc']['ckpt_path'],
127
+ device=self.device
128
+ )
129
+
130
+ # Initialize DreamVG
131
+ self.dreamvg = DreamVG(
132
+ config_path=self.script_dir / self.config['dreamvg']['config_path'],
133
+ ckpt_path=self.script_dir / self.config['dreamvg']['ckpt_path'],
134
+ device=self.device
135
+ )
136
+
137
+ # Load speaker encoder
138
+ spk_encoder.load_model(self.script_dir / self.config['speaker_path'], self.device)
139
+ self.spk_encoder = spk_encoder
140
+ self.spk_embed_cache = None
141
+
142
+ def _init_end2end_mode(self):
143
+ # Initialize DreamVC
144
+ self.dreamvc = DreamVC(
145
+ config_path=self.script_dir / self.config['dreamvc']['config_path'],
146
+ ckpt_path=self.script_dir / self.config['dreamvc']['ckpt_path'],
147
+ device=self.device
148
+ )
149
+
150
+ def _load_content(self, audio_path):
151
+ content_audio, _ = librosa.load(audio_path, sr=16000)
152
+ # Calculate the required length to make it a multiple of 16*160
153
+ target_length = ((len(content_audio) + 16*160 - 1) // (16*160)) * (16*160)
154
+ # Pad with zeros if necessary
155
+ if len(content_audio) < target_length:
156
+ content_audio = np.pad(content_audio, (0, target_length - len(content_audio)), mode='constant')
157
+ content_audio = torch.tensor(content_audio).unsqueeze(0).to(self.device)
158
+ content_clip = get_content(self.content_model, content_audio)
159
+ return content_clip
160
+
161
+ def load_spk_embed(self, emb_path):
162
+ self.spk_embed_cache = torch.load(emb_path, map_location=self.device)
163
+
164
+ def save_spk_embed(self, emb_path):
165
+ assert self.spk_embed_cache is not None
166
+ torch.save(self.spk_embed_cache.cpu(), emb_path)
167
+
168
+ def save_audio(self, output_path, audio, sr):
169
+ sf.write(output_path, audio, samplerate=sr)
170
+
171
+ @torch.no_grad()
172
+ def genvc(self, content_audio, prompt,
173
+ prompt_guidance_scale=3, prompt_guidance_rescale=0.0,
174
+ prompt_ddim_steps=100, prompt_eta=1, prompt_random_seed=None,
175
+ vc_guidance_scale=3, vc_guidance_rescale=0.7,
176
+ vc_ddim_steps=50, vc_eta=1, vc_random_seed=None,
177
+ ):
178
+
179
+ content_clip = self._load_content(content_audio)
180
+
181
+ text_batch = self.tokenizer(prompt, max_length=32,
182
+ padding='max_length', truncation=True, return_tensors="pt")
183
+ text, text_mask = text_batch.input_ids.to(self.device), \
184
+ text_batch.attention_mask.to(self.device)
185
+ text = self.text_encoder(input_ids=text, attention_mask=text_mask)[0]
186
+
187
+ if self.mode == 'plugin':
188
+ spk_embed = self.dreamvg.inference([text, text_mask],
189
+ guidance_scale=prompt_guidance_scale,
190
+ guidance_rescale=prompt_guidance_rescale,
191
+ ddim_steps=prompt_ddim_steps, eta=prompt_eta,
192
+ random_seed=prompt_random_seed)
193
+
194
+ B, L, D = content_clip.shape
195
+ gen_audio_chunks = []
196
+ num_chunks = (L + self.chunk_size - 1) // self.chunk_size
197
+ for i in range(num_chunks):
198
+ start_idx = i * self.chunk_size
199
+ end_idx = min((i + 1) * self.chunk_size, L)
200
+ content_clip_chunk = content_clip[:, start_idx:end_idx, :]
201
+
202
+ gen_audio_chunk = self.dreamvc.inference(
203
+ spk_embed, content_clip_chunk, None,
204
+ guidance_scale=vc_guidance_scale,
205
+ guidance_rescale=vc_guidance_rescale,
206
+ ddim_steps=vc_ddim_steps,
207
+ eta=vc_eta,
208
+ random_seed=vc_random_seed)
209
+
210
+ gen_audio_chunks.append(gen_audio_chunk)
211
+
212
+ gen_audio = torch.cat(gen_audio_chunks, dim=-1)
213
+
214
+ self.spk_embed_cache = spk_embed
215
+
216
+ elif self.mode == 'end2end':
217
+ B, L, D = content_clip.shape
218
+ gen_audio_chunks = []
219
+ num_chunks = (L + self.chunk_size - 1) // self.chunk_size
220
+
221
+ for i in range(num_chunks):
222
+ start_idx = i * self.chunk_size
223
+ end_idx = min((i + 1) * self.chunk_size, L)
224
+ content_clip_chunk = content_clip[:, start_idx:end_idx, :]
225
+
226
+ gen_audio_chunk = self.dreamvc.inference([text, text_mask], content_clip,
227
+ guidance_scale=prompt_guidance_scale,
228
+ guidance_rescale=prompt_guidance_rescale,
229
+ ddim_steps=prompt_ddim_steps,
230
+ eta=prompt_eta, random_seed=prompt_random_seed)
231
+ gen_audio_chunks.append(gen_audio_chunk)
232
+
233
+ gen_audio = torch.cat(gen_audio_chunks, dim=-1)
234
+
235
+ else:
236
+ raise NotImplementedError("Select mode from 'plugin' and 'end2end'")
237
+
238
+ gen_audio = self.hifigan(gen_audio.squeeze(1))
239
+ gen_audio = gen_audio.cpu().numpy().squeeze(0).squeeze(0)
240
+
241
+ return gen_audio, self.sr
242
+
243
+ @torch.no_grad()
244
+ def simplevc(self, content_audio, speaker_audio=None, use_spk_cache=False,
245
+ vc_guidance_scale=3, vc_guidance_rescale=0.7,
246
+ vc_ddim_steps=50, vc_eta=1, vc_random_seed=None,
247
+ ):
248
+
249
+ assert self.mode == 'plugin'
250
+ if speaker_audio is not None:
251
+ speaker_audio, _ = librosa.load(speaker_audio, sr=16000)
252
+ speaker_audio = torch.tensor(speaker_audio).unsqueeze(0).to(self.device)
253
+ spk_embed = spk_encoder.embed_utterance_batch(speaker_audio)
254
+ self.spk_embed_cache = spk_embed
255
+ elif use_spk_cache:
256
+ assert self.spk_embed_cache is not None
257
+ spk_embed = self.spk_embed_cache
258
+ else:
259
+ raise NotImplementedError
260
+
261
+ content_clip = self._load_content(content_audio)
262
+
263
+ B, L, D = content_clip.shape
264
+ gen_audio_chunks = []
265
+ num_chunks = (L + self.chunk_size - 1) // self.chunk_size
266
+ for i in range(num_chunks):
267
+ start_idx = i * self.chunk_size
268
+ end_idx = min((i + 1) * self.chunk_size, L)
269
+ content_clip_chunk = content_clip[:, start_idx:end_idx, :]
270
+
271
+ gen_audio_chunk = self.dreamvc.inference(
272
+ spk_embed, content_clip_chunk, None,
273
+ guidance_scale=vc_guidance_scale,
274
+ guidance_rescale=vc_guidance_rescale,
275
+ ddim_steps=vc_ddim_steps,
276
+ eta=vc_eta,
277
+ random_seed=vc_random_seed)
278
+
279
+ gen_audio_chunks.append(gen_audio_chunk)
280
+
281
+ gen_audio = torch.cat(gen_audio_chunks, dim=-1)
282
+
283
+ gen_audio = self.hifigan(gen_audio.squeeze(1))
284
+ gen_audio = gen_audio.cpu().numpy().squeeze(0).squeeze(0)
285
+
286
+ return gen_audio, self.sr
287
+
288
+
289
+ if __name__ == '__main__':
290
+ dreamvoice = DreamVoice(config='dreamvc.yaml', mode='plugin', device='cuda')
291
+ content_audio = 'test.wav'
292
+ speaker_audio = 'speaker.wav'
293
+ prompt = 'young female voice, sounds young and cute'
294
+ gen_audio, sr = dreamvoice.genvc('test.wav', prompt)
295
+ dreamvoice.save_audio('debug.wav', gen_audio, sr)
dreamvoice/.ipynb_checkpoints/dreamvc-checkpoint.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ sample_rate: 24000
4
+ vocoder_path: 'ckpts/bigvgan_24k/g_01000000.pt'
5
+ vocoder_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/bigvgan_24k/g_01000000.pt'
6
+ vocoder_config_path: 'ckpts/bigvgan_24k/config.json'
7
+ vocoder_config_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/bigvgan_24k/config.json'
8
+
9
+ speaker_path: 'ckpts/spk_encoder/pretrained.pt'
10
+ speaker_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/spk_encoder/pretrained.pt'
11
+ lm_path: 'google/flan-t5-base'
12
+
13
+ dreamvc:
14
+ config_path: 'src/configs/diffvc_cross.yaml'
15
+ ckpt_path: 'ckpts/dreamvc_cross.pt'
16
+ ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_cross.pt'
17
+
18
+ rediffvc:
19
+ config_path: 'src/configs/diffvc_base.yaml'
20
+ ckpt_path: 'ckpts/dreamvc_base.pt'
21
+ ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_base.pt'
22
+
23
+ dreamvg:
24
+ config_path: 'src/configs/plugin_cross.yaml'
25
+ ckpt_path: 'ckpts/dreamvc_plugin.pt'
26
+ ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_plugin.pt'
dreamvoice/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .api import DreamVoice
dreamvoice/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (202 Bytes). View file
 
dreamvoice/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (225 Bytes). View file
 
dreamvoice/__pycache__/api.cpython-310.pyc ADDED
Binary file (8.05 kB). View file
 
dreamvoice/__pycache__/api.cpython-311.pyc ADDED
Binary file (14.3 kB). View file
 
dreamvoice/api.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import yaml
4
+ import torch
5
+ import librosa
6
+ import numpy as np
7
+ import soundfile as sf
8
+ from pathlib import Path
9
+ from transformers import T5Tokenizer, T5EncoderModel
10
+ from tqdm import tqdm
11
+ from .src.vc_wrapper import ReDiffVC, DreamVC
12
+ from .src.plugin_wrapper import DreamVG
13
+ from .src.modules.speaker_encoder.encoder import inference as spk_encoder
14
+ from .src.modules.BigVGAN.inference import load_model as load_vocoder
15
+ from .src.feats.contentvec_hf import get_content_model, get_content
16
+
17
+
18
+ class DreamVoice:
19
+ def __init__(self, config='dreamvc.yaml', mode='plugin', device='cuda', chunk_size=16):
20
+ # Initial setup
21
+ script_dir = Path(__file__).resolve().parent
22
+ config_path = script_dir / config
23
+
24
+ # Load configuration file
25
+ with open(config_path, 'r') as fp:
26
+ self.config = yaml.safe_load(fp)
27
+
28
+ self.script_dir = script_dir
29
+
30
+ # Ensure all checkpoints are downloaded
31
+ self._ensure_checkpoints_exist()
32
+
33
+ # Initialize attributes
34
+ self.device = device
35
+ self.sr = self.config['sample_rate']
36
+
37
+ # Load vocoder
38
+ vocoder_path = script_dir / self.config['vocoder_path']
39
+ self.hifigan, _ = load_vocoder(vocoder_path, device)
40
+ self.hifigan.eval()
41
+
42
+ # Load content model
43
+ self.content_model = get_content_model().to(device)
44
+
45
+ # Load tokenizer and text encoder
46
+ lm_path = self.config['lm_path']
47
+ self.tokenizer = T5Tokenizer.from_pretrained(lm_path)
48
+ self.text_encoder = T5EncoderModel.from_pretrained(lm_path).to(device).eval()
49
+
50
+ # Set mode
51
+ self.mode = mode
52
+ if mode == 'plugin':
53
+ self._init_plugin_mode()
54
+ elif mode == 'end2end':
55
+ self._init_end2end_mode()
56
+ else:
57
+ raise NotImplementedError("Select mode from 'plugin' and 'end2end'")
58
+
59
+ # chunk inputs to 10s clips
60
+ self.chunk_size = chunk_size * 50
61
+
62
+ def _ensure_checkpoints_exist(self):
63
+ checkpoints = [
64
+ ('vocoder_path', self.config.get('vocoder_url')),
65
+ ('vocoder_config_path', self.config.get('vocoder_config_url')),
66
+ ('speaker_path', self.config.get('speaker_url')),
67
+ ('dreamvc.ckpt_path', self.config.get('dreamvc', {}).get('ckpt_url')),
68
+ ('rediffvc.ckpt_path', self.config.get('rediffvc', {}).get('ckpt_url')),
69
+ ('dreamvg.ckpt_path', self.config.get('dreamvg', {}).get('ckpt_url'))
70
+ ]
71
+
72
+ for path_key, url in checkpoints:
73
+ local_path = self._get_local_path(path_key)
74
+ if not local_path.exists() and url:
75
+ print(f"Downloading {path_key} from {url}")
76
+ self._download_file(url, local_path)
77
+
78
+ def _get_local_path(self, path_key):
79
+ keys = path_key.split('.')
80
+ local_path = self.config
81
+ for key in keys:
82
+ local_path = local_path.get(key, {})
83
+ return self.script_dir / local_path
84
+
85
+ def _download_file(self, url, local_path):
86
+ try:
87
+ # Attempt to send a GET request to the URL
88
+ response = requests.get(url, stream=True)
89
+ response.raise_for_status() # Ensure we raise an exception for HTTP errors
90
+ except requests.exceptions.RequestException as e:
91
+ # Log the error for debugging purposes
92
+ print(f"Error encountered: {e}")
93
+
94
+ # Development mode: prompt user for Hugging Face API key
95
+ user_input = input("Private checkpoint, please request authorization and enter your Hugging Face API key.")
96
+ self.hf_key = user_input if user_input else None
97
+
98
+ # Set headers if an API key is provided
99
+ headers = {'Authorization': f'Bearer {self.hf_key}'} if self.hf_key else {}
100
+
101
+ try:
102
+ # Attempt to send a GET request with headers in development mode
103
+ response = requests.get(url, stream=True, headers=headers)
104
+ response.raise_for_status() # Ensure we raise an exception for HTTP errors
105
+ except requests.exceptions.RequestException as e:
106
+ # Log the error for debugging purposes
107
+ print(f"Error encountered in dev mode: {e}")
108
+ response = None # Handle response accordingly in your code
109
+
110
+ local_path.parent.mkdir(parents=True, exist_ok=True)
111
+
112
+ total_size = int(response.headers.get('content-length', 0))
113
+ block_size = 8192
114
+ t = tqdm(total=total_size, unit='iB', unit_scale=True)
115
+
116
+ with open(local_path, 'wb') as f:
117
+ for chunk in response.iter_content(chunk_size=block_size):
118
+ t.update(len(chunk))
119
+ f.write(chunk)
120
+ t.close()
121
+
122
+ def _init_plugin_mode(self):
123
+ # Initialize ReDiffVC
124
+ self.dreamvc = ReDiffVC(
125
+ config_path=self.script_dir / self.config['rediffvc']['config_path'],
126
+ ckpt_path=self.script_dir / self.config['rediffvc']['ckpt_path'],
127
+ device=self.device
128
+ )
129
+
130
+ # Initialize DreamVG
131
+ self.dreamvg = DreamVG(
132
+ config_path=self.script_dir / self.config['dreamvg']['config_path'],
133
+ ckpt_path=self.script_dir / self.config['dreamvg']['ckpt_path'],
134
+ device=self.device
135
+ )
136
+
137
+ # Load speaker encoder
138
+ spk_encoder.load_model(self.script_dir / self.config['speaker_path'], self.device)
139
+ self.spk_encoder = spk_encoder
140
+ self.spk_embed_cache = None
141
+
142
+ def _init_end2end_mode(self):
143
+ # Initialize DreamVC
144
+ self.dreamvc = DreamVC(
145
+ config_path=self.script_dir / self.config['dreamvc']['config_path'],
146
+ ckpt_path=self.script_dir / self.config['dreamvc']['ckpt_path'],
147
+ device=self.device
148
+ )
149
+
150
+ def _load_content(self, audio_path):
151
+ content_audio, _ = librosa.load(audio_path, sr=16000)
152
+ # Calculate the required length to make it a multiple of 16*160
153
+ target_length = ((len(content_audio) + 16*160 - 1) // (16*160)) * (16*160)
154
+ # Pad with zeros if necessary
155
+ if len(content_audio) < target_length:
156
+ content_audio = np.pad(content_audio, (0, target_length - len(content_audio)), mode='constant')
157
+ content_audio = torch.tensor(content_audio).unsqueeze(0).to(self.device)
158
+ content_clip = get_content(self.content_model, content_audio)
159
+ return content_clip
160
+
161
+ def load_spk_embed(self, emb_path):
162
+ self.spk_embed_cache = torch.load(emb_path, map_location=self.device)
163
+
164
+ def save_spk_embed(self, emb_path):
165
+ assert self.spk_embed_cache is not None
166
+ torch.save(self.spk_embed_cache.cpu(), emb_path)
167
+
168
+ def save_audio(self, output_path, audio, sr):
169
+ sf.write(output_path, audio, samplerate=sr)
170
+
171
+ @torch.no_grad()
172
+ def genvc(self, content_audio, prompt,
173
+ prompt_guidance_scale=3, prompt_guidance_rescale=0.0,
174
+ prompt_ddim_steps=100, prompt_eta=1, prompt_random_seed=None,
175
+ vc_guidance_scale=3, vc_guidance_rescale=0.7,
176
+ vc_ddim_steps=50, vc_eta=1, vc_random_seed=None,
177
+ ):
178
+
179
+ content_clip = self._load_content(content_audio)
180
+
181
+ text_batch = self.tokenizer(prompt, max_length=32,
182
+ padding='max_length', truncation=True, return_tensors="pt")
183
+ text, text_mask = text_batch.input_ids.to(self.device), \
184
+ text_batch.attention_mask.to(self.device)
185
+ text = self.text_encoder(input_ids=text, attention_mask=text_mask)[0]
186
+
187
+ if self.mode == 'plugin':
188
+ spk_embed = self.dreamvg.inference([text, text_mask],
189
+ guidance_scale=prompt_guidance_scale,
190
+ guidance_rescale=prompt_guidance_rescale,
191
+ ddim_steps=prompt_ddim_steps, eta=prompt_eta,
192
+ random_seed=prompt_random_seed)
193
+
194
+ B, L, D = content_clip.shape
195
+ gen_audio_chunks = []
196
+ num_chunks = (L + self.chunk_size - 1) // self.chunk_size
197
+ for i in range(num_chunks):
198
+ start_idx = i * self.chunk_size
199
+ end_idx = min((i + 1) * self.chunk_size, L)
200
+ content_clip_chunk = content_clip[:, start_idx:end_idx, :]
201
+
202
+ gen_audio_chunk = self.dreamvc.inference(
203
+ spk_embed, content_clip_chunk, None,
204
+ guidance_scale=vc_guidance_scale,
205
+ guidance_rescale=vc_guidance_rescale,
206
+ ddim_steps=vc_ddim_steps,
207
+ eta=vc_eta,
208
+ random_seed=vc_random_seed)
209
+
210
+ gen_audio_chunks.append(gen_audio_chunk)
211
+
212
+ gen_audio = torch.cat(gen_audio_chunks, dim=-1)
213
+
214
+ self.spk_embed_cache = spk_embed
215
+
216
+ elif self.mode == 'end2end':
217
+ B, L, D = content_clip.shape
218
+ gen_audio_chunks = []
219
+ num_chunks = (L + self.chunk_size - 1) // self.chunk_size
220
+
221
+ for i in range(num_chunks):
222
+ start_idx = i * self.chunk_size
223
+ end_idx = min((i + 1) * self.chunk_size, L)
224
+ content_clip_chunk = content_clip[:, start_idx:end_idx, :]
225
+
226
+ gen_audio_chunk = self.dreamvc.inference([text, text_mask], content_clip,
227
+ guidance_scale=prompt_guidance_scale,
228
+ guidance_rescale=prompt_guidance_rescale,
229
+ ddim_steps=prompt_ddim_steps,
230
+ eta=prompt_eta, random_seed=prompt_random_seed)
231
+ gen_audio_chunks.append(gen_audio_chunk)
232
+
233
+ gen_audio = torch.cat(gen_audio_chunks, dim=-1)
234
+
235
+ else:
236
+ raise NotImplementedError("Select mode from 'plugin' and 'end2end'")
237
+
238
+ gen_audio = self.hifigan(gen_audio.squeeze(1))
239
+ gen_audio = gen_audio.cpu().numpy().squeeze(0).squeeze(0)
240
+
241
+ return gen_audio, self.sr
242
+
243
+ @torch.no_grad()
244
+ def simplevc(self, content_audio, speaker_audio=None, use_spk_cache=False,
245
+ vc_guidance_scale=3, vc_guidance_rescale=0.7,
246
+ vc_ddim_steps=50, vc_eta=1, vc_random_seed=None,
247
+ ):
248
+
249
+ assert self.mode == 'plugin'
250
+ if speaker_audio is not None:
251
+ speaker_audio, _ = librosa.load(speaker_audio, sr=16000)
252
+ speaker_audio = torch.tensor(speaker_audio).unsqueeze(0).to(self.device)
253
+ spk_embed = spk_encoder.embed_utterance_batch(speaker_audio)
254
+ self.spk_embed_cache = spk_embed
255
+ elif use_spk_cache:
256
+ assert self.spk_embed_cache is not None
257
+ spk_embed = self.spk_embed_cache
258
+ else:
259
+ raise NotImplementedError
260
+
261
+ content_clip = self._load_content(content_audio)
262
+
263
+ B, L, D = content_clip.shape
264
+ gen_audio_chunks = []
265
+ num_chunks = (L + self.chunk_size - 1) // self.chunk_size
266
+ for i in range(num_chunks):
267
+ start_idx = i * self.chunk_size
268
+ end_idx = min((i + 1) * self.chunk_size, L)
269
+ content_clip_chunk = content_clip[:, start_idx:end_idx, :]
270
+
271
+ gen_audio_chunk = self.dreamvc.inference(
272
+ spk_embed, content_clip_chunk, None,
273
+ guidance_scale=vc_guidance_scale,
274
+ guidance_rescale=vc_guidance_rescale,
275
+ ddim_steps=vc_ddim_steps,
276
+ eta=vc_eta,
277
+ random_seed=vc_random_seed)
278
+
279
+ gen_audio_chunks.append(gen_audio_chunk)
280
+
281
+ gen_audio = torch.cat(gen_audio_chunks, dim=-1)
282
+
283
+ gen_audio = self.hifigan(gen_audio.squeeze(1))
284
+ gen_audio = gen_audio.cpu().numpy().squeeze(0).squeeze(0)
285
+
286
+ return gen_audio, self.sr
287
+
288
+
289
+ if __name__ == '__main__':
290
+ dreamvoice = DreamVoice(config='dreamvc.yaml', mode='plugin', device='cuda')
291
+ content_audio = 'test.wav'
292
+ speaker_audio = 'speaker.wav'
293
+ prompt = 'young female voice, sounds young and cute'
294
+ gen_audio, sr = dreamvoice.genvc('test.wav', prompt)
295
+ dreamvoice.save_audio('debug.wav', gen_audio, sr)
dreamvoice/ckpts/bigvgan_24k/config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "resblock": "1",
3
+ "num_gpus": 0,
4
+ "batch_size": 32,
5
+ "learning_rate": 0.0001,
6
+ "adam_b1": 0.8,
7
+ "adam_b2": 0.99,
8
+ "lr_decay": 0.999,
9
+ "seed": 1234,
10
+
11
+ "upsample_rates": [10,6,4,2],
12
+ "upsample_kernel_sizes": [20,12,8,4],
13
+ "upsample_initial_channel": 512,
14
+ "resblock_kernel_sizes": [3,7,11],
15
+ "resblock_dilation_sizes": [[1,3,5], [1,3,5], [1,3,5]],
16
+
17
+ "activation": "snakebeta",
18
+ "snake_logscale": true,
19
+
20
+ "resolutions": [[1024, 120, 600], [2048, 240, 1200], [512, 50, 240]],
21
+ "mpd_reshapes": [2, 3, 5, 7, 11],
22
+ "use_spectral_norm": false,
23
+ "discriminator_channel_mult": 1,
24
+
25
+ "segment_size": 12000,
26
+ "num_mels": 128,
27
+ "n_fft": 1920,
28
+ "hop_size": 480,
29
+ "win_size": 1920,
30
+
31
+ "sampling_rate": 24000,
32
+
33
+ "fmin": 0,
34
+ "fmax": 12000,
35
+ "fmax_for_loss": null,
36
+
37
+ "num_workers": 4,
38
+
39
+ "dist_config": {
40
+ "dist_backend": "nccl",
41
+ "dist_url": "tcp://localhost:54321",
42
+ "world_size": 1
43
+ }
44
+ }
dreamvoice/ckpts/bigvgan_24k/g_01000000.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:683a7baafedda8ec2fd2409deff61bd58ae66fbf10630550a17fcfed6f728977
3
+ size 58405452
dreamvoice/ckpts/dreamvc_base.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5abe034bf590e2ce0405c66e950dc61f041629731e959cb09e2009688cd1254c
3
+ size 300117179
dreamvoice/ckpts/dreamvc_cross.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:87b4eb1e62b1bf4e157edc2766b9b4461c0be0f7d98a970d6b087f3797c35920
3
+ size 451974443
dreamvoice/ckpts/dreamvc_plugin.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2396f6b96e9057e73e20eee173d7aaded6b5eb70745a9f5282999c0ea9a4d848
3
+ size 104892440
dreamvoice/ckpts/spk_encoder/pretrained.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39373b86598fa3da9fcddee6142382efe09777e8d37dc9c0561f41f0070f134e
3
+ size 17090379
dreamvoice/dreamvc.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ sample_rate: 24000
4
+ vocoder_path: 'ckpts/bigvgan_24k/g_01000000.pt'
5
+ vocoder_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/bigvgan_24k/g_01000000.pt'
6
+ vocoder_config_path: 'ckpts/bigvgan_24k/config.json'
7
+ vocoder_config_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/bigvgan_24k/config.json'
8
+
9
+ speaker_path: 'ckpts/spk_encoder/pretrained.pt'
10
+ speaker_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/spk_encoder/pretrained.pt'
11
+ lm_path: 'google/flan-t5-base'
12
+
13
+ dreamvc:
14
+ config_path: 'src/configs/diffvc_cross.yaml'
15
+ ckpt_path: 'ckpts/dreamvc_cross.pt'
16
+ ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_cross.pt'
17
+
18
+ rediffvc:
19
+ config_path: 'src/configs/diffvc_base.yaml'
20
+ ckpt_path: 'ckpts/dreamvc_base.pt'
21
+ ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_base.pt'
22
+
23
+ dreamvg:
24
+ config_path: 'src/configs/plugin_cross.yaml'
25
+ ckpt_path: 'ckpts/dreamvc_plugin.pt'
26
+ ckpt_url: 'https://huggingface.co/myshell-ai/DreamVoice/resolve/main/dreamvoice/ckpts/dreamvc_plugin.pt'
dreamvoice/src/.ipynb_checkpoints/extract_features-checkpoint.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import librosa
4
+ import numpy as np
5
+ import soundfile as sf
6
+ import pandas as pd
7
+ # from feats.hubert_model import get_soft_model, get_hubert_soft_content
8
+ from feats.contentvec_hf import get_content_model, get_content
9
+ # from modules.speaker_encoder.encoder import inference as spk_encoder
10
+ # from pathlib import Path
11
+ from tqdm import tqdm
12
+ from multiprocessing import Process
13
+ import pyworld as pw
14
+
15
+
16
+ def resample_save(infolder, audio_path, model,
17
+ audio_sr=24000, content_sr=16000, min_length=1.92,
18
+ content_resolution=50,
19
+ save_path='features'):
20
+ if os.path.isfile(save_path + '/' + 'audio_24k/' + audio_path) is False:
21
+ audio, sr = librosa.load(infolder + audio_path, sr=content_sr)
22
+ final_length = audio.shape[-1] // (content_sr / content_resolution) * (content_sr / content_resolution)
23
+ # final_length = final_length / content_sr
24
+
25
+ length = max(round(min_length*content_sr), round(final_length))
26
+ assert length % 10 == 0
27
+ audio = audio[:length]
28
+ audio_save = np.zeros(length, dtype=audio.dtype)
29
+ audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
30
+
31
+ # content = get_hubert_soft_content(model, torch.tensor(audio_save).unsqueeze(0))
32
+ content = get_content(model, torch.tensor(audio_save).unsqueeze(0))
33
+ content = content.cpu()
34
+ os.makedirs(os.path.dirname(save_path + '/' + 'content/' + audio_path), exist_ok=True)
35
+ torch.save(content, save_path + '/' + 'content/' + audio_path+'.pt')
36
+ # print(audio_save.shape)
37
+ # print(content.shape)
38
+ os.makedirs(os.path.dirname(save_path + '/' + 'audio_16k/' + audio_path), exist_ok=True)
39
+ sf.write(save_path + '/' + 'audio_16k/' + audio_path, audio_save, int(sr))
40
+ # print(save_path + '/' + 'audio_16k/' + audio_path)
41
+
42
+ audio, sr = librosa.load(infolder + audio_path, sr=audio_sr)
43
+ length = max(round(min_length*audio_sr), round(final_length/content_sr*audio_sr))
44
+ assert length % 10 == 0
45
+ audio = audio[:length]
46
+ audio_save = np.zeros(length, dtype=audio.dtype)
47
+ audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
48
+ # print(audio_save.shape)
49
+ os.makedirs(os.path.dirname(save_path + '/' + 'audio_24k/' + audio_path), exist_ok=True)
50
+ sf.write(save_path + '/' + 'audio_24k/' + audio_path, audio_save, int(sr))
51
+
52
+
53
+ def extract_f0(in_folder, audio_path, save_path):
54
+ audio, sr = librosa.load(in_folder + audio_path, sr=None)
55
+ assert sr == 16000
56
+ if os.path.isfile(save_path + '/' + 'f0/' + audio_path + '.pt') is False:
57
+ # wav = audio
58
+ # wav = np.pad(wav, int((1024-320)/2), mode='reflect')
59
+ # f0_, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=320, center=False, sr=sr,
60
+ # fmin=librosa.note_to_hz('C2'),
61
+ # fmax=librosa.note_to_hz('C6'))
62
+
63
+ _f0, t = pw.dio(audio.astype(np.float64), sr, frame_period=320 / sr * 1000)
64
+ f0 = pw.stonemask(audio.astype(np.float64), _f0, t, sr)[:-1]
65
+
66
+ f0 = np.nan_to_num(f0)
67
+ os.makedirs(os.path.dirname(save_path + '/' + 'f0/' + audio_path), exist_ok=True)
68
+ # print(save_path + '/' + 'f0/' + audio_path + '.pt')
69
+ torch.save(torch.tensor(f0), save_path + '/' + 'f0/' + audio_path + '.pt')
70
+
71
+
72
+ def chunks(arr, m):
73
+ result = [[] for i in range(m)]
74
+ for i in range(len(arr)):
75
+ result[i%m].append(arr[i])
76
+ return result
77
+
78
+
79
+ def extract_f0_main(in_folder, audio_paths, save_path):
80
+ for audio_path in tqdm(audio_paths):
81
+ extract_f0(in_folder, audio_path, save_path)
82
+
83
+
84
+ if __name__ == '__main__':
85
+ df = pd.read_csv('../test_data/vc_meta.csv')
86
+ # model = get_soft_model('../pre_ckpts/hubert_soft.pt').to('cuda')
87
+ model = get_content_model().to('cuda')
88
+ # # spk_encoder.load_model(Path('ckpts/spk_encoder/pretrained.pt'), device="cuda")
89
+ for i in tqdm(range(len(df))):
90
+ row = df.iloc[i]
91
+ in_path = row['path']
92
+ resample_save('../test_data/', in_path, model, save_path='../features/')
93
+
94
+ in_folder = '../features/audio_16k/'
95
+ audio_files = list(df['path'])
96
+ save_path = '../features/'
97
+ cores = 6
98
+
99
+ subsets = chunks(audio_files, cores)
100
+
101
+ for subset in subsets:
102
+ t = Process(target=extract_f0_main, args=(in_folder, subset, save_path))
103
+ t.start()
dreamvoice/src/.ipynb_checkpoints/plugin_wrapper-checkpoint.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import torch
3
+ from diffusers import DDIMScheduler
4
+ from .model.p2e_cross import P2E_Cross
5
+ from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
6
+
7
+
8
+ class DreamVG(object):
9
+ def __init__(self,
10
+ config_path='configs/plugin_cross.yaml',
11
+ ckpt_path='../ckpts/dreamvc_plugin.pt',
12
+ device='cpu'):
13
+
14
+ with open(config_path, 'r') as fp:
15
+ config = yaml.safe_load(fp)
16
+
17
+ self.device = device
18
+ self.model = P2E_Cross(config['model']).to(device)
19
+ self.model.load_state_dict(torch.load(ckpt_path)['model'])
20
+ self.model.eval()
21
+
22
+ noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
23
+ beta_start=config['scheduler']['beta_start'],
24
+ beta_end=config['scheduler']['beta_end'],
25
+ rescale_betas_zero_snr=True,
26
+ timestep_spacing="trailing",
27
+ clip_sample=False,
28
+ prediction_type='v_prediction')
29
+ self.noise_scheduler = noise_scheduler
30
+ self.scale = config['scheduler']['scale']
31
+ self.shift = config['scheduler']['shift']
32
+ self.spk_shape = config['model']['unet']['in_channels']
33
+
34
+ @torch.no_grad()
35
+ def inference(self, text,
36
+ guidance_scale=5, guidance_rescale=0.7,
37
+ ddim_steps=50, eta=1, random_seed=2023,
38
+ ):
39
+ text, text_mask = text
40
+ self.model.eval()
41
+
42
+ gen_shape = (1, self.spk_shape)
43
+
44
+ if random_seed is not None:
45
+ generator = torch.Generator(device=self.device).manual_seed(random_seed)
46
+ else:
47
+ generator = torch.Generator(device=self.device)
48
+ generator.seed()
49
+
50
+ self.noise_scheduler.set_timesteps(ddim_steps)
51
+
52
+ # init noise
53
+ noise = torch.randn(gen_shape, generator=generator, device=self.device)
54
+ latents = noise
55
+
56
+ for t in self.noise_scheduler.timesteps:
57
+ latents = self.noise_scheduler.scale_model_input(latents, t)
58
+
59
+ if guidance_scale:
60
+ output_text = self.model(latents, t, text, text_mask, train_cfg=False)
61
+ output_uncond = self.model(latents, t, text, text_mask, train_cfg=True, cfg_prob=1.0)
62
+
63
+ output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
64
+ if guidance_rescale > 0.0:
65
+ output_pred = rescale_noise_cfg(output_pred, output_text,
66
+ guidance_rescale=guidance_rescale)
67
+ else:
68
+ output_pred = self.model(latents, t, text, text_mask, train_cfg=False)
69
+
70
+ latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
71
+ eta=eta, generator=generator).prev_sample
72
+
73
+ # pred = reverse_minmax_norm_diff(latents, vmin=0.0, vmax=0.5)
74
+ pred = scale_shift_re(latents, 1/self.scale, self.shift)
75
+ # pred = torch.clip(pred, min=0.0, max=0.5)
76
+ return pred
dreamvoice/src/.ipynb_checkpoints/train_plugin-checkpoint.py ADDED
File without changes
dreamvoice/src/.ipynb_checkpoints/train_vc-checkpoint.py ADDED
File without changes
dreamvoice/src/.ipynb_checkpoints/vc_wrapper-checkpoint.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import torch
3
+ from diffusers import DDIMScheduler
4
+ from .model.model import DiffVC
5
+ from .model.model_cross import DiffVC_Cross
6
+ from .utils import scale_shift, scale_shift_re, rescale_noise_cfg
7
+
8
+
9
+ class ReDiffVC(object):
10
+ def __init__(self,
11
+ config_path='configs/diffvc_base.yaml',
12
+ ckpt_path='../ckpts/dreamvc_base.pt',
13
+ device='cpu'):
14
+
15
+ with open(config_path, 'r') as fp:
16
+ config = yaml.safe_load(fp)
17
+
18
+ self.device = device
19
+ self.model = DiffVC(config['model']).to(device)
20
+ self.model.load_state_dict(torch.load(ckpt_path)['model'])
21
+ self.model.eval()
22
+
23
+ noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
24
+ beta_start=config['scheduler']['beta_start'],
25
+ beta_end=config['scheduler']['beta_end'],
26
+ rescale_betas_zero_snr=True,
27
+ timestep_spacing="trailing",
28
+ clip_sample=False,
29
+ prediction_type='v_prediction')
30
+ self.noise_scheduler = noise_scheduler
31
+ self.scale = config['scheduler']['scale']
32
+ self.shift = config['scheduler']['shift']
33
+ self.melshape = config['model']['unet']['sample_size'][0]
34
+
35
+ @torch.no_grad()
36
+ def inference(self,
37
+ spk_embed, content_clip, f0_clip=None,
38
+ guidance_scale=3, guidance_rescale=0.7,
39
+ ddim_steps=50, eta=1, random_seed=2023):
40
+
41
+ self.model.eval()
42
+ if random_seed is not None:
43
+ generator = torch.Generator(device=self.device).manual_seed(random_seed)
44
+ else:
45
+ generator = torch.Generator(device=self.device)
46
+ generator.seed()
47
+
48
+ self.noise_scheduler.set_timesteps(ddim_steps)
49
+
50
+ # init noise
51
+ gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
52
+ noise = torch.randn(gen_shape, generator=generator, device=self.device)
53
+ latents = noise
54
+
55
+ for t in self.noise_scheduler.timesteps:
56
+ latents = self.noise_scheduler.scale_model_input(latents, t)
57
+
58
+ if guidance_scale:
59
+ output_text = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
60
+ output_uncond = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=True,
61
+ speaker_cfg=1.0, pitch_cfg=0.0)
62
+
63
+ output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
64
+ if guidance_rescale > 0.0:
65
+ output_pred = rescale_noise_cfg(output_pred, output_text,
66
+ guidance_rescale=guidance_rescale)
67
+ else:
68
+ output_pred = self.model(latents, t, content_clip, spk_embed, f0_clip, train_cfg=False)
69
+
70
+ latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
71
+ eta=eta, generator=generator).prev_sample
72
+
73
+ pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
74
+ return pred
75
+
76
+
77
+ class DreamVC(object):
78
+ def __init__(self,
79
+ config_path='configs/diffvc_cross.yaml',
80
+ ckpt_path='../ckpts/dreamvc_cross.pt',
81
+ device='cpu'):
82
+
83
+ with open(config_path, 'r') as fp:
84
+ config = yaml.safe_load(fp)
85
+
86
+ self.device = device
87
+ self.model = DiffVC_Cross(config['model']).to(device)
88
+ self.model.load_state_dict(torch.load(ckpt_path)['model'])
89
+ self.model.eval()
90
+
91
+ noise_scheduler = DDIMScheduler(num_train_timesteps=config['scheduler']['num_train_steps'],
92
+ beta_start=config['scheduler']['beta_start'],
93
+ beta_end=config['scheduler']['beta_end'],
94
+ rescale_betas_zero_snr=True,
95
+ timestep_spacing="trailing",
96
+ clip_sample=False,
97
+ prediction_type='v_prediction')
98
+ self.noise_scheduler = noise_scheduler
99
+ self.scale = config['scheduler']['scale']
100
+ self.shift = config['scheduler']['shift']
101
+ self.melshape = config['model']['unet']['sample_size'][0]
102
+
103
+ @torch.no_grad()
104
+ def inference(self,
105
+ text, content_clip, f0_clip=None,
106
+ guidance_scale=3, guidance_rescale=0.7,
107
+ ddim_steps=50, eta=1, random_seed=2023):
108
+
109
+ text, text_mask = text
110
+ self.model.eval()
111
+ if random_seed is not None:
112
+ generator = torch.Generator(device=self.device).manual_seed(random_seed)
113
+ else:
114
+ generator = torch.Generator(device=self.device)
115
+ generator.seed()
116
+
117
+ self.noise_scheduler.set_timesteps(ddim_steps)
118
+
119
+ # init noise
120
+ gen_shape = (1, 1, self.melshape, content_clip.shape[-2])
121
+ noise = torch.randn(gen_shape, generator=generator, device=self.device)
122
+ latents = noise
123
+
124
+ for t in self.noise_scheduler.timesteps:
125
+ latents = self.noise_scheduler.scale_model_input(latents, t)
126
+
127
+ if guidance_scale:
128
+ output_text = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
129
+ output_uncond = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=True,
130
+ speaker_cfg=1.0, pitch_cfg=0.0)
131
+
132
+ output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
133
+ if guidance_rescale > 0.0:
134
+ output_pred = rescale_noise_cfg(output_pred, output_text,
135
+ guidance_rescale=guidance_rescale)
136
+ else:
137
+ output_pred = self.model(latents, t, content_clip, text, text_mask, f0_clip, train_cfg=False)
138
+
139
+ latents = self.noise_scheduler.step(model_output=output_pred, timestep=t, sample=latents,
140
+ eta=eta, generator=generator).prev_sample
141
+
142
+ pred = scale_shift_re(latents, scale=1/self.scale, shift=self.shift)
143
+ return pred
144
+
dreamvoice/src/__pycache__/plugin_wrapper.cpython-310.pyc ADDED
Binary file (2.41 kB). View file
 
dreamvoice/src/__pycache__/plugin_wrapper.cpython-311.pyc ADDED
Binary file (4.38 kB). View file
 
dreamvoice/src/__pycache__/vc_wrapper.cpython-310.pyc ADDED
Binary file (3.49 kB). View file
 
dreamvoice/src/__pycache__/vc_wrapper.cpython-311.pyc ADDED
Binary file (7.81 kB). View file
 
dreamvoice/src/configs/.ipynb_checkpoints/diffvc_base-checkpoint.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "base"
4
+
5
+ model:
6
+ cls_embedding:
7
+ speaker_dim: 256
8
+ feature_dim: 512
9
+ content_dim: 768
10
+ content_hidden: 256
11
+ use_pitch: false
12
+
13
+ unet:
14
+ sample_size: [128, 256]
15
+ in_channels: 257
16
+ out_channels: 1
17
+ layers_per_block: 2
18
+ block_out_channels: [128, 256, 256, 512]
19
+ down_block_types:
20
+ [
21
+ "DownBlock2D",
22
+ "DownBlock2D",
23
+ "AttnDownBlock2D",
24
+ "AttnDownBlock2D",
25
+ ]
26
+ up_block_types:
27
+ [
28
+ "AttnUpBlock2D",
29
+ "AttnUpBlock2D",
30
+ "UpBlock2D",
31
+ "UpBlock2D"
32
+ ]
33
+ attention_head_dim: 32
34
+ class_embed_type: 'identity'
35
+
36
+ scheduler:
37
+ num_train_steps: 1000
38
+ beta_schedule: 'linear'
39
+ beta_start: 0.0001
40
+ beta_end: 0.02
41
+ num_infer_steps: 50
42
+ rescale_betas_zero_snr: true
43
+ timestep_spacing: "trailing"
44
+ clip_sample: false
45
+ prediction_type: 'v_prediction'
46
+ scale: 2.75
47
+ shift: 5.80
dreamvoice/src/configs/.ipynb_checkpoints/diffvc_base_pitch-checkpoint.yaml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "base"
4
+
5
+ diffwrap:
6
+ cls_embedding:
7
+ speaker_dim: 256
8
+ feature_dim: 512
9
+ content_dim: 768
10
+ content_hidden: 256
11
+ use_pitch: true
12
+ pitch_dim: 1
13
+ pitch_hidden: 128
14
+
15
+ unet:
16
+ sample_size: [128, 256]
17
+ in_channels: 385
18
+ out_channels: 1
19
+ layers_per_block: 2
20
+ block_out_channels: [256, 256, 512]
21
+ down_block_types:
22
+ [
23
+ "DownBlock2D",
24
+ "AttnDownBlock2D",
25
+ "AttnDownBlock2D",
26
+ ]
27
+ up_block_types:
28
+ [
29
+ "AttnUpBlock2D",
30
+ "AttnUpBlock2D",
31
+ "UpBlock2D"
32
+ ]
33
+ attention_head_dim: 32
34
+ class_embed_type: 'identity'
dreamvoice/src/configs/.ipynb_checkpoints/diffvc_cross-checkpoint.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "cross"
4
+
5
+ model:
6
+ cls_embedding:
7
+ content_dim: 768
8
+ content_hidden: 256
9
+ use_pitch: false
10
+
11
+ unet:
12
+ sample_size: [128, 256]
13
+ in_channels: 257
14
+ out_channels: 1
15
+ layers_per_block: 2
16
+ block_out_channels: [128, 256, 256, 512]
17
+ down_block_types:
18
+ [
19
+ "DownBlock2D",
20
+ "DownBlock2D",
21
+ "CrossAttnDownBlock2D",
22
+ "CrossAttnDownBlock2D",
23
+ ]
24
+ up_block_types:
25
+ [
26
+ "CrossAttnUpBlock2D",
27
+ "CrossAttnUpBlock2D",
28
+ "UpBlock2D",
29
+ "UpBlock2D",
30
+ ]
31
+ attention_head_dim: 32
32
+ cross_attention_dim: 768
33
+
34
+ scheduler:
35
+ num_train_steps: 1000
36
+ beta_schedule: 'linear'
37
+ beta_start: 0.0001
38
+ beta_end: 0.02
39
+ num_infer_steps: 50
40
+ rescale_betas_zero_snr: true
41
+ timestep_spacing: "trailing"
42
+ clip_sample: false
43
+ prediction_type: 'v_prediction'
44
+ scale: 2.75
45
+ shift: 5.80
dreamvoice/src/configs/.ipynb_checkpoints/diffvc_cross_pitch-checkpoint.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "cross"
4
+
5
+ diffwrap:
6
+ cls_embedding:
7
+ content_dim: 768
8
+ content_hidden: 256
9
+ use_pitch: true
10
+ pitch_dim: 1
11
+ pitch_hidden: 128
12
+
13
+ unet:
14
+ sample_size: [100, 256]
15
+ in_channels: 385
16
+ out_channels: 1
17
+ layers_per_block: 2
18
+ block_out_channels: [128, 256, 512]
19
+ down_block_types:
20
+ [
21
+ "DownBlock2D",
22
+ "CrossAttnDownBlock2D",
23
+ "CrossAttnDownBlock2D",
24
+ ]
25
+ up_block_types:
26
+ [
27
+ "CrossAttnUpBlock2D",
28
+ "CrossAttnUpBlock2D",
29
+ "UpBlock2D",
30
+ ]
31
+ attention_head_dim: 32
32
+ cross_attention_dim: 768
33
+
dreamvoice/src/configs/.ipynb_checkpoints/plugin_cross-checkpoint.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "cross"
4
+
5
+ model:
6
+ cls_embedding:
7
+ content_dim: 768
8
+ content_hidden: 256
9
+
10
+ unet:
11
+ sample_size: [1, 1]
12
+ in_channels: 256
13
+ out_channels: 256
14
+ layers_per_block: 2
15
+ block_out_channels: [256]
16
+ down_block_types:
17
+ [
18
+ "CrossAttnDownBlock2D",
19
+ ]
20
+ up_block_types:
21
+ [
22
+ "CrossAttnUpBlock2D",
23
+ ]
24
+ attention_head_dim: 32
25
+ cross_attention_dim: 768
26
+
27
+ scheduler:
28
+ num_train_steps: 1000
29
+ beta_schedule: 'linear'
30
+ beta_start: 0.0001
31
+ beta_end: 0.02
32
+ num_infer_steps: 50
33
+ rescale_betas_zero_snr: true
34
+ timestep_spacing: "trailing"
35
+ clip_sample: false
36
+ prediction_type: 'v_prediction'
37
+ scale: 0.05
38
+ shift: -0.035
39
+
dreamvoice/src/configs/diffvc_base.yaml ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "base"
4
+
5
+ model:
6
+ cls_embedding:
7
+ speaker_dim: 256
8
+ feature_dim: 512
9
+ content_dim: 768
10
+ content_hidden: 256
11
+ use_pitch: false
12
+
13
+ unet:
14
+ sample_size: [128, 256]
15
+ in_channels: 257
16
+ out_channels: 1
17
+ layers_per_block: 2
18
+ block_out_channels: [128, 256, 256, 512]
19
+ down_block_types:
20
+ [
21
+ "DownBlock2D",
22
+ "DownBlock2D",
23
+ "AttnDownBlock2D",
24
+ "AttnDownBlock2D",
25
+ ]
26
+ up_block_types:
27
+ [
28
+ "AttnUpBlock2D",
29
+ "AttnUpBlock2D",
30
+ "UpBlock2D",
31
+ "UpBlock2D"
32
+ ]
33
+ attention_head_dim: 32
34
+ class_embed_type: 'identity'
35
+
36
+ scheduler:
37
+ num_train_steps: 1000
38
+ beta_schedule: 'linear'
39
+ beta_start: 0.0001
40
+ beta_end: 0.02
41
+ num_infer_steps: 50
42
+ rescale_betas_zero_snr: true
43
+ timestep_spacing: "trailing"
44
+ clip_sample: false
45
+ prediction_type: 'v_prediction'
46
+ scale: 2.75
47
+ shift: 5.80
dreamvoice/src/configs/diffvc_base_pitch.yaml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "base"
4
+
5
+ diffwrap:
6
+ cls_embedding:
7
+ speaker_dim: 256
8
+ feature_dim: 512
9
+ content_dim: 768
10
+ content_hidden: 256
11
+ use_pitch: true
12
+ pitch_dim: 1
13
+ pitch_hidden: 128
14
+
15
+ unet:
16
+ sample_size: [128, 256]
17
+ in_channels: 385
18
+ out_channels: 1
19
+ layers_per_block: 2
20
+ block_out_channels: [128, 256, 512]
21
+ down_block_types:
22
+ [
23
+ "DownBlock2D",
24
+ "AttnDownBlock2D",
25
+ "AttnDownBlock2D",
26
+ ]
27
+ up_block_types:
28
+ [
29
+ "AttnUpBlock2D",
30
+ "AttnUpBlock2D",
31
+ "UpBlock2D"
32
+ ]
33
+ attention_head_dim: 32
34
+ class_embed_type: 'identity'
dreamvoice/src/configs/diffvc_cross.yaml ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "cross"
4
+
5
+ model:
6
+ cls_embedding:
7
+ content_dim: 768
8
+ content_hidden: 256
9
+ use_pitch: false
10
+
11
+ unet:
12
+ sample_size: [128, 256]
13
+ in_channels: 257
14
+ out_channels: 1
15
+ layers_per_block: 2
16
+ block_out_channels: [128, 256, 256, 512]
17
+ down_block_types:
18
+ [
19
+ "DownBlock2D",
20
+ "DownBlock2D",
21
+ "CrossAttnDownBlock2D",
22
+ "CrossAttnDownBlock2D",
23
+ ]
24
+ up_block_types:
25
+ [
26
+ "CrossAttnUpBlock2D",
27
+ "CrossAttnUpBlock2D",
28
+ "UpBlock2D",
29
+ "UpBlock2D",
30
+ ]
31
+ attention_head_dim: 32
32
+ cross_attention_dim: 768
33
+
34
+ scheduler:
35
+ num_train_steps: 1000
36
+ beta_schedule: 'linear'
37
+ beta_start: 0.0001
38
+ beta_end: 0.02
39
+ num_infer_steps: 50
40
+ rescale_betas_zero_snr: true
41
+ timestep_spacing: "trailing"
42
+ clip_sample: false
43
+ prediction_type: 'v_prediction'
44
+ scale: 2.75
45
+ shift: 5.80
dreamvoice/src/configs/diffvc_cross_pitch.yaml ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "cross"
4
+
5
+ diffwrap:
6
+ cls_embedding:
7
+ content_dim: 768
8
+ content_hidden: 256
9
+ use_pitch: true
10
+ pitch_dim: 1
11
+ pitch_hidden: 128
12
+
13
+ unet:
14
+ sample_size: [100, 256]
15
+ in_channels: 385
16
+ out_channels: 1
17
+ layers_per_block: 2
18
+ block_out_channels: [128, 256, 512]
19
+ down_block_types:
20
+ [
21
+ "DownBlock2D",
22
+ "CrossAttnDownBlock2D",
23
+ "CrossAttnDownBlock2D",
24
+ ]
25
+ up_block_types:
26
+ [
27
+ "CrossAttnUpBlock2D",
28
+ "CrossAttnUpBlock2D",
29
+ "UpBlock2D",
30
+ ]
31
+ attention_head_dim: 32
32
+ cross_attention_dim: 768
33
+
dreamvoice/src/configs/plugin_cross.yaml ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "cross"
4
+
5
+ model:
6
+ cls_embedding:
7
+ content_dim: 768
8
+ content_hidden: 256
9
+
10
+ unet:
11
+ sample_size: [1, 1]
12
+ in_channels: 256
13
+ out_channels: 256
14
+ layers_per_block: 2
15
+ block_out_channels: [256]
16
+ down_block_types:
17
+ [
18
+ "CrossAttnDownBlock2D",
19
+ ]
20
+ up_block_types:
21
+ [
22
+ "CrossAttnUpBlock2D",
23
+ ]
24
+ attention_head_dim: 32
25
+ cross_attention_dim: 768
26
+
27
+ scheduler:
28
+ num_train_steps: 1000
29
+ beta_schedule: 'linear'
30
+ beta_start: 0.0001
31
+ beta_end: 0.02
32
+ num_infer_steps: 50
33
+ rescale_betas_zero_snr: true
34
+ timestep_spacing: "trailing"
35
+ clip_sample: false
36
+ prediction_type: 'v_prediction'
37
+ scale: 0.05
38
+ shift: -0.035
39
+
dreamvoice/src/debug.py ADDED
File without changes
dreamvoice/src/extract_features.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import librosa
4
+ import numpy as np
5
+ import soundfile as sf
6
+ import pandas as pd
7
+ # from feats.hubert_model import get_soft_model, get_hubert_soft_content
8
+ from feats.contentvec_hf import get_content_model, get_content
9
+ # from modules.speaker_encoder.encoder import inference as spk_encoder
10
+ # from pathlib import Path
11
+ from tqdm import tqdm
12
+ from multiprocessing import Process
13
+ import pyworld as pw
14
+
15
+
16
+ def resample_save(infolder, audio_path, model,
17
+ audio_sr=24000, content_sr=16000, min_length=1.92,
18
+ content_resolution=50,
19
+ save_path='features'):
20
+ if os.path.isfile(save_path + '/' + 'audio_24k/' + audio_path) is False:
21
+ audio, sr = librosa.load(infolder + audio_path, sr=content_sr)
22
+ final_length = audio.shape[-1] // (content_sr / content_resolution) * (content_sr / content_resolution)
23
+ # final_length = final_length / content_sr
24
+
25
+ length = max(round(min_length*content_sr), round(final_length))
26
+ assert length % 10 == 0
27
+ audio = audio[:length]
28
+ audio_save = np.zeros(length, dtype=audio.dtype)
29
+ audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
30
+
31
+ # content = get_hubert_soft_content(model, torch.tensor(audio_save).unsqueeze(0))
32
+ content = get_content(model, torch.tensor(audio_save).unsqueeze(0))
33
+ content = content.cpu()
34
+ os.makedirs(os.path.dirname(save_path + '/' + 'content/' + audio_path), exist_ok=True)
35
+ torch.save(content, save_path + '/' + 'content/' + audio_path+'.pt')
36
+ # print(audio_save.shape)
37
+ # print(content.shape)
38
+ os.makedirs(os.path.dirname(save_path + '/' + 'audio_16k/' + audio_path), exist_ok=True)
39
+ sf.write(save_path + '/' + 'audio_16k/' + audio_path, audio_save, int(sr))
40
+ # print(save_path + '/' + 'audio_16k/' + audio_path)
41
+
42
+ audio, sr = librosa.load(infolder + audio_path, sr=audio_sr)
43
+ length = max(round(min_length*audio_sr), round(final_length/content_sr*audio_sr))
44
+ assert length % 10 == 0
45
+ audio = audio[:length]
46
+ audio_save = np.zeros(length, dtype=audio.dtype)
47
+ audio_save[:audio.shape[-1]] = audio[:audio.shape[-1]]
48
+ # print(audio_save.shape)
49
+ os.makedirs(os.path.dirname(save_path + '/' + 'audio_24k/' + audio_path), exist_ok=True)
50
+ sf.write(save_path + '/' + 'audio_24k/' + audio_path, audio_save, int(sr))
51
+
52
+
53
+ def extract_f0(in_folder, audio_path, save_path):
54
+ audio, sr = librosa.load(in_folder + audio_path, sr=None)
55
+ assert sr == 16000
56
+ if os.path.isfile(save_path + '/' + 'f0/' + audio_path + '.pt') is False:
57
+ # wav = audio
58
+ # wav = np.pad(wav, int((1024-320)/2), mode='reflect')
59
+ # f0_, _, _ = librosa.pyin(wav, frame_length=1024, hop_length=320, center=False, sr=sr,
60
+ # fmin=librosa.note_to_hz('C2'),
61
+ # fmax=librosa.note_to_hz('C6'))
62
+
63
+ _f0, t = pw.dio(audio.astype(np.float64), sr, frame_period=320 / sr * 1000)
64
+ f0 = pw.stonemask(audio.astype(np.float64), _f0, t, sr)[:-1]
65
+
66
+ f0 = np.nan_to_num(f0)
67
+ os.makedirs(os.path.dirname(save_path + '/' + 'f0/' + audio_path), exist_ok=True)
68
+ # print(save_path + '/' + 'f0/' + audio_path + '.pt')
69
+ torch.save(torch.tensor(f0), save_path + '/' + 'f0/' + audio_path + '.pt')
70
+
71
+
72
+ def chunks(arr, m):
73
+ result = [[] for i in range(m)]
74
+ for i in range(len(arr)):
75
+ result[i%m].append(arr[i])
76
+ return result
77
+
78
+
79
+ def extract_f0_main(in_folder, audio_paths, save_path):
80
+ for audio_path in tqdm(audio_paths):
81
+ extract_f0(in_folder, audio_path, save_path)
82
+
83
+
84
+ if __name__ == '__main__':
85
+ df = pd.read_csv('../test_data/vc_meta.csv')
86
+ # model = get_soft_model('../pre_ckpts/hubert_soft.pt').to('cuda')
87
+ model = get_content_model().to('cuda')
88
+ # # spk_encoder.load_model(Path('ckpts/spk_encoder/pretrained.pt'), device="cuda")
89
+ for i in tqdm(range(len(df))):
90
+ row = df.iloc[i]
91
+ in_path = row['path']
92
+ resample_save('../test_data/', in_path, model, save_path='../features/')
93
+
94
+ in_folder = '../features/audio_16k/'
95
+ audio_files = list(df['path'])
96
+ save_path = '../features/'
97
+ cores = 6
98
+
99
+ subsets = chunks(audio_files, cores)
100
+
101
+ for subset in subsets:
102
+ t = Process(target=extract_f0_main, args=(in_folder, subset, save_path))
103
+ t.start()
dreamvoice/src/feats/.ipynb_checkpoints/contentvec-checkpoint.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import librosa
3
+ from fairseq import checkpoint_utils
4
+ import torch.nn.functional as F
5
+
6
+
7
+ def get_model(vec_path):
8
+ print("load model(s) from {}".format(vec_path))
9
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
10
+ [vec_path],
11
+ suffix="",
12
+ )
13
+ model = models[0]
14
+ model.eval()
15
+ return model
16
+
17
+
18
+ @torch.no_grad()
19
+ def get_content(hmodel, wav_16k_tensor, device='cuda', layer=12):
20
+ # print(layer)
21
+ wav_16k_tensor = wav_16k_tensor.to(device)
22
+ # so that the output shape will be len(audio//320)
23
+ wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
24
+ feats = wav_16k_tensor
25
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
26
+ inputs = {
27
+ "source": feats.to(wav_16k_tensor.device),
28
+ "padding_mask": padding_mask.to(wav_16k_tensor.device),
29
+ "output_layer": layer
30
+ }
31
+ logits = hmodel.extract_features(**inputs)[0]
32
+ # feats = hmodel.final_proj(logits[0])
33
+ return logits
34
+
35
+
36
+ if __name__ == '__main__':
37
+ audio, sr = librosa.load('test.wav', sr=16000)
38
+ audio = audio[:100*320]
39
+ model = get_model('../../ckpts/checkpoint_best_legacy_500.pt')
40
+ model = model.cuda()
41
+ content = get_content(model, torch.tensor([audio]))
42
+ print(content)
dreamvoice/src/feats/.ipynb_checkpoints/contentvec_hf-checkpoint.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import HubertModel
2
+ import torch.nn as nn
3
+ import torch
4
+ import torch.nn.functional as F
5
+ import librosa
6
+
7
+
8
+ class HubertModelWithFinalProj(HubertModel):
9
+ def __init__(self, config):
10
+ super().__init__(config)
11
+
12
+ # The final projection layer is only used for backward compatibility.
13
+ # Following https://github.com/auspicious3000/contentvec/issues/6
14
+ # Remove this layer is necessary to achieve the desired outcome.
15
+ self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
16
+
17
+
18
+ def get_content_model(config='lengyue233/content-vec-best'):
19
+ model = HubertModelWithFinalProj.from_pretrained(config)
20
+ model.eval()
21
+ return model
22
+
23
+
24
+ @torch.no_grad()
25
+ def get_content(model, wav_16k_tensor, device='cuda'):
26
+ # print(layer)
27
+ wav_16k_tensor = wav_16k_tensor.to(device)
28
+ # so that the output shape will be len(audio//320)
29
+ wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
30
+ logits = model(wav_16k_tensor)['last_hidden_state']
31
+ return logits
32
+
33
+
34
+ if __name__ == '__main__':
35
+ model = get_content_model().cuda()
36
+ audio, sr = librosa.load('test.wav', sr=16000)
37
+ audio = audio[:100*320]
38
+ audio = torch.tensor([audio])
39
+ content = get_content(model, audio, 'cuda')
40
+ print(content)
dreamvoice/src/feats/.ipynb_checkpoints/hubert_model-checkpoint.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, torchaudio
2
+ from .hubert.hubert import HubertSoft
3
+ from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
4
+ import librosa
5
+
6
+
7
+ def get_soft_model(model_path):
8
+ hubert = HubertSoft()
9
+ # Load checkpoint (either hubert_soft or hubert_discrete)
10
+ # hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
11
+ checkpoint = torch.load(model_path)
12
+ consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
13
+ hubert.load_state_dict(checkpoint["hubert"])
14
+ hubert.eval()
15
+ return hubert
16
+
17
+
18
+ @torch.no_grad()
19
+ def get_hubert_soft_content(hmodel, wav_16k_tensor, device='cuda'):
20
+ wav_16k_tensor = wav_16k_tensor.to(device).unsqueeze(1)
21
+ # print(wav_16k_tensor.shape)
22
+ units = hmodel.units(wav_16k_tensor)
23
+ # print(units.shape)
24
+ return units.cpu()
dreamvoice/src/feats/.ipynb_checkpoints/test-checkpoint.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, torchaudio
2
+ from hubert.hubert import HubertSoft
3
+ from torch.nn.modules.utils import consume_prefix_in_state_dict_if_present
4
+ import librosa
5
+
6
+
7
+ def get_soft_model(model_path):
8
+ hubert = HubertSoft()
9
+ # Load checkpoint (either hubert_soft or hubert_discrete)
10
+ # hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True)
11
+ checkpoint = torch.load(model_path)
12
+ consume_prefix_in_state_dict_if_present(checkpoint["hubert"], "module.")
13
+ hubert.load_state_dict(checkpoint["hubert"])
14
+ hubert.eval()
15
+ return model
16
+
17
+
18
+ @torch.no_grad()
19
+ def get_hubert_soft_content(hmodel, wav_16k_tensor, device='cuda'):
20
+ wav_16k_tensor = wav_16k_tensor.to(device)
21
+ units = hmodel.units(wav_16k_tensor)
22
+ return units.cpu()
dreamvoice/src/feats/__pycache__/contentvec.cpython-310.pyc ADDED
Binary file (1.29 kB). View file
 
dreamvoice/src/feats/__pycache__/contentvec.cpython-311.pyc ADDED
Binary file (2.23 kB). View file
 
dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-310.pyc ADDED
Binary file (1.45 kB). View file
 
dreamvoice/src/feats/__pycache__/contentvec_hf.cpython-311.pyc ADDED
Binary file (2.41 kB). View file
 
dreamvoice/src/feats/__pycache__/hubert_model.cpython-311.pyc ADDED
Binary file (1.44 kB). View file
 
dreamvoice/src/feats/contentvec.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import librosa
3
+ from fairseq import checkpoint_utils
4
+ import torch.nn.functional as F
5
+
6
+
7
+ def get_model(vec_path):
8
+ print("load model(s) from {}".format(vec_path))
9
+ models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task(
10
+ [vec_path],
11
+ suffix="",
12
+ )
13
+ model = models[0]
14
+ model.eval()
15
+ return model
16
+
17
+
18
+ @torch.no_grad()
19
+ def get_content(hmodel, wav_16k_tensor, device='cuda', layer=12):
20
+ # print(layer)
21
+ wav_16k_tensor = wav_16k_tensor.to(device)
22
+ # so that the output shape will be len(audio//320)
23
+ wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
24
+ feats = wav_16k_tensor
25
+ padding_mask = torch.BoolTensor(feats.shape).fill_(False)
26
+ inputs = {
27
+ "source": feats.to(wav_16k_tensor.device),
28
+ "padding_mask": padding_mask.to(wav_16k_tensor.device),
29
+ "output_layer": layer
30
+ }
31
+ logits = hmodel.extract_features(**inputs)[0]
32
+ # feats = hmodel.final_proj(logits[0])
33
+ return logits
34
+
35
+
36
+ if __name__ == '__main__':
37
+ audio, sr = librosa.load('test.wav', sr=16000)
38
+ audio = audio[:100*320]
39
+ model = get_model('../../ckpts/checkpoint_best_legacy_500.pt')
40
+ model = model.cuda()
41
+ content = get_content(model, torch.tensor([audio]))
42
+ print(content)
dreamvoice/src/feats/contentvec_hf.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import HubertModel
2
+ import torch.nn as nn
3
+ import torch
4
+ import torch.nn.functional as F
5
+ import librosa
6
+
7
+
8
+ class HubertModelWithFinalProj(HubertModel):
9
+ def __init__(self, config):
10
+ super().__init__(config)
11
+
12
+ # The final projection layer is only used for backward compatibility.
13
+ # Following https://github.com/auspicious3000/contentvec/issues/6
14
+ # Remove this layer is necessary to achieve the desired outcome.
15
+ self.final_proj = nn.Linear(config.hidden_size, config.classifier_proj_size)
16
+
17
+
18
+ def get_content_model(config='lengyue233/content-vec-best'):
19
+ model = HubertModelWithFinalProj.from_pretrained(config)
20
+ model.eval()
21
+ return model
22
+
23
+
24
+ @torch.no_grad()
25
+ def get_content(model, wav_16k_tensor, device='cuda'):
26
+ # print(layer)
27
+ wav_16k_tensor = wav_16k_tensor.to(device)
28
+ # so that the output shape will be len(audio//320)
29
+ wav_16k_tensor = F.pad(wav_16k_tensor, ((400 - 320) // 2, (400 - 320) // 2))
30
+ logits = model(wav_16k_tensor)['last_hidden_state']
31
+ return logits
32
+
33
+
34
+ if __name__ == '__main__':
35
+ model = get_content_model().cuda()
36
+ audio, sr = librosa.load('test.wav', sr=16000)
37
+ audio = audio[:100*320]
38
+ audio = torch.tensor([audio])
39
+ content = get_content(model, audio, 'cuda')
40
+ print(content)
dreamvoice/src/feats/hubert/.gitignore ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # VSCode project settings
114
+ .vscode
115
+
116
+ # Spyder project settings
117
+ .spyderproject
118
+ .spyproject
119
+
120
+ # Rope project settings
121
+ .ropeproject
122
+
123
+ # mkdocs documentation
124
+ /site
125
+
126
+ # mypy
127
+ .mypy_cache/
128
+ .dmypy.json
129
+ dmypy.json
130
+
131
+ # Pyre type checker
132
+ .pyre/
dreamvoice/src/feats/hubert/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2021 Benjamin van Niekerk
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.