zzk1st commited on
Commit
96ea36d
1 Parent(s): ff8ba22

Copy code from github

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +13 -0
  2. APIs.py +215 -0
  3. AudioCraft/app.py +110 -0
  4. Bark/__init__.py +0 -0
  5. Bark/app.py +109 -0
  6. Envs/AudioCraft.yml +237 -0
  7. Envs/Bark.yml +180 -0
  8. Envs/VoiceFixer.yml +123 -0
  9. Envs/WavJourney.yml +248 -0
  10. EnvsSetup/AudioCraft.sh +16 -0
  11. EnvsSetup/Bark.sh +1 -0
  12. EnvsSetup/VoiceFixer.sh +1 -0
  13. EnvsSetup/WavJourney.sh +1 -0
  14. README.md +47 -13
  15. VoiceFixer/app.py +55 -0
  16. VoiceParser/__init__.py +0 -0
  17. VoiceParser/app.py +58 -0
  18. VoiceParser/customtokenizer.py +202 -0
  19. VoiceParser/hubert_manager.py +33 -0
  20. VoiceParser/model.py +102 -0
  21. VoiceParser/pre_kmeans_hubert.py +106 -0
  22. add_voice_preset.py +21 -0
  23. code_generator.py +190 -0
  24. convert_json_to_audio_gen_code.py +30 -0
  25. data/voice_presets/metadata.json +47 -0
  26. data/voice_presets/npz/biden.npz +0 -0
  27. data/voice_presets/npz/boris.npz +0 -0
  28. data/voice_presets/npz/boy_Tom_Hiddleston.npz +0 -0
  29. data/voice_presets/npz/child_boy.npz +0 -0
  30. data/voice_presets/npz/cnn_male_speaker.npz +0 -0
  31. data/voice_presets/npz/elder_morgen.npz +0 -0
  32. data/voice_presets/npz/girl_Anne_Hathaway.npz +0 -0
  33. data/voice_presets/npz/mark_professor.npz +0 -0
  34. data/voice_presets/npz/news_female_speaker.npz +0 -0
  35. data/voice_presets/npz/news_female_speaker_outside.npz +0 -0
  36. data/voice_presets/npz/news_male_speaker.npz +0 -0
  37. data/voice_presets/npz/trump.npz +0 -0
  38. parse_voice.py +31 -0
  39. pipeline.py +235 -0
  40. prompts/audio_script_to_character_voice_map.prompt +11 -0
  41. prompts/audio_script_to_json.prompt +74 -0
  42. prompts/script_to_json.prompt +58 -0
  43. prompts/text_to_audio_script.prompt +34 -0
  44. prompts/text_to_json.prompt +31 -0
  45. scripts/download_models.py +31 -0
  46. scripts/kill_services.py +28 -0
  47. scripts/restart_services.sh +2 -0
  48. scripts/start_services.py +41 -0
  49. scripts/start_ui.sh +1 -0
  50. ui_client.py +273 -0
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ output/*
2
+ __pycache__/*
3
+ Bark/__pycache__/*
4
+ *.wav
5
+ Bark/request.py
6
+ VoiceFixer/request.py
7
+ service_logs/*
8
+ convert_script_to_audio_gen_code.py
9
+ /cache/
10
+ VoiceParser/hubert/*
11
+ VoiceParser/__pycache__
12
+ config.yaml
13
+ /services_logs/
APIs.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import requests
4
+ import yaml
5
+ import pyloudnorm as pyln
6
+ from scipy.io.wavfile import write
7
+ import torchaudio
8
+ from retrying import retry
9
+
10
+
11
+ os.environ['OPENBLAS_NUM_THREADS'] = '1'
12
+
13
+ SAMPLE_RATE = 32000
14
+
15
+
16
+ with open('config.yaml', 'r') as file:
17
+ config = yaml.safe_load(file)
18
+ tts_port = config['Text-to-Speech']['service-port']
19
+ ttm_port = config['Text-to-Music']['service-port']
20
+ tta_port = config['Text-to-Audio']['service-port']
21
+ sr_port = config['Speech-Restoration']['service-port']
22
+ vp_port = config['Voice-Parser']['service-port']
23
+ enable_sr = config['Speech-Restoration']['Enable']
24
+
25
+
26
+ def IDLE(length=1.0, out_wav='out.wav', sr=SAMPLE_RATE):
27
+ idle = np.zeros(int(length * sr))
28
+ WRITE_AUDIO(idle, name=out_wav, sr=SAMPLE_RATE)
29
+
30
+
31
+ def LOUDNESS_NORM(audio, sr=32000, volumn=-25):
32
+ # peak normalize audio to -1 dB
33
+ peak_normalized_audio = pyln.normalize.peak(audio, -10.0)
34
+ # measure the loudness first
35
+ meter = pyln.Meter(sr) # create BS.1770 meter
36
+ loudness = meter.integrated_loudness(peak_normalized_audio)
37
+ # loudness normalize audio to -12 dB LUFS
38
+ normalized_audio = pyln.normalize.loudness(peak_normalized_audio, loudness, volumn)
39
+ return normalized_audio
40
+
41
+
42
+ def WRITE_AUDIO(wav, name=None, sr=SAMPLE_RATE):
43
+ """
44
+ function: write audio numpy to .wav file
45
+ @params:
46
+ wav: np.array [samples]
47
+ """
48
+ if name is None:
49
+ name = 'output.wav'
50
+
51
+ if len(wav.shape) > 1:
52
+ wav = wav[0]
53
+
54
+ # declipping
55
+
56
+ max_value = np.max(np.abs(wav))
57
+ if max_value > 1:
58
+ wav *= 0.9 / max_value
59
+
60
+ # print(f'WRITE_AUDIO to {name}')
61
+ write(name, sr, np.round(wav*32767).astype(np.int16))
62
+
63
+
64
+ def READ_AUDIO_NUMPY(wav, sr=SAMPLE_RATE):
65
+ """
66
+ function: read audio numpy
67
+ return: np.array [samples]
68
+ """
69
+ waveform, sample_rate = torchaudio.load(wav)
70
+
71
+ if sample_rate != sr:
72
+ waveform = torchaudio.functional.resample(waveform, orig_freq=sample_rate, new_freq=sr)
73
+
74
+ wav_numpy = waveform[0].numpy()
75
+
76
+ return wav_numpy
77
+
78
+
79
+ def MIX(wavs=[['1.wav', 0.], ['2.wav', 10.]], out_wav='out.wav', sr=SAMPLE_RATE):
80
+ """
81
+ wavs:[[wav_name, absolute_offset], ...]
82
+ """
83
+
84
+ # last_name, last_offset = wavs[-1]
85
+ # last_len = len(READ_AUDIO_NUMPY(last_name))
86
+ # max_length = int(last_offset * sr + last_len)
87
+
88
+ max_length = max([int(wav[1]*sr + len(READ_AUDIO_NUMPY(wav[0]))) for wav in wavs])
89
+ template_wav = np.zeros(max_length)
90
+
91
+ for wav in wavs:
92
+ cur_name, cur_offset = wav
93
+ cur_wav = READ_AUDIO_NUMPY(cur_name)
94
+ cur_len = len(cur_wav)
95
+ cur_offset = int(cur_offset * sr)
96
+
97
+ # mix
98
+ template_wav[cur_offset:cur_offset+cur_len] += cur_wav
99
+
100
+ WRITE_AUDIO(template_wav, name=out_wav)
101
+
102
+
103
+ def CAT(wavs, out_wav='out.wav'):
104
+ """
105
+ wavs: List of wav file ['1.wav', '2.wav', ...]
106
+ """
107
+ wav_num = len(wavs)
108
+
109
+ segment0 = READ_AUDIO_NUMPY(wavs[0])
110
+
111
+ cat_wav = segment0
112
+
113
+ if wav_num > 1:
114
+ for i in range(1, wav_num):
115
+ next_wav = READ_AUDIO_NUMPY(wavs[i])
116
+ cat_wav = np.concatenate((cat_wav, next_wav), axis=-1)
117
+
118
+ WRITE_AUDIO(cat_wav, name=out_wav)
119
+
120
+
121
+ def COMPUTE_LEN(wav):
122
+ wav= READ_AUDIO_NUMPY(wav)
123
+ return len(wav) / 32000
124
+
125
+
126
+ @retry(stop_max_attempt_number=5, wait_fixed=2000)
127
+ def TTM(text, length=10, volume=-28, out_wav='out.wav'):
128
+ url = f'http://127.0.0.1:{ttm_port}/generate_music'
129
+ data = {
130
+ 'text': f'{text}',
131
+ 'length': f'{length}',
132
+ 'volume': f'{volume}',
133
+ 'output_wav': f'{out_wav}',
134
+ }
135
+
136
+ response = requests.post(url, json=data)
137
+
138
+ if response.status_code == 200:
139
+ print('Success:', response.json()['message'])
140
+ else:
141
+ print('Error:', response.json()['API error'])
142
+ raise RuntimeError(response.json()['API error'])
143
+
144
+ @retry(stop_max_attempt_number=5, wait_fixed=2000)
145
+ def TTA(text, length=5, volume=-35, out_wav='out.wav'):
146
+ url = f'http://127.0.0.1:{tta_port}/generate_audio'
147
+ data = {
148
+ 'text': f'{text}',
149
+ 'length': f'{length}',
150
+ 'volume': f'{volume}',
151
+ 'output_wav': f'{out_wav}',
152
+ }
153
+
154
+ response = requests.post(url, json=data)
155
+
156
+ if response.status_code == 200:
157
+ print('Success:', response.json()['message'])
158
+ else:
159
+ print('Error:', response.json()['API error'])
160
+ raise RuntimeError(response.json()['API error'])
161
+
162
+
163
+ @retry(stop_max_attempt_number=5, wait_fixed=2000)
164
+ def TTS(text, speaker='news_anchor', volume=-20, out_wav='out.wav', enhanced=enable_sr, speaker_id='', speaker_npz=''):
165
+ url = f'http://127.0.0.1:{tts_port}/generate_speech'
166
+ data = {
167
+ 'text': f'{text}',
168
+ 'speaker_id': f'{speaker_id}',
169
+ 'speaker_npz': f'{speaker_npz}',
170
+ 'volume': f'{volume}',
171
+ 'output_wav': f'{out_wav}',
172
+ }
173
+
174
+ response = requests.post(url, json=data)
175
+
176
+ if response.status_code == 200:
177
+ print('Success:', response.json()['message'])
178
+ else:
179
+ print('Error:', response.json()['API error'])
180
+ raise RuntimeError(response.json()['API error'])
181
+
182
+ if enhanced:
183
+ SR(processfile=out_wav)
184
+
185
+
186
+ @retry(stop_max_attempt_number=5, wait_fixed=2000)
187
+ def SR(processfile):
188
+ url = f'http://127.0.0.1:{sr_port}/fix_audio'
189
+ data = {'processfile': f'{processfile}'}
190
+
191
+ response = requests.post(url, json=data)
192
+
193
+ if response.status_code == 200:
194
+ print('Success:', response.json()['message'])
195
+ else:
196
+ print('Error:', response.json()['API error'])
197
+ raise RuntimeError(response.json()['API error'])
198
+
199
+
200
+ @retry(stop_max_attempt_number=5, wait_fixed=2000)
201
+ def VP(wav_path, out_dir):
202
+ url = f'http://127.0.0.1:{vp_port}/parse_voice'
203
+ data = {
204
+ 'wav_path': f'{wav_path}',
205
+ 'out_dir':f'{out_dir}'
206
+ }
207
+
208
+ response = requests.post(url, json=data)
209
+
210
+ if response.status_code == 200:
211
+ print('Success:', response.json()['message'])
212
+ else:
213
+ print('Error:', response.json()['API error'])
214
+ raise RuntimeError(response.json()['API error'])
215
+
AudioCraft/app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.append('../AudioJourney')
3
+ import os
4
+ import yaml
5
+ import logging
6
+ import torchaudio
7
+ from APIs import WRITE_AUDIO, LOUDNESS_NORM
8
+ from utils import fade
9
+ from flask import Flask, request, jsonify
10
+
11
+ with open('config.yaml', 'r') as file:
12
+ config = yaml.safe_load(file)
13
+
14
+ # Configure the logging format and level
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(levelname)s - %(message)s'
18
+ )
19
+
20
+ # Create a FileHandler for the log file
21
+ os.makedirs('services_logs', exist_ok=True)
22
+ log_filename = 'services_logs/Text-to-Audio-Music.log'
23
+ file_handler = logging.FileHandler(log_filename, mode='w')
24
+ file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
25
+
26
+ # Add the FileHandler to the root logger
27
+ logging.getLogger('').addHandler(file_handler)
28
+
29
+
30
+ # Initialize the model here
31
+ from audiocraft.models import AudioGen, MusicGen
32
+ tta_model = AudioGen.get_pretrained('facebook/audiogen-medium')
33
+ logging.info('AudioGen is loaded ...')
34
+
35
+ model_size = config['Text-to-Music']['model_size']
36
+ ttm_model = MusicGen.get_pretrained(f'facebook/musicgen-{model_size}')
37
+ logging.info(f'MusicGen ({model_size}) is loaded ...')
38
+
39
+ app = Flask(__name__)
40
+
41
+ @app.route('/generate_audio', methods=['POST'])
42
+ def generate_audio():
43
+ # Receive the text from the POST request
44
+ data = request.json
45
+ text = data['text']
46
+ length = float(data.get('length', 5.0))
47
+ volume = float(data.get('volume', -35))
48
+ output_wav = data.get('output_wav', 'out.wav')
49
+
50
+ logging.info(f'TTA (AudioGen): Prompt: {text}, length: {length} seconds, volume: {volume} dB')
51
+
52
+ try:
53
+ tta_model.set_generation_params(duration=length)
54
+ wav = tta_model.generate([text])
55
+ wav = torchaudio.functional.resample(wav, orig_freq=16000, new_freq=32000)
56
+
57
+ wav = wav.squeeze().cpu().detach().numpy()
58
+ wav = fade(LOUDNESS_NORM(wav, volumn=volume))
59
+ WRITE_AUDIO(wav, name=output_wav)
60
+
61
+ # Return success message and the filename of the generated audio
62
+ return jsonify({'message': f'Text-to-Audio generated successfully | {text}', 'file': output_wav})
63
+
64
+ except Exception as e:
65
+ return jsonify({'API error': str(e)}), 500
66
+
67
+
68
+ @app.route('/generate_music', methods=['POST'])
69
+ def generate_music():
70
+ # Receive the text from the POST request
71
+ data = request.json
72
+ text = data['text']
73
+ length = float(data.get('length', 5.0))
74
+ volume = float(data.get('volume', -35))
75
+ output_wav = data.get('output_wav', 'out.wav')
76
+
77
+ logging.info(f'TTM (MusicGen): Prompt: {text}, length: {length} seconds, volume: {volume} dB')
78
+
79
+
80
+ try:
81
+ ttm_model.set_generation_params(duration=length)
82
+ wav = ttm_model.generate([text])
83
+ wav = wav[0][0].cpu().detach().numpy()
84
+ wav = fade(LOUDNESS_NORM(wav, volumn=volume))
85
+ WRITE_AUDIO(wav, name=output_wav)
86
+
87
+ # Return success message and the filename of the generated audio
88
+ return jsonify({'message': f'Text-to-Music generated successfully | {text}', 'file': output_wav})
89
+
90
+ except Exception as e:
91
+ # Return error message if something goes wrong
92
+ return jsonify({'API error': str(e)}), 500
93
+
94
+
95
+ if __name__ == '__main__':
96
+ import yaml
97
+ with open('config.yaml', 'r') as file:
98
+ config = yaml.safe_load(file)
99
+
100
+ tta_service_port = config['Text-to-Audio']['service-port']
101
+ ttm_service_port = config['Text-to-Audio']['service-port']
102
+
103
+ if tta_service_port != ttm_service_port:
104
+ msg = 'Ports of TTA and TTM should be same if you are using Audiocraft ...'
105
+ logging.info(msg)
106
+ raise ValueError(msg)
107
+
108
+ app.run(debug=False, port=tta_service_port)
109
+
110
+
Bark/__init__.py ADDED
File without changes
Bark/app.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append('../AudioJourney')
4
+ import logging
5
+ import yaml
6
+ import numpy as np
7
+ import torch
8
+ import torchaudio
9
+ from torchaudio.transforms import SpeedPerturbation
10
+ import nltk
11
+ from APIs import WRITE_AUDIO, LOUDNESS_NORM
12
+ from flask import Flask, request, jsonify
13
+ from transformers import BarkModel, AutoProcessor
14
+
15
+
16
+ with open('config.yaml', 'r') as file:
17
+ config = yaml.safe_load(file)
18
+
19
+ # Configure the logging format and level
20
+ logging.basicConfig(
21
+ level=logging.INFO,
22
+ format='%(asctime)s - %(levelname)s - %(message)s'
23
+ )
24
+
25
+ # Create a FileHandler for the log file
26
+ os.makedirs('services_logs', exist_ok=True)
27
+ log_filename = 'services_logs/Text-to-Speech.log'
28
+ file_handler = logging.FileHandler(log_filename, mode='w')
29
+ file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
30
+
31
+ # Add the FileHandler to the root logger
32
+ logging.getLogger('').addHandler(file_handler)
33
+
34
+ # Initialize the model here
35
+ SPEED = float(config['Text-to-Speech']['speed'])
36
+ speed_perturb = SpeedPerturbation(32000, [SPEED])
37
+
38
+ logging.info('Loading Bark model ...')
39
+ # TODO: fp16?
40
+ model = BarkModel.from_pretrained("suno/bark")
41
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
42
+ model = model.to(device)
43
+ model = model.to_bettertransformer() # Flash attention
44
+ SAMPLE_RATE = model.generation_config.sample_rate
45
+ SEMANTIC_TEMPERATURE = 0.9
46
+ COARSE_TEMPERATURE = 0.5
47
+ FINE_TEMPERATURE = 0.5
48
+
49
+ processor = AutoProcessor.from_pretrained("suno/bark")
50
+
51
+ app = Flask(__name__)
52
+
53
+ @app.route('/generate_speech', methods=['POST'])
54
+ def generate_speech():
55
+ # Receive the text from the POST request
56
+ data = request.json
57
+ text = data['text']
58
+ speaker_id = data['speaker_id']
59
+ speaker_npz = data['speaker_npz']
60
+ volume = float(data.get('volume', -35))
61
+ output_wav = data.get('output_wav', 'out.wav')
62
+
63
+ logging.info(f'TTS (Bark): Speaker: {speaker_id}, Volume: {volume} dB, Prompt: {text}')
64
+
65
+ try:
66
+ # Generate audio using the global pipe object
67
+ text = text.replace('\n', ' ').strip()
68
+ sentences = nltk.sent_tokenize(text)
69
+ silence = torch.zeros(int(0.1 * SAMPLE_RATE), device=device).unsqueeze(0) # 0.1 second of silence
70
+
71
+ pieces = []
72
+ for sentence in sentences:
73
+ inputs = processor(sentence, voice_preset=speaker_npz).to(device)
74
+ # NOTE: you must run the line below, otherwise you will see the runtime error
75
+ # RuntimeError: view size is not compatible with input tensor's size and stride (at least one dimension spans across two contiguous subspaces). Use .reshape(...) instead.
76
+ inputs['history_prompt']['coarse_prompt'] = inputs['history_prompt']['coarse_prompt'].transpose(0, 1).contiguous().transpose(0, 1)
77
+
78
+ with torch.inference_mode():
79
+ # TODO: min_eos_p?
80
+ output = model.generate(
81
+ **inputs,
82
+ do_sample = True,
83
+ semantic_temperature = SEMANTIC_TEMPERATURE,
84
+ coarse_temperature = COARSE_TEMPERATURE,
85
+ fine_temperature = FINE_TEMPERATURE
86
+ )
87
+
88
+ pieces += [output, silence]
89
+
90
+ result_audio = torch.cat(pieces, dim=1)
91
+ wav_tensor = result_audio.to(dtype=torch.float32).cpu()
92
+ wav = torchaudio.functional.resample(wav_tensor, orig_freq=SAMPLE_RATE, new_freq=32000)
93
+ wav = speed_perturb(wav.float())[0].squeeze(0)
94
+ wav = wav.numpy()
95
+ wav = LOUDNESS_NORM(wav, volumn=volume)
96
+ WRITE_AUDIO(wav, name=output_wav)
97
+
98
+ # Return success message and the filename of the generated audio
99
+ return jsonify({'message': f'Text-to-Speech generated successfully | {speaker_id}: {text}', 'file': output_wav})
100
+
101
+ except Exception as e:
102
+ raise e
103
+ # Return error message if something goes wrong
104
+ return jsonify({'API error': str(e)}), 500
105
+
106
+
107
+ if __name__ == '__main__':
108
+ service_port = config['Text-to-Speech']['service-port']
109
+ app.run(debug=False, port=service_port)
Envs/AudioCraft.yml ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: AudioCraft
2
+ channels:
3
+ - nvidia/label/cuda-11.8.0
4
+ - conda-forge
5
+ - defaults
6
+ dependencies:
7
+ - _libgcc_mutex=0.1=conda_forge
8
+ - _openmp_mutex=4.5=2_gnu
9
+ - bzip2=1.0.8=h7f98852_4
10
+ - ca-certificates=2023.05.30=h06a4308_0
11
+ - cuda-cccl=11.8.89=0
12
+ - cuda-command-line-tools=11.8.0=0
13
+ - cuda-compiler=11.8.0=0
14
+ - cuda-cudart=11.8.89=0
15
+ - cuda-cudart-dev=11.8.89=0
16
+ - cuda-cuobjdump=11.8.86=0
17
+ - cuda-cupti=11.8.87=0
18
+ - cuda-cuxxfilt=11.8.86=0
19
+ - cuda-documentation=11.8.86=0
20
+ - cuda-driver-dev=11.8.89=0
21
+ - cuda-gdb=11.8.86=0
22
+ - cuda-libraries=11.8.0=0
23
+ - cuda-libraries-dev=11.8.0=0
24
+ - cuda-memcheck=11.8.86=0
25
+ - cuda-nsight=11.8.86=0
26
+ - cuda-nsight-compute=11.8.0=0
27
+ - cuda-nvcc=11.8.89=0
28
+ - cuda-nvdisasm=11.8.86=0
29
+ - cuda-nvml-dev=11.8.86=0
30
+ - cuda-nvprof=11.8.87=0
31
+ - cuda-nvprune=11.8.86=0
32
+ - cuda-nvrtc=11.8.89=0
33
+ - cuda-nvrtc-dev=11.8.89=0
34
+ - cuda-nvtx=11.8.86=0
35
+ - cuda-nvvp=11.8.87=0
36
+ - cuda-profiler-api=11.8.86=0
37
+ - cuda-sanitizer-api=11.8.86=0
38
+ - cuda-toolkit=11.8.0=0
39
+ - cuda-tools=11.8.0=0
40
+ - cuda-visual-tools=11.8.0=0
41
+ - gds-tools=1.4.0.31=0
42
+ - ld_impl_linux-64=2.40=h41732ed_0
43
+ - libcublas=11.11.3.6=0
44
+ - libcublas-dev=11.11.3.6=0
45
+ - libcufft=10.9.0.58=0
46
+ - libcufft-dev=10.9.0.58=0
47
+ - libcufile=1.4.0.31=0
48
+ - libcufile-dev=1.4.0.31=0
49
+ - libcurand=10.3.0.86=0
50
+ - libcurand-dev=10.3.0.86=0
51
+ - libcusolver=11.4.1.48=0
52
+ - libcusolver-dev=11.4.1.48=0
53
+ - libcusparse=11.7.5.86=0
54
+ - libcusparse-dev=11.7.5.86=0
55
+ - libffi=3.4.2=h7f98852_5
56
+ - libgcc-ng=13.1.0=he5830b7_0
57
+ - libgomp=13.1.0=he5830b7_0
58
+ - libnpp=11.8.0.86=0
59
+ - libnpp-dev=11.8.0.86=0
60
+ - libnsl=2.0.0=h7f98852_0
61
+ - libnvjpeg=11.9.0.86=0
62
+ - libnvjpeg-dev=11.9.0.86=0
63
+ - libsqlite=3.42.0=h2797004_0
64
+ - libuuid=2.38.1=h0b41bf4_0
65
+ - libzlib=1.2.13=hd590300_5
66
+ - ncurses=6.4=hcb278e6_0
67
+ - nsight-compute=2022.3.0.22=0
68
+ - openssl=3.1.1=hd590300_1
69
+ - pip=23.1.2=pyhd8ed1ab_0
70
+ - python=3.8.17=he550d4f_0_cpython
71
+ - readline=8.2=h8228510_1
72
+ - setuptools=68.0.0=pyhd8ed1ab_0
73
+ - tk=8.6.12=h27826a3_0
74
+ - wheel=0.40.0=pyhd8ed1ab_0
75
+ - xz=5.2.6=h166bdaf_0
76
+ - pip:
77
+ - aiofiles==23.1.0
78
+ - aiohttp==3.8.4
79
+ - aiosignal==1.3.1
80
+ - altair==5.0.1
81
+ - antlr4-python3-runtime==4.9.3
82
+ - anyio==3.7.1
83
+ - appdirs==1.4.4
84
+ - async-timeout==4.0.2
85
+ - attrs==23.1.0
86
+ - audioread==3.0.0
87
+ - av==10.0.0
88
+ - blinker==1.6.2
89
+ - blis==0.7.9
90
+ - catalogue==2.0.8
91
+ - certifi==2023.5.7
92
+ - cffi==1.15.1
93
+ - charset-normalizer==3.2.0
94
+ - click==8.1.5
95
+ - cloudpickle==2.2.1
96
+ - cmake==3.26.4
97
+ - colorlog==6.7.0
98
+ - confection==0.1.0
99
+ - contourpy==1.1.0
100
+ - cycler==0.11.0
101
+ - cymem==2.0.7
102
+ - cython==0.29.36
103
+ - decorator==5.1.1
104
+ - demucs==4.0.0
105
+ - diffq==0.2.4
106
+ - docopt==0.6.2
107
+ - dora-search==0.1.12
108
+ - einops==0.6.1
109
+ - encodec==0.1.1
110
+ - exceptiongroup==1.1.2
111
+ - fastapi==0.100.0
112
+ - ffmpy==0.3.0
113
+ - filelock==3.12.2
114
+ - flashy==0.0.2
115
+ - flask==2.3.2
116
+ - fonttools==4.41.0
117
+ - frozenlist==1.4.0
118
+ - fsspec==2023.6.0
119
+ - future==0.18.3
120
+ - gradio==3.36.1
121
+ - gradio-client==0.2.9
122
+ - h11==0.14.0
123
+ - httpcore==0.17.3
124
+ - httpx==0.24.1
125
+ - huggingface-hub==0.16.4
126
+ - hydra-colorlog==1.2.0
127
+ - hydra-core==1.3.2
128
+ - idna==3.4
129
+ - importlib-metadata==6.8.0
130
+ - importlib-resources==6.0.0
131
+ - itsdangerous==2.1.2
132
+ - jinja2==3.1.2
133
+ - joblib==1.3.1
134
+ - jsonschema==4.18.3
135
+ - jsonschema-specifications==2023.6.1
136
+ - julius==0.2.7
137
+ - kiwisolver==1.4.4
138
+ - lameenc==1.5.1
139
+ - langcodes==3.3.0
140
+ - lazy-loader==0.3
141
+ - librosa==0.10.0.post2
142
+ - lightning-utilities==0.9.0
143
+ - linkify-it-py==2.0.2
144
+ - lit==16.0.6
145
+ - llvmlite==0.40.1
146
+ - markdown-it-py==2.2.0
147
+ - markupsafe==2.1.3
148
+ - matplotlib==3.7.2
149
+ - mdit-py-plugins==0.3.3
150
+ - mdurl==0.1.2
151
+ - mpmath==1.3.0
152
+ - msgpack==1.0.5
153
+ - multidict==6.0.4
154
+ - murmurhash==1.0.9
155
+ - mypy-extensions==1.0.0
156
+ - networkx==3.1
157
+ - num2words==0.5.12
158
+ - numba==0.57.1
159
+ - numpy==1.24.4
160
+ - nvidia-cublas-cu11==11.10.3.66
161
+ - nvidia-cuda-cupti-cu11==11.7.101
162
+ - nvidia-cuda-nvrtc-cu11==11.7.99
163
+ - nvidia-cuda-runtime-cu11==11.7.99
164
+ - nvidia-cudnn-cu11==8.5.0.96
165
+ - nvidia-cufft-cu11==10.9.0.58
166
+ - nvidia-curand-cu11==10.2.10.91
167
+ - nvidia-cusolver-cu11==11.4.0.1
168
+ - nvidia-cusparse-cu11==11.7.4.91
169
+ - nvidia-nccl-cu11==2.14.3
170
+ - nvidia-nvtx-cu11==11.7.91
171
+ - omegaconf==2.3.0
172
+ - openunmix==1.2.1
173
+ - orjson==3.9.2
174
+ - packaging==23.1
175
+ - pandas==2.0.3
176
+ - pathy==0.10.2
177
+ - pillow==10.0.0
178
+ - pkgutil-resolve-name==1.3.10
179
+ - pooch==1.6.0
180
+ - preshed==3.0.8
181
+ - pycparser==2.21
182
+ - pydantic==1.10.11
183
+ - pydub==0.25.1
184
+ - pygments==2.15.1
185
+ - pyloudnorm==0.1.1
186
+ - pyparsing==3.0.9
187
+ - pyre-extensions==0.0.29
188
+ - python-dateutil==2.8.2
189
+ - python-multipart==0.0.6
190
+ - pytz==2023.3
191
+ - pyyaml==6.0
192
+ - referencing==0.29.1
193
+ - regex==2023.6.3
194
+ - requests==2.31.0
195
+ - retrying==1.3.4
196
+ - rpds-py==0.8.10
197
+ - safetensors==0.3.1
198
+ - scikit-learn==1.3.0
199
+ - scipy==1.10.1
200
+ - semantic-version==2.10.0
201
+ - sentencepiece==0.1.99
202
+ - six==1.16.0
203
+ - smart-open==6.3.0
204
+ - sniffio==1.3.0
205
+ - soundfile==0.12.1
206
+ - soxr==0.3.5
207
+ - spacy==3.5.2
208
+ - spacy-legacy==3.0.12
209
+ - spacy-loggers==1.0.4
210
+ - srsly==2.4.6
211
+ - starlette==0.27.0
212
+ - submitit==1.4.5
213
+ - sympy==1.12
214
+ - thinc==8.1.10
215
+ - threadpoolctl==3.2.0
216
+ - tokenizers==0.13.3
217
+ - toolz==0.12.0
218
+ - torch==2.0.1
219
+ - torchaudio==2.0.2
220
+ - torchmetrics==1.0.1
221
+ - tqdm==4.65.0
222
+ - transformers==4.31.0
223
+ - treetable==0.2.5
224
+ - triton==2.0.0
225
+ - typer==0.7.0
226
+ - typing-extensions==4.7.1
227
+ - typing-inspect==0.9.0
228
+ - tzdata==2023.3
229
+ - uc-micro-py==1.0.2
230
+ - urllib3==2.0.3
231
+ - uvicorn==0.22.0
232
+ - wasabi==1.1.2
233
+ - websockets==11.0.3
234
+ - werkzeug==2.3.6
235
+ - xformers==0.0.20
236
+ - yarl==1.9.2
237
+ - zipp==3.16.2
Envs/Bark.yml ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Bark
2
+ channels:
3
+ - conda-forge
4
+ - defaults
5
+ dependencies:
6
+ - _libgcc_mutex=0.1=conda_forge
7
+ - _openmp_mutex=4.5=2_gnu
8
+ - bzip2=1.0.8=h7f98852_4
9
+ - ca-certificates=2023.5.7=hbcca054_0
10
+ - ld_impl_linux-64=2.40=h41732ed_0
11
+ - libffi=3.4.2=h7f98852_5
12
+ - libgcc-ng=13.1.0=he5830b7_0
13
+ - libgomp=13.1.0=he5830b7_0
14
+ - libnsl=2.0.0=h7f98852_0
15
+ - libsqlite=3.42.0=h2797004_0
16
+ - libuuid=2.38.1=h0b41bf4_0
17
+ - libzlib=1.2.13=hd590300_5
18
+ - ncurses=6.4=hcb278e6_0
19
+ - openssl=3.1.1=hd590300_1
20
+ - pip=23.1.2=pyhd8ed1ab_0
21
+ - python=3.8.17=he550d4f_0_cpython
22
+ - readline=8.2=h8228510_1
23
+ - setuptools=68.0.0=pyhd8ed1ab_0
24
+ - tk=8.6.12=h27826a3_0
25
+ - wheel=0.40.0=pyhd8ed1ab_0
26
+ - xz=5.2.6=h166bdaf_0
27
+ - pip:
28
+ - aiohttp==3.8.5
29
+ - aiosignal==1.3.1
30
+ - altair==5.0.1
31
+ - appdirs==1.4.4
32
+ - asttokens==2.2.1
33
+ - async-timeout==4.0.3
34
+ - attrs==23.1.0
35
+ - audioread==3.0.0
36
+ - backcall==0.2.0
37
+ - backports-zoneinfo==0.2.1
38
+ - blinker==1.6.2
39
+ - boto3==1.28.3
40
+ - botocore==1.31.3
41
+ - cachetools==5.3.1
42
+ - certifi==2023.5.7
43
+ - cffi==1.15.1
44
+ - charset-normalizer==3.2.0
45
+ - click==8.1.5
46
+ - cmake==3.26.4
47
+ - coloredlogs==15.0.1
48
+ - contourpy==1.1.0
49
+ - cycler==0.11.0
50
+ - datasets==2.14.4
51
+ - decorator==5.1.1
52
+ - dill==0.3.7
53
+ - einops==0.6.1
54
+ - encodec==0.1.1
55
+ - executing==1.2.0
56
+ - filelock==3.12.2
57
+ - fire==0.5.0
58
+ - flask==2.3.2
59
+ - fonttools==4.41.0
60
+ - frozenlist==1.4.0
61
+ - fsspec==2023.6.0
62
+ - funcy==2.0
63
+ - future==0.18.3
64
+ - gitdb==4.0.10
65
+ - gitpython==3.1.32
66
+ - huggingface-hub==0.16.4
67
+ - humanfriendly==10.0
68
+ - idna==3.4
69
+ - importlib-metadata==6.8.0
70
+ - importlib-resources==6.0.0
71
+ - ipdb==0.13.13
72
+ - ipython==8.12.2
73
+ - itsdangerous==2.1.2
74
+ - jedi==0.19.0
75
+ - jinja2==3.1.2
76
+ - jmespath==1.0.1
77
+ - joblib==1.3.1
78
+ - jsonschema==4.18.3
79
+ - jsonschema-specifications==2023.6.1
80
+ - kiwisolver==1.4.4
81
+ - lazy-loader==0.3
82
+ - librosa==0.10.0.post2
83
+ - lit==16.0.6
84
+ - llvmlite==0.40.1
85
+ - markdown-it-py==3.0.0
86
+ - markupsafe==2.1.3
87
+ - matplotlib==3.7.2
88
+ - matplotlib-inline==0.1.6
89
+ - mdurl==0.1.2
90
+ - mpmath==1.3.0
91
+ - msgpack==1.0.5
92
+ - multidict==6.0.4
93
+ - multiprocess==0.70.15
94
+ - networkx==3.1
95
+ - nltk==3.8.1
96
+ - numba==0.57.1
97
+ - numpy==1.24.4
98
+ - nvidia-cublas-cu11==11.10.3.66
99
+ - nvidia-cuda-cupti-cu11==11.7.101
100
+ - nvidia-cuda-nvrtc-cu11==11.7.99
101
+ - nvidia-cuda-runtime-cu11==11.7.99
102
+ - nvidia-cudnn-cu11==8.5.0.96
103
+ - nvidia-cufft-cu11==10.9.0.58
104
+ - nvidia-curand-cu11==10.2.10.91
105
+ - nvidia-cusolver-cu11==11.4.0.1
106
+ - nvidia-cusparse-cu11==11.7.4.91
107
+ - nvidia-nccl-cu11==2.14.3
108
+ - nvidia-nvtx-cu11==11.7.91
109
+ - optimum==1.11.1
110
+ - packaging==23.1
111
+ - pandas==2.0.3
112
+ - parso==0.8.3
113
+ - pexpect==4.8.0
114
+ - pickleshare==0.7.5
115
+ - pillow==9.5.0
116
+ - pkgutil-resolve-name==1.3.10
117
+ - pooch==1.6.0
118
+ - progressbar==2.5
119
+ - prompt-toolkit==3.0.39
120
+ - protobuf==4.23.4
121
+ - ptyprocess==0.7.0
122
+ - pure-eval==0.2.2
123
+ - pyarrow==12.0.1
124
+ - pycparser==2.21
125
+ - pydeck==0.8.1b0
126
+ - pygments==2.15.1
127
+ - pyloudnorm==0.1.1
128
+ - pympler==1.0.1
129
+ - pyparsing==3.0.9
130
+ - python-dateutil==2.8.2
131
+ - pytz==2023.3
132
+ - pytz-deprecation-shim==0.1.0.post0
133
+ - pyyaml==6.0
134
+ - referencing==0.29.1
135
+ - regex==2023.6.3
136
+ - requests==2.31.0
137
+ - resampy==0.4.2
138
+ - retrying==1.3.4
139
+ - rich==13.4.2
140
+ - rpds-py==0.8.10
141
+ - s3transfer==0.6.1
142
+ - safetensors==0.3.1
143
+ - scikit-learn==1.3.0
144
+ - scipy==1.10.1
145
+ - sentencepiece==0.1.99
146
+ - six==1.16.0
147
+ - smmap==5.0.0
148
+ - soundfile==0.12.1
149
+ - soxr==0.3.5
150
+ - stack-data==0.6.2
151
+ - streamlit==1.24.1
152
+ - suno-bark==0.1.5
153
+ - sympy==1.12
154
+ - tenacity==8.2.2
155
+ - termcolor==2.3.0
156
+ - threadpoolctl==3.2.0
157
+ - tokenizers==0.13.3
158
+ - toml==0.10.2
159
+ - tomli==2.0.1
160
+ - toolz==0.12.0
161
+ - torch==2.0.1
162
+ - torchaudio==2.0.2
163
+ - torchlibrosa==0.0.7
164
+ - tornado==6.3.2
165
+ - tqdm==4.65.0
166
+ - traitlets==5.9.0
167
+ - transformers==4.31.0
168
+ - triton==2.0.0
169
+ - typing-extensions==4.7.1
170
+ - tzdata==2023.3
171
+ - tzlocal==4.3.1
172
+ - urllib3==1.26.16
173
+ - validators==0.20.0
174
+ - watchdog==3.0.0
175
+ - wcwidth==0.2.6
176
+ - werkzeug==2.3.6
177
+ - xxhash==3.3.0
178
+ - yarl==1.9.2
179
+ - zipp==3.16.1
180
+ prefix: /home/zzk/Workspace/miniconda3/envs/Bark
Envs/VoiceFixer.yml ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: VoiceFixer
2
+ channels:
3
+ - defaults
4
+ dependencies:
5
+ - _libgcc_mutex=0.1=main
6
+ - _openmp_mutex=5.1=1_gnu
7
+ - ca-certificates=2023.05.30=h06a4308_0
8
+ - ld_impl_linux-64=2.38=h1181459_1
9
+ - libffi=3.4.4=h6a678d5_0
10
+ - libgcc-ng=11.2.0=h1234567_1
11
+ - libgomp=11.2.0=h1234567_1
12
+ - libstdcxx-ng=11.2.0=h1234567_1
13
+ - ncurses=6.4=h6a678d5_0
14
+ - openssl=3.0.9=h7f8727e_0
15
+ - pip=23.2.1=py38h06a4308_0
16
+ - python=3.8.17=h955ad1f_0
17
+ - readline=8.2=h5eee18b_0
18
+ - setuptools=68.0.0=py38h06a4308_0
19
+ - sqlite=3.41.2=h5eee18b_0
20
+ - tk=8.6.12=h1ccaba5_0
21
+ - wheel=0.38.4=py38h06a4308_0
22
+ - xz=5.4.2=h5eee18b_0
23
+ - zlib=1.2.13=h5eee18b_0
24
+ - pip:
25
+ - altair==5.0.1
26
+ - attrs==23.1.0
27
+ - audioread==3.0.0
28
+ - backports-zoneinfo==0.2.1
29
+ - blinker==1.6.2
30
+ - cachetools==5.3.1
31
+ - certifi==2023.5.7
32
+ - cffi==1.15.1
33
+ - charset-normalizer==3.2.0
34
+ - click==8.1.5
35
+ - cmake==3.27.0
36
+ - contourpy==1.1.0
37
+ - cycler==0.11.0
38
+ - decorator==5.1.1
39
+ - filelock==3.12.2
40
+ - flask==2.3.2
41
+ - fonttools==4.38.0
42
+ - gitdb==4.0.10
43
+ - gitpython==3.1.32
44
+ - idna==3.4
45
+ - importlib-metadata==6.7.0
46
+ - importlib-resources==5.12.0
47
+ - itsdangerous==2.1.2
48
+ - jinja2==3.1.2
49
+ - joblib==1.3.1
50
+ - jsonschema==4.17.3
51
+ - jsonschema-specifications==2023.7.1
52
+ - kiwisolver==1.4.4
53
+ - librosa==0.8.1
54
+ - lit==16.0.6
55
+ - llvmlite==0.39.1
56
+ - markdown-it-py==2.2.0
57
+ - markupsafe==2.1.3
58
+ - matplotlib==3.5.3
59
+ - mdurl==0.1.2
60
+ - mpmath==1.3.0
61
+ - networkx==3.1
62
+ - numba==0.56.4
63
+ - numpy==1.21.6
64
+ - nvidia-cublas-cu11==11.10.3.66
65
+ - nvidia-cuda-cupti-cu11==11.7.101
66
+ - nvidia-cuda-nvrtc-cu11==11.7.99
67
+ - nvidia-cuda-runtime-cu11==11.7.99
68
+ - nvidia-cudnn-cu11==8.5.0.96
69
+ - nvidia-cufft-cu11==10.9.0.58
70
+ - nvidia-curand-cu11==10.2.10.91
71
+ - nvidia-cusolver-cu11==11.4.0.1
72
+ - nvidia-cusparse-cu11==11.7.4.91
73
+ - nvidia-nccl-cu11==2.14.3
74
+ - nvidia-nvtx-cu11==11.7.91
75
+ - packaging==23.1
76
+ - pandas==1.3.5
77
+ - pillow==9.5.0
78
+ - pkgutil-resolve-name==1.3.10
79
+ - platformdirs==3.9.1
80
+ - pooch==1.7.0
81
+ - progressbar==2.5
82
+ - protobuf==4.23.4
83
+ - pyarrow==12.0.1
84
+ - pycparser==2.21
85
+ - pydeck==0.8.1b0
86
+ - pygments==2.15.1
87
+ - pympler==1.0.1
88
+ - pyparsing==3.1.0
89
+ - pyrsistent==0.19.3
90
+ - python-dateutil==2.8.2
91
+ - pytz==2023.3
92
+ - pytz-deprecation-shim==0.1.0.post0
93
+ - pyyaml==6.0.1
94
+ - referencing==0.30.0
95
+ - requests==2.31.0
96
+ - resampy==0.4.2
97
+ - retrying==1.3.4
98
+ - rich==13.4.2
99
+ - rpds-py==0.9.2
100
+ - scikit-learn==1.0.2
101
+ - scipy==1.7.3
102
+ - six==1.16.0
103
+ - smmap==5.0.0
104
+ - soundfile==0.12.1
105
+ - streamlit==1.23.1
106
+ - sympy==1.12
107
+ - tenacity==8.2.2
108
+ - threadpoolctl==3.1.0
109
+ - toml==0.10.2
110
+ - toolz==0.12.0
111
+ - torch==1.13.1
112
+ - torchlibrosa==0.0.7
113
+ - tornado==6.2
114
+ - triton==2.0.0
115
+ - typing-extensions==4.7.1
116
+ - tzdata==2023.3
117
+ - tzlocal==4.3.1
118
+ - urllib3==2.0.3
119
+ - validators==0.20.0
120
+ - voicefixer==0.1.2
121
+ - watchdog==3.0.0
122
+ - werkzeug==2.3.6
123
+ - zipp==3.15.0
Envs/WavJourney.yml ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: WavJourney
2
+ channels:
3
+ - conda-forge
4
+ - defaults
5
+ dependencies:
6
+ - _libgcc_mutex=0.1=conda_forge
7
+ - _openmp_mutex=4.5=2_gnu
8
+ - aom=3.5.0=h27087fc_0
9
+ - bzip2=1.0.8=h7f98852_4
10
+ - ca-certificates=2023.7.22=hbcca054_0
11
+ - cairo=1.16.0=hbbf8b49_1016
12
+ - dav1d=1.2.1=hd590300_0
13
+ - expat=2.5.0=hcb278e6_1
14
+ - ffmpeg=6.0.0=gpl_hdbbbd96_103
15
+ - font-ttf-dejavu-sans-mono=2.37=hab24e00_0
16
+ - font-ttf-inconsolata=3.000=h77eed37_0
17
+ - font-ttf-source-code-pro=2.038=h77eed37_0
18
+ - font-ttf-ubuntu=0.83=hab24e00_0
19
+ - fontconfig=2.14.2=h14ed4e7_0
20
+ - fonts-conda-ecosystem=1=0
21
+ - fonts-conda-forge=1=0
22
+ - freetype=2.12.1=hca18f0e_1
23
+ - fribidi=1.0.10=h36c2ea0_0
24
+ - gettext=0.21.1=h27087fc_0
25
+ - gmp=6.2.1=h58526e2_0
26
+ - gnutls=3.7.8=hf3e180e_0
27
+ - graphite2=1.3.13=h58526e2_1001
28
+ - harfbuzz=7.3.0=hdb3a94d_0
29
+ - icu=72.1=hcb278e6_0
30
+ - lame=3.100=h166bdaf_1003
31
+ - ld_impl_linux-64=2.40=h41732ed_0
32
+ - libass=0.17.1=hc9aadba_0
33
+ - libdrm=2.4.114=h166bdaf_0
34
+ - libexpat=2.5.0=hcb278e6_1
35
+ - libffi=3.4.2=h7f98852_5
36
+ - libgcc-ng=13.1.0=he5830b7_0
37
+ - libglib=2.76.4=hebfc3b9_0
38
+ - libgomp=13.1.0=he5830b7_0
39
+ - libiconv=1.17=h166bdaf_0
40
+ - libidn2=2.3.4=h166bdaf_0
41
+ - libnsl=2.0.0=h7f98852_0
42
+ - libopus=1.3.1=h7f98852_1
43
+ - libpciaccess=0.17=h166bdaf_0
44
+ - libpng=1.6.39=h753d276_0
45
+ - libsqlite=3.42.0=h2797004_0
46
+ - libstdcxx-ng=13.1.0=hfd8a6a1_0
47
+ - libtasn1=4.19.0=h166bdaf_0
48
+ - libunistring=0.9.10=h7f98852_0
49
+ - libuuid=2.38.1=h0b41bf4_0
50
+ - libva=2.19.0=hd590300_0
51
+ - libvpx=1.13.0=hcb278e6_0
52
+ - libxcb=1.15=h0b41bf4_0
53
+ - libxml2=2.11.5=h0d562d8_0
54
+ - libzlib=1.2.13=hd590300_5
55
+ - ncurses=6.4=hcb278e6_0
56
+ - nettle=3.8.1=hc379101_1
57
+ - openh264=2.3.1=hcb278e6_2
58
+ - openssl=3.1.2=hd590300_0
59
+ - p11-kit=0.24.1=hc5aa10d_0
60
+ - pcre2=10.40=hc3806b6_0
61
+ - pip=23.2=pyhd8ed1ab_0
62
+ - pixman=0.40.0=h36c2ea0_0
63
+ - pthread-stubs=0.4=h36c2ea0_1001
64
+ - python=3.8.17=he550d4f_0_cpython
65
+ - readline=8.2=h8228510_1
66
+ - setuptools=68.0.0=pyhd8ed1ab_0
67
+ - svt-av1=1.6.0=h59595ed_0
68
+ - tk=8.6.12=h27826a3_0
69
+ - wheel=0.40.0=pyhd8ed1ab_1
70
+ - x264=1!164.3095=h166bdaf_2
71
+ - x265=3.5=h924138e_3
72
+ - xorg-fixesproto=5.0=h7f98852_1002
73
+ - xorg-kbproto=1.0.7=h7f98852_1002
74
+ - xorg-libice=1.1.1=hd590300_0
75
+ - xorg-libsm=1.2.4=h7391055_0
76
+ - xorg-libx11=1.8.6=h8ee46fc_0
77
+ - xorg-libxau=1.0.11=hd590300_0
78
+ - xorg-libxdmcp=1.1.3=h7f98852_0
79
+ - xorg-libxext=1.3.4=h0b41bf4_2
80
+ - xorg-libxfixes=5.0.3=h7f98852_1004
81
+ - xorg-libxrender=0.9.11=hd590300_0
82
+ - xorg-renderproto=0.11.1=h7f98852_1002
83
+ - xorg-xextproto=7.3.0=h0b41bf4_1003
84
+ - xorg-xproto=7.0.31=h7f98852_1007
85
+ - xz=5.2.6=h166bdaf_0
86
+ - zlib=1.2.13=hd590300_5
87
+ - pip:
88
+ - accelerate==0.21.0
89
+ - aiofiles==23.1.0
90
+ - aiohttp==3.8.5
91
+ - aiosignal==1.3.1
92
+ - altair==5.0.1
93
+ - annotated-types==0.5.0
94
+ - antlr4-python3-runtime==4.8
95
+ - anyio==3.7.1
96
+ - appdirs==1.4.4
97
+ - asttokens==2.2.1
98
+ - async-timeout==4.0.2
99
+ - attrs==23.1.0
100
+ - audiolm-pytorch==1.1.4
101
+ - audioread==3.0.0
102
+ - backcall==0.2.0
103
+ - beartype==0.15.0
104
+ - bitarray==2.8.1
105
+ - blinker==1.6.2
106
+ - certifi==2023.5.7
107
+ - cffi==1.15.1
108
+ - charset-normalizer==3.2.0
109
+ - click==8.1.6
110
+ - cmake==3.26.4
111
+ - colorama==0.4.6
112
+ - contourpy==1.1.0
113
+ - cycler==0.11.0
114
+ - cython==3.0.0
115
+ - decorator==5.1.1
116
+ - einops==0.6.1
117
+ - ema-pytorch==0.2.3
118
+ - encodec==0.1.1
119
+ - exceptiongroup==1.1.2
120
+ - executing==1.2.0
121
+ - fairseq==0.12.2
122
+ - fastapi==0.100.1
123
+ - ffmpy==0.3.1
124
+ - filelock==3.12.2
125
+ - flask==2.3.2
126
+ - fonttools==4.42.0
127
+ - frozenlist==1.4.0
128
+ - fsspec==2023.6.0
129
+ - future==0.18.3
130
+ - gradio==3.39.0
131
+ - gradio-client==0.3.0
132
+ - h11==0.14.0
133
+ - httpcore==0.17.3
134
+ - httpx==0.24.1
135
+ - huggingface-hub==0.16.4
136
+ - hydra-core==1.0.7
137
+ - idna==3.4
138
+ - importlib-metadata==6.8.0
139
+ - importlib-resources==6.0.0
140
+ - ipdb==0.13.13
141
+ - ipython==8.12.2
142
+ - itsdangerous==2.1.2
143
+ - jedi==0.18.2
144
+ - jinja2==3.1.2
145
+ - joblib==1.3.1
146
+ - json5==0.9.14
147
+ - jsonschema==4.18.6
148
+ - jsonschema-specifications==2023.7.1
149
+ - kiwisolver==1.4.4
150
+ - lazy-loader==0.3
151
+ - librosa==0.10.0.post2
152
+ - linkify-it-py==2.0.2
153
+ - lion-pytorch==0.1.2
154
+ - lit==16.0.6
155
+ - llvmlite==0.40.1
156
+ - local-attention==1.8.6
157
+ - lxml==4.9.3
158
+ - markdown-it-py==2.2.0
159
+ - markupsafe==2.1.3
160
+ - matplotlib==3.7.2
161
+ - matplotlib-inline==0.1.6
162
+ - mdit-py-plugins==0.3.3
163
+ - mdurl==0.1.2
164
+ - mpmath==1.3.0
165
+ - msgpack==1.0.5
166
+ - multidict==6.0.4
167
+ - networkx==3.1
168
+ - nltk==3.8.1
169
+ - numba==0.57.1
170
+ - numpy==1.24.4
171
+ - nvidia-cublas-cu11==11.10.3.66
172
+ - nvidia-cuda-cupti-cu11==11.7.101
173
+ - nvidia-cuda-nvrtc-cu11==11.7.99
174
+ - nvidia-cuda-runtime-cu11==11.7.99
175
+ - nvidia-cudnn-cu11==8.5.0.96
176
+ - nvidia-cufft-cu11==10.9.0.58
177
+ - nvidia-curand-cu11==10.2.10.91
178
+ - nvidia-cusolver-cu11==11.4.0.1
179
+ - nvidia-cusparse-cu11==11.7.4.91
180
+ - nvidia-nccl-cu11==2.14.3
181
+ - nvidia-nvtx-cu11==11.7.91
182
+ - omegaconf==2.0.6
183
+ - openai==0.27.8
184
+ - orjson==3.9.2
185
+ - packaging==23.1
186
+ - pandas==2.0.3
187
+ - parso==0.8.3
188
+ - pexpect==4.8.0
189
+ - pickleshare==0.7.5
190
+ - pillow==10.0.0
191
+ - pkgutil-resolve-name==1.3.10
192
+ - pooch==1.6.0
193
+ - portalocker==2.7.0
194
+ - prompt-toolkit==3.0.39
195
+ - psutil==5.9.5
196
+ - ptyprocess==0.7.0
197
+ - pure-eval==0.2.2
198
+ - pycparser==2.21
199
+ - pydantic==2.1.1
200
+ - pydantic-core==2.4.0
201
+ - pydub==0.25.1
202
+ - pygments==2.15.1
203
+ - pyloudnorm==0.1.1
204
+ - pyparsing==3.0.9
205
+ - python-dateutil==2.8.2
206
+ - python-multipart==0.0.6
207
+ - pytz==2023.3
208
+ - pyyaml==6.0.1
209
+ - referencing==0.30.1
210
+ - regex==2023.6.3
211
+ - requests==2.31.0
212
+ - retrying==1.3.4
213
+ - rpds-py==0.9.2
214
+ - sacrebleu==2.3.1
215
+ - safetensors==0.3.2
216
+ - scikit-learn==1.3.0
217
+ - scipy==1.10.1
218
+ - semantic-version==2.10.0
219
+ - sentencepiece==0.1.99
220
+ - six==1.16.0
221
+ - sniffio==1.3.0
222
+ - soundfile==0.12.1
223
+ - soxr==0.3.5
224
+ - stack-data==0.6.2
225
+ - starlette==0.27.0
226
+ - sympy==1.12
227
+ - tabulate==0.9.0
228
+ - threadpoolctl==3.2.0
229
+ - tokenizers==0.13.3
230
+ - tomli==2.0.1
231
+ - toolz==0.12.0
232
+ - torch==2.0.1
233
+ - torchaudio==2.0.2
234
+ - tqdm==4.65.0
235
+ - traitlets==5.9.0
236
+ - transformers==4.31.0
237
+ - triton==2.0.0
238
+ - typing-extensions==4.7.1
239
+ - tzdata==2023.3
240
+ - uc-micro-py==1.0.2
241
+ - urllib3==2.0.4
242
+ - uvicorn==0.23.2
243
+ - vector-quantize-pytorch==1.6.30
244
+ - wcwidth==0.2.6
245
+ - websockets==11.0.3
246
+ - werkzeug==2.3.6
247
+ - yarl==1.9.2
248
+ - zipp==3.16.2
EnvsSetup/AudioCraft.sh ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ conda env create -f Envs/AudioCraft.yml
2
+ conda run --live-stream -n AudioCraft pip install -U git+https://[email protected]/facebookresearch/audiocraft@c5157b5bf14bf83449c17ea1eeb66c19fb4bc7f0#egg=audiocraft
3
+ # Could not load library libcudnn_cnn_infer.so.8.
4
+ # Error: libnvrtc.so: cannot open shared object file: No such file or directory
5
+ CONDAENV=AudioCraft
6
+ source activate ${CONDAENV}
7
+ conda install -c "nvidia/label/cuda-11.8.0" cuda-toolkit
8
+ python3 -m pip install nvidia-cudnn-cu11==8.5.0.96
9
+ source deactivate
10
+ mkdir -p $CONDA_PREFIX/envs/${CONDAENV}/etc/conda/activate.d
11
+ echo 'CUDNN_PATH=$(dirname $(python -c "import nvidia.cudnn;print(nvidia.cudnn.__file__)"))' >> $CONDA_PREFIX/envs/${CONDAENV}/etc/conda/activate.d/env_vars.sh
12
+ echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/:$CUDNN_PATH/lib' >> $CONDA_PREFIX/envs/${CONDAENV}/etc/conda/activate.d/env_vars.sh
13
+ source $CONDA_PREFIX/envs/${CONDAENV}/etc/conda/activate.d/env_vars.sh
14
+
15
+ # If you're using WSL2, you can add the following into ~/.bashrc
16
+ # export LD_LIBRARY_PATH=/usr/lib/wsl/lib:$LD_LIBRARY_PATH
EnvsSetup/Bark.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ conda env create -f Envs/Bark.yml
EnvsSetup/VoiceFixer.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ conda env create -f Envs/VoiceFixer.yml
EnvsSetup/WavJourney.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ conda env create -f Envs/WavJourney.yml
README.md CHANGED
@@ -1,13 +1,47 @@
1
- ---
2
- title: WavJourney
3
- emoji: 🔥
4
- colorFrom: blue
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 3.40.1
8
- app_file: app.py
9
- pinned: false
10
- license: cc-by-nc-nd-4.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # How to run WavJourney?
2
+ 1. Install environment by following the bash scripts in `EnvsSetup/`
3
+ 2. Start API services; The service logs are in the folder of `logs/`
4
+ ```bash
5
+ python scripts/start_services.py
6
+ ```
7
+ 3. Run AudioJourney client; The results of scripts and audio are in the folder of `output/[datetime]_[instruction text]/`
8
+ ```bash
9
+ conda activate AudioJourney
10
+ python audiojourney_cli.py -f --instruction "News channel BBC broadcast about Trump playing street fighter 6 against Biden"
11
+ ```
12
+ 4. Kill the API services
13
+ ```bash
14
+ python scripts/kill_services.py
15
+ ```
16
+
17
+ 5. Start the UI
18
+ ```bash
19
+ sh scripts/start_ui.sh
20
+ ```
21
+
22
+
23
+ # Voice Presets
24
+ You can add voice presets to WavJourney to customize the voice actors. Simply provide the voice id, the description and a sample wav file, and WavJourney will pick the voice automatically based on the audio script.
25
+
26
+ Predefined system voice presets are in `data/voice_presets`, whereas session voice presets are in each session's individual folder. See the example below:
27
+
28
+ - 📂 **project_folder**
29
+ - 📂 **data**
30
+ - 📂 **voice_presets** <-- system voice presets
31
+ - 📄 **metadata.json** <-- system voice preset metadata
32
+ - 📂 **npz**
33
+ - 📂 **output**
34
+ - 📂 **sessions**
35
+ - 📂 **session_1**
36
+ - 📂 **voice_presets** <-- session voice presets
37
+ - 📄 **metadata.json** <-- session voice preset metadata
38
+ - 📂 **npz**
39
+ - 📂 **session_2**
40
+ - **...**
41
+
42
+ ## Add voice to system voice presets via command line
43
+ It's recommended to manage voice presets via UI. However if you want to add voice to voice presets via command line. Run the script below:
44
+ ```bash
45
+ python add_voice_preset.py --id "id" --desc "description" --wav-path path/to/wav --session-id session-id
46
+ ```
47
+ if `session-id` is set to '', then you are adding to system voice presets
VoiceFixer/app.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from genericpath import exists
2
+ import os
3
+ import os.path
4
+ import logging
5
+ from voicefixer import VoiceFixer
6
+ from flask import Flask, request, jsonify
7
+
8
+ # Configure the logging format and level
9
+ logging.basicConfig(
10
+ level=logging.INFO,
11
+ format='%(asctime)s - %(levelname)s - %(message)s'
12
+ )
13
+
14
+ # Create a FileHandler for the log file
15
+ os.makedirs('services_logs', exist_ok=True)
16
+ log_filename = 'services_logs/Speech-Restoration.log'
17
+ file_handler = logging.FileHandler(log_filename, mode='w')
18
+ file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
19
+
20
+ # Add the FileHandler to the root logger
21
+ logging.getLogger('').addHandler(file_handler)
22
+
23
+ # Initialize the model here
24
+ vf = VoiceFixer()
25
+ logging.info('VoiceFixer is loaded ...')
26
+
27
+ app = Flask(__name__)
28
+
29
+ @app.route('/fix_audio', methods=['POST'])
30
+ def fix_audio():
31
+ # Receive the text from the POST request
32
+ data = request.json
33
+ processfile = data['processfile']
34
+
35
+ logging.info(f'Fixing {processfile} ...')
36
+
37
+ try:
38
+ vf.restore(input=processfile, output=processfile, cuda=True, mode=0)
39
+
40
+ # Return success message and the filename of the generated audio
41
+ return jsonify({'message': 'Speech restored successfully', 'file': processfile})
42
+
43
+ except Exception as e:
44
+ # Return error message if something goes wrong
45
+ return jsonify({'API error': str(e)}), 500
46
+
47
+
48
+ if __name__ == '__main__':
49
+ import yaml
50
+ with open('config.yaml', 'r') as file:
51
+ config = yaml.safe_load(file)
52
+
53
+ service_port = config['Speech-Restoration']['service-port']
54
+ app.run(debug=False, port=service_port)
55
+
VoiceParser/__init__.py ADDED
File without changes
VoiceParser/app.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from genericpath import exists
2
+ import os
3
+ import os.path
4
+ import logging
5
+ import yaml
6
+ from model import VoiceParser
7
+ from flask import Flask, request, jsonify
8
+
9
+ with open('config.yaml', 'r') as file:
10
+ config = yaml.safe_load(file)
11
+
12
+ service_port = config['Voice-Parser']['service-port']
13
+ vp_device = config['Voice-Parser']['device']
14
+
15
+ # Configure the logging format and level
16
+ logging.basicConfig(
17
+ level=logging.INFO,
18
+ format='%(asctime)s - %(levelname)s - %(message)s'
19
+ )
20
+
21
+ # Create a FileHandler for the log file
22
+ os.makedirs('services_logs', exist_ok=True)
23
+ log_filename = 'services_logs/Voice-Parser.log'
24
+ file_handler = logging.FileHandler(log_filename, mode='w')
25
+ file_handler.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
26
+
27
+ # Add the FileHandler to the root logger
28
+ logging.getLogger('').addHandler(file_handler)
29
+
30
+ # Initialize the model here
31
+ vp = VoiceParser(device=vp_device)
32
+ logging.info('VoiceParser is loaded ...')
33
+
34
+ app = Flask(__name__)
35
+
36
+ @app.route('/parse_voice', methods=['POST'])
37
+ def parse_voice():
38
+ # Receive the text from the POST request
39
+ data = request.json
40
+ wav_path = data['wav_path']
41
+ out_dir = data['out_dir']
42
+
43
+ logging.info(f'Parsing {wav_path} ...')
44
+
45
+ try:
46
+ vp.extract_acoustic_embed(wav_path, out_dir)
47
+
48
+ # Return success message and the filename of the generated audio
49
+ return jsonify({'message': f'Sucessfully parsed {wav_path}'})
50
+
51
+ except Exception as e:
52
+ # Return error message if something goes wrong
53
+ return jsonify({'API error': str(e)}), 500
54
+
55
+
56
+ if __name__ == '__main__':
57
+ app.run(debug=False, port=service_port)
58
+
VoiceParser/customtokenizer.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Custom tokenizer model.
3
+ Author: https://www.github.com/gitmylo/
4
+ License: MIT
5
+ """
6
+
7
+ import json
8
+ import os.path
9
+ from zipfile import ZipFile
10
+ from typing import Union
11
+
12
+
13
+ import numpy
14
+ import torch
15
+ from torch import nn, optim
16
+ from torch.serialization import MAP_LOCATION
17
+
18
+
19
+ class CustomTokenizer(nn.Module):
20
+ def __init__(self, hidden_size=1024, input_size=768, output_size=10000, version=0):
21
+ super(CustomTokenizer, self).__init__()
22
+ next_size = input_size
23
+ if version == 0:
24
+ self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
25
+ next_size = hidden_size
26
+ if version == 1:
27
+ self.lstm = nn.LSTM(input_size, hidden_size, 2, batch_first=True)
28
+ self.intermediate = nn.Linear(hidden_size, 4096)
29
+ next_size = 4096
30
+
31
+ self.fc = nn.Linear(next_size, output_size)
32
+ self.softmax = nn.LogSoftmax(dim=1)
33
+ self.optimizer: optim.Optimizer = None
34
+ self.lossfunc = nn.CrossEntropyLoss()
35
+ self.input_size = input_size
36
+ self.hidden_size = hidden_size
37
+ self.output_size = output_size
38
+ self.version = version
39
+
40
+ def forward(self, x):
41
+ x, _ = self.lstm(x)
42
+ if self.version == 1:
43
+ x = self.intermediate(x)
44
+ x = self.fc(x)
45
+ x = self.softmax(x)
46
+ return x
47
+
48
+ @torch.no_grad()
49
+ def get_token(self, x):
50
+ """
51
+ Used to get the token for the first
52
+ :param x: An array with shape (N, input_size) where N is a whole number greater or equal to 1, and input_size is the input size used when creating the model.
53
+ :return: An array with shape (N,) where N is the same as N from the input. Every number in the array is a whole number in range 0...output_size - 1 where output_size is the output size used when creating the model.
54
+ """
55
+ return torch.argmax(self(x), dim=1)
56
+
57
+ def prepare_training(self):
58
+ self.optimizer = optim.Adam(self.parameters(), 0.001)
59
+
60
+ def train_step(self, x_train, y_train, log_loss=False):
61
+ # y_train = y_train[:-1]
62
+ # y_train = y_train[1:]
63
+
64
+ optimizer = self.optimizer
65
+ lossfunc = self.lossfunc
66
+ # Zero the gradients
67
+ self.zero_grad()
68
+
69
+ # Forward pass
70
+ y_pred = self(x_train)
71
+
72
+ y_train_len = len(y_train)
73
+ y_pred_len = y_pred.shape[0]
74
+
75
+ if y_train_len > y_pred_len:
76
+ diff = y_train_len - y_pred_len
77
+ y_train = y_train[diff:]
78
+ elif y_train_len < y_pred_len:
79
+ diff = y_pred_len - y_train_len
80
+ y_pred = y_pred[:-diff, :]
81
+
82
+ y_train_hot = torch.zeros(len(y_train), self.output_size)
83
+ y_train_hot[range(len(y_train)), y_train] = 1
84
+ y_train_hot = y_train_hot.to('cuda')
85
+
86
+ # Calculate the loss
87
+ loss = lossfunc(y_pred, y_train_hot)
88
+
89
+ # Print loss
90
+ if log_loss:
91
+ print('Loss', loss.item())
92
+
93
+ # Backward pass
94
+ loss.backward()
95
+
96
+ # Update the weights
97
+ optimizer.step()
98
+
99
+ def save(self, path):
100
+ info_path = '.'.join(os.path.basename(path).split('.')[:-1]) + '/.info'
101
+ torch.save(self.state_dict(), path)
102
+ data_from_model = Data(self.input_size, self.hidden_size, self.output_size, self.version)
103
+ with ZipFile(path, 'a') as model_zip:
104
+ model_zip.writestr(info_path, data_from_model.save())
105
+ model_zip.close()
106
+
107
+ @staticmethod
108
+ def load_from_checkpoint(path, map_location: MAP_LOCATION = None):
109
+ old = True
110
+ with ZipFile(path) as model_zip:
111
+ filesMatch = [file for file in model_zip.namelist() if file.endswith('/.info')]
112
+ file = filesMatch[0] if filesMatch else None
113
+ if file:
114
+ old = False
115
+ data_from_model = Data.load(model_zip.read(file).decode('utf-8'))
116
+ model_zip.close()
117
+ if old:
118
+ model = CustomTokenizer()
119
+ else:
120
+ model = CustomTokenizer(data_from_model.hidden_size, data_from_model.input_size, data_from_model.output_size, data_from_model.version)
121
+ model.load_state_dict(torch.load(path, map_location=map_location))
122
+ if map_location:
123
+ model = model.to(map_location)
124
+ return model
125
+
126
+
127
+
128
+ class Data:
129
+ input_size: int
130
+ hidden_size: int
131
+ output_size: int
132
+ version: int
133
+
134
+ def __init__(self, input_size=768, hidden_size=1024, output_size=10000, version=0):
135
+ self.input_size = input_size
136
+ self.hidden_size = hidden_size
137
+ self.output_size = output_size
138
+ self.version = version
139
+
140
+ @staticmethod
141
+ def load(string):
142
+ data = json.loads(string)
143
+ return Data(data['input_size'], data['hidden_size'], data['output_size'], data['version'])
144
+
145
+ def save(self):
146
+ data = {
147
+ 'input_size': self.input_size,
148
+ 'hidden_size': self.hidden_size,
149
+ 'output_size': self.output_size,
150
+ 'version': self.version,
151
+ }
152
+ return json.dumps(data)
153
+
154
+
155
+ def auto_train(data_path, save_path='model.pth', lload_model: Union[str, None] = None, save_epochs=1):
156
+ data_x, data_y = {}, {}
157
+
158
+ if load_model and os.path.isfile(load_model):
159
+ print('Loading model from', load_model)
160
+ model_training = CustomTokenizer.load_from_checkpoint(load_model, 'cuda')
161
+ else:
162
+ print('Creating new model.')
163
+ model_training = CustomTokenizer(version=1).to('cuda')
164
+ save_path = os.path.join(data_path, save_path)
165
+ base_save_path = '.'.join(save_path.split('.')[:-1])
166
+
167
+ sem_string = '_semantic.npy'
168
+ feat_string = '_semantic_features.npy'
169
+
170
+ ready = os.path.join(data_path, 'ready')
171
+ for input_file in os.listdir(ready):
172
+ full_path = os.path.join(ready, input_file)
173
+ try:
174
+ prefix = input_file.split("_")[0]
175
+ number = int(prefix)
176
+ except ValueError as e:
177
+ raise e
178
+ if input_file.endswith(sem_string):
179
+ data_y[number] = numpy.load(full_path)
180
+ elif input_file.endswith(feat_string):
181
+ data_x[number] = numpy.load(full_path)
182
+
183
+ model_training.prepare_training()
184
+ epoch = 1
185
+
186
+ while 1:
187
+ for i in range(save_epochs):
188
+ j = 0
189
+ for i in range(max(len(data_x), len(data_y))):
190
+ x = data_x.get(i)
191
+ y = data_y.get(i)
192
+ if x is None or y is None:
193
+ print(f'The training data does not match. key={i}')
194
+ continue
195
+ model_training.train_step(torch.tensor(x).to('cuda'), torch.tensor(y).to('cuda'), j % 50 == 0) # Print loss every 50 steps
196
+ j += 1
197
+ save_p = save_path
198
+ save_p_2 = f'{base_save_path}_epoch_{epoch}.pth'
199
+ model_training.save(save_p)
200
+ model_training.save(save_p_2)
201
+ print(f'Epoch {epoch} completed')
202
+ epoch += 1
VoiceParser/hubert_manager.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os.path
2
+ import shutil
3
+ import urllib.request
4
+
5
+ import huggingface_hub
6
+
7
+
8
+ class HuBERTManager:
9
+ @staticmethod
10
+ def make_sure_hubert_installed(download_url: str = 'https://dl.fbaipublicfiles.com/hubert/hubert_base_ls960.pt', file_name: str = 'hubert.pt'):
11
+ install_dir = os.path.join('VoiceParser', 'hubert')
12
+ if not os.path.isdir(install_dir):
13
+ os.makedirs(install_dir, exist_ok=True)
14
+ install_file = os.path.join(install_dir, file_name)
15
+ if not os.path.isfile(install_file):
16
+ print('Downloading HuBERT base model')
17
+ urllib.request.urlretrieve(download_url, install_file)
18
+ print('Downloaded HuBERT')
19
+ return install_file
20
+
21
+
22
+ @staticmethod
23
+ def make_sure_tokenizer_installed(model: str = 'quantifier_hubert_base_ls960_14.pth', repo: str = 'GitMylo/bark-voice-cloning', local_file: str = 'tokenizer.pth'):
24
+ install_dir = os.path.join('VoiceParser', 'hubert')
25
+ if not os.path.isdir(install_dir):
26
+ os.makedirs(install_dir, exist_ok=True)
27
+ install_file = os.path.join(install_dir, local_file)
28
+ if not os.path.isfile(install_file):
29
+ print('Downloading HuBERT custom tokenizer')
30
+ huggingface_hub.hf_hub_download(repo, model, local_dir=install_dir, local_dir_use_symlinks=False)
31
+ shutil.move(os.path.join(install_dir, model), install_file)
32
+ print('Downloaded tokenizer')
33
+ return install_file
VoiceParser/model.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import numpy as np
4
+
5
+ import torch
6
+ import torchaudio
7
+ torchaudio.set_audio_backend("soundfile") # Use 'soundfile' backend
8
+
9
+ from encodec import EncodecModel
10
+ from encodec.utils import convert_audio
11
+ from hubert_manager import HuBERTManager
12
+ from pre_kmeans_hubert import CustomHubert
13
+ from customtokenizer import CustomTokenizer
14
+
15
+ class VoiceParser():
16
+ def __init__(self, device='cpu'):
17
+ model = ('quantifier_hubert_base_ls960_14.pth', 'tokenizer.pth')
18
+
19
+ hubert_model = CustomHubert(HuBERTManager.make_sure_hubert_installed(), device=device)
20
+ quant_model = CustomTokenizer.load_from_checkpoint(HuBERTManager.make_sure_tokenizer_installed(model=model[0], local_file=model[1]), device)
21
+ encodec_model = EncodecModel.encodec_model_24khz()
22
+ encodec_model.set_target_bandwidth(6.0)
23
+
24
+ self.hubert_model = hubert_model
25
+ self.quant_model = quant_model
26
+ self.encodec_model = encodec_model.to(device)
27
+ self.device = device
28
+ print('Loaded VoiceParser models!')
29
+
30
+
31
+ def extract_acoustic_embed(self, wav_path, npz_dir):
32
+ wav, sr = torchaudio.load(wav_path)
33
+
34
+ wav_hubert = wav.to(self.device)
35
+
36
+ if wav_hubert.shape[0] == 2: # Stereo to mono if needed
37
+ wav_hubert = wav_hubert.mean(0, keepdim=True)
38
+
39
+ semantic_vectors = self.hubert_model.forward(wav_hubert, input_sample_hz=sr)
40
+ semantic_tokens = self.quant_model.get_token(semantic_vectors)
41
+ wav = convert_audio(wav, sr, self.encodec_model.sample_rate, 1).unsqueeze(0)
42
+
43
+ wav = wav.to(self.device)
44
+
45
+ with torch.no_grad():
46
+ encoded_frames = self.encodec_model.encode(wav)
47
+
48
+ codes = torch.cat([encoded[0] for encoded in encoded_frames], dim=-1).squeeze()
49
+
50
+ codes = codes.cpu()
51
+ semantic_tokens = semantic_tokens.cpu()
52
+
53
+ wav_name = os.path.split(wav_path)[1]
54
+ npz_name = wav_name[:-4] + '.npz'
55
+ npz_path = os.path.join(npz_dir, npz_name)
56
+
57
+ np.savez(
58
+ npz_path,
59
+ semantic_prompt=semantic_tokens,
60
+ fine_prompt=codes,
61
+ coarse_prompt=codes[:2, :]
62
+ )
63
+
64
+ return npz_path
65
+
66
+
67
+ def read_json_file(self, json_path):
68
+ with open(json_path, 'r') as file:
69
+ data = json.load(file)
70
+ return data
71
+
72
+
73
+ def parse_voice_json(self, voice_json, output_dir):
74
+ """
75
+ Parse a voice json file, generate the corresponding output json and npz files
76
+ Params:
77
+ voice_json: path of a json file or List of json nodes
78
+ output_dir: output dir for new json and npz files
79
+ """
80
+ if isinstance(voice_json, list):
81
+ voice_json = voice_json
82
+ else:
83
+ # If voice_json is a file path (str), read the JSON file
84
+ voice_json = self.read_json_file(voice_json)
85
+ for item in voice_json:
86
+ wav_path = item['wav']
87
+ npz_path = self.extract_acoustic_embed(wav_path=wav_path, npz_dir=output_dir)
88
+ item['npz'] = npz_path
89
+ del item['wav']
90
+
91
+ output_json = os.path.join(output_dir, 'metadata.json')
92
+
93
+ with open(output_json, 'w') as file:
94
+ json.dump(voice_json, file, indent=4)
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
VoiceParser/pre_kmeans_hubert.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Modified HuBERT model without kmeans.
3
+ Original author: https://github.com/lucidrains/
4
+ Modified by: https://www.github.com/gitmylo/
5
+ License: MIT
6
+ """
7
+
8
+ # Modified code from https://github.com/lucidrains/audiolm-pytorch/blob/main/audiolm_pytorch/hubert_kmeans.py
9
+
10
+ from pathlib import Path
11
+
12
+ import torch
13
+ from torch import nn
14
+ from einops import pack, unpack
15
+
16
+ import fairseq
17
+
18
+ from torchaudio.functional import resample
19
+
20
+ from audiolm_pytorch.utils import curtail_to_multiple
21
+
22
+ import logging
23
+ logging.root.setLevel(logging.ERROR)
24
+
25
+
26
+ def exists(val):
27
+ return val is not None
28
+
29
+
30
+ def default(val, d):
31
+ return val if exists(val) else d
32
+
33
+
34
+ class CustomHubert(nn.Module):
35
+ """
36
+ checkpoint and kmeans can be downloaded at https://github.com/facebookresearch/fairseq/tree/main/examples/hubert
37
+ or you can train your own
38
+ """
39
+
40
+ def __init__(
41
+ self,
42
+ checkpoint_path,
43
+ target_sample_hz=16000,
44
+ seq_len_multiple_of=None,
45
+ output_layer=9,
46
+ device=None
47
+ ):
48
+ super().__init__()
49
+ self.target_sample_hz = target_sample_hz
50
+ self.seq_len_multiple_of = seq_len_multiple_of
51
+ self.output_layer = output_layer
52
+
53
+ if device is not None:
54
+ self.to(device)
55
+
56
+ model_path = Path(checkpoint_path)
57
+
58
+ assert model_path.exists(), f'path {checkpoint_path} does not exist'
59
+
60
+ checkpoint = torch.load(checkpoint_path, map_location=device)
61
+ load_model_input = {checkpoint_path: checkpoint}
62
+ model, *_ = fairseq.checkpoint_utils.load_model_ensemble_and_task(load_model_input)
63
+
64
+ if device is not None:
65
+ model[0].to(device)
66
+
67
+ self.model = model[0]
68
+ self.model.eval()
69
+
70
+ @property
71
+ def groups(self):
72
+ return 1
73
+
74
+ @torch.no_grad()
75
+ def forward(
76
+ self,
77
+ wav_input,
78
+ flatten=True,
79
+ input_sample_hz=None
80
+ ):
81
+ device = wav_input.device
82
+
83
+ if exists(input_sample_hz):
84
+ wav_input = resample(wav_input, input_sample_hz, self.target_sample_hz)
85
+
86
+ if exists(self.seq_len_multiple_of):
87
+ wav_input = curtail_to_multiple(wav_input, self.seq_len_multiple_of)
88
+
89
+ embed = self.model(
90
+ wav_input,
91
+ features_only=True,
92
+ mask=False, # thanks to @maitycyrus for noticing that mask is defaulted to True in the fairseq code
93
+ output_layer=self.output_layer
94
+ )
95
+
96
+ embed, packed_shape = pack([embed['x']], '* d')
97
+
98
+ # codebook_indices = self.kmeans.predict(embed.cpu().detach().numpy())
99
+
100
+ codebook_indices = torch.from_numpy(embed.cpu().detach().numpy()).to(device) # .long()
101
+
102
+ if flatten:
103
+ return codebook_indices
104
+
105
+ codebook_indices, = unpack(codebook_indices, packed_shape, '*')
106
+ return codebook_indices
add_voice_preset.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import voice_presets
3
+
4
+ def main():
5
+ # Argument Parsing
6
+ parser = argparse.ArgumentParser(description="Add Voice Preset")
7
+ parser.add_argument("--id", required=True, help="ID of the voice")
8
+ parser.add_argument("--desc", required=True, help="Description of the voice")
9
+ parser.add_argument("--wav-path", required=True, help="Path to the .wav file")
10
+ parser.add_argument("--session-id", required=True, help="session_id, if set to '' then it's system voice presets")
11
+ args = parser.parse_args()
12
+
13
+ if args.session_id:
14
+ print(voice_presets.add_session_voice_preset(args.id, args.desc, args.wav_path, args.session_id))
15
+ else:
16
+ print(voice_presets.add_system_voice_preset(args.id, args.desc, args.wav_path))
17
+
18
+
19
+
20
+ if __name__ == "__main__":
21
+ main()
code_generator.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json5
3
+ import utils
4
+
5
+
6
+ def check_json_script(data):
7
+ foreground_mandatory_attrs_map = {
8
+ 'music': ['vol', 'len', 'desc'],
9
+ 'sound_effect': ['vol', 'len', 'desc'],
10
+ 'speech': ['vol', 'text']
11
+ }
12
+ background_mandatory_attrs_map = {
13
+ 'music': ['vol', 'desc'],
14
+ 'sound_effect': ['vol', 'desc'],
15
+ }
16
+
17
+ def check_by_audio_type(audio, mandatory_attrs_map, audio_str):
18
+ if audio['audio_type'] not in mandatory_attrs_map:
19
+ raise ValueError('audio_type is not allowed in this layout, audio={audio_str}')
20
+ for attr_name in mandatory_attrs_map[audio['audio_type']]:
21
+ if attr_name not in audio:
22
+ raise ValueError(f'{attr_name} does not exist, audio={audio_str}')
23
+
24
+ # Check json's format
25
+ for audio in data:
26
+ audio_str = json5.dumps(audio, indent=None)
27
+ if 'layout' not in audio:
28
+ raise ValueError(f'layout missing, audio={audio_str}')
29
+ elif 'audio_type' not in audio:
30
+ raise ValueError(f'audio_type missing, audio={audio_str}')
31
+ elif audio['layout'] == 'foreground':
32
+ check_by_audio_type(audio, foreground_mandatory_attrs_map, audio_str)
33
+ elif audio['layout'] == 'background':
34
+ if 'id' not in audio:
35
+ raise ValueError(f'id not in background audio, audio={audio_str}')
36
+ if 'action' not in audio:
37
+ raise ValueError(f'action not in background audio, audio={audio_str}')
38
+ if audio['action'] == 'begin':
39
+ check_by_audio_type(audio, background_mandatory_attrs_map, audio_str)
40
+ else:
41
+ if audio['action'] != 'end':
42
+ raise ValueError(f'Unknown action, audio={audio_str}')
43
+ else:
44
+ raise ValueError(f'Unknown layout, audio={audio_str}')
45
+ #except Exception as err:
46
+ # sys.stderr.write(f'PARSING ERROR: {err}, audio={json5.dumps(audio, indent=None)}\n')
47
+ # all_clear = False
48
+
49
+
50
+ def collect_and_check_audio_data(data):
51
+ fg_audio_id = 0
52
+ fg_audios = []
53
+ bg_audios = []
54
+ # Collect all the foreground and background audio ids used to calculate background audio length later
55
+ for audio in data:
56
+ if audio['layout'] == 'foreground':
57
+ audio['id'] = fg_audio_id
58
+ fg_audios.append(audio)
59
+ fg_audio_id += 1
60
+ else: # background
61
+ if audio['action'] == 'begin':
62
+ audio['begin_fg_audio_id'] = fg_audio_id
63
+ bg_audios.append(audio)
64
+ else: # ends
65
+ # find the backgound with the id, and update its 'end_fg_audio_id'
66
+ for bg_audio in bg_audios:
67
+ if bg_audio['id'] == audio['id'] and bg_audio['audio_type'] == audio['audio_type']:
68
+ bg_audio['end_fg_audio_id'] = fg_audio_id
69
+ break
70
+
71
+ # check if all background audios are valid
72
+ for bg_audio in bg_audios:
73
+ if 'begin_fg_audio_id' not in bg_audio:
74
+ raise ValueError(f'begin of background missing, audio={bg_audio}')
75
+ elif 'end_fg_audio_id' not in bg_audio:
76
+ raise ValueError(f'end of background missing, audio={bg_audio}')
77
+
78
+ if bg_audio['begin_fg_audio_id'] > bg_audio['end_fg_audio_id']:
79
+ raise ValueError(f'background audio ends before start, audio={bg_audio}')
80
+ elif bg_audio['begin_fg_audio_id'] == bg_audio['end_fg_audio_id']:
81
+ raise ValueError(f'background audio contains no foreground audio, audio={bg_audio}')
82
+ #except Exception as err:
83
+ # sys.stderr.write(f'ALIGNMENT ERROR: {err}, audio={bg_audio}\n')
84
+ # return None, None
85
+
86
+ return fg_audios, bg_audios
87
+
88
+
89
+ class AudioCodeGenerator:
90
+ def __init__(self):
91
+ self.wav_counters = {
92
+ 'bg_sound_effect': 0,
93
+ 'bg_music': 0,
94
+ 'idle': 0,
95
+ 'fg_sound_effect': 0,
96
+ 'fg_music': 0,
97
+ 'fg_speech': 0,
98
+ }
99
+ self.code = ''
100
+
101
+ def append_code(self, content):
102
+ self.code = f'{self.code}{content}\n'
103
+
104
+ def generate_code(self, fg_audios, bg_audios, output_path, result_filename):
105
+ def get_wav_name(audio):
106
+ audio_type = audio['audio_type']
107
+ layout = 'fg' if audio['layout'] == 'foreground' else 'bg'
108
+ wav_type = f'{layout}_{audio_type}' if layout else audio_type
109
+ desc = audio['text'] if 'text' in audio else audio['desc']
110
+ desc = utils.text_to_abbrev_prompt(desc)
111
+ wav_filename = f'{wav_type}_{self.wav_counters[wav_type]}_{desc}.wav'
112
+ self.wav_counters[wav_type] += 1
113
+ return wav_filename
114
+
115
+ header = f'''
116
+ import sys
117
+ sys.path.append('../AudioJourney')
118
+
119
+ import os
120
+ import datetime
121
+
122
+ from APIs import TTM, TTS, TTA, MIX, CAT, COMPUTE_LEN
123
+
124
+
125
+ fg_audio_lens = []
126
+ wav_path = \"{output_path.absolute()}/audio\"
127
+ os.makedirs(wav_path, exist_ok=True)
128
+
129
+ '''
130
+ self.append_code(header)
131
+
132
+ fg_audio_wavs = []
133
+ for fg_audio in fg_audios:
134
+ wav_name = get_wav_name(fg_audio)
135
+ if fg_audio['audio_type'] == 'sound_effect':
136
+ self.append_code(f'TTA(text=\"{fg_audio["desc"]}\", length={fg_audio["len"]}, volume={fg_audio["vol"]}, out_wav=os.path.join(wav_path, \"{wav_name}\"))')
137
+ elif fg_audio['audio_type'] == 'music':
138
+ self.append_code(f'TTM(text=\"{fg_audio["desc"]}\", length={fg_audio["len"]}, volume={fg_audio["vol"]}, out_wav=os.path.join(wav_path, \"{wav_name}\"))')
139
+ elif fg_audio['audio_type'] == 'speech':
140
+ npz_path = self.char_to_voice_map[fg_audio["character"]]["npz_path"]
141
+ npz_full_path = os.path.abspath(npz_path) if os.path.exists(npz_path) else npz_path
142
+ self.append_code(f'TTS(text=\"{fg_audio["text"]}\", speaker_id=\"{self.char_to_voice_map[fg_audio["character"]]["id"]}\", volume={fg_audio["vol"]}, out_wav=os.path.join(wav_path, \"{wav_name}\"), speaker_npz=\"{npz_full_path}\")')
143
+ fg_audio_wavs.append(wav_name)
144
+ self.append_code(f'fg_audio_lens.append(COMPUTE_LEN(os.path.join(wav_path, \"{wav_name}\")))\n')
145
+
146
+ # cat all foreground audio together
147
+ self.append_code(f'fg_audio_wavs = []')
148
+ for wav_filename in fg_audio_wavs:
149
+ self.append_code(f'fg_audio_wavs.append(os.path.join(wav_path, \"{wav_filename}\"))')
150
+ self.append_code(f'CAT(wavs=fg_audio_wavs, out_wav=os.path.join(wav_path, \"foreground.wav\"))')
151
+
152
+ bg_audio_wavs = []
153
+ self.append_code(f'\nbg_audio_offsets = []')
154
+ for bg_audio in bg_audios:
155
+ wav_name = get_wav_name(bg_audio)
156
+ self.append_code(f'bg_audio_len = sum(fg_audio_lens[{bg_audio["begin_fg_audio_id"]}:{bg_audio["end_fg_audio_id"]}])')
157
+ self.append_code(f'bg_audio_offset = sum(fg_audio_lens[:{bg_audio["begin_fg_audio_id"]}])')
158
+ if bg_audio['audio_type'] == 'sound_effect':
159
+ self.append_code(f'TTA(text=\"{bg_audio["desc"]}\", volume={bg_audio["vol"]}, length=bg_audio_len, out_wav=os.path.join(wav_path, \"{wav_name}\"))')
160
+ elif bg_audio['audio_type'] == 'music':
161
+ self.append_code(f'TTM(text=\"{bg_audio["desc"]}\", volume={bg_audio["vol"]}, length=bg_audio_len, out_wav=os.path.join(wav_path, \"{wav_name}\"))')
162
+ else:
163
+ raise ValueError()
164
+ bg_audio_wavs.append(wav_name)
165
+ self.append_code(f'bg_audio_offsets.append(bg_audio_offset)\n')
166
+ self.append_code(f'bg_audio_wavs = []')
167
+ for wav_filename in bg_audio_wavs:
168
+ self.append_code(f'bg_audio_wavs.append(os.path.join(wav_path, \"{wav_filename}\"))')
169
+
170
+ self.append_code(f'bg_audio_wav_offset_pairs = list(zip(bg_audio_wavs, bg_audio_offsets))')
171
+ self.append_code(f'bg_audio_wav_offset_pairs.append((os.path.join(wav_path, \"foreground.wav\"), 0))')
172
+ self.append_code(f'MIX(wavs=bg_audio_wav_offset_pairs, out_wav=os.path.join(wav_path, \"{result_filename}.wav\"))')
173
+
174
+
175
+ def init_char_to_voice_map(self, filename):
176
+ with open(filename, 'r') as file:
177
+ self.char_to_voice_map = json5.load(file)
178
+
179
+
180
+ def parse_and_generate(self, script_filename, char_to_voice_map_filename, output_path, result_filename='result'):
181
+ self.code = ''
182
+ self.init_char_to_voice_map(char_to_voice_map_filename)
183
+
184
+ with open(script_filename, 'r') as file:
185
+ data = json5.load(file)
186
+
187
+ check_json_script(data)
188
+ fg_audios, bg_audios = collect_and_check_audio_data(data)
189
+ self.generate_code(fg_audios, bg_audios, output_path, result_filename)
190
+ return self.code
convert_json_to_audio_gen_code.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import json5
4
+ from pathlib import Path
5
+ from code_generator import AudioCodeGenerator
6
+
7
+
8
+ def main():
9
+ parser = argparse.ArgumentParser()
10
+ parser.add_argument("--script", help="Path to the json script file")
11
+ parser.add_argument("--character-to-voice-map", help="Path to the character-to-voice mapping CSV file")
12
+ parser.add_argument(
13
+ "--path",
14
+ type=str,
15
+ default=".",
16
+ help="Path of all the output wav files to be created by the generated code, default: current path"
17
+ )
18
+ args = parser.parse_args()
19
+
20
+ if not os.path.isfile(args.script):
21
+ print(f"File {args.script} does not exist.")
22
+ return
23
+
24
+ output_path = Path(args.path)
25
+ audio_code_generator = AudioCodeGenerator()
26
+ code = audio_code_generator.parse_and_generate(args.script, args.character_to_voice_map, output_path)
27
+ print(code)
28
+
29
+ if __name__ == "__main__":
30
+ main()
data/voice_presets/metadata.json ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Female1": {
3
+ "id": "Female1",
4
+ "desc": "a normal female adult voice, British accent",
5
+ "npz_path": "v2/en_speaker_9"
6
+ },
7
+ "Female2": {
8
+ "id": "Female2",
9
+ "desc": "a normal female adult voice, American accent",
10
+ "npz_path": "v2/de_speaker_3"
11
+ },
12
+ "Male1": {
13
+ "id": "Male1",
14
+ "desc": "a normal male adult voice, British accent",
15
+ "npz_path": "v2/en_speaker_1"
16
+ },
17
+ "Male2": {
18
+ "id": "Male2",
19
+ "desc": "a normal male adult voice, American accent",
20
+ "npz_path": "v2/en_speaker_2"
21
+ },
22
+ "News_Male": {
23
+ "id": "News_Male",
24
+ "desc": "a male voice of a news anchor, suitable for news scenarios",
25
+ "npz_path": "data/voice_presets/npz/news_male_speaker.npz"
26
+ },
27
+ "News_Female": {
28
+ "id": "News_Female",
29
+ "desc": "a female voice of a news anchor, suitable for news scenarios",
30
+ "npz_path": "data/voice_presets/npz/news_male_speaker.npz"
31
+ },
32
+ "News_Female_Out": {
33
+ "id": "News_Female_Out",
34
+ "desc": "a female voice of a off-site news reporter, suitable for news scenario",
35
+ "npz_path": "data/voice_presets/npz/news_female_speaker_outside.npz"
36
+ },
37
+ "child": {
38
+ "id": "child",
39
+ "desc": "a small young boy voice",
40
+ "npz_path": "data/voice_presets/npz/child_boy.npz"
41
+ },
42
+ "old_man": {
43
+ "id": "old_man",
44
+ "desc": "a voice of an old man",
45
+ "npz_path": "data/voice_presets/npz/elder_morgen.npz"
46
+ }
47
+ }
data/voice_presets/npz/biden.npz ADDED
Binary file (41.8 kB). View file
 
data/voice_presets/npz/boris.npz ADDED
Binary file (38.3 kB). View file
 
data/voice_presets/npz/boy_Tom_Hiddleston.npz ADDED
Binary file (86.5 kB). View file
 
data/voice_presets/npz/child_boy.npz ADDED
Binary file (33.1 kB). View file
 
data/voice_presets/npz/cnn_male_speaker.npz ADDED
Binary file (46.6 kB). View file
 
data/voice_presets/npz/elder_morgen.npz ADDED
Binary file (30.8 kB). View file
 
data/voice_presets/npz/girl_Anne_Hathaway.npz ADDED
Binary file (74.6 kB). View file
 
data/voice_presets/npz/mark_professor.npz ADDED
Binary file (106 kB). View file
 
data/voice_presets/npz/news_female_speaker.npz ADDED
Binary file (71.8 kB). View file
 
data/voice_presets/npz/news_female_speaker_outside.npz ADDED
Binary file (60.5 kB). View file
 
data/voice_presets/npz/news_male_speaker.npz ADDED
Binary file (36 kB). View file
 
data/voice_presets/npz/trump.npz ADDED
Binary file (73.1 kB). View file
 
parse_voice.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ from VoiceParser.model import VoiceParser
4
+
5
+ def main():
6
+ parser = argparse.ArgumentParser()
7
+ parser.add_argument('--wav-path', type=str, help="Path of a wav file")
8
+ parser.add_argument('--wav-dir', type=str, help="Directory of wav files")
9
+ parser.add_argument('--out-dir', type=str, help="Directory of output npz files")
10
+ args = parser.parse_args()
11
+
12
+ if (args.wav_path is None and args.wav_dir is None) or (args.wav_path is not None and args.wav_dir is not None):
13
+ parser.error("Please provide either '--wav-path' or '--wav-dir', but not both.")
14
+
15
+ out_dir = args.out_dir
16
+
17
+ model = VoiceParser(device='cpu')
18
+
19
+ if args.wav_path is not None:
20
+ model.extract_acoustic_embed(args.wav_path, out_dir)
21
+ print(f'Sucessfully parsed {args.wav_path}')
22
+ else:
23
+ wav_name_list = os.listdir(args.wav_dir)
24
+ for wav_name in wav_name_list:
25
+ wav_path = os.path.join(args.wav_dir, wav_name)
26
+ model.extract_acoustic_embed(wav_path, out_dir)
27
+ print(f'Sucessfully parsed {wav_path}')
28
+
29
+
30
+ if __name__ == '__main__':
31
+ main()
pipeline.py ADDED
@@ -0,0 +1,235 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import datetime
3
+ import os
4
+ import subprocess
5
+ from string import Template
6
+ import openai
7
+ import re
8
+ from pathlib import Path
9
+ import glob
10
+ from utils import get_key
11
+ import pickle
12
+ import time
13
+ import json5
14
+ from retrying import retry
15
+ from code_generator import check_json_script, collect_and_check_audio_data
16
+ from tabulate import tabulate
17
+ import random
18
+ import string
19
+
20
+ import utils
21
+ import voice_presets
22
+ from code_generator import AudioCodeGenerator
23
+
24
+ USE_OPENAI_CACHE = True
25
+ openai_cache = []
26
+ if USE_OPENAI_CACHE:
27
+ os.makedirs('cache', exist_ok=True)
28
+ for cache_file in glob.glob('cache/*.pkl'):
29
+ with open(cache_file, 'rb') as file:
30
+ openai_cache.append(pickle.load(file))
31
+
32
+ openai.api_key = get_key()
33
+
34
+ def chat_with_gpt(prompt):
35
+ if USE_OPENAI_CACHE:
36
+ filtered_object = list(filter(lambda x: x['prompt'] == prompt, openai_cache))
37
+ if len(filtered_object) > 0:
38
+ response = filtered_object[0]['response']
39
+ return response
40
+ chat = openai.ChatCompletion.create(
41
+ # model="gpt-3.5-turbo",
42
+ model="gpt-4",
43
+ messages=[
44
+ {
45
+ "role": "system",
46
+ "content": "You are a helpful assistant."
47
+ },
48
+ {
49
+ "role": "user",
50
+ "content": prompt
51
+ }
52
+ ]
53
+ )
54
+ if USE_OPENAI_CACHE:
55
+ cache_obj = {
56
+ 'prompt': prompt,
57
+ 'response': chat['choices'][0]['message']['content']
58
+ }
59
+ with open(f'cache/{time.time()}.pkl', 'wb') as _openai_cache:
60
+ pickle.dump(cache_obj, _openai_cache)
61
+ openai_cache.append(cache_obj)
62
+
63
+ return chat['choices'][0]['message']['content']
64
+
65
+
66
+ def get_file_content(filename):
67
+ with open(filename, 'r') as file:
68
+ return file.read().strip()
69
+
70
+
71
+ def write_to_file(filename, content):
72
+ with open(filename, 'w') as file:
73
+ file.write(content)
74
+
75
+
76
+ def extract_substring_with_quotes(input_string, quotes="'''"):
77
+ pattern = f"{quotes}(.*?){quotes}"
78
+ matches = re.findall(pattern, input_string, re.DOTALL)
79
+ return matches
80
+
81
+
82
+ def try_extract_content_from_quotes(content):
83
+ if "'''" in content:
84
+ return extract_substring_with_quotes(content)[0]
85
+ elif "```" in content:
86
+ return extract_substring_with_quotes(content, quotes="```")[0]
87
+ else:
88
+ return content
89
+
90
+ def maybe_get_content_from_file(content_or_filename):
91
+ if os.path.exists(content_or_filename):
92
+ with open(content_or_filename, 'r') as file:
93
+ return file.read().strip()
94
+ return content_or_filename
95
+
96
+
97
+
98
+ # Pipeline Interface Guidelines:
99
+ #
100
+ # Init calls:
101
+ # - Init calls must be called before running the actual steps
102
+ # - init_session() is called every time a gradio webpage is loaded
103
+ #
104
+ # Single Step:
105
+ # - takes input (file or content) and output path as input
106
+ # - most of time just returns output content
107
+ #
108
+ # Compositional Step:
109
+ # - takes session_id as input (you have session_id, you have all the paths)
110
+ # - run a series of steps
111
+
112
+ # This is called for every new gradio webpage
113
+
114
+ def init_session(session_id=''):
115
+ def uid8():
116
+ return ''.join(random.choices(string.ascii_lowercase + string.digits, k=8))
117
+
118
+ if session_id == '':
119
+ session_id = f'{datetime.datetime.now().strftime("%Y%m%d%H%M%S")}_{uid8()}'
120
+ # create the paths
121
+ os.makedirs(utils.get_session_voice_preset_path(session_id))
122
+ os.makedirs(utils.get_session_audio_path(session_id))
123
+ return session_id
124
+
125
+ @retry(stop_max_attempt_number=3)
126
+ def input_text_to_json_script_with_retry(complete_prompt_path):
127
+ print(" trying ...")
128
+ complete_prompt = get_file_content(complete_prompt_path)
129
+ json_response = try_extract_content_from_quotes(chat_with_gpt(complete_prompt))
130
+ json_data = json5.loads(json_response)
131
+
132
+ try:
133
+ check_json_script(json_data)
134
+ collect_and_check_audio_data(json_data)
135
+ except Exception as err:
136
+ print(f'JSON ERROR: {err}')
137
+ retry_complete_prompt = f'{complete_prompt}\n```\n{json_response}```\nThe script above has format error(s). Return the fixed script.\n\nScript:\n'
138
+ write_to_file(complete_prompt_path, retry_complete_prompt)
139
+ raise err
140
+
141
+ return json_response
142
+
143
+ # Step 1: input_text to json
144
+ def input_text_to_json_script(input_text, output_path):
145
+ print('Step 1: Writing audio script with LLM ...')
146
+ input_text = maybe_get_content_from_file(input_text)
147
+ text_to_audio_script_prompt = get_file_content('prompts/text_to_json.prompt')
148
+ prompt = f'{text_to_audio_script_prompt}\n\nInput text: {input_text}\n\nScript:\n'
149
+ complete_prompt_path = output_path / 'complete_input_text_to_audio_script.prompt'
150
+ write_to_file(complete_prompt_path, prompt)
151
+ audio_script_response = input_text_to_json_script_with_retry(complete_prompt_path)
152
+ generated_audio_script_filename = output_path / 'audio_script.json'
153
+ write_to_file(generated_audio_script_filename, audio_script_response)
154
+ return audio_script_response
155
+
156
+ # Step 2: json to char-voice map
157
+ def json_script_to_char_voice_map(json_script, voices, output_path):
158
+ def create_complete_char_voice_map(char_voice_map):
159
+ return
160
+ print('Step 2: Parsing character voice with LLM...')
161
+ json_script_content = maybe_get_content_from_file(json_script)
162
+ prompt = get_file_content('prompts/audio_script_to_character_voice_map.prompt')
163
+ presets_str = '\n'.join(f"{preset['id']}: {preset['desc']}" for preset in voices.values())
164
+ prompt = Template(prompt).substitute(voice_and_desc=presets_str)
165
+ prompt = f"{prompt}\n\nAudio script:\n'''\n{json_script_content}\n'''\n\noutput:\n"
166
+ write_to_file(output_path / 'complete_audio_script_to_char_voice_map.prompt', prompt)
167
+ char_voice_map_response = try_extract_content_from_quotes(chat_with_gpt(prompt))
168
+ char_voice_map = json5.loads(char_voice_map_response)
169
+ # enrich char_voice_map with voice preset metadata
170
+ complete_char_voice_map = {c: voices[char_voice_map[c]] for c in char_voice_map}
171
+ char_voice_map_filename = output_path / 'character_voice_map.json'
172
+ write_to_file(char_voice_map_filename, json5.dumps(complete_char_voice_map))
173
+ return complete_char_voice_map
174
+
175
+ # Step 3: json to py code
176
+ def json_script_and_char_voice_map_to_audio_gen_code(json_script_filename, char_voice_map_filename, output_path, result_filename):
177
+ print('Step 3: Compiling audio script to Python program ...')
178
+ audio_code_generator = AudioCodeGenerator()
179
+ code = audio_code_generator.parse_and_generate(
180
+ json_script_filename,
181
+ char_voice_map_filename,
182
+ output_path,
183
+ result_filename
184
+ )
185
+ write_to_file(output_path / 'audio_generation.py', code)
186
+
187
+ # Step 4: py code to final wav
188
+ def audio_code_gen_to_result(audio_gen_code_path):
189
+ print('Step 4: Start running Python program ...')
190
+ audio_gen_code_filename = audio_gen_code_path / 'audio_generation.py'
191
+ os.system(f'python {audio_gen_code_filename}')
192
+
193
+ # Function call used by Gradio: input_text to json
194
+ def generate_json_file(session_id, input_text):
195
+ output_path = utils.get_session_path(session_id)
196
+ # Step 1
197
+ return input_text_to_json_script(input_text, output_path)
198
+
199
+ # Function call used by Gradio: json to result wav
200
+ def generate_audio(session_id, json_script):
201
+ output_path = utils.get_session_path(session_id)
202
+ output_audio_path = utils.get_session_audio_path(session_id)
203
+ voices = voice_presets.get_merged_voice_presets(session_id)
204
+
205
+ # Step 2
206
+ json_script_to_char_voice_map(json_script, voices, output_path)
207
+ # Step 3
208
+ json_script_filename = output_path / 'audio_script.json'
209
+ char_voice_map_filename = output_path / 'character_voice_map.json'
210
+ result_wav_basename = f'res_{session_id}'
211
+ json_script_and_char_voice_map_to_audio_gen_code(json_script_filename, char_voice_map_filename, output_path, result_wav_basename)
212
+ # Step 4
213
+ audio_code_gen_to_result(output_path)
214
+
215
+ result_wav_filename = output_audio_path / f'{result_wav_basename}.wav'
216
+ print(f'Done all processes, result: {result_wav_filename}')
217
+ return result_wav_filename
218
+
219
+ # Convenient function call used by wavjourney_cli
220
+ def full_steps(session_id, input_text):
221
+ json_script = generate_json_file(session_id, input_text)
222
+ return generate_audio(session_id, json_script)
223
+
224
+ def convert_json_to_md(audio_script_response):
225
+ audio_json_data = json5.loads(audio_script_response)
226
+ table = [[node.get(field, 'N/A') for field in ["audio_type", "layout", "id", "character", "action", 'vol']] +
227
+ [node.get("desc", "N/A") if node.get("audio_type") != "speech" else node.get("text", "N/A")] +
228
+ [node.get("len", "Auto") if "len" in node else "Auto"]
229
+ for i, node in enumerate(audio_json_data)]
230
+
231
+ headers = ["Audio Type", "Layout", "ID", "Character", "Action", 'Volume', "Description", "Length" ]
232
+
233
+ # Tabulate
234
+ table_txt = tabulate(table, headers, tablefmt="github")
235
+ return table_txt
prompts/audio_script_to_character_voice_map.prompt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Given an audio script in json format, for each character appeared in the "character" attribute, you should map the character to a "voice type" according to the his/her lines and the voice type's features. Each character must be mapped to a different voice type, and each voice type must be from one of the following(each line in the format of "[voice_type_id]: [voice_type_description]"):
2
+ $voice_and_desc
3
+
4
+ Output should be in the format of json, like:
5
+ '''
6
+ {
7
+ "character_1": "voice_type_1",
8
+ "character_2": "voice_type_2",
9
+ ...
10
+ }
11
+ '''
prompts/audio_script_to_json.prompt ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Given an audio script, adapt it into a json file. You must go through each line of the script, and try your best to convert it to a json object or multiple json objects.
2
+
3
+ Each json object represents an audio. There are three types of audios: sound effect, music, and speech. For each audio, there are two types of layouts: foreground and background. Foreground audios are played sequentially, and background audios are environmental sounds or music which are played while the foreground audios are being played.
4
+
5
+ While going through each line of the script, you have choices as below:
6
+ - For character lines, you need to convert it to a speech audio. Note that a speech audio can only be foreground. Example:
7
+ From
8
+ ```
9
+ News Anchor: Good evening, this is BBC News.
10
+ ```
11
+ To
12
+ ```
13
+ {"audio_type": "speech", "layout": "foreground", "character": "News Anchor", "vol": -15, "text": "Good evening, this is BBC News."},
14
+ ```
15
+ - For sound effects, you need to convert it to a sound_effect audio. Especially, you need to figure out its length according to the script's context, and put it into "len". Example:
16
+ From
17
+ ```
18
+ (SFX: Airport beeping sound)
19
+ ```
20
+ to
21
+ ```
22
+ {"audio_type": "sound_effect", "layout": "foreground", "vol": -35, "len": 2, "desc": "Airport beeping sound"},
23
+ ```
24
+ - For music, you need to convert it to a music audio. Especially, you need to figure out its length according to the script's context, and put it into "len". Example:
25
+ From
26
+ ```
27
+ (SFX: Uplifting newsroom music)
28
+ ```
29
+ to
30
+ ```
31
+ {"audio_type": "music", "layout": "foreground", "vol": -35, "len": 10, "desc": "Uplifting newsroom music"},
32
+ ```
33
+
34
+ When a sound effect or music is environmental played in the background, you should set their layout to "background". You must give the background audio an unique id, and you must figure out the end of the background audio according to the context and indicate it explicitly. Example:
35
+ From
36
+ ```
37
+ ...
38
+ (SFX: Airport ambiance, people walking)
39
+ Airport Announcer: Lades and Gentlemen, attentions please!
40
+ ...
41
+ ```
42
+ to
43
+ ```
44
+ ...
45
+ {"audio_type": "sound_effect", "layout": "background", "id":1, "action": "begin", "vol": -35, "desc": "Airport ambiance, people walking"},
46
+ [foreground audio]
47
+ ...
48
+ {"audio_type": "sound_effect", "layout": "background", "id":1, "action": "end"},
49
+ ...
50
+ ```
51
+
52
+ When a line contains multiple sound effects and musics, you need to decompose it into multiple audios. Example:
53
+ From
54
+ ```
55
+ ...
56
+ (SFX: A classy restaurant, low chatter, clinking silverware, jazz music playing)
57
+ ...
58
+ ```
59
+ to
60
+ ```
61
+ ...
62
+ {"audio_type": "sound_effect", "layout": "background", "id":1, "action": "begin", "vol": -35, "desc": "low chatter"},
63
+ {"audio_type": "sound_effect", "layout": "background", "id":2, "action": "begin", "vol": -35, "desc": "clinking silverware"},
64
+ {"audio_type": "music", "layout": "background", "id":3, "action": "begin", "vol": -35, "desc": "jazz music"},
65
+ ...
66
+ {"audio_type": "sound_effect", "layout": "background", "id":1, "action": "end"},
67
+ {"audio_type": "sound_effect", "layout": "background", "id":2, "action": "end"},
68
+ {"audio_type": "music", "layout": "background", "id":3, "action": "end"},
69
+ ...
70
+ ```
71
+
72
+ The final json object contains a list of all the audio objects.
73
+
74
+ Script:
prompts/script_to_json.prompt ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Convert an audio script line to another format. Each line will be converted to a simple json format. Below are the examples of conversion of each line.
2
+
3
+ Example line 1:
4
+ '''
5
+ [Background music 1 begins, -35dB: Uplifting newsroom music]
6
+ '''
7
+ convert to:
8
+ '''
9
+ {"voice_type": "back_ground_music", "id": 1, "state": "begin", "volume": -35, "desc": "Uplifting newsroom music"},
10
+ '''
11
+ Example line 2:
12
+ '''
13
+ [Background music 1 ends]
14
+ '''
15
+ convert to:
16
+ '''
17
+ {"voice_type": "back_ground_music", "id": 1, "state": "end"},
18
+ '''
19
+ Example line 3:
20
+ '''
21
+ [Background sound effect 2 begins, -35dB: Crowds cheering and arcade ambiance]
22
+ '''
23
+ convert to:
24
+ '''
25
+ {"voice_type": "back_ground_sound_effect", "id": 2, "state": "begin", "volume": -35, "desc": "Crowds cheering and arcade ambiance"},
26
+ '''
27
+ Example line 4:
28
+ '''
29
+ [Background sound effect 2 ends]
30
+ '''
31
+ convert to:
32
+ '''
33
+ {"voice_type": "back_ground_sound_effect", "id": 2, "state": "end"},
34
+ '''
35
+ Example line 5:
36
+ '''
37
+ News Anchor, -15dB: Good evening, this is BBC News.
38
+ '''
39
+ convert to:
40
+ '''
41
+ {"voice_type": "speech", "character": "News Anchor", "volume": -15, "desc": "Good evening, this is BBC News."},
42
+ '''
43
+ Example line 6:
44
+ '''
45
+ [Sound effect, 3s, -15dB: Keyboard typing and mouse clicking]
46
+ '''
47
+ convert to:
48
+ '''
49
+ {"voice_type": "sound_effect", "length": 3, "volume": -15, "desc": "Keyboard typing and mouse clicking"},
50
+ '''
51
+ Example line 7:
52
+ '''
53
+ [Sound music, 10s, -15dB: Uplifting newsroom music]
54
+ '''
55
+ convert to:
56
+ '''
57
+ {"voice_type": "music", "length": 10, "volume": -15, "desc": "Uplifting newsroom music"},
58
+ '''
prompts/text_to_audio_script.prompt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ I want you to act as a audio script writer. I'll give you an instruction which is a general idea and you will make it a short audio script.
2
+
3
+ The script should follow the rules below:
4
+ - For dialogs, each line must contain the character's name, its volume in decibel (human voices are usually around -15dB) and the line, example:
5
+ '''
6
+ Darth Vader, -16dB: Luke, I'm your father.
7
+ '''
8
+ - For foreground sound effect, you must wrap the line with brackets and start with "Sound effect, ", and you should give the duration of the sound effect in seconds, and you should specify the volume you want in decibel(For foreground sound effects it's usually around -15dB), and you should give very detailed description of the sound effect, example:
9
+ '''
10
+ [Sound effect, 2s, -15dB: Airport beeping sound]
11
+ '''
12
+ - For foreground music, you must wrap the line with brackets and start with "Music, ", and you should give the duration of the music in seconds, and you should specify the volume you want in decibel(for foreground music it's usually around -15dB), and you should give very detailed description of the music, example:
13
+ '''
14
+ [Music, 10s, -15dB: 80's Rock and Roll music]
15
+ '''
16
+ - For background sound effects, you must wrap the line with brackets and start with "Background sound effect" followed by its id, and you must always explicitly indicate the start and end of the sound effects, and you should specify the volume you want in decibel(for background sound effect it's usually around -35dB), and you should give very detailed description of the sound effect, example:
17
+ '''
18
+ [Background sound effect 1 begins, -34dB: Airport ambiance, including footsteps, luggage rolling, and distant airplane engine]
19
+ ...
20
+ [Background sound effect 1 ends]
21
+ '''
22
+ - For background music, you must wrap the line with brackets and start with "Background music" followed by its id, and you must always explicitly indicate the start and end of the music, and you should specify the volume you want in decibel(for background sound effect it's usually around -35dB), and you should give very detailed description of the music, example:
23
+ '''
24
+ [Background music 1 begins, -35dB: Uplifting newsroom music]
25
+ ...
26
+ [Background music 1 ends]
27
+ '''
28
+ - For music and sound effect, you can not name the element outside these:
29
+ ["Sound effect, ",
30
+ "Music, ",
31
+ "Background sound effect" followed by its id,
32
+ "Background music" followed by its id]
33
+ such as "Foreground sound effect", "Foreground music" is forbidden
34
+
prompts/text_to_json.prompt ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ I want you to act as a audio script writer. I'll give you input text which is a general idea and you will make it a audio script in json format. Instructions:
2
+ - Each line represents an audio. There are three types of audios: sound effect, music, and speech. For each audio, there are only two types of layouts: foreground and background. Foreground audios are played sequentially, and background audios are environmental sounds or music which are played while the foreground audios are being played.
3
+ - Sound effects can be either foreground or background. For sound effects, you must provide its layout, volume, length (in seconds), and detailed description of the real-world sound effect. Example:
4
+ '''
5
+ - The description of sound effects should not contain a specific person.
6
+ {"audio_type": "sound_effect", "layout": "foreground", "vol": -35, "len": 2, "desc": "Airport beeping sound"},
7
+ '''
8
+ - Music can be either foreground or background. For music, you must provide its layout, volume, length (in seconds), and detailed description of the music. Example:
9
+ '''
10
+ {"audio_type": "music", "layout": "foreground", "vol": -35, "len": 10, "desc": "Uplifting newsroom music"},
11
+ '''
12
+ - Speechs can only be foreground. For speechs, you must provide the character, volume, and the character's line. You do not need to specify the length of the speech. Example:
13
+ '''
14
+ {"audio_type": "speech", "layout": "foreground", "character": "News Anchor", "vol": -15, "text": "Good evening, this is BBC News. In today's breaking news, we have an unexpected turn of events in the political arena"},
15
+ '''
16
+ - The description of speechs should not contain anything other than the lines, such as actions, expressions, emotions etc.
17
+ - For background sound audio, you must specify the beginning and the end of a background audio in separate lines to indicate when the audio begins and when it ends. Example for background sound effect (for background music it's similar):
18
+ '''
19
+ {"audio_type": "sound_effect", "layout": "background", "id":1, "action": "begin", "vol": -35, "desc": "Airport ambiance, people walking"},
20
+ [foreground audio 1]
21
+ [foreground audio 2]
22
+ ...
23
+ {"audio_type": "sound_effect", "layout": "background", "id":1, "action": "end"},
24
+ '''
25
+ - Each background audio must have a unique id.
26
+ - You do not specify the length of a background audio.
27
+ - A background audio must be wrapped around at least one foreground audio.
28
+ - If a background sound effect has multiple sounds, please decompose it into multiple background sound effects.
29
+ - At the same time there must be at most only one audio with type music playing, either foreground or background.
30
+ - The volume of background sound effect/music is usually around -35 ~ -40 dB
31
+ - The output json must be a list as the root node containing all the audio nodes, and must be wrapped with triple quotes '''.
scripts/download_models.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+
4
+ # Read the YAML file
5
+ with open('config.yaml', 'r') as file:
6
+ config = yaml.safe_load(file)
7
+
8
+ # Extract values for each application
9
+ tts_env = config['Text-to-Speech']['env']
10
+
11
+ ttm_env = config['Text-to-Music']['env']
12
+ ttm_model_size = config['Text-to-Music']['model_size']
13
+
14
+ tta_env = config['Text-to-Audio']['env']
15
+
16
+ sr_env = config['Speech-Restoration']['env']
17
+
18
+ # Downloading the TTS models
19
+ print('Step 1: Downloading TTS model ...')
20
+ os.system(f'conda run --live-stream -n {tts_env} python -c \'from transformers import BarkModel; BarkModel.from_pretrained("suno/bark")\'')
21
+
22
+ print('Step 2: Downloading TTA model ...')
23
+ os.system(f'conda run --live-stream -n {tta_env} python -c \'from audiocraft.models import AudioGen; tta_model = AudioGen.get_pretrained("facebook/audiogen-medium")\'')
24
+
25
+ print('Step 3: Downloading TTM model ...')
26
+ os.system(f'conda run --live-stream -n {ttm_env} python -c \'from audiocraft.models import MusicGen; tta_model = MusicGen.get_pretrained("facebook/musicgen-{ttm_model_size}")\'')
27
+
28
+ print('Step 4: Downloading SR model ...')
29
+ os.system(f'conda run --live-stream -n {sr_env} python -c \'from voicefixer import VoiceFixer; vf = VoiceFixer()\'')
30
+
31
+ print('All models successfully downloaded!')
scripts/kill_services.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+
4
+ # Read the YAML file
5
+ with open('config.yaml', 'r') as file:
6
+ config = yaml.safe_load(file)
7
+
8
+ # Extract values for each application
9
+ tts_port = config['Text-to-Speech']['service-port']
10
+
11
+ ttm_port = config['Text-to-Music']['service-port']
12
+
13
+ tta_port = config['Text-to-Audio']['service-port']
14
+
15
+ sr_port = config['Speech-Restoration']['service-port']
16
+
17
+ vp_port = config['Voice-Parser']['service-port']
18
+
19
+
20
+ # Execute the commands
21
+ os.system(f'kill $(lsof -t -i :{tts_port})')
22
+ os.system(f'kill $(lsof -t -i :{tta_port})')
23
+ os.system(f'kill $(lsof -t -i :{ttm_port})')
24
+ os.system(f'kill $(lsof -t -i :{sr_port})')
25
+ os.system(f'kill $(lsof -t -i :{vp_port})')
26
+
27
+
28
+
scripts/restart_services.sh ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ python scripts/kill_services.py
2
+ python scripts/start_services.py
scripts/start_services.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yaml
2
+ import os
3
+
4
+ # Read the YAML file
5
+ with open('config.yaml', 'r') as file:
6
+ config = yaml.safe_load(file)
7
+
8
+ os.makedirs('services_logs', exist_ok=True)
9
+
10
+ # Extract values for each application
11
+ tts_model = config['Text-to-Speech']['model']
12
+ tts_env = config['Text-to-Speech']['env']
13
+
14
+ ttm_model = config['Text-to-Music']['model']
15
+ ttm_env = config['Text-to-Music']['env']
16
+
17
+ tta_model = config['Text-to-Audio']['model']
18
+ tta_env = config['Text-to-Audio']['env']
19
+
20
+ sr_model = config['Speech-Restoration']['model']
21
+ sr_env = config['Speech-Restoration']['env']
22
+ enable_sr = config['Speech-Restoration']['Enable']
23
+
24
+ vp_model = config['Voice-Parser']['model']
25
+ vp_env = config['Voice-Parser']['env']
26
+
27
+ # Execute the commands
28
+ os.system(f'nohup conda run --live-stream -n {tts_env} python {tts_model}/app.py > services_logs/meta_tts.out 2>&1 &')
29
+ os.system(f'nohup conda run --live-stream -n {vp_env} python {vp_model}/app.py > services_logs/meta_vp.out 2>&1 &')
30
+
31
+ if enable_sr:
32
+ os.system(f'nohup conda run --live-stream -n {sr_env} python {sr_model}/app.py > services_logs/meta_sr.out 2>&1 &')
33
+
34
+ # Using AudioCraft for TTA & TTM
35
+ if tta_env == ttm_env:
36
+ os.system(f'nohup conda run --live-stream -n {ttm_env} python {ttm_model}/app.py > services_logs/meta_tta_ttm.out 2>&1 &')
37
+
38
+ # Using AudioLDM for TTA, MusicGen for TTM
39
+ if tta_env != ttm_env:
40
+ os.system(f'nohup conda run --live-stream -n {tta_env} python {tta_model}/app.py > services_logs/meta_tta.out 2>&1 &')
41
+ os.system(f'nohup conda run --live-stream -n {ttm_env} python {ttm_model}/app.py > services_logs/meta_ttm.out 2>&1 &')
scripts/start_ui.sh ADDED
@@ -0,0 +1 @@
 
 
1
+ conda run --live-stream -n WavJourney gradio ui_client.py
ui_client.py ADDED
@@ -0,0 +1,273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pdb
2
+ import shutil
3
+
4
+ import gradio as gr
5
+
6
+ import pipeline
7
+ import utils
8
+ from pipeline import generate_json_file, generate_audio
9
+ from voice_presets import load_voice_presets_metadata, add_session_voice_preset, \
10
+ remove_session_voice_preset
11
+
12
+ import openai
13
+
14
+ VOICE_PRESETS_HEADERS = ['ID', 'Description']
15
+ DELETE_FILE_WHEN_DO_CLEAR = False
16
+ DEBUG = False
17
+
18
+
19
+ def generate_script_fn(instruction, _state: gr.State):
20
+ try:
21
+ session_id = _state['session_id']
22
+ json_script = generate_json_file(session_id, instruction)
23
+ table_text = pipeline.convert_json_to_md(json_script)
24
+ except Exception as e:
25
+ gr.Warning(str(e))
26
+ print(f"Generating script error: {str(e)}")
27
+ return [None, gr.Button.update(interactive=False), _state, gr.Button.update(interactive=True)]
28
+ _state = {
29
+ **_state,
30
+ 'session_id': session_id,
31
+ 'json_script': json_script
32
+ }
33
+ return [
34
+ table_text,
35
+ _state,
36
+ gr.Button.update(interactive=True),
37
+ gr.Button.update(interactive=True),
38
+ gr.Button.update(interactive=True),
39
+ gr.Button.update(interactive=True),
40
+ ]
41
+
42
+
43
+ def generate_audio_fn(state):
44
+ btn_state = gr.Button.update(interactive=True)
45
+ try:
46
+ audio_path = generate_audio(**state)
47
+ return [
48
+ gr.make_waveform(str(audio_path)),
49
+ btn_state,
50
+ btn_state,
51
+ btn_state,
52
+ btn_state,
53
+ ]
54
+ except Exception as e:
55
+ print(f"Generation audio error: {str(e)}")
56
+ gr.Warning(str(e))
57
+ return [
58
+ None,
59
+ btn_state,
60
+ btn_state,
61
+ btn_state,
62
+ btn_state,
63
+ ]
64
+
65
+
66
+ def clear_fn(state):
67
+ if DELETE_FILE_WHEN_DO_CLEAR:
68
+ shutil.rmtree('output', ignore_errors=True)
69
+ state = {'session_id': pipeline.init_session()}
70
+ return [gr.Textbox.update(value=''), gr.Video.update(value=None),
71
+ gr.Markdown.update(value=''), gr.Button.update(interactive=False), gr.Button.update(interactive=False),
72
+ state, gr.Dataframe.update(visible=False), gr.Button.update(visible=False),
73
+ gr.Textbox.update(value=''), gr.Textbox.update(value=''), gr.File.update(value=None)]
74
+
75
+
76
+ def textbox_listener(textbox_input):
77
+ if len(textbox_input) > 0:
78
+ return gr.Button.update(interactive=True)
79
+ else:
80
+ return gr.Button.update(interactive=False)
81
+
82
+
83
+ def get_voice_preset_to_list(state: gr.State):
84
+ if state.__class__ == dict:
85
+ session_id = state['session_id']
86
+ else:
87
+ session_id = state.value['session_id']
88
+ voice_presets = load_voice_presets_metadata(
89
+ utils.get_session_voice_preset_path(session_id),
90
+ safe_if_metadata_not_exist=True
91
+ )
92
+ dataframe = []
93
+ for key in voice_presets.keys():
94
+ row = [key, voice_presets[key]['desc']]
95
+ dataframe.append(row)
96
+ return dataframe
97
+
98
+
99
+ def df_on_select(evt: gr.SelectData):
100
+ print(f"You selected {evt.value} at {evt.index} from {evt.target}")
101
+ return {'selected_voice_preset': evt.index}
102
+
103
+
104
+ def del_voice_preset(selected_voice_presets, ui_state, dataframe):
105
+ gr_visible = gr.Dataframe.update(visible=True)
106
+ btn_visible = gr.Button.update(visible=True)
107
+ current_presets = get_voice_preset_to_list(ui_state)
108
+ if selected_voice_presets['selected_voice_preset'] is None or \
109
+ selected_voice_presets['selected_voice_preset'][0] > len(current_presets) - 1:
110
+ gr.Warning('None row is selected')
111
+ return [current_presets, gr_visible, btn_visible, selected_voice_presets]
112
+ # Do the real file deletion
113
+ index = selected_voice_presets['selected_voice_preset'][0]
114
+ vp_id = dataframe['ID'][index]
115
+ remove_session_voice_preset(vp_id, ui_state['session_id'])
116
+ current_presets = get_voice_preset_to_list(ui_state)
117
+ gr.Dataframe.update(value=current_presets)
118
+ if len(current_presets) == 0:
119
+ gr_visible = gr.Dataframe.update(visible=False)
120
+ btn_visible = gr.Button.update(visible=False)
121
+ selected_voice_presets['selected_voice_preset'] = None
122
+ return [current_presets, gr_visible, btn_visible, selected_voice_presets]
123
+
124
+
125
+ def get_system_voice_presets():
126
+ system_presets = load_voice_presets_metadata(utils.get_system_voice_preset_path())
127
+ data = []
128
+ for k, v in system_presets.items():
129
+ data.append([k, v['desc']])
130
+ # headers = ['id', 'description']
131
+ # table_txt = tabulate(data, headers, tablefmt="github")
132
+ return data
133
+
134
+
135
+ def set_openai_key(key):
136
+ openai.api_key = key
137
+ return key
138
+
139
+
140
+ def add_voice_preset(vp_id, vp_desc, file, ui_state, added_voice_preset):
141
+ if vp_id is None or vp_desc is None or file is None or vp_id.strip() == '' or vp_desc.strip() == '':
142
+ gr.Warning('please complete all three fields')
143
+ else:
144
+ count: int = added_voice_preset['count']
145
+ # check if greater than 3
146
+ session_id = ui_state['session_id']
147
+ file_path = file.name
148
+ print(f'session {session_id}, id {id}, desc {vp_desc}, file {file_path}')
149
+ # Do adding ...
150
+ try:
151
+ add_session_voice_preset(vp_id, vp_desc, file_path, session_id)
152
+ added_voice_preset['count'] = count + 1
153
+ except Exception as exception:
154
+ gr.Warning(str(exception))
155
+ # After added
156
+ dataframe = get_voice_preset_to_list(ui_state)
157
+ df_visible = gr.Dataframe.update(visible=True)
158
+ del_visible = gr.Button.update(visible=True)
159
+ if len(dataframe) == 0:
160
+ df_visible = gr.Dataframe.update(visible=False)
161
+ del_visible = gr.Button.update(visible=False)
162
+ return [gr.Textbox.update(value=''), gr.Textbox.update(value=''), gr.File.update(value=None),
163
+ ui_state, added_voice_preset, dataframe, gr.Button.update(interactive=True),
164
+ df_visible, del_visible]
165
+
166
+
167
+ with gr.Blocks() as interface:
168
+ system_voice_presets = get_system_voice_presets()
169
+ # State
170
+ ui_state = gr.State(value={'session_id': pipeline.init_session()})
171
+ selected_voice_presets = gr.State(value={'selected_voice_preset': None})
172
+ added_voice_preset_state = gr.State(value={'added_file': None, 'count': 0})
173
+ # UI Component
174
+ key_text_input = gr.Textbox(label='Please Enter OPENAI Key for acessing GPT4', lines=1, placeholder="Input instruction here.",
175
+ value='')
176
+ text_input_value = '' if DEBUG is False else "News channel BBC broadcast about Trump playing street fighter 6 against Biden"
177
+ text_input = gr.Textbox(label='Input', lines=2, placeholder="Input instruction here.",
178
+ value=text_input_value)
179
+ markdown_output = gr.Markdown(label='Audio Script', lines=2)
180
+ generate_script_btn = gr.Button(value='Generate Script', interactive=False)
181
+ audio_output = gr.Video(type='filepath')
182
+ generate_audio_btn = gr.Button(value='Generate Audio', interactive=False)
183
+ clear_btn = gr.ClearButton(value='Clear Inputs')
184
+ # System Voice Presets
185
+ gr.Markdown(label='System Voice Presets', value='# System Voice Presets')
186
+ system_markdown_voice_presets = gr.Dataframe(label='System Voice Presets', headers=VOICE_PRESETS_HEADERS,
187
+ value=system_voice_presets)
188
+ # User Voice Preset Related
189
+ gr.Markdown(label='User Voice Presets', value='# User Voice Presets')
190
+ get_voice_preset_to_list(ui_state)
191
+ voice_presets_df = gr.Dataframe(headers=VOICE_PRESETS_HEADERS, col_count=len(VOICE_PRESETS_HEADERS),
192
+ value=get_voice_preset_to_list(ui_state), interactive=False, visible=False)
193
+ # voice_presets_ds = gr.Dataset(components=[gr.Dataframe(visible=True)], samples=get_voice_preset_to_list(ui_state))
194
+ del_voice_btn = gr.Button(value='Delete Selected Voice Preset', visible=False)
195
+ gr.Markdown(label='Add Voice Preset', value='## Add Voice Preset')
196
+ vp_text_id = gr.Textbox(label='Id', lines=1, placeholder="Input voice preset id here.")
197
+ vp_text_desc = gr.Textbox(label='Desc', lines=1, placeholder="Input description here.")
198
+ vp_file = gr.File(label='Wav File', type='file', description='Upload your wav file here.', file_types=['.wav'],
199
+ interactive=True)
200
+ vp_submit = gr.Button(label='Upload Voice Preset', value="Upload Voice Preset")
201
+ # events
202
+ key_text_input.change(fn=set_openai_key, inputs=[key_text_input], outputs=[key_text_input])
203
+ text_input.change(fn=textbox_listener, inputs=[text_input], outputs=[generate_script_btn])
204
+ generate_audio_btn.click(
205
+ fn=generate_audio_fn,
206
+ inputs=[ui_state],
207
+ outputs=[
208
+ audio_output,
209
+ generate_audio_btn,
210
+ generate_script_btn,
211
+ clear_btn,
212
+ vp_submit,
213
+ ],
214
+ api_name='audio_journey',
215
+ )
216
+ generate_audio_btn.click(
217
+ fn=lambda _: [
218
+ gr.Button.update(interactive=False),
219
+ gr.Button.update(interactive=False),
220
+ gr.Button.update(interactive=False),
221
+ gr.Button.update(interactive=False),
222
+ ],
223
+ outputs=[
224
+ generate_audio_btn,
225
+ generate_script_btn,
226
+ clear_btn,
227
+ vp_submit,
228
+ ]
229
+ )
230
+ clear_btn.click(fn=clear_fn, inputs=ui_state,
231
+ outputs=[text_input, audio_output, markdown_output, generate_audio_btn, generate_script_btn,
232
+ ui_state, voice_presets_df, del_voice_btn,
233
+ vp_text_id, vp_text_desc, vp_file])
234
+ generate_script_btn.click(
235
+ fn=generate_script_fn, inputs=[text_input, ui_state],
236
+ outputs=[
237
+ markdown_output,
238
+ ui_state,
239
+ generate_audio_btn,
240
+ generate_script_btn,
241
+ clear_btn,
242
+ vp_submit,
243
+ ]
244
+ )
245
+ generate_script_btn.click(
246
+ fn=lambda _: [
247
+ gr.Button.update(interactive=False),
248
+ gr.Button.update(interactive=False),
249
+ gr.Button.update(interactive=False),
250
+ gr.Button.update(interactive=False),
251
+ ],
252
+ outputs=[
253
+ generate_audio_btn,
254
+ generate_script_btn,
255
+ clear_btn,
256
+ vp_submit,
257
+ ]
258
+ )
259
+ voice_presets_df.select(df_on_select, outputs=[selected_voice_presets])
260
+ voice_presets_df.update(lambda x: print(x))
261
+ del_voice_btn.click(del_voice_preset, inputs=[selected_voice_presets, ui_state, voice_presets_df],
262
+ outputs=[voice_presets_df, voice_presets_df, del_voice_btn, selected_voice_presets])
263
+ # user voice preset upload
264
+ vp_submit.click(add_voice_preset, inputs=[vp_text_id, vp_text_desc, vp_file, ui_state, added_voice_preset_state],
265
+ outputs=[vp_text_id, vp_text_desc, vp_file, ui_state, added_voice_preset_state, voice_presets_df,
266
+ vp_submit,
267
+ voice_presets_df, del_voice_btn])
268
+ vp_submit.click(lambda _: gr.Button.update(interactive=False), inputs=[vp_submit])
269
+ # debug only
270
+ # print_state_btn = gr.Button(value='Print State')
271
+ # print_state_btn.click(fn=lambda state, state2: print(state, state2), inputs=[ui_state, selected_voice_presets])
272
+ interface.queue(concurrency_count=5)
273
+ interface.launch()