max input length
Browse files- infer_onnx.py +38 -32
infer_onnx.py
CHANGED
@@ -22,7 +22,8 @@ def intersperse(lst, item):
|
|
22 |
result = [item] * (len(lst) * 2 + 1)
|
23 |
result[1::2] = lst
|
24 |
return result
|
25 |
-
|
|
|
26 |
def process_text(i: int, text: str, device: torch.device, cleaner:str):
|
27 |
print(f"[{i}] - Input text: {text}")
|
28 |
x = torch.tensor(
|
@@ -152,36 +153,40 @@ def vocos_inference(mel,denoise):
|
|
152 |
|
153 |
|
154 |
def tts(text:str, accent:str, spk_name:str, temperature:float, length_scale:float):
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
|
|
|
|
|
|
|
|
185 |
|
186 |
|
187 |
## GUI space
|
@@ -244,7 +249,8 @@ matcha_inference = gr.Interface(
|
|
244 |
gr.Textbox(
|
245 |
value="m'ha costat molt desenvolupar una veu, i ara que la tinc no estaré en silenci.",
|
246 |
max_lines=1,
|
247 |
-
label="Input text
|
|
|
248 |
),
|
249 |
accent_dropdown,
|
250 |
speaker_dropdown,
|
|
|
22 |
result = [item] * (len(lst) * 2 + 1)
|
23 |
result[1::2] = lst
|
24 |
return result
|
25 |
+
|
26 |
+
|
27 |
def process_text(i: int, text: str, device: torch.device, cleaner:str):
|
28 |
print(f"[{i}] - Input text: {text}")
|
29 |
x = torch.tensor(
|
|
|
153 |
|
154 |
|
155 |
def tts(text:str, accent:str, spk_name:str, temperature:float, length_scale:float):
|
156 |
+
if len(text) > 500:
|
157 |
+
gr.Info("The maximum input allowed is 500 characters.")
|
158 |
+
|
159 |
+
else:
|
160 |
+
denoise=True
|
161 |
+
spk_id = speaker_id_dict[accent][spk_name]
|
162 |
+
sid = np.array([int(spk_id)]) if spk_id is not None else None
|
163 |
+
text_matcha , text_lengths = process_text(0,text,"cpu",cleaner=cleaners[accent])
|
164 |
+
model_matcha_mel = models[accent]
|
165 |
+
|
166 |
+
# MATCHA VOCOS
|
167 |
+
inputs = {
|
168 |
+
"x": text_matcha,
|
169 |
+
"x_lengths": text_lengths,
|
170 |
+
"scales": np.array([temperature, length_scale], dtype=np.float32),
|
171 |
+
"spks": sid
|
172 |
+
}
|
173 |
+
mel_t0 = perf_counter()
|
174 |
+
# matcha mel inference
|
175 |
+
mel, mel_lengths = model_matcha_mel.run(None, inputs)
|
176 |
+
mel_infer_secs = perf_counter() - mel_t0
|
177 |
+
print("Matcha Mel inference time", mel_infer_secs)
|
178 |
+
|
179 |
+
vocos_t0 = perf_counter()
|
180 |
+
# vocos inference
|
181 |
+
wavs_vocos = vocos_inference(mel,denoise)
|
182 |
+
vocos_infer_secs = perf_counter() - vocos_t0
|
183 |
+
print("Vocos inference time", vocos_infer_secs)
|
184 |
+
|
185 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False, dir="/home/user/app") as fp_matcha_vocos:
|
186 |
+
sf.write(fp_matcha_vocos.name, wavs_vocos.squeeze(0), 22050, "PCM_24")
|
187 |
+
|
188 |
+
print(f"RTF matcha + vocos { (mel_infer_secs + vocos_infer_secs) / (wavs_vocos.shape[1]/22050) }")
|
189 |
+
return fp_matcha_vocos.name
|
190 |
|
191 |
|
192 |
## GUI space
|
|
|
249 |
gr.Textbox(
|
250 |
value="m'ha costat molt desenvolupar una veu, i ara que la tinc no estaré en silenci.",
|
251 |
max_lines=1,
|
252 |
+
label="Input text ",
|
253 |
+
info="max 500 characters",
|
254 |
),
|
255 |
accent_dropdown,
|
256 |
speaker_dropdown,
|