File size: 2,382 Bytes
c968fc3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
# Copyright (c) 2024 Amphion.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import os
import torch
from tqdm import tqdm
from collections import OrderedDict
from models.tts.base.tts_inferece import TTSInference
from models.tts.jets.jets_dataset import JetsTestDataset, JetsTestCollator
from utils.util import load_config
from utils.io import save_audio
from models.tts.jets.jets import Jets
from models.vocoders.vocoder_inference import synthesis
from pathlib import Path
from processors.phone_extractor import phoneExtractor
from text.text_token_collation import phoneIDCollation
import numpy as np
import json
import time
class JetsInference(TTSInference):
def __init__(self, args, cfg):
TTSInference.__init__(self, args, cfg)
self.args = args
self.cfg = cfg
self.infer_type = args.mode
def _build_model(self):
self.model = Jets(self.cfg)
return self.model
def _build_test_dataset(self):
return JetsTestDataset, JetsTestCollator
def inference_for_batches(self):
###### Construct test_batch ######
n_batch = len(self.test_dataloader)
now = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time()))
print(
"Model eval time: {}, batch_size = {}, n_batch = {}".format(
now, self.test_batch_size, n_batch
)
)
self.model.eval()
###### Inference for each batch ######
pred_res = []
with torch.no_grad():
for i, batch_data in enumerate(
self.test_dataloader if n_batch == 1 else tqdm(self.test_dataloader)
):
outputs = self.model.inference(batch_data)
audios, d_predictions = outputs
d_predictions = d_predictions.unsqueeze(-1)
for idx in range(audios.size(0)):
audio = audios[idx, 0, :].data.cpu().float()
duration = d_predictions[idx, :, :]
audio_length = (
duration.sum([0, 1]).long() * self.cfg.preprocess.hop_size
)
audio_length = audio_length.cpu().numpy()
audio = audio[:audio_length]
pred_res.append(audio)
return pred_res
|