{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "5dde1b9d", "metadata": {}, "outputs": [], "source": [ "from pathlib import Path\n", "import utils\n", "from models import SynthesizerTrn\n", "import torch\n", "from torch import no_grad, LongTensor\n", "import librosa\n", "from text import text_to_sequence, _clean_text\n", "import commons\n", "import scipy.io.wavfile as wavf\n", "import os\n", "\n", "import IPython.display as ipd" ] }, { "cell_type": "code", "execution_count": 11, "id": "f4bc040a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "INFO:root:Loaded checkpoint './OUTPUT_MODEL/G_latest.pth' (iteration 601)\n", "o↑hayoogozaima↓sU.\n", " length:18\n", " length:18\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "model_path = \"./OUTPUT_MODEL/G_latest.pth\"\n", "config_path = \"./OUTPUT_MODEL/config.json\"\n", "\n", "length = 1.0\n", "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n", "\n", "def get_text(text, hps, is_symbol):\n", " text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)\n", " if hps.data.add_blank:\n", " text_norm = commons.intersperse(text_norm, 0)\n", " text_norm = LongTensor(text_norm)\n", " return text_norm\n", "\n", "hps = utils.get_hparams_from_file(config_path)\n", "net_g = SynthesizerTrn(\n", " len(hps.symbols),\n", " hps.data.filter_length // 2 + 1,\n", " hps.train.segment_size // hps.data.hop_length,\n", " n_speakers=hps.data.n_speakers,\n", " **hps.model).to(device)\n", "_ = net_g.eval()\n", "_ = utils.load_checkpoint(model_path, net_g, None)\n", "\n", "speaker_ids = hps.speakers\n", "\n", "text = \"おはようございます。\"\n", "#text = \"[JA]\" + text + \"[JA]\"\n", "speaker_id = 0\n", "stn_tst = get_text(text, hps, False)\n", "with no_grad():\n", " x_tst = stn_tst.unsqueeze(0).to(device)\n", " x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)\n", " sid = LongTensor([speaker_id]).to(device)\n", " audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.6,\n", " length_scale=1.0 / length)[0][0, 0].data.cpu().float().numpy()\n", "del stn_tst, x_tst, x_tst_lengths, sid\n", "\n", "ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))" ] }, { "cell_type": "code", "execution_count": null, "id": "032cc92d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 5 }