In [None]:
!git clone https://github.com/neonbjb/tortoise-tts.git
%cd tortoise-tts
!pip install -r requirements.txt

In [None]:
# Imports used through the rest of the notebook.
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm

from utils.tokenizer import VoiceBpeTokenizer
from models.discrete_diffusion_vocoder import DiscreteDiffusionVocoder
from models.text_voice_clip import VoiceCLIP
from models.dvae import DiscreteVAE
from models.autoregressive import UnifiedVoice

# These have some fairly interesting code that is hidden in the colab. Consider checking it out.
from do_tts import download_models, load_discrete_vocoder_diffuser, load_conditioning, fix_autoregressive_output, do_spectrogram_diffusion

In [None]:
# Download pretrained models and set up pretrained voice bank. Feel free to upload and add your own voices here.
# To do so, upload two WAV files cropped to 5-10 seconds of someone speaking.
download_models()
preselected_cond_voices = {
        # Male voices
        'dotrice': ['voices/dotrice/1.wav', 'voices/dotrice/2.wav'],
        'harris': ['voices/harris/1.wav', 'voices/harris/2.wav'],
        'lescault': ['voices/lescault/1.wav', 'voices/lescault/2.wav'],
        'otto': ['voices/otto/1.wav', 'voices/otto/2.wav'],
        # Female voices
        'atkins': ['voices/atkins/1.wav', 'voices/atkins/2.wav'],
        'grace': ['voices/grace/1.wav', 'voices/grace/2.wav'],
        'kennard': ['voices/kennard/1.wav', 'voices/kennard/2.wav'],
        'mol': ['voices/mol/1.wav', 'voices/mol/2.wav'],
    }

In [None]:
# This is the text that will be spoken.
text = "And took the other as just as fair, and having perhaps the better claim, because it was grassy and wanted wear."
# This is the voice that will speak it.
voice = 'atkins'
# This is the number of samples we will generate from the DALLE-style model. More will produce better results, but will take longer to produce.
# I don't recommend going less than 128.
num_autoregressive_samples = 128

In [None]:
# Prepare data.
tokenizer = VoiceBpeTokenizer()
text = torch.IntTensor(tokenizer.encode(text)).unsqueeze(0).cuda()
text = F.pad(text, (0,1))  # This may not be necessary.
cond_paths = preselected_cond_voices[voice]
conds = []
for cond_path in cond_paths:
    c, cond_wav = load_conditioning(cond_path)
    conds.append(c)
conds = torch.stack(conds, dim=1)  # And just use the last cond_wav for the diffusion model.

In [None]:
# Load the autoregressive model.
autoregressive = UnifiedVoice(max_mel_tokens=300, max_text_tokens=200, max_conditioning_inputs=2, layers=30, model_dim=1024,
                                      heads=16, number_text_tokens=256, start_text_token=255, checkpointing=False, train_solo_embeddings=False).cuda().eval()
autoregressive.load_state_dict(torch.load('.models/autoregressive.pth'))
stop_mel_token = autoregressive.stop_mel_token

In [None]:
# Perform inference with the autoregressive model, generating num_autoregressive_samples
with torch.no_grad():
    samples = []
    for b in tqdm(range(num_autoregressive_samples // 16)):
        codes = autoregressive.inference_speech(conds, text, num_beams=1, repetition_penalty=1.0, do_sample=True, top_k=50, top_p=.95,
                                                temperature=.9, num_return_sequences=16, length_penalty=1)
        padding_needed = 250 - codes.shape[1]
        codes = F.pad(codes, (0, padding_needed), value=stop_mel_token)
        samples.append(codes)

# Delete model weights to conserve memory.
del autoregressive

In [None]:
# Load the CLIP model.
clip = VoiceCLIP(dim_text=512, dim_speech=512, dim_latent=512, num_text_tokens=256, text_enc_depth=8, text_seq_len=120, text_heads=8,
                  num_speech_tokens=8192, speech_enc_depth=10, speech_heads=8, speech_seq_len=250).cuda().eval()
clip.load_state_dict(torch.load('.models/clip.pth'))

In [None]:
# Use the CLIP model to select the best autoregressive output to match the given text.
clip_results = []
with torch.no_grad():
    for batch in samples:
        for i in range(batch.shape[0]):
            batch[i] = fix_autoregressive_output(batch[i], stop_mel_token)
        text = text[:, :120]  # Ugly hack to fix the fact that I didn't train CLIP to handle long enough text.
        clip_results.append(clip(text.repeat(batch.shape[0], 1),
                            torch.full((batch.shape[0],), fill_value=text.shape[1]-1, dtype=torch.long, device='cuda'),
                            batch, torch.full((batch.shape[0],), fill_value=batch.shape[1]*1024, dtype=torch.long, device='cuda'),
                            return_loss=False))
    clip_results = torch.cat(clip_results, dim=0)
    samples = torch.cat(samples, dim=0)
    best_results = samples[torch.topk(clip_results, k=1).indices]

# Save samples to CPU memory, delete clip to conserve memory.
samples = samples.cpu()
del clip

In [None]:
# Load the DVAE and diffusion model.
dvae = DiscreteVAE(positional_dims=1, channels=80, hidden_dim=512, num_resnet_blocks=3, codebook_dim=512, num_tokens=8192, num_layers=2,
                    record_codes=True, kernel_size=3, use_transposed_convs=False).cuda().eval()
dvae.load_state_dict(torch.load('.models/dvae.pth'), strict=False)
diffusion = DiscreteDiffusionVocoder(model_channels=128, dvae_dim=80, channel_mult=[1, 1, 1.5, 2, 3, 4, 6, 8, 8, 8, 8], num_res_blocks=[1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1],
                                      spectrogram_conditioning_resolutions=[2,512], attention_resolutions=[512,1024], num_heads=4, kernel_size=3, scale_factor=2,
                                      conditioning_inputs_provided=True, time_embed_dim_multiplier=4).cuda().eval()
diffusion.load_state_dict(torch.load('.models/diffusion.pth'))
diffuser = load_discrete_vocoder_diffuser(desired_diffusion_steps=100)

In [None]:
# Decode the (best) discrete sequence created by the autoregressive model.
with torch.no_grad():
    for b in range(best_results.shape[0]):
        code = best_results[b].unsqueeze(0)
        wav = do_spectrogram_diffusion(diffusion, dvae, diffuser, code, cond_wav, spectrogram_compression_factor=256, mean=True)
        torchaudio.save(f'{voice}_{b}.wav', wav.squeeze(0).cpu(), 22050)

In [None]:
# Listen to your text! (told you that'd take a long time..)
from IPython.display import Audio
Audio(data=wav.squeeze(0).cpu().numpy(), rate=22050)