import torch import torch.nn as nn import torch.nn.functional as F import numpy as np from utils import ConvNorm, LinearNorm from torch.nn.parameter import Parameter class GroupNorm_Mask(nn.Module): def __init__(self, num_groups, num_channels, eps=1e-5, affine=True): super().__init__() self.num_groups = num_groups self.num_channels = num_channels self.eps = eps self.affine = affine if self.affine: self.weight = Parameter(torch.Tensor(num_channels)) self.bias = Parameter(torch.Tensor(num_channels)) else: self.register_parameter('weight', None) self.register_parameter('bias', None) self.reset_parameters() def reset_parameters(self): if self.affine: nn.init.ones_(self.weight) nn.init.zeros_(self.bias) def forward(self, x, mask): B, C, L = x.size() assert C % self.num_groups == 0 x = x.view(B, self.num_groups, C//self.num_groups, L) mask = mask.view(B, 1, 1, L) x = x * mask mean = x.mean(dim=2, keepdim=True).sum(dim=3, keepdim=True) / mask.sum(dim=3, keepdim=True) var = (((x - mean)**2)*mask).mean(dim=2, keepdim=True).sum(dim=3, keepdim=True) / mask.sum(dim=3, keepdim=True) x = (x - mean) / (var + self.eps).sqrt() x = x.view(B, C, L) return x * self.weight.view(1,-1,1) + self.bias.view(1,-1,1) class M43_Sequential(nn.Sequential): def forward(self, inputs, mask): inputs = self._modules['0'](inputs) inputs = self._modules['1'](inputs, mask) return inputs class Encoder(nn.Module): """Encoder module: """ def __init__(self, hparams): super(Encoder, self).__init__() self.dim_freq = hparams.dim_freq_sea self.dim_enc = hparams.dim_enc_sea self.chs_grp = hparams.chs_grp self.dim_neck = hparams.dim_neck_sea convolutions = [] for i in range(5): conv_layer = M43_Sequential( ConvNorm(self.dim_freq if i==0 else self.dim_enc, self.dim_enc, kernel_size=1, stride=1, padding=0, dilation=1, w_init_gain='relu'), GroupNorm_Mask(self.dim_enc//self.chs_grp, self.dim_enc)) convolutions.append(conv_layer) conv_layer = M43_Sequential( ConvNorm(self.dim_enc, 128, kernel_size=1, stride=1, padding=0, dilation=1, w_init_gain='relu'), GroupNorm_Mask(128//self.chs_grp, 128)) convolutions.append(conv_layer) conv_layer = M43_Sequential( ConvNorm(128, 32, kernel_size=1, stride=1, padding=0, dilation=1, w_init_gain='relu'), GroupNorm_Mask(32//self.chs_grp, 32)) convolutions.append(conv_layer) conv_layer = M43_Sequential( ConvNorm(32, self.dim_neck, kernel_size=1, stride=1, padding=0, dilation=1, w_init_gain='relu'), GroupNorm_Mask(1, self.dim_neck)) convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) def forward(self, x, mask): for conv in self.convolutions: x = F.relu(conv(x, mask)) codes = x.permute(0, 2, 1) * mask.unsqueeze(-1) return codes class Decoder(nn.Module): """Decoder module: """ def __init__(self, hparams): super(Decoder, self).__init__() self.dim_enc = hparams.dim_enc_sea self.dim_emb = hparams.dim_spk self.dim_freq = hparams.dim_freq_sp self.dim_neck = hparams.dim_neck_sea self.lstm = nn.LSTM(self.dim_neck+self.dim_emb, 1024, 3, batch_first=True) self.linear_projection = LinearNorm(1024, self.dim_freq) def forward(self, x): outputs = self.lstm(x)[0] decoder_output = self.linear_projection(outputs) return decoder_output class Generator(nn.Module): """Generator network.""" def __init__(self, hparams): super(Generator, self).__init__() self.encoder = Encoder(hparams) self.decoder = Decoder(hparams) def forward(self, x, c_trg): x = x.transpose(2,1) codes = self.encoder(x) encoder_outputs = torch.cat((codes, c_trg.unsqueeze(1).expand(-1,x.size(-1),-1)), dim=-1) mel_outputs = self.decoder(encoder_outputs) return mel_outputs def encode(self, x, mask): x = x.transpose(2,1) codes = self.encoder(x, mask) return codes def decode(self, codes, c_trg): encoder_outputs = torch.cat((codes, c_trg.unsqueeze(1).expand(-1,codes.size(1),-1)), dim=-1) mel_outputs = self.decoder(encoder_outputs) return mel_outputs class Encoder_2(nn.Module): """Encoder module: """ def __init__(self, hparams): super().__init__() self.dim_freq = hparams.dim_freq_sea self.dim_enc = hparams.dim_enc_sea self.chs_grp = hparams.chs_grp self.dim_neck = hparams.dim_neck_sea convolutions = [] for i in range(5): conv_layer = M43_Sequential( ConvNorm(self.dim_freq if i==0 else self.dim_enc, self.dim_enc, kernel_size=5, stride=1, padding=2, dilation=1, w_init_gain='relu'), GroupNorm_Mask(self.dim_enc//self.chs_grp, self.dim_enc)) convolutions.append(conv_layer) conv_layer = M43_Sequential( ConvNorm(self.dim_enc, 128, kernel_size=5, stride=1, padding=2, dilation=1, w_init_gain='relu'), GroupNorm_Mask(128//self.chs_grp, 128)) convolutions.append(conv_layer) conv_layer = M43_Sequential( ConvNorm(128, 32, kernel_size=5, stride=1, padding=2, dilation=1, w_init_gain='relu'), GroupNorm_Mask(32//self.chs_grp, 32)) convolutions.append(conv_layer) conv_layer = M43_Sequential( ConvNorm(32, self.dim_neck, kernel_size=5, stride=1, padding=2, dilation=1, w_init_gain='linear'), GroupNorm_Mask(1, self.dim_neck)) convolutions.append(conv_layer) self.convolutions = nn.ModuleList(convolutions) def forward(self, x, mask): for i in range(len(self.convolutions)-1): x = F.relu(self.convolutions[i](x, mask)) x = self.convolutions[-1](x, mask) codes = x.permute(0, 2, 1) * mask.unsqueeze(-1) return codes