From ce511b8417138ca7068c3b16582ce4cd7dc1415e Mon Sep 17 00:00:00 2001 From: Adorable-Qin Date: Thu, 11 Apr 2024 20:37:40 +0800 Subject: [PATCH] Remove useless file --- modules/encoder/conv_encoder.py | 103 -------------------------------- 1 file changed, 103 deletions(-) delete mode 100644 modules/encoder/conv_encoder.py diff --git a/modules/encoder/conv_encoder.py b/modules/encoder/conv_encoder.py deleted file mode 100644 index cfbd2e29..00000000 --- a/modules/encoder/conv_encoder.py +++ /dev/null @@ -1,103 +0,0 @@ -# Copyright (c) 2023 Amphion. -# -# This source code is licensed under the MIT license found in the -# LICENSE file in the root directory of this source tree. - -import torch -import torch.nn as nn -import torch.nn.functional as F -from torch.nn.utils import spectral_norm -from modules.generic.conv import Conv1d - - -class ConvEncoder(nn.Module): - def __init__(self, in_channels, z_channels, spk_channels, num_dilation_layer=10): - super(ConvEncoder, self).__init__() - - self.in_channels = in_channels - self.z_channels = z_channels - self.spk_channels = spk_channels - - self.pre_process = Conv1d(in_channels, 512, kernel_size=3) - - self.dilated_conv_layers = nn.ModuleList() - for i in range(num_dilation_layer): - dilation = 2**i - self.dilated_conv_layers.append( - DilatedConvBlock(512, 512, z_channels, spk_channels, dilation) - ) - - def forward(self, inputs, z, s): - inputs = inputs.transpose(1, 2) - outputs = self.pre_process(inputs) - print(inputs.shape) - for layer in self.dilated_conv_layers: - outputs = layer(outputs, z, s) - - encoder_outputs = outputs.transpose(1, 2) - return encoder_outputs - - -class DilatedConvBlock(nn.Module): - """A stack of dilated convolutions interspersed - with batch normalisation and ReLU activations""" - - def __init__(self, in_channels, out_channels, z_channels, s_channels, dilation): - super(DilatedConvBlock, self).__init__() - - self.in_channels = in_channels - self.out_channels = out_channels - self.z_channels = z_channels - self.s_channels = s_channels - - self.conv1d = Conv1d( - in_channels, out_channels, kernel_size=3, dilation=dilation - ) - self.batch_layer = BatchNorm1dLayer(out_channels, s_channels, z_channels) - - def forward(self, inputs, z, s): - outputs = self.conv1d(inputs) - outputs = self.batch_layer(outputs, z, s) - return F.relu(outputs) - - -class BatchNorm1dLayer(nn.Module): - """The latents z and speaker embedding s modulate the scale and - shift parameters of the batch normalisation layers""" - - def __init__(self, num_features, s_channels=128, z_channels=128): - super().__init__() - - self.num_features = num_features - self.s_channels = s_channels - self.z_channels = z_channels - self.batch_nrom = nn.BatchNorm1d(num_features, affine=False) - - self.scale_layer = spectral_norm(nn.Linear(z_channels, num_features)) - self.scale_layer.weight.data.normal_(1, 0.02) # Initialise scale at N(1, 0.02) - self.scale_layer.bias.data.zero_() # Initialise bias at 0 - - self.shift_layer = spectral_norm(nn.Linear(s_channels, num_features)) - self.shift_layer.weight.data.normal_(1, 0.02) # Initialise scale at N(1, 0.02) - self.shift_layer.bias.data.zero_() # Initialise bias at 0 - - def forward(self, inputs, z, s): - outputs = self.batch_nrom(inputs) - scale = self.scale_layer(z) - scale = scale.view(-1, self.num_features, 1) - - shift = self.shift_layer(s) - shift = shift.view(-1, self.num_features, 1) - - outputs = scale * outputs + shift - - return outputs - - -if __name__ == "__main__": - model = ConvEncoder(256, 64, 64) - encoder_inputs = torch.randn(2, 256, 10) - z = torch.randn(2, 64) - speaker = torch.randn(1, 64) - outputs, duration = model(encoder_inputs, z, speaker) - print(outputs.shape, duration.shape)