Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Librilight Preprocess Scripts Revision #125

Closed
wants to merge 11 commits into from
8 changes: 5 additions & 3 deletions bins/svc/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,11 @@ def main():
for dataset in cfg.preprocess.data_augment:
new_datasets = [
f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
f"{dataset}_formant_shift"
if cfg.preprocess.use_formant_shift
else None,
(
f"{dataset}_formant_shift"
if cfg.preprocess.use_formant_shift
else None
),
f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
]
Expand Down
4 changes: 4 additions & 0 deletions bins/tts/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ def preprocess(cfg, args):
"""
# Specify the output root path to save the processed data
output_path = cfg.preprocess.processed_dir
print("Output path: {}".format(output_path))
os.makedirs(output_path, exist_ok=True)

# Split train and test sets
Expand Down Expand Up @@ -137,6 +138,9 @@ def preprocess(cfg, args):
except:
print("No Data Augmentation.")

if "librilight" in cfg.dataset:
return
Comment on lines +141 to +142
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These two lines of code imply that the LibriLight dataset does not undergo subsequent feature extraction; does it use online feature extraction? Can we add a condition here to determine whether the feature extraction method is online extraction or pre-extraction? This means our system will support two types of feature extraction methods.
P.S., we need to integrate the online feature extraction process later.


# json files
dataset_types = list()
dataset_types.append((cfg.preprocess.train_file).split(".")[0])
Expand Down
8 changes: 5 additions & 3 deletions bins/tts/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,11 @@ def main():
for dataset in cfg.preprocess.data_augment:
new_datasets = [
f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
f"{dataset}_formant_shift"
if cfg.preprocess.use_formant_shift
else None,
(
f"{dataset}_formant_shift"
if cfg.preprocess.use_formant_shift
else None
),
f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
]
Expand Down
2 changes: 1 addition & 1 deletion config/ns2.json
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"base_config": "config/base.json",
"base_config": "config/tts.json",
"model_type": "NaturalSpeech2",
"dataset": ["libritts"],
"preprocess": {
Expand Down
10 changes: 10 additions & 0 deletions config/tts.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,16 @@
// Directory names of processed data or extracted features
"phone_dir": "phones",
"use_phone": true,
"n_cpus": 16,
"n_gpus": 8,
"cut_length": 10,
"max_length": 20,
"whisper_model_id":"distil-whisper/distil-medium.en",
"used_subsets": ["tiny"],
// MFA files
"mfa_dict_path": "./mfa/english_mfa/mfa_dict.dict",
"mfa_model_path": "./mfa/english_mfa/model",
"mfa_config_path": "./mfa/english_mfa/config.yaml",
HarryHe11 marked this conversation as resolved.
Show resolved Hide resolved
},
"model": {
"text_token_num": 512,
Expand Down
4 changes: 4 additions & 0 deletions env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ pip install git+https://github.com/lhotse-speech/lhotse

pip install -U encodec

pip install -U textgrid

pip install black

pip install phonemizer==3.2.1 pypinyin==0.48.0

# Uninstall nvidia-cublas-cu11 if there exist some bugs about CUDA version
Expand Down
4 changes: 1 addition & 3 deletions evaluation/metrics/similarity/models/RawNetModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,9 +121,7 @@ def forward(self, x):
w = self.attention(global_x)

mu = torch.sum(x * w, dim=2)
sg = torch.sqrt(
(torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4)
)
sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4))

x = torch.cat((mu, sg), 1)

Expand Down
6 changes: 2 additions & 4 deletions models/tta/autoencoder/autoencoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,8 +250,7 @@ def forward(self, x):


# TODO: Encoder1d
class Encoder1d(Encoder2d):
...
class Encoder1d(Encoder2d): ...


class Decoder2d(nn.Module):
Expand Down Expand Up @@ -351,8 +350,7 @@ def forward(self, z):


# TODO: decoder1d
class Decoder1d(Decoder2d):
...
class Decoder1d(Decoder2d): ...


class AutoencoderKL(nn.Module):
Expand Down
6 changes: 2 additions & 4 deletions models/tta/autoencoder/autoencoder_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,7 @@ def __call__(self, batch):
return packed_batch_features


class AutoencoderKLTestDataset(BaseTestDataset):
...
class AutoencoderKLTestDataset(BaseTestDataset): ...


class AutoencoderKLTestCollator(BaseTestCollator):
...
class AutoencoderKLTestCollator(BaseTestCollator): ...
30 changes: 16 additions & 14 deletions models/tta/ldm/audioldm.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,20 +755,22 @@ def __init__(
use_checkpoint=use_checkpoint,
use_scale_shift_norm=use_scale_shift_norm,
),
AttentionBlock(
ch,
use_checkpoint=use_checkpoint,
num_heads=num_heads,
num_head_channels=dim_head,
use_new_attention_order=use_new_attention_order,
)
if not use_spatial_transformer
else SpatialTransformer(
ch,
num_heads,
dim_head,
depth=transformer_depth,
context_dim=context_dim,
(
AttentionBlock(
ch,
use_checkpoint=use_checkpoint,
num_heads=num_heads,
num_head_channels=dim_head,
use_new_attention_order=use_new_attention_order,
)
if not use_spatial_transformer
else SpatialTransformer(
ch,
num_heads,
dim_head,
depth=transformer_depth,
context_dim=context_dim,
)
),
ResBlock(
ch,
Expand Down
6 changes: 2 additions & 4 deletions models/tta/ldm/audioldm_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,7 @@ def __call__(self, batch):
return packed_batch_features


class AudioLDMTestDataset(BaseTestDataset):
...
class AudioLDMTestDataset(BaseTestDataset): ...


class AudioLDMTestCollator(BaseTestCollator):
...
class AudioLDMTestCollator(BaseTestCollator): ...
12 changes: 7 additions & 5 deletions models/tts/valle/valle.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,11 +194,13 @@ def __init__(
adaptive_layer_norm=True,
),
num_layers=int(num_decoder_layers * nar_scale_factor),
norm=AdaptiveLayerNorm(
nar_decoder_dim, norm=nn.LayerNorm(nar_decoder_dim)
)
if cfg.norm_first
else None,
norm=(
AdaptiveLayerNorm(
nar_decoder_dim, norm=nn.LayerNorm(nar_decoder_dim)
)
if cfg.norm_first
else None
),
)
self.nar_predict_layers = nn.ModuleList(
[
Expand Down
16 changes: 10 additions & 6 deletions models/vocoders/vocoder_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,9 +303,11 @@ def _load_model(self, checkpoint_dir, from_multi_gpu=False):
]:
ckpt = torch.load(
checkpoint_dir,
map_location=torch.device("cuda")
if torch.cuda.is_available()
else torch.device("cpu"),
map_location=(
torch.device("cuda")
if torch.cuda.is_available()
else torch.device("cpu")
),
)
if from_multi_gpu:
pretrained_generator_dict = ckpt["generator_state_dict"]
Expand Down Expand Up @@ -412,9 +414,11 @@ def load_nnvocoder(
if vocoder_name in ["bigvgan", "hifigan", "melgan", "nsfhifigan"]:
ckpt = torch.load(
weights_file,
map_location=torch.device("cuda")
if torch.cuda.is_available()
else torch.device("cpu"),
map_location=(
torch.device("cuda")
if torch.cuda.is_available()
else torch.device("cpu")
),
)
if from_multi_gpu:
pretrained_generator_dict = ckpt["generator_state_dict"]
Expand Down
4 changes: 1 addition & 3 deletions modules/diffusion/karras/karras_diffusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,9 +465,7 @@ def to_d(x, sigma, denoised):
def get_ancestral_step(sigma_from, sigma_to):
"""Calculates the noise level (sigma_down) to step down to and the amount
of noise to add (sigma_up) when doing an ancestral sampling step."""
sigma_up = (
sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2
) ** 0.5
sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
return sigma_down, sigma_up

Expand Down
11 changes: 6 additions & 5 deletions modules/transformer/mh_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class MultiheadAttention(Module):
>>> attn_output, attn_output_weights = multihead_attn(query, key, value)

"""

__constants__ = ["batch_first"]
bias_k: Optional[torch.Tensor]
bias_v: Optional[torch.Tensor]
Expand Down Expand Up @@ -340,11 +341,11 @@ def forward(
key_padding_mask if key_padding_mask is not None else attn_mask,
need_weights,
average_attn_weights,
1
if key_padding_mask is not None
else 0
if attn_mask is not None
else None,
(
1
if key_padding_mask is not None
else 0 if attn_mask is not None else None
),
)

any_nested = query.is_nested or key.is_nested or value.is_nested
Expand Down
10 changes: 7 additions & 3 deletions modules/transformer/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -385,9 +385,13 @@ def _init_norm_layers(
return (
layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs),
layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs),
layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs)
if layer_norm_cls != IdentityNorm
else BalancedBasicNorm(d_model, eps=layer_norm_eps, **factory_kwargs),
(
layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs)
if layer_norm_cls != IdentityNorm
else BalancedBasicNorm(
d_model, eps=layer_norm_eps, **factory_kwargs
)
),
)


Expand Down
32 changes: 20 additions & 12 deletions modules/wenet_extractor/efficient_conformer/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -234,12 +234,16 @@ def __init__(
output_size,
encoder_selfattn_layer(*encoder_selfattn_layer_args),
positionwise_layer(*positionwise_layer_args),
positionwise_layer(*positionwise_layer_args)
if macaron_style
else None,
convolution_layer(*convolution_layer_args_stride)
if use_cnn_module
else None,
(
positionwise_layer(*positionwise_layer_args)
if macaron_style
else None
),
(
convolution_layer(*convolution_layer_args_stride)
if use_cnn_module
else None
),
torch.nn.AvgPool1d(
kernel_size=self.stride[index],
stride=self.stride[index],
Expand All @@ -266,12 +270,16 @@ def __init__(
output_size,
encoder_selfattn_layer(*encoder_selfattn_layer_args),
positionwise_layer(*positionwise_layer_args),
positionwise_layer(*positionwise_layer_args)
if macaron_style
else None,
convolution_layer(*convolution_layer_args_normal)
if use_cnn_module
else None,
(
positionwise_layer(*positionwise_layer_args)
if macaron_style
else None
),
(
convolution_layer(*convolution_layer_args_normal)
if use_cnn_module
else None
),
dropout_rate,
normalize_before,
)
Expand Down
12 changes: 7 additions & 5 deletions modules/wenet_extractor/squeezeformer/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,11 +424,13 @@ def forward_chunk(
xs,
att_mask,
pos_emb,
att_cache=att_cache[i : i + 1][:, :, ::factor, :][
:, :, : pos_emb.size(1) - xs.size(1), :
]
if elayers > 0
else att_cache[:, :, ::factor, :],
att_cache=(
att_cache[i : i + 1][:, :, ::factor, :][
:, :, : pos_emb.size(1) - xs.size(1), :
]
if elayers > 0
else att_cache[:, :, ::factor, :]
),
cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache,
)
# NOTE(xcsong): After layer.forward
Expand Down
12 changes: 7 additions & 5 deletions modules/wenet_extractor/transformer/decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,13 @@ def __init__(
MultiHeadedAttention(
attention_heads, attention_dim, self_attention_dropout_rate
),
MultiHeadedAttention(
attention_heads, attention_dim, src_attention_dropout_rate
)
if src_attention
else None,
(
MultiHeadedAttention(
attention_heads, attention_dim, src_attention_dropout_rate
)
if src_attention
else None
),
PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
dropout_rate,
normalize_before,
Expand Down
16 changes: 10 additions & 6 deletions modules/wenet_extractor/transformer/encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,12 +489,16 @@ def __init__(
output_size,
encoder_selfattn_layer(*encoder_selfattn_layer_args),
positionwise_layer(*positionwise_layer_args),
positionwise_layer(*positionwise_layer_args)
if macaron_style
else None,
convolution_layer(*convolution_layer_args)
if use_cnn_module
else None,
(
positionwise_layer(*positionwise_layer_args)
if macaron_style
else None
),
(
convolution_layer(*convolution_layer_args)
if use_cnn_module
else None
),
dropout_rate,
normalize_before,
)
Expand Down
8 changes: 5 additions & 3 deletions modules/wenet_extractor/utils/init_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,11 @@ def init_model(configs):
input_dim,
global_cmvn=global_cmvn,
**configs["encoder_conf"],
**configs["encoder_conf"]["efficient_conf"]
if "efficient_conf" in configs["encoder_conf"]
else {},
**(
configs["encoder_conf"]["efficient_conf"]
if "efficient_conf" in configs["encoder_conf"]
else {}
),
)
else:
encoder = TransformerEncoder(
Expand Down
Loading
Loading