open-mmlab · HarryHe11 · Jan 12, 2024 · Jan 14, 2024 · Jan 28, 2024 · Jan 28, 2024
diff --git a/bins/svc/train.py b/bins/svc/train.py
@@ -87,9 +87,11 @@ def main():
         for dataset in cfg.preprocess.data_augment:
             new_datasets = [
                 f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
-                f"{dataset}_formant_shift"
-                if cfg.preprocess.use_formant_shift
-                else None,
+                (
+                    f"{dataset}_formant_shift"
+                    if cfg.preprocess.use_formant_shift
+                    else None
+                ),
                 f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
                 f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
             ]

diff --git a/bins/tts/preprocess.py b/bins/tts/preprocess.py
@@ -100,6 +100,7 @@ def preprocess(cfg, args):
     """
     # Specify the output root path to save the processed data
     output_path = cfg.preprocess.processed_dir
+    print("Output path: {}".format(output_path))
     os.makedirs(output_path, exist_ok=True)
 
     # Split train and test sets
@@ -137,6 +138,9 @@ def preprocess(cfg, args):
     except:
         print("No Data Augmentation.")
 
+    if "librilight" in cfg.dataset:
+        return
+
     # json files
     dataset_types = list()
     dataset_types.append((cfg.preprocess.train_file).split(".")[0])

diff --git a/bins/tts/train.py b/bins/tts/train.py
@@ -86,9 +86,11 @@ def main():
         for dataset in cfg.preprocess.data_augment:
             new_datasets = [
                 f"{dataset}_pitch_shift" if cfg.preprocess.use_pitch_shift else None,
-                f"{dataset}_formant_shift"
-                if cfg.preprocess.use_formant_shift
-                else None,
+                (
+                    f"{dataset}_formant_shift"
+                    if cfg.preprocess.use_formant_shift
+                    else None
+                ),
                 f"{dataset}_equalizer" if cfg.preprocess.use_equalizer else None,
                 f"{dataset}_time_stretch" if cfg.preprocess.use_time_stretch else None,
             ]

diff --git a/config/ns2.json b/config/ns2.json
@@ -1,5 +1,5 @@
 {
-    "base_config": "config/base.json",
+    "base_config": "config/tts.json",
     "model_type": "NaturalSpeech2",
     "dataset": ["libritts"],
     "preprocess": {

diff --git a/config/tts.json b/config/tts.json
@@ -16,6 +16,16 @@
     // Directory names of processed data or extracted features
     "phone_dir": "phones",
     "use_phone": true,
+    "n_cpus": 16,
+    "n_gpus": 8,
+    "cut_length": 10,
+    "max_length": 20,
+    "whisper_model_id":"distil-whisper/distil-medium.en",
+    "used_subsets": ["tiny"],
+    // MFA files
+    "mfa_dict_path": "./mfa/english_mfa/mfa_dict.dict",
+    "mfa_model_path": "./mfa/english_mfa/model",
+    "mfa_config_path": "./mfa/english_mfa/config.yaml",
   },
   "model": {
       "text_token_num": 512,

diff --git a/env.sh b/env.sh
@@ -21,6 +21,10 @@ pip install git+https://github.com/lhotse-speech/lhotse
 
 pip install -U encodec
 
+pip install -U textgrid
+
+pip install black
+
 pip install phonemizer==3.2.1 pypinyin==0.48.0
 
 # Uninstall nvidia-cublas-cu11 if there exist some bugs about CUDA version

diff --git a/evaluation/metrics/similarity/models/RawNetModel.py b/evaluation/metrics/similarity/models/RawNetModel.py
@@ -121,9 +121,7 @@ def forward(self, x):
         w = self.attention(global_x)
 
         mu = torch.sum(x * w, dim=2)
-        sg = torch.sqrt(
-            (torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4)
-        )
+        sg = torch.sqrt((torch.sum((x**2) * w, dim=2) - mu**2).clamp(min=1e-4, max=1e4))
 
         x = torch.cat((mu, sg), 1)
 

diff --git a/models/tta/autoencoder/autoencoder.py b/models/tta/autoencoder/autoencoder.py
@@ -250,8 +250,7 @@ def forward(self, x):
 
 
 # TODO: Encoder1d
-class Encoder1d(Encoder2d):
-    ...
+class Encoder1d(Encoder2d): ...
 
 
 class Decoder2d(nn.Module):
@@ -351,8 +350,7 @@ def forward(self, z):
 
 
 # TODO: decoder1d
-class Decoder1d(Decoder2d):
-    ...
+class Decoder1d(Decoder2d): ...
 
 
 class AutoencoderKL(nn.Module):

diff --git a/models/tta/autoencoder/autoencoder_dataset.py b/models/tta/autoencoder/autoencoder_dataset.py
@@ -106,9 +106,7 @@ def __call__(self, batch):
         return packed_batch_features
 
 
-class AutoencoderKLTestDataset(BaseTestDataset):
-    ...
+class AutoencoderKLTestDataset(BaseTestDataset): ...
 
 
-class AutoencoderKLTestCollator(BaseTestCollator):
-    ...
+class AutoencoderKLTestCollator(BaseTestCollator): ...
diff --git a/models/tta/ldm/audioldm.py b/models/tta/ldm/audioldm.py
@@ -755,20 +755,22 @@ def __init__(
                 use_checkpoint=use_checkpoint,
                 use_scale_shift_norm=use_scale_shift_norm,
             ),
-            AttentionBlock(
-                ch,
-                use_checkpoint=use_checkpoint,
-                num_heads=num_heads,
-                num_head_channels=dim_head,
-                use_new_attention_order=use_new_attention_order,
-            )
-            if not use_spatial_transformer
-            else SpatialTransformer(
-                ch,
-                num_heads,
-                dim_head,
-                depth=transformer_depth,
-                context_dim=context_dim,
+            (
+                AttentionBlock(
+                    ch,
+                    use_checkpoint=use_checkpoint,
+                    num_heads=num_heads,
+                    num_head_channels=dim_head,
+                    use_new_attention_order=use_new_attention_order,
+                )
+                if not use_spatial_transformer
+                else SpatialTransformer(
+                    ch,
+                    num_heads,
+                    dim_head,
+                    depth=transformer_depth,
+                    context_dim=context_dim,
+                )
             ),
             ResBlock(
                 ch,

diff --git a/models/tta/ldm/audioldm_dataset.py b/models/tta/ldm/audioldm_dataset.py
@@ -145,9 +145,7 @@ def __call__(self, batch):
         return packed_batch_features
 
 
-class AudioLDMTestDataset(BaseTestDataset):
-    ...
+class AudioLDMTestDataset(BaseTestDataset): ...
 
 
-class AudioLDMTestCollator(BaseTestCollator):
-    ...
+class AudioLDMTestCollator(BaseTestCollator): ...
diff --git a/models/tts/valle/valle.py b/models/tts/valle/valle.py
@@ -194,11 +194,13 @@ def __init__(
                     adaptive_layer_norm=True,
                 ),
                 num_layers=int(num_decoder_layers * nar_scale_factor),
-                norm=AdaptiveLayerNorm(
-                    nar_decoder_dim, norm=nn.LayerNorm(nar_decoder_dim)
-                )
-                if cfg.norm_first
-                else None,
+                norm=(
+                    AdaptiveLayerNorm(
+                        nar_decoder_dim, norm=nn.LayerNorm(nar_decoder_dim)
+                    )
+                    if cfg.norm_first
+                    else None
+                ),
             )
             self.nar_predict_layers = nn.ModuleList(
                 [

diff --git a/models/vocoders/vocoder_inference.py b/models/vocoders/vocoder_inference.py
@@ -303,9 +303,11 @@ def _load_model(self, checkpoint_dir, from_multi_gpu=False):
             ]:
                 ckpt = torch.load(
                     checkpoint_dir,
-                    map_location=torch.device("cuda")
-                    if torch.cuda.is_available()
-                    else torch.device("cpu"),
+                    map_location=(
+                        torch.device("cuda")
+                        if torch.cuda.is_available()
+                        else torch.device("cpu")
+                    ),
                 )
                 if from_multi_gpu:
                     pretrained_generator_dict = ckpt["generator_state_dict"]
@@ -412,9 +414,11 @@ def load_nnvocoder(
         if vocoder_name in ["bigvgan", "hifigan", "melgan", "nsfhifigan"]:
             ckpt = torch.load(
                 weights_file,
-                map_location=torch.device("cuda")
-                if torch.cuda.is_available()
-                else torch.device("cpu"),
+                map_location=(
+                    torch.device("cuda")
+                    if torch.cuda.is_available()
+                    else torch.device("cpu")
+                ),
             )
             if from_multi_gpu:
                 pretrained_generator_dict = ckpt["generator_state_dict"]

diff --git a/modules/diffusion/karras/karras_diffusion.py b/modules/diffusion/karras/karras_diffusion.py
@@ -465,9 +465,7 @@ def to_d(x, sigma, denoised):
 def get_ancestral_step(sigma_from, sigma_to):
     """Calculates the noise level (sigma_down) to step down to and the amount
     of noise to add (sigma_up) when doing an ancestral sampling step."""
-    sigma_up = (
-        sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2
-    ) ** 0.5
+    sigma_up = (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5
     sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
     return sigma_down, sigma_up
 

diff --git a/modules/transformer/mh_attention.py b/modules/transformer/mh_attention.py
@@ -65,6 +65,7 @@ class MultiheadAttention(Module):
         >>> attn_output, attn_output_weights = multihead_attn(query, key, value)
 
     """
+
     __constants__ = ["batch_first"]
     bias_k: Optional[torch.Tensor]
     bias_v: Optional[torch.Tensor]
@@ -340,11 +341,11 @@ def forward(
                     key_padding_mask if key_padding_mask is not None else attn_mask,
                     need_weights,
                     average_attn_weights,
-                    1
-                    if key_padding_mask is not None
-                    else 0
-                    if attn_mask is not None
-                    else None,
+                    (
+                        1
+                        if key_padding_mask is not None
+                        else 0 if attn_mask is not None else None
+                    ),
                 )
 
         any_nested = query.is_nested or key.is_nested or value.is_nested

diff --git a/modules/transformer/transformer.py b/modules/transformer/transformer.py
@@ -385,9 +385,13 @@ def _init_norm_layers(
             return (
                 layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs),
                 layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs),
-                layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs)
-                if layer_norm_cls != IdentityNorm
-                else BalancedBasicNorm(d_model, eps=layer_norm_eps, **factory_kwargs),
+                (
+                    layer_norm_cls(d_model, eps=layer_norm_eps, **factory_kwargs)
+                    if layer_norm_cls != IdentityNorm
+                    else BalancedBasicNorm(
+                        d_model, eps=layer_norm_eps, **factory_kwargs
+                    )
+                ),
             )
 
 

diff --git a/modules/wenet_extractor/efficient_conformer/encoder.py b/modules/wenet_extractor/efficient_conformer/encoder.py
@@ -234,12 +234,16 @@ def __init__(
                         output_size,
                         encoder_selfattn_layer(*encoder_selfattn_layer_args),
                         positionwise_layer(*positionwise_layer_args),
-                        positionwise_layer(*positionwise_layer_args)
-                        if macaron_style
-                        else None,
-                        convolution_layer(*convolution_layer_args_stride)
-                        if use_cnn_module
-                        else None,
+                        (
+                            positionwise_layer(*positionwise_layer_args)
+                            if macaron_style
+                            else None
+                        ),
+                        (
+                            convolution_layer(*convolution_layer_args_stride)
+                            if use_cnn_module
+                            else None
+                        ),
                         torch.nn.AvgPool1d(
                             kernel_size=self.stride[index],
                             stride=self.stride[index],
@@ -266,12 +270,16 @@ def __init__(
                         output_size,
                         encoder_selfattn_layer(*encoder_selfattn_layer_args),
                         positionwise_layer(*positionwise_layer_args),
-                        positionwise_layer(*positionwise_layer_args)
-                        if macaron_style
-                        else None,
-                        convolution_layer(*convolution_layer_args_normal)
-                        if use_cnn_module
-                        else None,
+                        (
+                            positionwise_layer(*positionwise_layer_args)
+                            if macaron_style
+                            else None
+                        ),
+                        (
+                            convolution_layer(*convolution_layer_args_normal)
+                            if use_cnn_module
+                            else None
+                        ),
                         dropout_rate,
                         normalize_before,
                     )

diff --git a/modules/wenet_extractor/squeezeformer/encoder.py b/modules/wenet_extractor/squeezeformer/encoder.py
@@ -424,11 +424,13 @@ def forward_chunk(
                 xs,
                 att_mask,
                 pos_emb,
-                att_cache=att_cache[i : i + 1][:, :, ::factor, :][
-                    :, :, : pos_emb.size(1) - xs.size(1), :
-                ]
-                if elayers > 0
-                else att_cache[:, :, ::factor, :],
+                att_cache=(
+                    att_cache[i : i + 1][:, :, ::factor, :][
+                        :, :, : pos_emb.size(1) - xs.size(1), :
+                    ]
+                    if elayers > 0
+                    else att_cache[:, :, ::factor, :]
+                ),
                 cnn_cache=cnn_cache[i] if cnn_cache.size(0) > 0 else cnn_cache,
             )
             # NOTE(xcsong): After layer.forward

diff --git a/modules/wenet_extractor/transformer/decoder.py b/modules/wenet_extractor/transformer/decoder.py
@@ -95,11 +95,13 @@ def __init__(
                     MultiHeadedAttention(
                         attention_heads, attention_dim, self_attention_dropout_rate
                     ),
-                    MultiHeadedAttention(
-                        attention_heads, attention_dim, src_attention_dropout_rate
-                    )
-                    if src_attention
-                    else None,
+                    (
+                        MultiHeadedAttention(
+                            attention_heads, attention_dim, src_attention_dropout_rate
+                        )
+                        if src_attention
+                        else None
+                    ),
                     PositionwiseFeedForward(attention_dim, linear_units, dropout_rate),
                     dropout_rate,
                     normalize_before,

diff --git a/modules/wenet_extractor/transformer/encoder.py b/modules/wenet_extractor/transformer/encoder.py
@@ -489,12 +489,16 @@ def __init__(
                     output_size,
                     encoder_selfattn_layer(*encoder_selfattn_layer_args),
                     positionwise_layer(*positionwise_layer_args),
-                    positionwise_layer(*positionwise_layer_args)
-                    if macaron_style
-                    else None,
-                    convolution_layer(*convolution_layer_args)
-                    if use_cnn_module
-                    else None,
+                    (
+                        positionwise_layer(*positionwise_layer_args)
+                        if macaron_style
+                        else None
+                    ),
+                    (
+                        convolution_layer(*convolution_layer_args)
+                        if use_cnn_module
+                        else None
+                    ),
                     dropout_rate,
                     normalize_before,
                 )

diff --git a/modules/wenet_extractor/utils/init_model.py b/modules/wenet_extractor/utils/init_model.py
@@ -76,9 +76,11 @@ def init_model(configs):
             input_dim,
             global_cmvn=global_cmvn,
             **configs["encoder_conf"],
-            **configs["encoder_conf"]["efficient_conf"]
-            if "efficient_conf" in configs["encoder_conf"]
-            else {},
+            **(
+                configs["encoder_conf"]["efficient_conf"]
+                if "efficient_conf" in configs["encoder_conf"]
+                else {}
+            ),
         )
     else:
         encoder = TransformerEncoder(