open-mmlab · qinghua2016 · Dec 24, 2023 · Dec 24, 2023
diff --git a/bins/tts/preprocess.py b/bins/tts/preprocess.py
@@ -142,8 +142,8 @@ def preprocess(cfg, args):
     dataset_types.append((cfg.preprocess.valid_file).split(".")[0])
     if "test" not in dataset_types:
         dataset_types.append("test")
-    if "eval" in dataset:
-        dataset_types = ["test"]
+    #if "eval" in dataset:
+    #    dataset_types = ["test"]
 
     # Dump metadata of datasets (singers, train/test durations, etc.)
     cal_metadata(cfg, dataset_types)

diff --git a/config/fs2.json b/config/fs2.json
@@ -7,13 +7,13 @@
       // acoustic features
       "extract_audio": true,
       "extract_mel": true,
-      "mel_extract_mode": "taco",
+      "mel_extract_mode": "raw",
       "mel_min_max_norm": false,
       "extract_pitch": true,
       "extract_uv": false,
       "pitch_extractor": "dio",
       "extract_energy": true,
-      "energy_extract_mode": "from_tacotron_stft",
+      "energy_extract_mode": "from_mel",
       "extract_duration": true,
       "use_phone": false,
       "pitch_norm": true,
@@ -22,17 +22,17 @@
       "energy_remove_outlier": true,
 
       // Default config 
-      "n_mel": 80,
+      "n_mel": 100,
       "win_size": 1024,  // todo
       "hop_size": 256,
-      "sample_rate": 22050,
+      "sample_rate": 24000,
       "n_fft": 1024, // todo
       "fmin": 0,
-      "fmax": 8000, // todo
+      "fmax": 12000, // todo
       "raw_data": "raw_data",
       "text_cleaners": ["english_cleaners"],
-      "f0_min": 71,    // ~C2
-      "f0_max": 800, //1100,    // ~C6(1100), ~G5(800)
+      "f0_min": 50,    // ~C2
+      "f0_max": 1100, //1100,    // ~C6(1100), ~G5(800)
       "pitch_bin": 256,
       "pitch_max": 1100.0,
       "pitch_min": 50.0,

diff --git a/egs/tts/FastSpeech2/exp_config.json b/egs/tts/FastSpeech2/exp_config.json
@@ -13,7 +13,7 @@
   "preprocess": {
     // TODO: Fill in the output data path. The default value is "Amphion/data"
     "processed_dir": "data",
-    "sample_rate": 22050,
+    "sample_rate": 24000,
   },
   "train": {
     "batch_size": 16,

diff --git a/egs/tts/FastSpeech2/run.sh b/egs/tts/FastSpeech2/run.sh
@@ -134,6 +134,8 @@ if [ $running_stage -eq 3 ]; then
     fi
 
 
+    # if you don't have a vocoder, you can download from https://huggingface.co/amphion/hifigan_speech_bigdata, 
+    # then link the hifigan_speech folder to pretrained/hifigan_speech
     CUDA_VISIBLE_DEVICES=$gpu accelerate launch "$work_dir"/bins/tts/inference.py \
         --config $exp_config \
         --acoustics_dir $infer_expt_dir \
@@ -143,7 +145,7 @@ if [ $running_stage -eq 3 ]; then
         --testing_set $infer_testing_set \
         --text "$infer_text" \
         --log_level debug \
-        --vocoder_dir /mntnfs/lee_data1/chenxi/processed_data/ljspeech/model_ckpt/hifigan/checkpoints
+        --vocoder_dir pretrained/hifigan_speech
 
 
 

diff --git a/models/tts/fastspeech2/fs2_dataset.py b/models/tts/fastspeech2/fs2_dataset.py
@@ -161,9 +161,9 @@ def read_duration(self):
 
             mel = np.load(self.utt2mel_path[utt]).transpose(1, 0)
             duration = np.load(self.utt2duration_path[utt])
-            assert mel.shape[0] == sum(
-                duration
-            ), f"{utt}: mismatch length between mel {mel.shape[0]} and sum(duration) {sum(duration)}"
+            #assert mel.shape[0] == sum(
+            #    duration
+            #), f"{utt}: mismatch length between mel {mel.shape[0]} and sum(duration) {sum(duration)}"
             utt2dur[utt] = duration
         return utt2dur
 

diff --git a/models/tts/fastspeech2/fs2_inference.py b/models/tts/fastspeech2/fs2_inference.py
@@ -49,17 +49,10 @@ def _build_test_dataset(self):
     @staticmethod
     def _parse_vocoder(vocoder_dir):
         r"""Parse vocoder config"""
-        vocoder_dir = os.path.abspath(vocoder_dir)
-        ckpt_list = [ckpt for ckpt in Path(vocoder_dir).glob("*.pt")]
-        # last step (different from the base *int(x.stem)*)
-        ckpt_list.sort(
-            key=lambda x: int(x.stem.split("_")[-2].split("-")[-1]), reverse=True
-        )
-        ckpt_path = str(ckpt_list[0])
         vocoder_cfg = load_config(
             os.path.join(vocoder_dir, "args.json"), lowercase=True
         )
-        return vocoder_cfg, ckpt_path
+        return vocoder_cfg, vocoder_dir
 
     @torch.inference_mode()
     def inference_for_batches(self):

diff --git a/models/tts/naturalspeech2/ns2_trainer.py b/models/tts/naturalspeech2/ns2_trainer.py
@@ -13,7 +13,7 @@
 from torch.utils.data import ConcatDataset, DataLoader
 from models.tts.base.tts_trainer import TTSTrainer
 from models.base.base_trainer import BaseTrainer
-from models.base.base_sampler import VariableSampler
+from models.tts.valle.valle_dataset import VariableSampler
 from models.tts.naturalspeech2.ns2_dataset import NS2Dataset, NS2Collator, batch_by_size
 from models.tts.naturalspeech2.ns2_loss import (
     log_pitch_loss,

diff --git a/models/vocoders/gan/gan_vocoder_trainer.py b/models/vocoders/gan/gan_vocoder_trainer.py
@@ -1048,13 +1048,10 @@ def _valid_step(self, data):
         valid_losses.update(discriminator_losses)
         valid_losses.update(generator_losses)
 
-        for item in valid_losses:
-            valid_losses[item] = valid_losses[item].item()
         for item in valid_losses:
             valid_losses[item] = valid_losses[item].item()
 
         return total_loss.item(), valid_losses
-        return total_loss.item(), valid_losses
 
     def _inference(self, eval_mel, eval_pitch=None, use_pitch=False):
         """Inference during training for test audios."""

diff --git a/preprocessors/ljspeech.py b/preprocessors/ljspeech.py
@@ -136,8 +136,9 @@ def prepare_align(dataset, dataset_path, cfg, output_path):
             wav_path = os.path.join(in_dir, "wavs", "{}.wav".format(base_name))
             if os.path.exists(wav_path):
                 os.makedirs(os.path.join(out_dir, speaker), exist_ok=True)
-                wav, _ = librosa.load(wav_path, sampling_rate)
-                wav = wav / max(abs(wav)) * max_wav_value
+                wav, _ = librosa.load(wav_path, sr=sampling_rate)
+                wav = wav / max(abs(wav)) * max_wav_value * 0.95
+                # todo: let's trim silence
 
                 wavfile.write(
                     os.path.join(out_dir, speaker, "{}.wav".format(base_name)),

diff --git a/processors/acoustic_extractor.py b/processors/acoustic_extractor.py
@@ -286,10 +286,11 @@ def extract_utt_acoustic_features_tts(dataset_output, cfg, utt):
                 mel = extract_mel_features(
                     wav_torch.unsqueeze(0), cfg.preprocess, taco=True, _stft=_stft
                 )
-                if cfg.preprocess.extract_duration:
-                    mel = mel[:, : sum(durations)]
             else:
-                mel = extract_mel_features(wav_torch.unsqueeze(0), cfg.preprocess)
+                mel = extract_mel_features_tts(wav_torch.unsqueeze(0), cfg.preprocess)
+
+            if cfg.preprocess.extract_duration:
+                mel = mel[:, : sum(durations)]
             save_feature(dataset_output, cfg.preprocess.mel_dir, uid, mel.cpu().numpy())
 
         if cfg.preprocess.extract_energy:

diff --git a/utils/audio.py b/utils/audio.py
@@ -28,18 +28,18 @@ def load_audio_torch(wave_file, fs):
     assert len(audio) > 2
 
     # Check the audio type (for soundfile loading backbone) - float, 8bit or 16bit
-    if np.issubdtype(audio.dtype, np.integer):
-        max_mag = -np.iinfo(audio.dtype).min
-    else:
-        max_mag = max(np.amax(audio), -np.amin(audio))
-        max_mag = (
-            (2**31) + 1
-            if max_mag > (2**15)
-            else ((2**15) + 1 if max_mag > 1.01 else 1.0)
-        )
+#    if np.issubdtype(audio.dtype, np.integer):
+#        max_mag = -np.iinfo(audio.dtype).min
+#    else:
+#        max_mag = max(np.amax(audio), -np.amin(audio))
+#        max_mag = (
+#            (2**31) + 1
+#            if max_mag > (2**15)
+#            else ((2**15) + 1 if max_mag > 1.01 else 1.0)
+#        )
 
     # Normalize the audio
-    audio = torch.FloatTensor(audio.astype(np.float32)) / max_mag
+    audio = torch.FloatTensor(audio.astype(np.float32))
 
     if (torch.isnan(audio) | torch.isinf(audio)).any():
         return [], sample_rate or fs or 48000

diff --git a/utils/mel.py b/utils/mel.py
@@ -232,8 +232,6 @@ def extract_mel_features_tts(
         spec = torch.matmul(mel_basis[str(cfg.fmax) + "_" + str(y.device)], spec)
         spec = spectral_normalize_torch(spec)
         spec = spec.squeeze(0)
-        spec = torch.matmul(mel_basis[str(cfg.fmax) + "_" + str(y.device)], spec)
-        spec = spectral_normalize_torch(spec)
     else:
         audio = torch.clip(y, -1, 1)
         audio = torch.autograd.Variable(audio, requires_grad=False)

diff --git a/utils/stft.py b/utils/stft.py
@@ -66,7 +66,7 @@ def window_sumsquare(
     # Compute the squared window at the desired length
     win_sq = get_window(window, win_length, fftbins=True)
     win_sq = librosa_util.normalize(win_sq, norm=norm) ** 2
-    win_sq = librosa_util.pad_center(win_sq, n_fft)
+    win_sq = librosa_util.pad_center(win_sq, size = n_fft)
 
     # Fill the envelope
     for i in range(n_frames):
@@ -243,7 +243,7 @@ def __init__(
         self.sampling_rate = sampling_rate
         self.stft_fn = STFT(filter_length, hop_length, win_length)
         mel_basis = librosa_mel_fn(
-            sampling_rate, filter_length, n_mel_channels, mel_fmin, mel_fmax
+            sr=sampling_rate, n_fft=filter_length, n_mels=n_mel_channels, fmin=mel_fmin, fmax=mel_fmax
         )
         mel_basis = torch.from_numpy(mel_basis).float()
         self.register_buffer("mel_basis", mel_basis)