finish data setup

zshy1205 · Oct 8, 2023 · ffe04e2 · ffe04e2
1 parent 93c56c5
commit ffe04e2
Show file tree

Hide file tree

Showing 86 changed files with 319 additions and 34 deletions.
diff --git a/.gitignore b/.gitignore
@@ -150,3 +150,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
+synthetic_wav/
+exp/
+**/*.wav
diff --git a/README.md b/README.md
@@ -21,11 +21,60 @@ python setup.py build_ext --inplace
 Note that to avoid the trouble of installing [torchdyn](https://github.com/DiffEqML/torchdyn), we directly copy the torchdyn 1.0.6 version here locally at `torchdyn/`.
 
 The following process may also need `bash` and `perl` commands in your environment.
+
 ## Data Preparation
+This repo relies on Kaldi-style data organization.
+All data description files should be put in subdirectories in `data/`.
+See `data/ljspeech/example` for a basic example. 
+In this example, the following plain text files are necessary:
+1. `wav.scp`: organized as `utt /path/to/wav`.
+2. `utts.list`: every line specifies an utterance. This can be obtained by `cut -d ' ' -f 1 wav.scp > utts.list`.
+3. `utt2spk`: organized as `utt spk_name`.
+4. `text` and `phn_duration`: specifies the phoneme sequence and the corresponding integer durations (in frames).
+Also, there is a `data/ljspeech/phones.txt` file to specify all the phones together with their indexes in dictionary.
+
+For LJSpeech, we provide the processed file [online](https://huggingface.co/datasets/cantabile-kwok/ljspeech-1024-256-dur/resolve/main/ljspeech-1024-256.zip).
+You can download it and unzip to `data/ljspeech`.
+If you want to train on your own dataset, you might have to create these files yourself (or change the data loading strategy).
+
+After having these manifest files, please do the following to extract mel-spectrogram for training:
+```shell
+bash extract_fbank.sh --stage 0 --stop_stage 2 --nj 16
+# nj: number of parallel jobs. 
+# Have a look into the script if you need to change something
+# Bash variables before "parse_options.sh" can be passed by CLI, e.g. "--key value".
+```
+Note that we default to use **16kHz** data here.
+This will create `feats/fbank` and `feats/normed_fbank`, where Kaldi-style scp and ark files store the mel-spectrogram data. 
+The normed features will be used for training.
+
+If you want to use speaker-IDs (like LJSpeech, instead of using pretrained speaker embeddings such as xvectors) for training, please run:
+```shell
+make_utt2spk_id.py data/ljspeech/train/utt2spk data/ljspeech/val/utt2spk
+# You can add more files in CLI. Will write utt2num_frames in the same directory to these files.
+```
 
 ## Training
+Configurations for training is stored as yaml file in `configs/`.
+Data manifests and features for training and validation set will be specified in those yaml files.
+You will need to change double-quoted file paths there if you need to train on your own data.
+
+Then, training is performed by 
+```shell
+python train.py -c configs/${your_yaml} -m ${model_name}
+# e.g. python train.py -c configs/lj_16k_gt_dur.yaml -m lj_16k_gt_dur
+```
+It will create `logs/${model_name}` for logging and checkpointing.
+
+Several notes:
+* By default, the program performs EMA to average weights. Weights with or without EMA will both be saved. 
+* By default, the program will try to find the latest checkpoint for resuming. EMA checkpoints are prior to non-EMA checkpoints.
+* You can set `use_gt_dur` to `false` to turn on MAS algorithm. In this setting, it is better to set `add_blank` to `true`.
 
+## Generate Data for ReFlow and Perform Reflow
+TO BE DONE
 ## Inference
+TO BE DONE
 
 ## Acknowledgement
 During the development, the following repositories were referred to:

diff --git a/cmd.sh b/cmd.sh
@@ -0,0 +1,91 @@
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentaion for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+
+
+# Select the backend used by run.sh from "local", "stdout", "sge", "slurm", or "ssh"
+cmd_backend="local"
+
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+
+    # The other usage
+    export train_cmd="utils/run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/run.pl"
+
+# Local machine, without any Job scheduling system
+elif [ "${cmd_backend}" = stdout ]; then
+
+    # The other usage
+    export train_cmd="utils/stdout.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="utils/stdout.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="utils/stdout.pl"
+
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+
+    export train_cmd="utils/queue.pl"
+    export cuda_cmd="utils/queue.pl"
+    export decode_cmd="utils/queue.pl"
+
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by defualt for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+
+    export train_cmd="utils/slurm.pl"
+    export cuda_cmd="utils/slurm.pl"
+    export decode_cmd="utils/slurm.pl"
+
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+
+    export train_cmd="utils/ssh.pl"
+    export cuda_cmd="utils/ssh.pl"
+    export decode_cmd="utils/ssh.pl"
+
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
diff --git a/configs/lj_16k_gt_dur.yaml b/configs/lj_16k_gt_dur.yaml
@@ -0,0 +1,55 @@
+xvector: false  # whether to use xvector for speaker modeling.
+
+perform_reflow: false  # if true, will need noise_scp be specified
+
+train:
+    test_size: 4
+    n_epochs: 10000
+    batch_size: 24
+    learning_rate: !!float 5e-5
+    seed: 37
+    save_every: 10
+    use_gt_dur: true  # whether to supervise duration modeling
+
+data:
+    sampling_rate: 16000
+    n_mel_channels: 80
+    add_blank: false  # whether to add blank tokens between each input phones
+    hop_length: 200  # in sampling points
+
+    phn2id: "data/ljspeech/phones.txt"
+
+    train:
+        utts: "data/ljspeech/train/utts.list"
+        utt2phns: "data/ljspeech/train/text"
+        utt2phn_duration: "data/ljspeech/train/phn_duration"
+        feats_scp: "feats/normed_fbank/ljspeech/train/feats.scp"
+        utt2num_frames: "feats/normed_fbank/ljspeech/train/utt2num_frames"
+        utt2spk: "data/ljspeech/train/utt2spk_id.json"
+
+    val:
+        utts: "data/ljspeech/val/utts.list"
+        utt2phns: "data/ljspeech/val/text"
+        utt2phn_duration: "data/ljspeech/val/phn_duration"
+        feats_scp: "feats/normed_fbank/ljspeech/val/feats.scp"
+        utt2num_frames: "feats/normed_fbank/ljspeech/val/utt2num_frames"
+        utt2spk: "data/ljspeech/val/utt2spk_id.json"
+
+model:
+    n_vocab: 148
+    n_spks: 1
+    spk_emb_dim: 64
+    n_enc_channels: 192
+    filter_channels: 768
+    filter_channels_dp: 256
+    n_enc_layers: 6
+    enc_kernel: 3
+    enc_dropout: 0.1
+    n_heads: 2
+    window_size: 4
+    dec_dim: 128
+    pe_scale: 1000
+    fm_type: "CFM"  # FM, CFM
+    fm_net_type: "unet"  # unet or diffsinger
+    shift_by_mu: false  # whether to shift the prior distribution by mu. True means GradTTS-style.
+    condition_by_mu: true  # whether to condition the flow matching decoder by mu. False supports text-agnostic voice conversion like GlowTTS.
diff --git a/data/ljspeech/example/phn_duration b/data/ljspeech/example/phn_duration
@@ -0,0 +1,10 @@
+LJ001-0110 9 4 3 4 3 4 9 9 7 5 8 4 8 13 10 14 8 3 4 4 7 5 11 6 9 9 11 15 9 5 7 5 5 3 11 3 4 4 6 5 5 4 7 5 4 4 4 4 8 4 5 4 6 6 9 11
+LJ002-0018 3 3 3 3 7 3 3 5 3 3 9 4 4 3 3 4 10 7 8 4 3 5 5 6 4 7 4 6 3 6 4 3 3 4 6 4 3 4 3 3 5 6 11 10 11 11 9 10 9 6 4 3 4 9 10 5 5 3 4 4 3 5 8 4 4 5 9 4 7 5 3 3 3 5 6 4 5 4 4 4 7 7 4 4 4 7 3
+LJ002-0043 4 15 7 6 5 7 10 5 13 9 24 3 12 7 5 9 6 4 6 8 6 4 7 4 13 13 11 17 7 5 10 4 7 6 4 3 4 6 7 7 5 8 14 13 9 7 3 4 3 7 13 13 14 17 9 15 13
+LJ003-0111 4 3 4 4 4 5 5 6 4 5 4 5 4 3 6 5 9 7 6 8 4 10 5 4 3 3 4 4 5 6 5 5 6 4 3 4 3 4 8 5 8 6 8 7 4 3 5 9 24 19 12 4 4 6 5 10 12 27 5 9 5 8 12 3 3 4 3 6 8 3 3 6 3 4 3 3 3 7 6 4 5 5 3 4 7 6 7 4 6 16
+LJ003-0345 10 12 4 3 4 4 4 3 4 4 6 3 5 3 6 12 3 3 3 3 8 4 5 5 6 10 6 8 17 8 3 4 4 4 7 5 6 3 4 3 6 4 4 5 6 4 4 3 4 4 5 5 8 8 4 3 10 5 10 21
+LJ004-0045 3 3 3 3 4 12 5 8 5 5 7 10 9 9 11 24 15 5 11 8 5 6 6 6 5 3 3 8 12 25 3 14 6 9 6 5 4 13 4 4 6 7 5 14 21 10 4 5 3 4 3 6 5 6 5 7 5 3 3 5 8 5 7 18
+LJ004-0096 15 4 5 8 4 4 3 4 6 5 4 3 5 4 3 3 3 5 4 4 5 3 6 9 4 3 6 3 3 4 6 3 3 5 5 6 4 4 5 3 4 5 4 3 8 3 5 3 4 6 5 4 5 3 3 4 7 7 5 5 4 7 6 5 8 10 6 8 10 6 6 13
+LJ004-0152 5 4 6 6 3 3 3 3 4 4 5 5 4 4 4 4 3 3 3 3 4 7 4 8 5 4 8 7 8 6 8 3 3 4 5 4 5 3 5 7 5 4 5 3 3 8 4 5 7 4 11 7 6 5 5 6 6 7 6 6 6 10 5 6 4 4 6 3 3 9 9 7 7 5 3 3 3 7 9 6 3 3 3 3 4 4 5 7 8 8 6 10 22 10 6 6 4 4 3 4 6 6 7 6 5 4 3 4 4 12 8 8 3
+LJ005-0014 8 4 7 5 4 6 3 5 4 3 4 5 12 7 4 7 4 5 3 5 4 4 3 4 9 5 8 23 12 9 3 3 5 6 4 7 5 3 3 6 12
+LJ005-0079 4 3 3 3 3 6 4 5 4 3 4 4 5 4 4 3 6 4 3 3 4 5 4 4 4 6 9 14 18 6 3 4 7 4 6 3 4 6 6 4 4 3 5 4 4 6 4 7 5 7 5 4 5 5 5 3 4 5 5 4 3 7 3 5 6 3 4 5 5 5 5 4 4 5 3 6 9 4 4 5 8 22
diff --git a/data/ljspeech/example/text b/data/ljspeech/example/text
@@ -0,0 +1,10 @@
+LJ001-0110 IY1 V IH0 N DH AH1 K AE1 S L AH0 N T AY1 P sil W EH1 N IH0 N L AA1 R JH D sil SH OW1 Z G R EY1 T SH AO1 R T K AH2 M IH0 NG Z IH1 N DH IH1 S R IY0 S P EH1 K T
+LJ002-0018 DH IY0 IH0 N AE1 D IH0 K W AH0 S IY0 AH0 V DH AH0 JH EY1 L W AH0 Z N OW1 T AH0 S T sil AE1 N D R IH0 P AO1 R T AH0 D AH0 P AA1 N sil AH0 G EH1 N AE1 N D AH0 G EH1 N B AY1 DH AH1 G R AE1 N D JH UH1 R IY0 Z AH0 V DH AH0 S IH1 T IY0 AH0 V L AH1 N D AH0 N sil
+LJ002-0043 L AO1 NG N EH1 R OW0 R UW1 M Z sil W AH1 N TH ER1 T IY2 S IH1 K S F IY1 T sil S IH1 K S sil T W EH1 N T IY0 TH R IY1 F IY1 T sil AE1 N D DH IY0 EY1 TH sil EY0 T IY1 N
+LJ003-0111 HH IY1 W AH0 Z IH0 N K AA1 N S AH0 K W AH0 N S P UH1 T sil AW1 T AH0 V DH AH1 P ER0 T EH1 K SH AH0 N AH0 V DH EH1 R IH0 N T ER1 N AH0 L L AO1 sil EH1 N D K W OW1 T sil DH EH1 R K OW1 D W AH0 Z AH0 S AH1 B JH IH0 K T AH0 V S AH1 M K Y UH2 R IY0 AA1 S AH0 T IY0
+LJ003-0345 sil AO1 L DH AH1 K AH0 M IH1 T IY0 K UH1 D D UW1 IH0 N DH IH0 S R IY0 S P EH1 K T sil W AH0 Z T AH0 TH R OW1 DH AH0 R IY0 S P AA2 N S AH0 B IH1 L AH0 T IY0 AA1 N sil AH1 DH ER0 Z
+LJ004-0045 M IH1 S T ER0 S T ER1 JH IH0 Z B AO1 R N sil S ER1 JH EY1 M Z M AE1 K AH0 N T AA2 SH sil S ER1 JH EY1 M Z S K AA1 R L IH0 T sil AE1 N D W IH1 L Y AH0 M sil W IH1 L B ER0 F AO2 R S
+LJ004-0096 sil DH AH1 F EY1 T AH0 L K AA1 N S AH0 K W EH2 N S AH0 Z W EH2 R AH1 V M AY1 T B IY0 P R IH0 V EH1 N T IH0 D sil IH0 F DH AH0 JH AH1 S T IH0 S IH0 Z AH0 V DH AH1 P IY1 S W ER0 D UW1 L IY0 AO1 TH ER0 AY2 Z D sil
+LJ004-0152 AO2 L DH OW1 AE1 T M IH1 S T ER0 B AH1 K S T AH0 N EH1 S V IH1 Z IH0 T AH0 N UW1 JH EY1 L W AH0 Z IH0 N P R AA1 S EH2 S AH0 V IH0 R EH1 K SH AH0 N sil DH AH0 F ER1 S T S T EH1 P T AO1 R D Z R AH0 F AO1 R M S IH1 N S HH AW1 ER0 D EH1 S V IH2 Z IH0 T EY1 SH AH0 N sil IH1 N S EH1 V AH0 N T IY1 N S EH1 V AH0 N IY0 F AO1 R sil
+LJ005-0014 S P IY1 K IH0 NG AA1 N AH0 D AH0 B EY1 T sil AO1 N P R IH1 Z AH0 N M AE1 T ER0 Z sil HH IY1 D IH0 K L EH1 R D DH AE1 T
+LJ005-0079 AE1 N D IH0 M P R UW1 V DH AH0 M AO1 R AH0 L Z AH0 V DH AH1 P R IH1 Z N ER0 Z sil AE1 N D SH AE1 L IH0 N SH UH1 R DH AH0 P R AA1 P ER0 M EH1 ZH ER0 AH0 V P AH1 N IH0 SH M AH0 N T sil T AH0 K AH0 N V IH1 K T AH0 D sil AH0 F EH1 N D ER0 Z
diff --git a/data/ljspeech/example/utt2spk b/data/ljspeech/example/utt2spk
@@ -0,0 +1,10 @@
+LJ001-0110 LJ
+LJ002-0018 LJ
+LJ002-0043 LJ
+LJ003-0111 LJ
+LJ003-0345 LJ
+LJ004-0045 LJ
+LJ004-0096 LJ
+LJ004-0152 LJ
+LJ005-0014 LJ
+LJ005-0079 LJ
diff --git a/data/ljspeech/example/utts.list b/data/ljspeech/example/utts.list
@@ -0,0 +1,10 @@
+LJ001-0110
+LJ002-0018
+LJ002-0043
+LJ003-0111
+LJ003-0345
+LJ004-0045
+LJ004-0096
+LJ004-0152
+LJ005-0014
+LJ005-0079
diff --git a/data/ljspeech/example/wav.scp b/data/ljspeech/example/wav.scp
@@ -0,0 +1,10 @@
+LJ001-0110 /path/to/dataset/LJ001-0110.wav
+LJ002-0018 /path/to/dataset/LJ002-0018.wav
+LJ002-0043 /path/to/dataset/LJ002-0043.wav
+LJ003-0111 /path/to/dataset/LJ003-0111.wav
+LJ003-0345 /path/to/dataset/LJ003-0345.wav
+LJ004-0045 /path/to/dataset/LJ004-0045.wav
+LJ004-0096 /path/to/dataset/LJ004-0096.wav
+LJ004-0152 /path/to/dataset/LJ004-0152.wav
+LJ005-0014 /path/to/dataset/LJ005-0014.wav
+LJ005-0079 /path/to/dataset/LJ005-0079.wav
diff --git a/extract_fbank.sh b/extract_fbank.sh
@@ -3,16 +3,16 @@
 
 nj=16     # number of parallel jobs in feature extraction
 sampling_rate=16000        # sampling frequency
-fmax=       # maximum frequency
-fmin=         # minimum frequency
+fmax=       # maximum frequency. If left blank, default to half the sampling rate
+fmin=         # minimum frequency. If left blank, default to 0.
 num_mels=80     # number of mel basis
 fft_size=1024   # number of fft points
 hop_size=256    # number of shift points
-win_length=  # window length
+win_length=  # window length. If left blank, default to minimum integer value that is greater than hop_size and is a power of 2.
 
-train_set="train" # name of training data directory
-dev_set="val"           # name of development data directory
-eval_set="val"         # name of evaluation data directory
+train_set="ljspeech/train" # name of training data directory
+dev_set="ljspeech/val"           # name of development data directory
+eval_set="ljspeech/val"         # name of evaluation data directory
 
 stage=0
 stop_stage=100
@@ -45,7 +45,9 @@ fi
 if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
     echo "Cepstral Mean Variance Normalization"
     feat_name=fbank
-    compute-cmvn-stats.py scp:${featdir}/${feat_name}/${train_set}/feats.scp ${featdir}/${feat_name}/${train_set}/cmvn.ark
+
+    # if you want to compute the CMVN stats instead of using the provided one, un-comment the line below.
+    # compute-cmvn-stats.py scp:${featdir}/${feat_name}/${train_set}/feats.scp ${featdir}/${feat_name}/${train_set}/cmvn.ark
     for x in ${train_set} ${dev_set} ${eval_set} ; do
         echo "Applying normalization for dataset ${x}"
         mkdir -p ${featdir}/normed_${feat_name}/${x} ;
@@ -55,3 +57,12 @@ if [ "${stage}" -le 1 ] && [ "${stop_stage}" -ge 1 ]; then
                     ark,scp:${featdir}/normed_${feat_name}/${x}/feats.ark,${featdir}/normed_${feat_name}/${x}/feats.scp
     done
 fi
+
+if [ "${stage}" -le 2 ] && [ "${stop_stage}" -ge 2 ]; then
+    echo "Write utt2num_frames"
+    feat_name=fbank
+    for x in ${train_set} ${dev_set} ${eval_set} ; do
+        feat-to-len.py scp:${featdir}/normed_${feat_name}/${x}/feats.scp > ${featdir}/normed_${feat_name}/${x}/utt2num_frames
+    done
+fi
+
diff --git a/feats/fbank/ljspeech/train/cmvn.ark b/feats/fbank/ljspeech/train/cmvn.ark
diff --git a/generate_for_reflow.py b/generate_for_reflow.py
@@ -9,18 +9,18 @@
 import torch.multiprocessing as mp
 import torch.distributed as dist
 from torch.nn.parallel import DistributedDataParallel as DDP
-import utils
+import tools
 
 
 def run(rank, n_gpus, hps, args, ckpt, feats_dir, temp_dir):
-    logger = utils.get_logger(hps.model_dir, f"inference.{rank}.log")  # NOTE: cannot delete this line.
+    logger = tools.get_logger(hps.model_dir, f"inference.{rank}.log")  # NOTE: cannot delete this line.
     device = torch.device('cpu' if not torch.cuda.is_available() else f"cuda:{rank}")
     torch.manual_seed(hps.train.seed)  # NOTE: control seed
 
     setattr(hps.data, "train_utts" if args.dataset == "train" else "val_utts", f"{temp_dir}/{rank}.txt")
 
-    train_dataset, collate_fn, model = utils.get_correct_class(hps)
-    val_dataset, _, _ = utils.get_correct_class(hps, train=False)
+    train_dataset, collate_fn, model = tools.get_correct_class(hps)
+    val_dataset, _, _ = tools.get_correct_class(hps, train=False)
 
     batch_collate = collate_fn
     train_loader = DataLoader(
@@ -40,7 +40,7 @@ def run(rank, n_gpus, hps, args, ckpt, feats_dir, temp_dir):
         shuffle=False,
     )
     model = model(**hps.model).to(device)
-    utils.load_checkpoint(ckpt, model, None)
+    tools.load_checkpoint(ckpt, model, None)
     print(f"Loaded checkpoint from {ckpt}")
     model.to(device).eval()
     print(f"Number of parameters: {model.nparams}")
@@ -147,8 +147,8 @@ def run(rank, n_gpus, hps, args, ckpt, feats_dir, temp_dir):
     os.environ["MASTER_ADDR"] = "localhost"
     os.environ["MASTER_PORT"] = "80000"
 
-    hps, args = utils.get_hparams_decode()
-    ckpt = utils.latest_checkpoint_path(hps.model_dir, "grad_*.pt" if not args.EMA else "EMA_grad_*.pt")
+    hps, args = tools.get_hparams_decode()
+    ckpt = tools.latest_checkpoint_path(hps.model_dir, "grad_*.pt" if not args.EMA else "EMA_grad_*.pt")
 
     if args.use_control_spk:
         feats_dir = f"synthetic_wav/{args.model}/tts_other_spk"