forked from X-LANCE/VoiceFlow-TTS
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
73537de
commit 312a89a
Showing
69 changed files
with
29,735 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
conda activate py39 | ||
export PATH=$PWD/tools:$PATH |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
#!/usr/bin/env python3 | ||
import argparse | ||
from distutils.util import strtobool | ||
import logging | ||
|
||
import kaldiio | ||
import numpy | ||
|
||
from espnet_transform.cmvn import CMVN | ||
from espnet_utils.cli_readers import file_reader_helper | ||
from espnet_utils.cli_utils import get_commandline_args | ||
from espnet_utils.cli_utils import is_scipy_wav_style | ||
from espnet_utils.cli_writers import file_writer_helper | ||
|
||
|
||
def get_parser(): | ||
parser = argparse.ArgumentParser( | ||
description='apply mean-variance normalization to files', | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
|
||
parser.add_argument('--verbose', '-V', default=0, type=int, | ||
help='Verbose option') | ||
parser.add_argument('--in-filetype', type=str, default='mat', | ||
choices=['mat', 'hdf5', 'sound.hdf5', 'sound'], | ||
help='Specify the file format for the rspecifier. ' | ||
'"mat" is the matrix format in kaldi') | ||
parser.add_argument('--stats-filetype', type=str, default='mat', | ||
choices=['mat', 'hdf5', 'npy'], | ||
help='Specify the file format for the rspecifier. ' | ||
'"mat" is the matrix format in kaldi') | ||
parser.add_argument('--out-filetype', type=str, default='mat', | ||
choices=['mat', 'hdf5'], | ||
help='Specify the file format for the wspecifier. ' | ||
'"mat" is the matrix format in kaldi') | ||
|
||
parser.add_argument('--norm-means', type=strtobool, default=True, | ||
help='Do variance normalization or not.') | ||
parser.add_argument('--norm-vars', type=strtobool, default=False, | ||
help='Do variance normalization or not.') | ||
parser.add_argument('--reverse', type=strtobool, default=False, | ||
help='Do reverse mode or not') | ||
parser.add_argument('--spk2utt', type=str, | ||
help='A text file of speaker to utterance-list map. ' | ||
'(Don\'t give rspecifier format, such as ' | ||
'"ark:spk2utt")') | ||
parser.add_argument('--utt2spk', type=str, | ||
help='A text file of utterance to speaker map. ' | ||
'(Don\'t give rspecifier format, such as ' | ||
'"ark:utt2spk")') | ||
parser.add_argument('--write-num-frames', type=str, | ||
help='Specify wspecifer for utt2num_frames') | ||
parser.add_argument('--compress', type=strtobool, default=False, | ||
help='Save in compressed format') | ||
parser.add_argument('--compression-method', type=int, default=2, | ||
help='Specify the method(if mat) or ' | ||
'gzip-level(if hdf5)') | ||
parser.add_argument('stats_rspecifier_or_rxfilename', | ||
help='Input stats. e.g. ark:stats.ark or stats.mat') | ||
parser.add_argument('rspecifier', type=str, | ||
help='Read specifier id. e.g. ark:some.ark') | ||
parser.add_argument('wspecifier', type=str, | ||
help='Write specifier id. e.g. ark:some.ark') | ||
return parser | ||
|
||
|
||
def main(): | ||
args = get_parser().parse_args() | ||
|
||
# logging info | ||
logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" | ||
if args.verbose > 0: | ||
logging.basicConfig(level=logging.INFO, format=logfmt) | ||
else: | ||
logging.basicConfig(level=logging.WARN, format=logfmt) | ||
logging.info(get_commandline_args()) | ||
|
||
if ':' in args.stats_rspecifier_or_rxfilename: | ||
is_rspcifier = True | ||
if args.stats_filetype == 'npy': | ||
stats_filetype = 'hdf5' | ||
else: | ||
stats_filetype = args.stats_filetype | ||
|
||
stats_dict = dict(file_reader_helper( | ||
args.stats_rspecifier_or_rxfilename, stats_filetype)) | ||
else: | ||
is_rspcifier = False | ||
if args.stats_filetype == 'mat': | ||
stats = kaldiio.load_mat(args.stats_rspecifier_or_rxfilename) | ||
else: | ||
stats = numpy.load(args.stats_rspecifier_or_rxfilename) | ||
stats_dict = {None: stats} | ||
|
||
cmvn = CMVN(stats=stats_dict, | ||
norm_means=args.norm_means, | ||
norm_vars=args.norm_vars, | ||
utt2spk=args.utt2spk, | ||
spk2utt=args.spk2utt, | ||
reverse=args.reverse) | ||
|
||
with file_writer_helper( | ||
args.wspecifier, | ||
filetype=args.out_filetype, | ||
write_num_frames=args.write_num_frames, | ||
compress=args.compress, | ||
compression_method=args.compression_method) as writer: | ||
for utt, mat in file_reader_helper(args.rspecifier, args.in_filetype): | ||
if is_scipy_wav_style(mat): | ||
# If data is sound file, then got as Tuple[int, ndarray] | ||
rate, mat = mat | ||
mat = cmvn(mat, utt if is_rspcifier else None) | ||
writer[utt] = mat | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,169 @@ | ||
#!/usr/bin/env python3 | ||
import argparse | ||
import logging | ||
|
||
import kaldiio | ||
import numpy as np | ||
from tqdm import tqdm | ||
from espnet_transform.transformation import Transformation | ||
from espnet_utils.cli_readers import file_reader_helper | ||
from espnet_utils.cli_utils import get_commandline_args | ||
from espnet_utils.cli_utils import is_scipy_wav_style | ||
from espnet_utils.cli_writers import file_writer_helper | ||
|
||
|
||
def get_parser(): | ||
parser = argparse.ArgumentParser( | ||
description='Compute cepstral mean and ' | ||
'variance normalization statistics' | ||
'If wspecifier provided: per-utterance by default, ' | ||
'or per-speaker if' | ||
'spk2utt option provided; if wxfilename: global', | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
parser.add_argument('--spk2utt', type=str, | ||
help='A text file of speaker to utterance-list map. ' | ||
'(Don\'t give rspecifier format, such as ' | ||
'"ark:utt2spk")') | ||
parser.add_argument('--verbose', '-V', default=0, type=int, | ||
help='Verbose option') | ||
parser.add_argument('--in-filetype', type=str, default='mat', | ||
choices=['mat', 'hdf5', 'sound.hdf5', 'sound'], | ||
help='Specify the file format for the rspecifier. ' | ||
'"mat" is the matrix format in kaldi') | ||
parser.add_argument('--out-filetype', type=str, default='mat', | ||
choices=['mat', 'hdf5', 'npy'], | ||
help='Specify the file format for the wspecifier. ' | ||
'"mat" is the matrix format in kaldi') | ||
parser.add_argument('--preprocess-conf', type=str, default=None, | ||
help='The configuration file for the pre-processing') | ||
parser.add_argument('rspecifier', type=str, | ||
help='Read specifier for feats. e.g. ark:some.ark') | ||
parser.add_argument('wspecifier_or_wxfilename', type=str, | ||
help='Write specifier. e.g. ark:some.ark') | ||
return parser | ||
|
||
|
||
def main(): | ||
args = get_parser().parse_args() | ||
|
||
logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" | ||
if args.verbose > 0: | ||
logging.basicConfig(level=logging.INFO, format=logfmt) | ||
else: | ||
logging.basicConfig(level=logging.WARN, format=logfmt) | ||
logging.info(get_commandline_args()) | ||
|
||
is_wspecifier = ':' in args.wspecifier_or_wxfilename | ||
|
||
if is_wspecifier: | ||
if args.spk2utt is not None: | ||
logging.info('Performing as speaker CMVN mode') | ||
utt2spk_dict = {} | ||
with open(args.spk2utt) as f: | ||
for line in f: | ||
spk, utts = line.rstrip().split(None, 1) | ||
for utt in utts.split(): | ||
utt2spk_dict[utt] = spk | ||
|
||
def utt2spk(x): | ||
return utt2spk_dict[x] | ||
else: | ||
logging.info('Performing as utterance CMVN mode') | ||
|
||
def utt2spk(x): | ||
return x | ||
|
||
if args.out_filetype == 'npy': | ||
logging.warning('--out-filetype npy is allowed only for ' | ||
'Global CMVN mode, changing to hdf5') | ||
args.out_filetype = 'hdf5' | ||
|
||
else: | ||
logging.info('Performing as global CMVN mode') | ||
if args.spk2utt is not None: | ||
logging.warning('spk2utt is not used for global CMVN mode') | ||
|
||
def utt2spk(x): | ||
return None | ||
|
||
if args.out_filetype == 'hdf5': | ||
logging.warning('--out-filetype hdf5 is not allowed for ' | ||
'Global CMVN mode, changing to npy') | ||
args.out_filetype = 'npy' | ||
|
||
if args.preprocess_conf is not None: | ||
preprocessing = Transformation(args.preprocess_conf) | ||
logging.info('Apply preprocessing: {}'.format(preprocessing)) | ||
else: | ||
preprocessing = None | ||
|
||
# Calculate stats for each speaker | ||
counts = {} | ||
sum_feats = {} | ||
square_sum_feats = {} | ||
|
||
idx = 0 | ||
for idx, (utt, matrix) in tqdm(enumerate(file_reader_helper( | ||
args.rspecifier, args.in_filetype), 1), desc="Computing Stats"): | ||
if is_scipy_wav_style(matrix): | ||
# If data is sound file, then got as Tuple[int, ndarray] | ||
rate, matrix = matrix | ||
if preprocessing is not None: | ||
matrix = preprocessing(matrix, uttid_list=utt) | ||
|
||
spk = utt2spk(utt) | ||
|
||
# Init at the first seen of the spk | ||
if spk not in counts: | ||
counts[spk] = 0 | ||
feat_shape = matrix.shape[1:] | ||
# Accumulate in double precision | ||
sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64) | ||
square_sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64) | ||
|
||
counts[spk] += matrix.shape[0] | ||
sum_feats[spk] += matrix.sum(axis=0) | ||
square_sum_feats[spk] += (matrix ** 2).sum(axis=0) | ||
logging.info('Processed {} utterances'.format(idx)) | ||
assert idx > 0, idx | ||
|
||
cmvn_stats = {} | ||
for spk in counts: | ||
feat_shape = sum_feats[spk].shape | ||
cmvn_shape = (2, feat_shape[0] + 1) + feat_shape[1:] | ||
_cmvn_stats = np.empty(cmvn_shape, dtype=np.float64) | ||
_cmvn_stats[0, :-1] = sum_feats[spk] | ||
_cmvn_stats[1, :-1] = square_sum_feats[spk] | ||
|
||
_cmvn_stats[0, -1] = counts[spk] | ||
_cmvn_stats[1, -1] = 0. | ||
|
||
# You can get the mean and std as following, | ||
# >>> N = _cmvn_stats[0, -1] | ||
# >>> mean = _cmvn_stats[0, :-1] / N | ||
# >>> std = np.sqrt(_cmvn_stats[1, :-1] / N - mean ** 2) | ||
|
||
cmvn_stats[spk] = _cmvn_stats | ||
|
||
# Per utterance or speaker CMVN | ||
if is_wspecifier: | ||
with file_writer_helper(args.wspecifier_or_wxfilename, | ||
filetype=args.out_filetype) as writer: | ||
for spk, mat in cmvn_stats.items(): | ||
writer[spk] = mat | ||
|
||
# Global CMVN | ||
else: | ||
matrix = cmvn_stats[None] | ||
if args.out_filetype == 'npy': | ||
np.save(args.wspecifier_or_wxfilename, matrix) | ||
elif args.out_filetype == 'mat': | ||
# Kaldi supports only matrix or vector | ||
kaldiio.save_mat(args.wspecifier_or_wxfilename, matrix) | ||
else: | ||
raise RuntimeError('Not supporting: --out-filetype {}' | ||
.format(args.out_filetype)) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,105 @@ | ||
#!/usr/bin/env python3 | ||
|
||
# Copyright 2018 Nagoya University (Tomoki Hayashi) | ||
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) | ||
|
||
import argparse | ||
from distutils.util import strtobool | ||
import logging | ||
|
||
import kaldiio | ||
import numpy | ||
|
||
from espnet_transform.spectrogram import logmelspectrogram | ||
from espnet_utils.cli_utils import get_commandline_args | ||
from espnet_utils.cli_writers import file_writer_helper | ||
|
||
|
||
def get_parser(): | ||
parser = argparse.ArgumentParser( | ||
description='compute FBANK feature from WAV', | ||
formatter_class=argparse.ArgumentDefaultsHelpFormatter) | ||
parser.add_argument('--fs', type=int, | ||
help='Sampling frequency') | ||
parser.add_argument('--fmax', type=int, default=None, nargs='?', | ||
help='Maximum frequency') | ||
parser.add_argument('--fmin', type=int, default=None, nargs='?', | ||
help='Minimum frequency') | ||
parser.add_argument('--n_mels', type=int, default=80, | ||
help='Number of mel basis') | ||
parser.add_argument('--n_fft', type=int, default=1024, | ||
help='FFT length in point') | ||
parser.add_argument('--n_shift', type=int, default=512, | ||
help='Shift length in point') | ||
parser.add_argument('--win_length', type=int, default=None, nargs='?', | ||
help='Analisys window length in point') | ||
parser.add_argument('--window', type=str, default='hann', | ||
choices=['hann', 'hamming'], | ||
help='Type of window') | ||
parser.add_argument('--write-num-frames', type=str, | ||
help='Specify wspecifer for utt2num_frames') | ||
parser.add_argument('--filetype', type=str, default='mat', | ||
choices=['mat', 'hdf5'], | ||
help='Specify the file format for output. ' | ||
'"mat" is the matrix format in kaldi') | ||
parser.add_argument('--compress', type=strtobool, default=False, | ||
help='Save in compressed format') | ||
parser.add_argument('--compression-method', type=int, default=2, | ||
help='Specify the method(if mat) or ' | ||
'gzip-level(if hdf5)') | ||
parser.add_argument('--verbose', '-V', default=0, type=int, | ||
help='Verbose option') | ||
parser.add_argument('--normalize', choices=[1, 16, 24, 32], type=int, | ||
default=None, | ||
help='Give the bit depth of the PCM, ' | ||
'then normalizes data to scale in [-1,1]') | ||
parser.add_argument('rspecifier', type=str, help='WAV scp file') | ||
parser.add_argument( | ||
'--segments', type=str, | ||
help='segments-file format: each line is either' | ||
'<segment-id> <recording-id> <start-time> <end-time>' | ||
'e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5') | ||
parser.add_argument('wspecifier', type=str, help='Write specifier') | ||
return parser | ||
|
||
|
||
def main(): | ||
parser = get_parser() | ||
args = parser.parse_args() | ||
|
||
logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s" | ||
if args.verbose > 0: | ||
logging.basicConfig(level=logging.INFO, format=logfmt) | ||
else: | ||
logging.basicConfig(level=logging.WARN, format=logfmt) | ||
logging.info(get_commandline_args()) | ||
|
||
with kaldiio.ReadHelper(args.rspecifier, | ||
segments=args.segments) as reader, \ | ||
file_writer_helper(args.wspecifier, | ||
filetype=args.filetype, | ||
write_num_frames=args.write_num_frames, | ||
compress=args.compress, | ||
compression_method=args.compression_method | ||
) as writer: | ||
for utt_id, (rate, array) in reader: | ||
assert rate == args.fs | ||
array = array.astype(numpy.float32) | ||
if args.normalize is not None and args.normalize != 1: | ||
array = array / (1 << (args.normalize - 1)) | ||
|
||
lmspc = logmelspectrogram( | ||
x=array, | ||
fs=args.fs, | ||
n_mels=args.n_mels, | ||
n_fft=args.n_fft, | ||
n_shift=args.n_shift, | ||
win_length=args.win_length, | ||
window=args.window, | ||
fmin=args.fmin, | ||
fmax=args.fmax) | ||
writer[utt_id] = lmspc | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.