Skip to content

Commit

Permalink
add tools
Browse files Browse the repository at this point in the history
  • Loading branch information
cantabile-kwok committed Oct 7, 2023
1 parent 73537de commit 312a89a
Show file tree
Hide file tree
Showing 69 changed files with 29,735 additions and 0 deletions.
1 change: 1 addition & 0 deletions path.sh
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
conda activate py39
export PATH=$PWD/tools:$PATH
116 changes: 116 additions & 0 deletions tools/apply-cmvn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
#!/usr/bin/env python3
import argparse
from distutils.util import strtobool
import logging

import kaldiio
import numpy

from espnet_transform.cmvn import CMVN
from espnet_utils.cli_readers import file_reader_helper
from espnet_utils.cli_utils import get_commandline_args
from espnet_utils.cli_utils import is_scipy_wav_style
from espnet_utils.cli_writers import file_writer_helper


def get_parser():
parser = argparse.ArgumentParser(
description='apply mean-variance normalization to files',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument('--verbose', '-V', default=0, type=int,
help='Verbose option')
parser.add_argument('--in-filetype', type=str, default='mat',
choices=['mat', 'hdf5', 'sound.hdf5', 'sound'],
help='Specify the file format for the rspecifier. '
'"mat" is the matrix format in kaldi')
parser.add_argument('--stats-filetype', type=str, default='mat',
choices=['mat', 'hdf5', 'npy'],
help='Specify the file format for the rspecifier. '
'"mat" is the matrix format in kaldi')
parser.add_argument('--out-filetype', type=str, default='mat',
choices=['mat', 'hdf5'],
help='Specify the file format for the wspecifier. '
'"mat" is the matrix format in kaldi')

parser.add_argument('--norm-means', type=strtobool, default=True,
help='Do variance normalization or not.')
parser.add_argument('--norm-vars', type=strtobool, default=False,
help='Do variance normalization or not.')
parser.add_argument('--reverse', type=strtobool, default=False,
help='Do reverse mode or not')
parser.add_argument('--spk2utt', type=str,
help='A text file of speaker to utterance-list map. '
'(Don\'t give rspecifier format, such as '
'"ark:spk2utt")')
parser.add_argument('--utt2spk', type=str,
help='A text file of utterance to speaker map. '
'(Don\'t give rspecifier format, such as '
'"ark:utt2spk")')
parser.add_argument('--write-num-frames', type=str,
help='Specify wspecifer for utt2num_frames')
parser.add_argument('--compress', type=strtobool, default=False,
help='Save in compressed format')
parser.add_argument('--compression-method', type=int, default=2,
help='Specify the method(if mat) or '
'gzip-level(if hdf5)')
parser.add_argument('stats_rspecifier_or_rxfilename',
help='Input stats. e.g. ark:stats.ark or stats.mat')
parser.add_argument('rspecifier', type=str,
help='Read specifier id. e.g. ark:some.ark')
parser.add_argument('wspecifier', type=str,
help='Write specifier id. e.g. ark:some.ark')
return parser


def main():
args = get_parser().parse_args()

# logging info
logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
if args.verbose > 0:
logging.basicConfig(level=logging.INFO, format=logfmt)
else:
logging.basicConfig(level=logging.WARN, format=logfmt)
logging.info(get_commandline_args())

if ':' in args.stats_rspecifier_or_rxfilename:
is_rspcifier = True
if args.stats_filetype == 'npy':
stats_filetype = 'hdf5'
else:
stats_filetype = args.stats_filetype

stats_dict = dict(file_reader_helper(
args.stats_rspecifier_or_rxfilename, stats_filetype))
else:
is_rspcifier = False
if args.stats_filetype == 'mat':
stats = kaldiio.load_mat(args.stats_rspecifier_or_rxfilename)
else:
stats = numpy.load(args.stats_rspecifier_or_rxfilename)
stats_dict = {None: stats}

cmvn = CMVN(stats=stats_dict,
norm_means=args.norm_means,
norm_vars=args.norm_vars,
utt2spk=args.utt2spk,
spk2utt=args.spk2utt,
reverse=args.reverse)

with file_writer_helper(
args.wspecifier,
filetype=args.out_filetype,
write_num_frames=args.write_num_frames,
compress=args.compress,
compression_method=args.compression_method) as writer:
for utt, mat in file_reader_helper(args.rspecifier, args.in_filetype):
if is_scipy_wav_style(mat):
# If data is sound file, then got as Tuple[int, ndarray]
rate, mat = mat
mat = cmvn(mat, utt if is_rspcifier else None)
writer[utt] = mat


if __name__ == "__main__":
main()
169 changes: 169 additions & 0 deletions tools/compute-cmvn-stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
#!/usr/bin/env python3
import argparse
import logging

import kaldiio
import numpy as np
from tqdm import tqdm
from espnet_transform.transformation import Transformation
from espnet_utils.cli_readers import file_reader_helper
from espnet_utils.cli_utils import get_commandline_args
from espnet_utils.cli_utils import is_scipy_wav_style
from espnet_utils.cli_writers import file_writer_helper


def get_parser():
parser = argparse.ArgumentParser(
description='Compute cepstral mean and '
'variance normalization statistics'
'If wspecifier provided: per-utterance by default, '
'or per-speaker if'
'spk2utt option provided; if wxfilename: global',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--spk2utt', type=str,
help='A text file of speaker to utterance-list map. '
'(Don\'t give rspecifier format, such as '
'"ark:utt2spk")')
parser.add_argument('--verbose', '-V', default=0, type=int,
help='Verbose option')
parser.add_argument('--in-filetype', type=str, default='mat',
choices=['mat', 'hdf5', 'sound.hdf5', 'sound'],
help='Specify the file format for the rspecifier. '
'"mat" is the matrix format in kaldi')
parser.add_argument('--out-filetype', type=str, default='mat',
choices=['mat', 'hdf5', 'npy'],
help='Specify the file format for the wspecifier. '
'"mat" is the matrix format in kaldi')
parser.add_argument('--preprocess-conf', type=str, default=None,
help='The configuration file for the pre-processing')
parser.add_argument('rspecifier', type=str,
help='Read specifier for feats. e.g. ark:some.ark')
parser.add_argument('wspecifier_or_wxfilename', type=str,
help='Write specifier. e.g. ark:some.ark')
return parser


def main():
args = get_parser().parse_args()

logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
if args.verbose > 0:
logging.basicConfig(level=logging.INFO, format=logfmt)
else:
logging.basicConfig(level=logging.WARN, format=logfmt)
logging.info(get_commandline_args())

is_wspecifier = ':' in args.wspecifier_or_wxfilename

if is_wspecifier:
if args.spk2utt is not None:
logging.info('Performing as speaker CMVN mode')
utt2spk_dict = {}
with open(args.spk2utt) as f:
for line in f:
spk, utts = line.rstrip().split(None, 1)
for utt in utts.split():
utt2spk_dict[utt] = spk

def utt2spk(x):
return utt2spk_dict[x]
else:
logging.info('Performing as utterance CMVN mode')

def utt2spk(x):
return x

if args.out_filetype == 'npy':
logging.warning('--out-filetype npy is allowed only for '
'Global CMVN mode, changing to hdf5')
args.out_filetype = 'hdf5'

else:
logging.info('Performing as global CMVN mode')
if args.spk2utt is not None:
logging.warning('spk2utt is not used for global CMVN mode')

def utt2spk(x):
return None

if args.out_filetype == 'hdf5':
logging.warning('--out-filetype hdf5 is not allowed for '
'Global CMVN mode, changing to npy')
args.out_filetype = 'npy'

if args.preprocess_conf is not None:
preprocessing = Transformation(args.preprocess_conf)
logging.info('Apply preprocessing: {}'.format(preprocessing))
else:
preprocessing = None

# Calculate stats for each speaker
counts = {}
sum_feats = {}
square_sum_feats = {}

idx = 0
for idx, (utt, matrix) in tqdm(enumerate(file_reader_helper(
args.rspecifier, args.in_filetype), 1), desc="Computing Stats"):
if is_scipy_wav_style(matrix):
# If data is sound file, then got as Tuple[int, ndarray]
rate, matrix = matrix
if preprocessing is not None:
matrix = preprocessing(matrix, uttid_list=utt)

spk = utt2spk(utt)

# Init at the first seen of the spk
if spk not in counts:
counts[spk] = 0
feat_shape = matrix.shape[1:]
# Accumulate in double precision
sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64)
square_sum_feats[spk] = np.zeros(feat_shape, dtype=np.float64)

counts[spk] += matrix.shape[0]
sum_feats[spk] += matrix.sum(axis=0)
square_sum_feats[spk] += (matrix ** 2).sum(axis=0)
logging.info('Processed {} utterances'.format(idx))
assert idx > 0, idx

cmvn_stats = {}
for spk in counts:
feat_shape = sum_feats[spk].shape
cmvn_shape = (2, feat_shape[0] + 1) + feat_shape[1:]
_cmvn_stats = np.empty(cmvn_shape, dtype=np.float64)
_cmvn_stats[0, :-1] = sum_feats[spk]
_cmvn_stats[1, :-1] = square_sum_feats[spk]

_cmvn_stats[0, -1] = counts[spk]
_cmvn_stats[1, -1] = 0.

# You can get the mean and std as following,
# >>> N = _cmvn_stats[0, -1]
# >>> mean = _cmvn_stats[0, :-1] / N
# >>> std = np.sqrt(_cmvn_stats[1, :-1] / N - mean ** 2)

cmvn_stats[spk] = _cmvn_stats

# Per utterance or speaker CMVN
if is_wspecifier:
with file_writer_helper(args.wspecifier_or_wxfilename,
filetype=args.out_filetype) as writer:
for spk, mat in cmvn_stats.items():
writer[spk] = mat

# Global CMVN
else:
matrix = cmvn_stats[None]
if args.out_filetype == 'npy':
np.save(args.wspecifier_or_wxfilename, matrix)
elif args.out_filetype == 'mat':
# Kaldi supports only matrix or vector
kaldiio.save_mat(args.wspecifier_or_wxfilename, matrix)
else:
raise RuntimeError('Not supporting: --out-filetype {}'
.format(args.out_filetype))


if __name__ == "__main__":
main()
105 changes: 105 additions & 0 deletions tools/compute-fbank-feats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env python3

# Copyright 2018 Nagoya University (Tomoki Hayashi)
# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)

import argparse
from distutils.util import strtobool
import logging

import kaldiio
import numpy

from espnet_transform.spectrogram import logmelspectrogram
from espnet_utils.cli_utils import get_commandline_args
from espnet_utils.cli_writers import file_writer_helper


def get_parser():
parser = argparse.ArgumentParser(
description='compute FBANK feature from WAV',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--fs', type=int,
help='Sampling frequency')
parser.add_argument('--fmax', type=int, default=None, nargs='?',
help='Maximum frequency')
parser.add_argument('--fmin', type=int, default=None, nargs='?',
help='Minimum frequency')
parser.add_argument('--n_mels', type=int, default=80,
help='Number of mel basis')
parser.add_argument('--n_fft', type=int, default=1024,
help='FFT length in point')
parser.add_argument('--n_shift', type=int, default=512,
help='Shift length in point')
parser.add_argument('--win_length', type=int, default=None, nargs='?',
help='Analisys window length in point')
parser.add_argument('--window', type=str, default='hann',
choices=['hann', 'hamming'],
help='Type of window')
parser.add_argument('--write-num-frames', type=str,
help='Specify wspecifer for utt2num_frames')
parser.add_argument('--filetype', type=str, default='mat',
choices=['mat', 'hdf5'],
help='Specify the file format for output. '
'"mat" is the matrix format in kaldi')
parser.add_argument('--compress', type=strtobool, default=False,
help='Save in compressed format')
parser.add_argument('--compression-method', type=int, default=2,
help='Specify the method(if mat) or '
'gzip-level(if hdf5)')
parser.add_argument('--verbose', '-V', default=0, type=int,
help='Verbose option')
parser.add_argument('--normalize', choices=[1, 16, 24, 32], type=int,
default=None,
help='Give the bit depth of the PCM, '
'then normalizes data to scale in [-1,1]')
parser.add_argument('rspecifier', type=str, help='WAV scp file')
parser.add_argument(
'--segments', type=str,
help='segments-file format: each line is either'
'<segment-id> <recording-id> <start-time> <end-time>'
'e.g. call-861225-A-0050-0065 call-861225-A 5.0 6.5')
parser.add_argument('wspecifier', type=str, help='Write specifier')
return parser


def main():
parser = get_parser()
args = parser.parse_args()

logfmt = "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s"
if args.verbose > 0:
logging.basicConfig(level=logging.INFO, format=logfmt)
else:
logging.basicConfig(level=logging.WARN, format=logfmt)
logging.info(get_commandline_args())

with kaldiio.ReadHelper(args.rspecifier,
segments=args.segments) as reader, \
file_writer_helper(args.wspecifier,
filetype=args.filetype,
write_num_frames=args.write_num_frames,
compress=args.compress,
compression_method=args.compression_method
) as writer:
for utt_id, (rate, array) in reader:
assert rate == args.fs
array = array.astype(numpy.float32)
if args.normalize is not None and args.normalize != 1:
array = array / (1 << (args.normalize - 1))

lmspc = logmelspectrogram(
x=array,
fs=args.fs,
n_mels=args.n_mels,
n_fft=args.n_fft,
n_shift=args.n_shift,
win_length=args.win_length,
window=args.window,
fmin=args.fmin,
fmax=args.fmax)
writer[utt_id] = lmspc


if __name__ == "__main__":
main()
Loading

0 comments on commit 312a89a

Please sign in to comment.