diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
new file mode 100644
index 00000000..d7f340a7
--- /dev/null
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,36 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
+
+name: Python package
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    env:
+      OMP_NUM_THREADS: 1
+    strategy:
+      matrix:
+        python-version: [3.7, 3.8, 3.9]
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v1
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        sudo apt-get install libsndfile1-dev
+        python -m pip install --upgrade pip
+        pip install torch
+        pip install -e ".[test]"
+    - name: Test
+      run: |
+        nosetests --with-coverage --cover-package=nnmnkwii -v -w tests/ -a '!require_local_data,!modspec'
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index f989c1a2..824295bf 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@ README.rst
 nnmnkwii/version.py
 nnmnkwii/paramgen/mlpg_helper.c
 nnmnkwii/util/_linalg.c
+nnmnkwii/paramgen/_bandmat/*.c
 examples
 docs/references/generated
 docs/.doctrees
diff --git a/.travis.yml b/.travis.yml
index 2ce221ad..b05ae2b4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -35,14 +35,8 @@ install:
   - pip install -e ".[test]"
 
 script:
-  # Workaround for bandmat installation issue on python >= 3.7
-  # see https://github.com/MattShannon/bandmat/issues/10
-  - if [ "$TRAVIS_PYTHON_VERSION" == "3.7" -o "$TRAVIS_PYTHON_VERSION" == "3.8" ]; then
-      nosetests --with-coverage --cover-package=nnmnkwii -v -w tests/ -a '!require_local_data,!modspec,!requires_bandmat' --ignore-files=test_autograd.py;
-    else
-      nosetests --with-coverage --cover-package=nnmnkwii -v -w tests/ -a '!require_local_data,!modspec';
-    fi
-  - flake8
+  - nosetests --with-coverage --cover-package=nnmnkwii -v -w tests/ -a '!require_local_data,!modspec';
+#  - flake8
 
 after_success:
   - codecov
diff --git a/MANIFEST.in b/MANIFEST.in
index 62564d9b..30745a6f 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,4 @@
 include README.md LICENSE.md
-recursive-include nnmnkwii COPYING
+recursive-include nnmnkwii COPYING License
 recursive-include nnmnkwii/util/_example_data *.lab *.wav *.npz *.hed
 recursive-include nnmnkwii *.c
diff --git a/README.md b/README.md
index 0f04cdb3..a925390b 100644
--- a/README.md
+++ b/README.md
@@ -4,6 +4,7 @@
 [![][docs-stable-img]][docs-stable-url]
 [![][docs-latest-img]][docs-latest-url]
 [![PyPI](https://img.shields.io/pypi/v/nnmnkwii.svg)](https://pypi.python.org/pypi/nnmnkwii)
+[![Python package](https://github.com/r9y9/nnmnkwii/actions/workflows/ci.yaml/badge.svg)](https://github.com/r9y9/nnmnkwii/actions/workflows/ci.yaml)
 [![Build Status](https://travis-ci.org/r9y9/nnmnkwii.svg?branch=master)](https://travis-ci.org/r9y9/nnmnkwii)
 [![Build status](https://ci.appveyor.com/api/projects/status/ch8cmtpw8ic1sd86?svg=true)](https://ci.appveyor.com/project/r9y9/nnmnkwii)
 [![codecov](https://codecov.io/gh/r9y9/nnmnkwii/branch/master/graph/badge.svg)](https://codecov.io/gh/r9y9/nnmnkwii)
@@ -11,8 +12,6 @@
 
 Library to build speech synthesis systems designed for easy and fast prototyping.
 
-Supported python versions: 2.7 and 3.6.
-
 ## Documentation
 
 - [**STABLE**][docs-stable-url] &mdash; **most recently tagged version of the documentation.**
diff --git a/docs/changelog.rst b/docs/changelog.rst
index 96a74b82..6d6db4a8 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -1,9 +1,18 @@
 Change log
 ==========
 
-v0.0.23 <2021-xx-xx>
+v0.0.23 <2021-05-15>
 --------------------
 
+- `#112`_: Renamed continuous_dict to numeric_dict to be more precise.
+- `#112`_: Bandmat is now a part of nnmnkwii's internal package. This is to avoid instllation failusres on python >=3.6. https://github.com/MattShannon/bandmat/issues/10
+- `#112`_: Started testing using github actions (python 3.7, 3.8, 3.9)
+- `#112`_: [hts.io]: :func:`nnmnkwii.io.hts.load_question_set` now keeps question names (e.g. "L-Phone_Boin") in dictionary.
+- `#112`_: [hts.io]: New functionality :func:`nnmnkwii.io.hts.write_audacity_labels`
+- `#112`_: [hts.io]: New functionality :func:`nnmnkwii.io.hts.write_textgrid`
+- `#112`_: Renamed np.int to int
+- `#112`_: Added pyproject.yaml
+
 v0.0.22 <2020-12-25>
 --------------------
 
@@ -214,4 +223,5 @@ v0.0.1 <2017-08-14>
 .. _#101: https://github.com/r9y9/nnmnkwii/pull/101
 .. _#105: https://github.com/r9y9/nnmnkwii/pull/105
 .. _#108: https://github.com/r9y9/nnmnkwii/pull/108
-.. _#109: https://github.com/r9y9/nnmnkwii/pull/109
\ No newline at end of file
+.. _#109: https://github.com/r9y9/nnmnkwii/pull/109
+.. _#112: https://github.com/r9y9/nnmnkwii/pull/112
diff --git a/docs/conf.py b/docs/conf.py
index db6c9ad9..2c032e41 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -22,12 +22,13 @@
 # sys.path.insert(0, os.path.abspath('.'))
 
 
-import pkg_resources
 import os
 
-__version__ = pkg_resources.get_distribution('nnmnkwii').version
+import pkg_resources
+
+__version__ = pkg_resources.get_distribution("nnmnkwii").version
 
-ON_RTD = os.environ.get('READTHEDOCS', None) == 'True'
+ON_RTD = os.environ.get("READTHEDOCS", None) == "True"
 
 # -- General configuration ------------------------------------------------
 
@@ -39,23 +40,23 @@
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
 extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.autosummary',
-    'sphinx.ext.doctest',
-    'sphinx.ext.intersphinx',
-    'sphinx.ext.mathjax',
-    'sphinx.ext.todo',
-    'sphinx.ext.viewcode',
-    'sphinx.ext.githubpages',
-    'sphinx.ext.napoleon',
-    'numpydoc',
-    'nbsphinx',
-    'IPython.sphinxext.ipython_console_highlighting',
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "sphinx.ext.doctest",
+    "sphinx.ext.intersphinx",
+    "sphinx.ext.mathjax",
+    "sphinx.ext.todo",
+    "sphinx.ext.viewcode",
+    "sphinx.ext.githubpages",
+    "sphinx.ext.napoleon",
+    "numpydoc",
+    "nbsphinx",
+    "IPython.sphinxext.ipython_console_highlighting",
 ]
 
 if ON_RTD:
     # Remove extensions not currently supported on RTD
-    extensions.remove('matplotlib.sphinxext.plot_directive')
+    extensions.remove("matplotlib.sphinxext.plot_directive")
 
 autosummary_generate = True
 numpydoc_show_class_members = False
@@ -77,18 +78,18 @@
     else:
         try:
             print("plot_directive.__version__:", plot_directive.__version__)
-            use_matplotlib_plot_directive = (plot_directive.__version__ >= 2)
+            use_matplotlib_plot_directive = plot_directive.__version__ >= 2
         except AttributeError:
             use_matplotlib_plot_directive = False
 
     if use_matplotlib_plot_directive:
-        extensions.append('matplotlib.sphinxext.plot_directive')
+        extensions.append("matplotlib.sphinxext.plot_directive")
     else:
         raise RuntimeError("You need a recent enough version of matplotlib")
 
-#------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
 # Plot
-#------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
 plot_pre_code = """
 import seaborn
 seaborn.set(style='ticks')
@@ -97,46 +98,47 @@
 np.set_printoptions(precision=3, linewidth=64, edgeitems=2, threshold=200)
 """
 plot_include_source = True
-plot_formats = [('png', 96), 'pdf']
+plot_formats = [("png", 96), "pdf"]
 plot_html_show_formats = False
 
 font_size = 13 * 72 / 96.0  # 13 px
 
 plot_rcparams = {
-    'font.size': font_size,
-    'axes.titlesize': font_size,
-    'axes.labelsize': font_size,
-    'xtick.labelsize': font_size,
-    'ytick.labelsize': font_size,
-    'legend.fontsize': font_size,
-    'figure.subplot.bottom': 0.2,
-    'figure.subplot.left': 0.2,
-    'figure.subplot.right': 0.9,
-    'figure.subplot.top': 0.85,
-    'figure.subplot.wspace': 0.4,
-    'text.usetex': False,
+    "font.size": font_size,
+    "axes.titlesize": font_size,
+    "axes.labelsize": font_size,
+    "xtick.labelsize": font_size,
+    "ytick.labelsize": font_size,
+    "legend.fontsize": font_size,
+    "figure.subplot.bottom": 0.2,
+    "figure.subplot.left": 0.2,
+    "figure.subplot.right": 0.9,
+    "figure.subplot.top": 0.85,
+    "figure.subplot.wspace": 0.4,
+    "text.usetex": False,
 }
 
 if not ON_RTD:
     import matplotlib
+
     matplotlib.rcParams.update(plot_rcparams)
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 #
 # source_suffix = ['.rst', '.md']
-source_suffix = '.rst'
+source_suffix = ".rst"
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = "index"
 
 # General information about the project.
-project = 'nnmnkwii'
-copyright = '2017, Ryuichi Yamamoto'
-author = 'Ryuichi Yamamoto'
+project = "nnmnkwii"
+copyright = "2017, Ryuichi Yamamoto"
+author = "Ryuichi Yamamoto"
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -157,10 +159,10 @@
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This patterns also effect to html_static_path and html_extra_path
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '**.ipynb_checkpoints']
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "**.ipynb_checkpoints"]
 
 # The name of the Pygments (syntax highlighting) style to use.
-pygments_style = 'sphinx'
+pygments_style = "sphinx"
 
 # If true, `todo` and `todoList` produce output, else they produce nothing.
 todo_include_todos = True
@@ -171,24 +173,27 @@
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_rtd_theme"
 
 # Theme options are theme-specific and customize the look and feel of a theme
 # further.  For a list of options available for each theme, see the
 # documentation.
 #
 html_theme_options = {
-    'collapse_navigation': False,
-    'display_version': True,
-    'logo_only': True,
+    "collapse_navigation": False,
+    "display_version": True,
+    "logo_only": True,
 }
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ["_static"]
+html_css_files = [
+    "css/custom.css",
+]
 
-html_logo = os.path.join(html_static_path[0], 'img/logo.png')
+html_logo = os.path.join(html_static_path[0], "img/logo.png")
 
 # Custom sidebar templates, must be a dictionary that maps document names
 # to template names.
@@ -196,12 +201,12 @@
 # This is required for the alabaster theme
 # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars
 html_sidebars = {
-    '**': [
-        'about.html',
-        'navigation.html',
-        'relations.html',  # needs 'show_related': True theme option to display
-        'searchbox.html',
-        'donate.html',
+    "**": [
+        "about.html",
+        "navigation.html",
+        "relations.html",  # needs 'show_related': True theme option to display
+        "searchbox.html",
+        "donate.html",
     ]
 }
 
@@ -209,7 +214,7 @@
 # -- Options for HTMLHelp output ------------------------------------------
 
 # Output file base name for HTML help builder.
-htmlhelp_basename = 'nnmnkwiidoc'
+htmlhelp_basename = "nnmnkwiidoc"
 
 
 # -- Options for LaTeX output ---------------------------------------------
@@ -218,15 +223,12 @@
     # The paper size ('letterpaper' or 'a4paper').
     #
     # 'papersize': 'letterpaper',
-
     # The font size ('10pt', '11pt' or '12pt').
     #
     # 'pointsize': '10pt',
-
     # Additional stuff for the LaTeX preamble.
     #
     # 'preamble': '',
-
     # Latex figure (float) alignment
     #
     # 'figure_align': 'htbp',
@@ -236,8 +238,13 @@
 # (source start file, target name, title,
 #  author, documentclass [howto, manual, or own class]).
 latex_documents = [
-    (master_doc, 'nnmnkwii.tex', 'nnmnkwii Documentation',
-     'Ryuichi Yamamoto', 'manual'),
+    (
+        master_doc,
+        "nnmnkwii.tex",
+        "nnmnkwii Documentation",
+        "Ryuichi Yamamoto",
+        "manual",
+    ),
 ]
 
 
@@ -245,10 +252,7 @@
 
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
-man_pages = [
-    (master_doc, 'nnmnkwii', 'nnmnkwii Documentation',
-     [author], 1)
-]
+man_pages = [(master_doc, "nnmnkwii", "nnmnkwii Documentation", [author], 1)]
 
 
 # -- Options for Texinfo output -------------------------------------------
@@ -257,18 +261,26 @@
 # (source start file, target name, title, author,
 #  dir menu entry, description, category)
 texinfo_documents = [
-    (master_doc, 'nnmnkwii', 'nnmnkwii Documentation',
-     author, 'nnmnkwii', 'One line description of project.',
-     'Miscellaneous'),
+    (
+        master_doc,
+        "nnmnkwii",
+        "nnmnkwii Documentation",
+        author,
+        "nnmnkwii",
+        "One line description of project.",
+        "Miscellaneous",
+    ),
 ]
 
 
 # Example configuration for intersphinx: refer to the Python standard library.
 intersphinx_mapping = {
-    'python': ('https://docs.python.org/', None),
-    'numpy': ('http://docs.scipy.org/doc/numpy/', None),
-    'pysptk': ('https://pysptk.readthedocs.io/en/latest/', None),
-    'pytorch': ('http://pytorch.org/docs/master/', None),
-    'librosa': ('http://librosa.github.io/librosa/', None),
-    'sklearn': ('http://scikit-learn.org/stable/', None)
+    "python": ("https://docs.python.org/3", None),
+    "numpy": ("https://numpy.org/doc/stable/", None),
+    "np": ("https://numpy.org/doc/stable/", None),
+    "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None),
+    "matplotlib": ("https://matplotlib.org/", None),
+    "sklearn": ("https://scikit-learn.org/stable/", None),
+    "pysptk": ("https://pysptk.readthedocs.io/en/latest/", None),
+    "pytorch": ("http://pytorch.org/docs/stable/", None),
 }
diff --git a/docs/nnmnkwii_gallery b/docs/nnmnkwii_gallery
index bd46ccfa..ddc6b2d5 160000
--- a/docs/nnmnkwii_gallery
+++ b/docs/nnmnkwii_gallery
@@ -1 +1 @@
-Subproject commit bd46ccfae88e8a5bc65dead67218ce34037e556f
+Subproject commit ddc6b2d5df4bdfd013a9ba948b30724e4ec41170
diff --git a/docs/references/io.rst b/docs/references/io.rst
index acf8a48f..4a3db96b 100644
--- a/docs/references/io.rst
+++ b/docs/references/io.rst
@@ -16,6 +16,8 @@ HTS IO
 
     load
     load_question_set
+    write_audacity_labels
+    write_textgrid
 
 .. autoclass:: HTSLabelFile
     :members:
diff --git a/nnmnkwii/baseline/gmm.py b/nnmnkwii/baseline/gmm.py
index 943a6c60..577ac675 100644
--- a/nnmnkwii/baseline/gmm.py
+++ b/nnmnkwii/baseline/gmm.py
@@ -1,8 +1,44 @@
 import numpy as np
+from nnmnkwii.paramgen import mlpg
+from scipy import linalg
 from sklearn.mixture import GaussianMixture
-from sklearn.mixture.gaussian_mixture import _compute_precision_cholesky
 
-from nnmnkwii.paramgen import mlpg
+
+# ref: https://github.com/scikit-learn/scikit-learn/blob/0.24.1/sklearn/mixture/_gaussian_mixture.py
+def _compute_precision_cholesky(covariances, covariance_type):
+    estimate_precision_error_message = (
+        "Fitting the mixture model failed because some components have "
+        "ill-defined empirical covariance (for instance caused by singleton "
+        "or collapsed samples). Try to decrease the number of components, "
+        "or increase reg_covar."
+    )
+
+    if covariance_type == "full":
+        n_components, n_features, _ = covariances.shape
+        precisions_chol = np.empty((n_components, n_features, n_features))
+        for k, covariance in enumerate(covariances):
+            try:
+                cov_chol = linalg.cholesky(covariance, lower=True)
+            except linalg.LinAlgError:
+                raise ValueError(estimate_precision_error_message)
+            precisions_chol[k] = linalg.solve_triangular(
+                cov_chol, np.eye(n_features), lower=True
+            ).T
+    elif covariance_type == "tied":
+        _, n_features = covariances.shape
+        try:
+            cov_chol = linalg.cholesky(covariances, lower=True)
+        except linalg.LinAlgError:
+            raise ValueError(estimate_precision_error_message)
+        precisions_chol = linalg.solve_triangular(
+            cov_chol, np.eye(n_features), lower=True
+        ).T
+    else:
+        if np.any(np.less_equal(covariances, 0.0)):
+            raise ValueError(estimate_precision_error_message)
+        precisions_chol = 1.0 / np.sqrt(covariances)
+    return precisions_chol
+
 
 # TODO: this can be refactored to be more flexible
 # e.g. take `swap` and `diff` out of the class
@@ -39,19 +75,21 @@ def __init__(self, gmm, swap=False, diff=False):
         # p(x), which is used to compute posterior prob. for a given source
         # spectral feature in mapping stage.
         self.px = GaussianMixture(
-            n_components=self.num_mixtures, covariance_type="full")
+            n_components=self.num_mixtures, covariance_type="full"
+        )
         self.px.means_ = self.src_means
         self.px.covariances_ = self.covarXX
         self.px.weights_ = self.weights
         self.px.precisions_cholesky_ = _compute_precision_cholesky(
-            self.px.covariances_, "full")
+            self.px.covariances_, "full"
+        )
 
     def transform(self, src):
         if src.ndim == 2:
             tgt = np.zeros_like(src)
             for idx, x in enumerate(src):
                 y = self._transform_frame(x)
-                tgt[idx][:len(y)] = y
+                tgt[idx][: len(y)] = y
             return tgt
         else:
             return self._transform_frame(src)
@@ -201,8 +239,9 @@ def transform(self, src):
         for t in range(T):
             m = optimum_mix[t]
             # Eq. (23), with approximating covariances as diagonals
-            D[t] = np.diag(self.covarYY[m]) - np.diag(self.covarYX[m]) / \
-                np.diag(self.covarXX[m]) * np.diag(self.covarXY[m])
+            D[t] = np.diag(self.covarYY[m]) - np.diag(self.covarYX[m]) / np.diag(
+                self.covarXX[m]
+            ) * np.diag(self.covarXY[m])
 
         # Once we have mean and variance over frames, then we can do MLPG
         return mlpg(E, D, self.windows)
diff --git a/nnmnkwii/datasets/__init__.py b/nnmnkwii/datasets/__init__.py
index 09d00f42..5b405dbd 100644
--- a/nnmnkwii/datasets/__init__.py
+++ b/nnmnkwii/datasets/__init__.py
@@ -173,7 +173,7 @@ def asarray(self, padded_length=None, dtype=np.float32,
         D = self[0].shape[-1]
         N = len(self)
         X = np.zeros((N, T, D), dtype=dtype)
-        lengths = np.zeros(N, dtype=np.int)
+        lengths = np.zeros(N, dtype=int)
 
         if verbose > 0:
             def custom_range(x):
diff --git a/nnmnkwii/frontend/merlin.py b/nnmnkwii/frontend/merlin.py
index a175b06f..4aa12f62 100644
--- a/nnmnkwii/frontend/merlin.py
+++ b/nnmnkwii/frontend/merlin.py
@@ -1,5 +1,3 @@
-# coding: utf-8
-
 # Part of code here is adapted from Merlin. Their license follows:
 ##########################################################################
 #           The Neural Network (NN) based Speech Synthesis System
@@ -40,12 +38,9 @@
 #  THIS SOFTWARE.
 ##########################################################################
 
-from __future__ import division, print_function, absolute_import
-
 import numpy as np
-
-from nnmnkwii.io import hts
 from nnmnkwii.frontend import NOTE_MAPPING
+from nnmnkwii.io import hts
 
 
 def get_frame_feature_size(subphone_features="full"):
@@ -54,34 +49,34 @@ def get_frame_feature_size(subphone_features="full"):
         return 0
     subphone_features = subphone_features.strip().lower()
     if subphone_features == "none":
-        raise ValueError(
-            "subphone_features = 'none' is deprecated, use None instead")
-    if subphone_features == 'full':
+        raise ValueError("subphone_features = 'none' is deprecated, use None instead")
+    if subphone_features == "full":
         return 9  # zhizheng's original 5 state features + 4 phoneme features
-    elif subphone_features == 'minimal_frame':
+    elif subphone_features == "minimal_frame":
         # the minimal features necessary to go from a state-level to
         # frame-level model
         return 2
-    elif subphone_features == 'state_only':
+    elif subphone_features == "state_only":
         return 1  # this is equivalent to a state-based system
-    elif subphone_features == 'frame_only':
+    elif subphone_features == "frame_only":
         # this is equivalent to a frame-based system without relying on
         # state-features
         return 1
-    elif subphone_features == 'uniform_state':
+    elif subphone_features == "uniform_state":
         # this is equivalent to a frame-based system with uniform
         # state-features
         return 2
-    elif subphone_features == 'minimal_phoneme':
+    elif subphone_features == "minimal_phoneme":
         # this is equivalent to a frame-based system with minimal features
         return 3
-    elif subphone_features == 'coarse_coding':
+    elif subphone_features == "coarse_coding":
         # this is equivalent to a frame-based positioning system reported in
         # Heiga Zen's work
         return 4
     else:
         raise ValueError(
-            'Unknown value for subphone_features: %s' % (subphone_features))
+            "Unknown value for subphone_features: %s" % (subphone_features)
+        )
     assert False
 
 
@@ -101,6 +96,7 @@ def compute_coarse_coding_features(num_states=3, npoints=600):
     sigma = 0.4
 
     from scipy.stats import norm
+
     cc_features[0, :] = norm(mu1, sigma).pdf(x1)
     cc_features[1, :] = norm(mu2, sigma).pdf(x2)
     cc_features[2, :] = norm(mu3, sigma).pdf(x3)
@@ -126,10 +122,13 @@ def extract_coarse_coding_features_relative(cc_features, phone_duration):
 
 def pattern_matching_binary(binary_dict, label):
     dict_size = len(binary_dict)
-    lab_binary_vector = np.zeros((1, dict_size), dtype=np.int)
+    lab_binary_vector = np.zeros((1, dict_size), dtype=int)
 
     for i in range(dict_size):
         current_question_list = binary_dict[i]
+        # NOTE: newer version returns tuple of (name, question)
+        if isinstance(current_question_list, tuple):
+            current_question_list = current_question_list[1]
         binary_flag = 0
         for iq in range(len(current_question_list)):
             current_compiled = current_question_list[iq]
@@ -143,15 +142,20 @@ def pattern_matching_binary(binary_dict, label):
     return lab_binary_vector
 
 
-def pattern_matching_continous_position(continuous_dict, label):
-    dict_size = len(continuous_dict)
+def pattern_matching_continous_position(numeric_dict, label):
+    dict_size = len(numeric_dict)
 
     lab_continuous_vector = np.zeros((1, dict_size), dtype=np.float32)
     for i in range(dict_size):
+        current_compiled = numeric_dict[i]
 
-        continuous_value = -1.0
-
-        current_compiled = continuous_dict[i]
+        # NOTE: newer version returns tuple of (name, question)
+        if isinstance(current_compiled, tuple):
+            current_compiled = current_compiled[1]
+        if "([-\d]+)" in current_compiled.pattern:
+            continuous_value = -50.0
+        else:
+            continuous_value = -1.0
 
         ms = current_compiled.search(label)
         if ms is not None:
@@ -169,13 +173,15 @@ def pattern_matching_continous_position(continuous_dict, label):
     return lab_continuous_vector
 
 
-def load_labels_with_phone_alignment(hts_labels,
-                                     binary_dict,
-                                     continuous_dict,
-                                     subphone_features=None,
-                                     add_frame_features=False,
-                                     frame_shift=50000):
-    dict_size = len(binary_dict) + len(continuous_dict)
+def load_labels_with_phone_alignment(
+    hts_labels,
+    binary_dict,
+    numeric_dict,
+    subphone_features=None,
+    add_frame_features=False,
+    frame_shift=50000,
+):
+    dict_size = len(binary_dict) + len(numeric_dict)
     frame_feature_size = get_frame_feature_size(subphone_features)
     dimension = frame_feature_size + dict_size
 
@@ -193,86 +199,96 @@ def load_labels_with_phone_alignment(hts_labels,
     for idx, (start_time, end_time, full_label) in enumerate(hts_labels):
         frame_number = int(end_time / frame_shift) - int(start_time / frame_shift)
 
-        label_binary_vector = pattern_matching_binary(
-            binary_dict, full_label)
+        label_binary_vector = pattern_matching_binary(binary_dict, full_label)
 
         # if there is no CQS question, the label_continuous_vector will
         # become to empty
         label_continuous_vector = pattern_matching_continous_position(
-            continuous_dict, full_label)
+            numeric_dict, full_label
+        )
         label_vector = np.concatenate(
-            [label_binary_vector, label_continuous_vector], axis=1)
+            [label_binary_vector, label_continuous_vector], axis=1
+        )
 
         if subphone_features == "coarse_coding":
-            cc_feat_matrix = extract_coarse_coding_features_relative(cc_features,
-                                                                     frame_number)
+            cc_feat_matrix = extract_coarse_coding_features_relative(
+                cc_features, frame_number
+            )
 
         if add_frame_features:
             current_block_binary_array = np.zeros(
-                (frame_number, dict_size + frame_feature_size))
+                (frame_number, dict_size + frame_feature_size)
+            )
             for i in range(frame_number):
-                current_block_binary_array[i,
-                                           0:dict_size] = label_vector
+                current_block_binary_array[i, 0:dict_size] = label_vector
 
-                if subphone_features == 'minimal_phoneme':
+                if subphone_features == "minimal_phoneme":
                     # features which distinguish frame position in phoneme
                     # fraction through phone forwards
-                    current_block_binary_array[i, dict_size] = float(
-                        i + 1) / float(frame_number)
+                    current_block_binary_array[i, dict_size] = float(i + 1) / float(
+                        frame_number
+                    )
                     # fraction through phone backwards
                     current_block_binary_array[i, dict_size + 1] = float(
-                        frame_number - i) / float(frame_number)
+                        frame_number - i
+                    ) / float(frame_number)
                     # phone duration
-                    current_block_binary_array[i,
-                                               dict_size + 2] = float(frame_number)
+                    current_block_binary_array[i, dict_size + 2] = float(frame_number)
 
-                elif subphone_features == 'coarse_coding':
+                elif subphone_features == "coarse_coding":
                     # features which distinguish frame position in phoneme
                     # using three continous numerical features
-                    current_block_binary_array[i,
-                                               dict_size + 0] = cc_feat_matrix[i, 0]
-                    current_block_binary_array[i,
-                                               dict_size + 1] = cc_feat_matrix[i, 1]
-                    current_block_binary_array[i,
-                                               dict_size + 2] = cc_feat_matrix[i, 2]
-                    current_block_binary_array[i,
-                                               dict_size + 3] = float(frame_number)
+                    current_block_binary_array[i, dict_size + 0] = cc_feat_matrix[i, 0]
+                    current_block_binary_array[i, dict_size + 1] = cc_feat_matrix[i, 1]
+                    current_block_binary_array[i, dict_size + 2] = cc_feat_matrix[i, 2]
+                    current_block_binary_array[i, dict_size + 3] = float(frame_number)
 
                 elif subphone_features is None:
                     pass
                 else:
                     raise ValueError(
                         "Combination of subphone_features and add_frame_features is not supported: {}, {}".format(
-                            subphone_features, add_frame_features))
-            label_feature_matrix[label_feature_index:label_feature_index +
-                                 frame_number, ] = current_block_binary_array
+                            subphone_features, add_frame_features
+                        )
+                    )
+            label_feature_matrix[
+                label_feature_index : label_feature_index + frame_number,
+            ] = current_block_binary_array
             label_feature_index = label_feature_index + frame_number
 
         elif subphone_features is None:
             current_block_binary_array = label_vector
-            label_feature_matrix[label_feature_index:label_feature_index +
-                                 1, ] = current_block_binary_array
+            label_feature_matrix[
+                label_feature_index : label_feature_index + 1,
+            ] = current_block_binary_array
             label_feature_index = label_feature_index + 1
         else:
             pass
 
     # omg
     if label_feature_index == 0:
-        raise ValueError("Combination of subphone_features and add_frame_features is not supported: {}, {}".format(
-            subphone_features, add_frame_features))
+        raise ValueError(
+            "Combination of subphone_features and add_frame_features is not supported: {}, {}".format(
+                subphone_features, add_frame_features
+            )
+        )
 
-    label_feature_matrix = label_feature_matrix[0:label_feature_index, ]
+    label_feature_matrix = label_feature_matrix[
+        0:label_feature_index,
+    ]
 
     return label_feature_matrix
 
 
-def load_labels_with_state_alignment(hts_labels,
-                                     binary_dict,
-                                     continuous_dict,
-                                     subphone_features=None,
-                                     add_frame_features=False,
-                                     frame_shift=50000):
-    dict_size = len(binary_dict) + len(continuous_dict)
+def load_labels_with_state_alignment(
+    hts_labels,
+    binary_dict,
+    numeric_dict,
+    subphone_features=None,
+    add_frame_features=False,
+    frame_shift=50000,
+):
+    dict_size = len(binary_dict) + len(numeric_dict)
     frame_feature_size = get_frame_feature_size(subphone_features)
     dimension = frame_feature_size + dict_size
 
@@ -290,8 +306,7 @@ def load_labels_with_state_alignment(hts_labels,
 
     phone_duration = 0
     state_duration_base = 0
-    for current_index, (start_time, end_time,
-                        full_label) in enumerate(hts_labels):
+    for current_index, (start_time, end_time, full_label) in enumerate(hts_labels):
         # remove state information [k]
         assert full_label[-1] == "]"
         full_label_length = len(full_label) - 3
@@ -308,15 +323,16 @@ def load_labels_with_state_alignment(hts_labels,
             phone_duration = frame_number
             state_duration_base = 0
 
-            label_binary_vector = pattern_matching_binary(
-                binary_dict, full_label)
+            label_binary_vector = pattern_matching_binary(binary_dict, full_label)
 
             # if there is no CQS question, the label_continuous_vector will
             # become to empty
             label_continuous_vector = pattern_matching_continous_position(
-                continuous_dict, full_label)
+                numeric_dict, full_label
+            )
             label_vector = np.concatenate(
-                [label_binary_vector, label_continuous_vector], axis=1)
+                [label_binary_vector, label_continuous_vector], axis=1
+            )
 
             for i in range(state_number - 1):
                 s, e, _ = hts_labels[current_index + i + 1]
@@ -324,115 +340,131 @@ def load_labels_with_state_alignment(hts_labels,
 
             if subphone_features == "coarse_coding":
                 cc_feat_matrix = extract_coarse_coding_features_relative(
-                    cc_features, phone_duration)
+                    cc_features, phone_duration
+                )
 
         if add_frame_features:
             current_block_binary_array = np.zeros(
-                (frame_number, dict_size + frame_feature_size))
+                (frame_number, dict_size + frame_feature_size)
+            )
             for i in range(frame_number):
-                current_block_binary_array[i,
-                                           0: dict_size] = label_vector
+                current_block_binary_array[i, 0:dict_size] = label_vector
 
-                if subphone_features == 'full':
+                if subphone_features == "full":
                     # Zhizheng's original 9 subphone features:
                     # fraction through state (forwards)
-                    current_block_binary_array[i, dict_size] = float(
-                        i + 1) / float(frame_number)
+                    current_block_binary_array[i, dict_size] = float(i + 1) / float(
+                        frame_number
+                    )
                     # fraction through state (backwards)
                     current_block_binary_array[i, dict_size + 1] = float(
-                        frame_number - i) / float(frame_number)
+                        frame_number - i
+                    ) / float(frame_number)
                     # length of state in frames
-                    current_block_binary_array[i,
-                                               dict_size + 2] = float(frame_number)
+                    current_block_binary_array[i, dict_size + 2] = float(frame_number)
                     # state index (counting forwards)
-                    current_block_binary_array[i,
-                                               dict_size + 3] = float(state_index)
+                    current_block_binary_array[i, dict_size + 3] = float(state_index)
                     # state index (counting backwards)
-                    current_block_binary_array[i, dict_size +
-                                               4] = float(state_index_backward)
+                    current_block_binary_array[i, dict_size + 4] = float(
+                        state_index_backward
+                    )
 
                     # length of phone in frames
-                    current_block_binary_array[i,
-                                               dict_size + 5] = float(phone_duration)
+                    current_block_binary_array[i, dict_size + 5] = float(phone_duration)
                     # fraction of the phone made up by current state
-                    current_block_binary_array[i, dict_size +
-                                               6] = float(frame_number) / float(phone_duration)
+                    current_block_binary_array[i, dict_size + 6] = float(
+                        frame_number
+                    ) / float(phone_duration)
                     # fraction through phone (backwards)
                     current_block_binary_array[i, dict_size + 7] = float(
-                        phone_duration - i - state_duration_base) / float(phone_duration)
+                        phone_duration - i - state_duration_base
+                    ) / float(phone_duration)
                     # fraction through phone (forwards)
                     current_block_binary_array[i, dict_size + 8] = float(
-                        state_duration_base + i + 1) / float(phone_duration)
+                        state_duration_base + i + 1
+                    ) / float(phone_duration)
 
-                elif subphone_features == 'state_only':
+                elif subphone_features == "state_only":
                     # features which only distinguish state:
                     current_block_binary_array[i, dict_size] = float(
-                        state_index)  # state index (counting forwards)
+                        state_index
+                    )  # state index (counting forwards)
 
-                elif subphone_features == 'frame_only':
+                elif subphone_features == "frame_only":
                     # features which distinguish frame position in phoneme:
                     current_frame_number += 1
                     # fraction through phone (counting forwards)
                     current_block_binary_array[i, dict_size] = float(
-                        current_frame_number) / float(phone_duration)
+                        current_frame_number
+                    ) / float(phone_duration)
 
-                elif subphone_features == 'uniform_state':
+                elif subphone_features == "uniform_state":
                     # features which distinguish frame position in phoneme:
                     current_frame_number += 1
                     # fraction through phone (counting forwards)
                     current_block_binary_array[i, dict_size] = float(
-                        current_frame_number) / float(phone_duration)
+                        current_frame_number
+                    ) / float(phone_duration)
                     new_state_index = max(
-                        1, round(float(current_frame_number) / float(phone_duration) * 5))
+                        1,
+                        round(float(current_frame_number) / float(phone_duration) * 5),
+                    )
                     # state index (counting forwards)
-                    current_block_binary_array[i,
-                                               dict_size + 1] = float(new_state_index)
+                    current_block_binary_array[i, dict_size + 1] = float(
+                        new_state_index
+                    )
 
                 elif subphone_features == "coarse_coding":
                     # features which distinguish frame position in phoneme
                     # using three continous numerical features
-                    current_block_binary_array[i, dict_size +
-                                               0] = cc_feat_matrix[current_frame_number, 0]
-                    current_block_binary_array[i, dict_size +
-                                               1] = cc_feat_matrix[current_frame_number, 1]
-                    current_block_binary_array[i, dict_size +
-                                               2] = cc_feat_matrix[current_frame_number, 2]
-                    current_block_binary_array[i,
-                                               dict_size + 3] = float(phone_duration)
+                    current_block_binary_array[i, dict_size + 0] = cc_feat_matrix[
+                        current_frame_number, 0
+                    ]
+                    current_block_binary_array[i, dict_size + 1] = cc_feat_matrix[
+                        current_frame_number, 1
+                    ]
+                    current_block_binary_array[i, dict_size + 2] = cc_feat_matrix[
+                        current_frame_number, 2
+                    ]
+                    current_block_binary_array[i, dict_size + 3] = float(phone_duration)
                     current_frame_number += 1
 
-                elif subphone_features == 'minimal_frame':
+                elif subphone_features == "minimal_frame":
                     # features which distinguish state and minimally frame
                     # position in state:
-                    current_block_binary_array[i, dict_size] = float(
-                        i + 1) / float(frame_number)  # fraction through state (forwards)
+                    current_block_binary_array[i, dict_size] = float(i + 1) / float(
+                        frame_number
+                    )  # fraction through state (forwards)
                     # state index (counting forwards)
-                    current_block_binary_array[i,
-                                               dict_size + 1] = float(state_index)
+                    current_block_binary_array[i, dict_size + 1] = float(state_index)
                 elif subphone_features is None:
                     pass
                 else:
                     assert False
 
-            label_feature_matrix[label_feature_index:label_feature_index +
-                                 frame_number] = current_block_binary_array
+            label_feature_matrix[
+                label_feature_index : label_feature_index + frame_number
+            ] = current_block_binary_array
             label_feature_index = label_feature_index + frame_number
-        elif subphone_features == 'state_only' and state_index == state_number:
+        elif subphone_features == "state_only" and state_index == state_number:
             # TODO: this pass seems not working
             current_block_binary_array = np.zeros(
-                (state_number, dict_size + frame_feature_size))
+                (state_number, dict_size + frame_feature_size)
+            )
             for i in range(state_number):
-                current_block_binary_array[i,
-                                           0:dict_size] = label_vector
+                current_block_binary_array[i, 0:dict_size] = label_vector
                 current_block_binary_array[i, dict_size] = float(
-                    i + 1)  # state index (counting forwards)
-            label_feature_matrix[label_feature_index:label_feature_index +
-                                 state_number, ] = current_block_binary_array
+                    i + 1
+                )  # state index (counting forwards)
+            label_feature_matrix[
+                label_feature_index : label_feature_index + state_number,
+            ] = current_block_binary_array
             label_feature_index = label_feature_index + state_number
         elif subphone_features is None and state_index == state_number:
             current_block_binary_array = label_vector
-            label_feature_matrix[label_feature_index:label_feature_index +
-                                 1, ] = current_block_binary_array
+            label_feature_matrix[
+                label_feature_index : label_feature_index + 1,
+            ] = current_block_binary_array
             label_feature_index = label_feature_index + 1
         else:
             pass
@@ -441,10 +473,15 @@ def load_labels_with_state_alignment(hts_labels,
 
     # omg
     if label_feature_index == 0:
-        raise ValueError("Combination of subphone_features and add_frame_features is not supported: {}, {}".format(
-            subphone_features, add_frame_features))
-
-    label_feature_matrix = label_feature_matrix[0:label_feature_index, ]
+        raise ValueError(
+            "Combination of subphone_features and add_frame_features is not supported: {}, {}".format(
+                subphone_features, add_frame_features
+            )
+        )
+
+    label_feature_matrix = label_feature_matrix[
+        0:label_feature_index,
+    ]
     return label_feature_matrix
 
 
@@ -464,7 +501,7 @@ def linguistic_features(hts_labels, *args, **kwargs):
     Args:
         hts_label (hts.HTSLabelFile): Input full-context label file
         binary_dict (dict): Dictionary used to extract binary features
-        continuous_dict (dict): Dictionary used to extrract continuous features
+        numeric_dict (dict): Dictionary used to extrract continuous features
         subphone_features (dict): Type of sub-phone features. According
           to the Merlin's source code, None, ``full``, ``state_only``,
           ``frame_only``, ``uniform_state``, ``minimal_phoneme`` and
@@ -485,12 +522,12 @@ def linguistic_features(hts_labels, *args, **kwargs):
         >>> from nnmnkwii.io import hts
         >>> from nnmnkwii.util import example_label_file, example_question_file
         >>> labels = hts.load(example_label_file(phone_level=False))
-        >>> binary_dict, continuous_dict = hts.load_question_set(example_question_file())
-        >>> features = fe.linguistic_features(labels, binary_dict, continuous_dict,
+        >>> binary_dict, numeric_dict = hts.load_question_set(example_question_file())
+        >>> features = fe.linguistic_features(labels, binary_dict, numeric_dict,
         ...     subphone_features="full", add_frame_features=True)
         >>> features.shape
         (615, 425)
-        >>> features = fe.linguistic_features(labels, binary_dict, continuous_dict,
+        >>> features = fe.linguistic_features(labels, binary_dict, numeric_dict,
         ...     subphone_features=None, add_frame_features=False)
         >>> features.shape
         (40, 416)
@@ -501,12 +538,12 @@ def linguistic_features(hts_labels, *args, **kwargs):
         >>> from nnmnkwii.io import hts
         >>> from nnmnkwii.util import example_label_file, example_question_file
         >>> labels = hts.load(example_label_file(phone_level=True))
-        >>> binary_dict, continuous_dict = hts.load_question_set(example_question_file())
-        >>> features = fe.linguistic_features(labels, binary_dict, continuous_dict,
+        >>> binary_dict, numeric_dict = hts.load_question_set(example_question_file())
+        >>> features = fe.linguistic_features(labels, binary_dict, numeric_dict,
         ...     subphone_features="coarse_coding", add_frame_features=True)
         >>> features.shape
         (615, 420)
-        >>> features = fe.linguistic_features(labels, binary_dict, continuous_dict,
+        >>> features = fe.linguistic_features(labels, binary_dict, numeric_dict,
         ...     subphone_features=None, add_frame_features=False)
         >>> features.shape
         (40, 416)
@@ -518,11 +555,13 @@ def linguistic_features(hts_labels, *args, **kwargs):
         return load_labels_with_phone_alignment(hts_labels, *args, **kwargs)
 
 
-def extract_dur_from_state_alignment_labels(hts_labels,
-                                            feature_type="numerical",
-                                            unit_size="state",
-                                            feature_size="phoneme",
-                                            frame_shift=50000):
+def extract_dur_from_state_alignment_labels(
+    hts_labels,
+    feature_type="numerical",
+    unit_size="state",
+    feature_size="phoneme",
+    frame_shift=50000,
+):
     if feature_type not in ["binary", "numerical"]:
         raise ValueError("Not supported")
     if unit_size not in ["phoneme", "state"]:
@@ -532,26 +571,22 @@ def extract_dur_from_state_alignment_labels(hts_labels,
 
     dur_dim = hts_labels.num_states() if unit_size == "state" else 1
     if feature_size == "phoneme":
-        dur_feature_matrix = np.empty(
-            (hts_labels.num_phones(), dur_dim), dtype=np.int)
+        dur_feature_matrix = np.empty((hts_labels.num_phones(), dur_dim), dtype=int)
     else:
-        dur_feature_matrix = np.empty(
-            (hts_labels.num_frames(), dur_dim), dtype=np.int)
+        dur_feature_matrix = np.empty((hts_labels.num_frames(), dur_dim), dtype=int)
 
     current_dur_array = np.zeros((dur_dim, 1))
     state_number = hts_labels.num_states()
     dur_dim = state_number
 
     dur_feature_index = 0
-    for current_index, (start_time, end_time,
-                        full_label) in enumerate(hts_labels):
+    for current_index, (start_time, end_time, full_label) in enumerate(hts_labels):
         # remove state information [k]
         full_label_length = len(full_label) - 3
         state_index = full_label[full_label_length + 1]
         state_index = int(state_index) - 1
 
-        frame_number = (
-            end_time - start_time) // frame_shift
+        frame_number = (end_time - start_time) // frame_shift
 
         if state_index == 1:
             phone_duration = frame_number
@@ -576,7 +611,8 @@ def extract_dur_from_state_alignment_labels(hts_labels,
                     current_block_array = current_dur_array.transpose()
                 if feature_size == "frame":
                     current_block_array = np.tile(
-                        current_dur_array.transpose(), (frame_number, 1))
+                        current_dur_array.transpose(), (frame_number, 1)
+                    )
             elif unit_size == "phoneme":
                 current_block_array = np.array([phone_duration])
             else:
@@ -584,25 +620,31 @@ def extract_dur_from_state_alignment_labels(hts_labels,
 
         # writing into dur_feature_matrix
         if feature_size == "frame":
-            dur_feature_matrix[dur_feature_index:dur_feature_index +
-                               frame_number, ] = current_block_array
+            dur_feature_matrix[
+                dur_feature_index : dur_feature_index + frame_number,
+            ] = current_block_array
             dur_feature_index = dur_feature_index + frame_number
         elif feature_size == "phoneme" and state_index == state_number:
-            dur_feature_matrix[dur_feature_index:dur_feature_index +
-                               1, ] = current_block_array
+            dur_feature_matrix[
+                dur_feature_index : dur_feature_index + 1,
+            ] = current_block_array
             dur_feature_index = dur_feature_index + 1
         else:
             pass
 
-    dur_feature_matrix = dur_feature_matrix[0:dur_feature_index, ]
+    dur_feature_matrix = dur_feature_matrix[
+        0:dur_feature_index,
+    ]
     return dur_feature_matrix
 
 
-def extract_dur_from_phone_alignment_labels(hts_labels,
-                                            feature_type="numerical",
-                                            unit_size="phoneme",
-                                            feature_size="phoneme",
-                                            frame_shift=50000):
+def extract_dur_from_phone_alignment_labels(
+    hts_labels,
+    feature_type="numerical",
+    unit_size="phoneme",
+    feature_size="phoneme",
+    frame_shift=50000,
+):
     if feature_type not in ["binary", "numerical"]:
         raise ValueError("Not supported")
     if unit_size != "phoneme":
@@ -610,11 +652,9 @@ def extract_dur_from_phone_alignment_labels(hts_labels,
     if feature_size not in ["phoneme", "frame"]:
         raise ValueError("Not supported")
     if feature_size == "phoneme":
-        dur_feature_matrix = np.empty(
-            (hts_labels.num_phones(), 1), dtype=np.int)
+        dur_feature_matrix = np.empty((hts_labels.num_phones(), 1), dtype=int)
     else:
-        dur_feature_matrix = np.empty(
-            (hts_labels.num_frames(), 1), dtype=np.int)
+        dur_feature_matrix = np.empty((hts_labels.num_frames(), 1), dtype=int)
     dur_feature_index = 0
     for current_index, (start_time, end_time, _) in enumerate(hts_labels):
         frame_number = (end_time - start_time) / frame_shift
@@ -631,12 +671,14 @@ def extract_dur_from_phone_alignment_labels(hts_labels,
 
         # writing into dur_feature_matrix
         if feature_size == "frame":
-            dur_feature_matrix[dur_feature_index:dur_feature_index +
-                               frame_number] = current_block_array
+            dur_feature_matrix[
+                dur_feature_index : dur_feature_index + frame_number
+            ] = current_block_array
             dur_feature_index = dur_feature_index + frame_number
         elif feature_size == "phoneme":
-            dur_feature_matrix[dur_feature_index:dur_feature_index +
-                               1] = current_block_array
+            dur_feature_matrix[
+                dur_feature_index : dur_feature_index + 1
+            ] = current_block_array
             dur_feature_index = dur_feature_index + 1
         else:
             assert False
@@ -690,8 +732,6 @@ def duration_features(hts_labels, *args, **kwargs):
 
     """
     if hts_labels.is_state_alignment_label():
-        return extract_dur_from_state_alignment_labels(
-            hts_labels, *args, **kwargs)
+        return extract_dur_from_state_alignment_labels(hts_labels, *args, **kwargs)
     else:
-        return extract_dur_from_phone_alignment_labels(
-            hts_labels, *args, **kwargs)
+        return extract_dur_from_phone_alignment_labels(hts_labels, *args, **kwargs)
diff --git a/nnmnkwii/io/__init__.py b/nnmnkwii/io/__init__.py
index d675d93d..e69de29b 100644
--- a/nnmnkwii/io/__init__.py
+++ b/nnmnkwii/io/__init__.py
@@ -1 +0,0 @@
-from __future__ import division, print_function, absolute_import
diff --git a/nnmnkwii/io/hts.py b/nnmnkwii/io/hts.py
index bb123790..20ffab42 100644
--- a/nnmnkwii/io/hts.py
+++ b/nnmnkwii/io/hts.py
@@ -38,12 +38,11 @@
 #  THIS SOFTWARE.
 ##########################################################################
 
-from __future__ import division, print_function, absolute_import
-
-import numpy as np
 import re
 from copy import copy
 
+import numpy as np
+
 
 class HTSLabelFile(object):
     """Memory representation for HTS-style context labels (a.k.a HTK alignment).
@@ -132,8 +131,12 @@ def __repr__(self):
 
     def round_(self):
         s = self.frame_shift
-        self.start_times = list(np.round(np.asarray(self.start_times) / s).astype(np.int64) * s)
-        self.end_times = list(np.round(np.asarray(self.end_times) / s).astype(np.int64) * s)
+        self.start_times = list(
+            np.round(np.asarray(self.start_times) / s).astype(np.int64) * s
+        )
+        self.end_times = list(
+            np.round(np.asarray(self.end_times) / s).astype(np.int64) * s
+        )
         return self
 
     def append(self, label, strict=True):
@@ -158,11 +161,15 @@ def append(self, label, strict=True):
             if start_time >= end_time:
                 raise ValueError(
                     "end_time ({}) must be larger than start_time ({}).".format(
-                        end_time, start_time))
+                        end_time, start_time
+                    )
+                )
             if len(self.end_times) > 0 and start_time != self.end_times[-1]:
                 raise ValueError(
                     "start_time ({}) must be equal to the last end_time ({}).".format(
-                        start_time, self.end_times[-1]))
+                        start_time, self.end_times[-1]
+                    )
+                )
 
         self.start_times.append(start_time)
         self.end_times.append(end_time)
@@ -178,8 +185,9 @@ def set_durations(self, durations, frame_shift=50000):
         offset = self.start_times[0]
 
         # Unwrap state-axis
-        end_times = offset + np.cumsum(
-            durations.reshape(-1, 1) * frame_shift).astype(np.int64)
+        end_times = offset + np.cumsum(durations.reshape(-1, 1) * frame_shift).astype(
+            np.int64
+        )
         if len(end_times) != len(self.end_times):
             raise RuntimeError("Unexpected input, maybe")
         start_times = np.hstack((offset, end_times[:-1])).astype(np.int64)
@@ -272,15 +280,15 @@ def silence_frame_indices(self, regex=None, frame_shift=50000):
         end_times = np.array(self.end_times)
         s = start_times[indices] // frame_shift
         e = end_times[indices] // frame_shift
-        return np.unique(np.concatenate(
-            [np.arange(a, b) for (a, b) in zip(s, e)], axis=0)).astype(np.int64)
+        return np.unique(
+            np.concatenate([np.arange(a, b) for (a, b) in zip(s, e)], axis=0)
+        ).astype(np.int64)
 
     def is_state_alignment_label(self):
-        return self.contexts[0][-1] == ']' and self.contexts[0][-3] == '['
+        return self.contexts[0][-1] == "]" and self.contexts[0][-3] == "["
 
     def num_states(self):
-        """Returnes number of states exclusing special begin/end states.
-        """
+        """Returnes number of states exclusing special begin/end states."""
         if not self.is_state_alignment_label():
             return 1
 
@@ -332,32 +340,34 @@ def wildcards2regex(question, convert_number_pattern=False, convert_svs_pattern=
     extracting continuous values):
     (\d+)       -- handles digit without decimal point
     ([\d\.]+)   -- handles digits with and without decimal point
+    ([-\d]+)    -- handles positive and negative numbers
     """
 
     # handle HTK wildcards (and lack of them) at ends of label:
     prefix = ""
     postfix = ""
-    if '*' in question:
-        if not question.startswith('*'):
+    if "*" in question:
+        if not question.startswith("*"):
             prefix = "\A"
-        if not question.endswith('*'):
+        if not question.endswith("*"):
             postfix = "\Z"
-    question = question.strip('*')
+    question = question.strip("*")
     question = re.escape(question)
     # convert remaining HTK wildcards * and ? to equivalent regex:
-    question = question.replace('\\*', '.*')
+    question = question.replace("\\*", ".*")
     question = prefix + question + postfix
 
     if convert_number_pattern:
-        question = question.replace('\\(\\\\d\\+\\)', '(\d+)')
-        question = question.replace(
-            '\\(\\[\\\\d\\\\\\.\\]\\+\\)', '([\d\.]+)')
+        question = question.replace("\\(\\\\d\\+\\)", "(\d+)")
+        question = question.replace("\\(\\[\\-\\\\d\\]\\+\\)", "([-\d]+)")
+        question = question.replace("\\(\\[\\\\d\\\\\\.\\]\\+\\)", "([\d\.]+)")
     # NOTE: singing voice synthesis specific handling
     if convert_svs_pattern:
         question = question.replace(
-            '\\(\\[A\\-Z\\]\\[b\\]\\?\\[0\\-9\\]\\+\\)', '([A-Z][b]?[0-9]+)')
-        question = question.replace('\\(\\\\NOTE\\)', '([A-Z][b]?[0-9]+)')
-        question = question.replace('\\(\\[pm\\]\\\\d\\+\\)', '([pm]\d+)')
+            "\\(\\[A\\-Z\\]\\[b\\]\\?\\[0\\-9\\]\\+\\)", "([A-Z][b]?[0-9]+)"
+        )
+        question = question.replace("\\(\\\\NOTE\\)", "([A-Z][b]?[0-9]+)")
+        question = question.replace("\\(\\[pm\\]\\\\d\\+\\)", "([pm]\d+)")
 
     return question
 
@@ -377,54 +387,124 @@ def load_question_set(qs_file_name, append_hat_for_LL=True, convert_svs_pattern=
         convert_svs_pattern (bool): Convert SVS specific patterns.
 
     Returns:
-        (binary_dict, continuous_dict): Binary/continuous feature extraction
+        (binary_dict, numeric_dict): Binary/numeric feature extraction
         regexes.
 
     Examples:
         >>> from nnmnkwii.io import hts
         >>> from nnmnkwii.util import example_question_file
-        >>> binary_dict, continuous_dict = hts.load_question_set(example_question_file())
+        >>> binary_dict, numeric_dict = hts.load_question_set(example_question_file())
     """
     with open(qs_file_name) as f:
         lines = f.readlines()
     binary_qs_index = 0
     continuous_qs_index = 0
     binary_dict = {}
-    continuous_dict = {}
+    numeric_dict = {}
 
-    LL = re.compile(re.escape('LL-'))
+    LL = re.compile(re.escape("LL-"))
 
     for line in lines:
-        line = line.replace('\n', '')
+        line = line.replace("\n", "")
         temp_list = line.split()
         if len(line) <= 0 or line.startswith("#"):
             continue
-        temp_list = line.split('{')
+        name = temp_list[1].replace('"', "").replace("'", "")
+        temp_list = line.split("{")
         temp_line = temp_list[1]
-        temp_list = temp_line.split('}')
+        temp_list = temp_line.split("}")
         temp_line = temp_list[0]
         temp_line = temp_line.strip()
-        question_list = temp_line.split(',')
+        question_list = temp_line.split(",")
 
-        temp_list = line.split(' ')
+        temp_list = line.split(" ")
         question_key = temp_list[1]
-        if temp_list[0] == 'CQS':
+        if temp_list[0] == "CQS":
             assert len(question_list) == 1
             processed_question = wildcards2regex(
-                question_list[0], convert_number_pattern=True)
-            continuous_dict[continuous_qs_index] = re.compile(
-                processed_question)  # save pre-compiled regular expression
+                question_list[0],
+                convert_number_pattern=True,
+                convert_svs_pattern=convert_svs_pattern,
+            )
+            numeric_dict[continuous_qs_index] = (
+                name,
+                re.compile(processed_question),
+            )  # save pre-compiled regular expression
             continuous_qs_index = continuous_qs_index + 1
-        elif temp_list[0] == 'QS':
+        elif temp_list[0] == "QS":
             re_list = []
             for temp_question in question_list:
                 processed_question = wildcards2regex(temp_question)
-                if append_hat_for_LL and LL.search(question_key) and processed_question[0] != '^':
-                    processed_question = '^' + processed_question
+                if (
+                    append_hat_for_LL
+                    and LL.search(question_key)
+                    and processed_question[0] != "^"
+                ):
+                    processed_question = "^" + processed_question
                 re_list.append(re.compile(processed_question))
 
-            binary_dict[binary_qs_index] = re_list
+            binary_dict[binary_qs_index] = (name, re_list)
             binary_qs_index = binary_qs_index + 1
         else:
             raise RuntimeError("Not supported question format")
-    return binary_dict, continuous_dict
+    return binary_dict, numeric_dict
+
+
+def write_audacity_labels(dst_path, labels):
+    """Write audacity labels from HTS-style labels
+
+    Args:
+        dst_path (str): The output file path.
+        labels (HTSLabelFile): HTS style labels
+    """
+    with open(dst_path, "w") as of:
+        for s, e, l in labels:
+            s, e = s * 1e-7, e * 1e-7
+            if "-" in l and "+" in l:
+                ph = l.split("-")[1].split("+")[0]
+            else:
+                ph = l
+            of.write("{:.4f}\t{:.4f}\t{}\n".format(s, e, ph))
+
+
+def write_textgrid(dst_path, labels):
+    """Write TextGrid from HTS-style labels
+
+    Args:
+        dst_path (str): The output file path.
+        labels (HTSLabelFile): HTS style labels
+    """
+    template = """File type = "ooTextFile"
+Object class = "TextGrid"
+
+xmin = 0
+xmax = {xmax}
+tiers? <exists>
+size = 1
+item []:
+    item [1]:
+        class = "IntervalTier"
+        name = "phoneme"
+        xmin = 0
+        xmax = {xmax}
+        intervals: size = {size}"""
+    template = template.format(xmax=labels.end_times[-1] * 1e-7, size=len(labels))
+
+    for idx, (s, e, l) in enumerate(labels):
+        s, e = s * 1e-7, e * 1e-7
+        if "-" in l and "+" in l:
+            ph = l.split("-")[1].split("+")[0]
+        else:
+            ph = l
+
+        template += """
+        intervals [{idx}]:
+            xmin = {s}
+            xmax = {e}
+            text = "{ph}" """.format(
+            idx=idx + 1, s=s, e=e, ph=ph
+        )
+    template += "\n"
+
+    with open(dst_path, "w") as of:
+        of.write(template)
diff --git a/nnmnkwii/paramgen/__init__.py b/nnmnkwii/paramgen/__init__.py
index 97d276ad..8e388f66 100644
--- a/nnmnkwii/paramgen/__init__.py
+++ b/nnmnkwii/paramgen/__init__.py
@@ -1,16 +1,5 @@
 from __future__ import with_statement, print_function, absolute_import
 
-
-try:
-    import bandmat
-except ImportError:
-    raise ImportError("""
-    `bandmat` is required for the `paramgen` module but not found.
-    Please install it manually by:
-    `pip install bandmat`.
-    If you see installation errors, please report to https://github.com/MattShannon/bandmat.
-    """)
-
 from ._mlpg import build_win_mats, mlpg, mlpg_grad, unit_variance_mlpg_matrix
 from ._mlpg import reshape_means, full_window_mat
 
diff --git a/nnmnkwii/paramgen/_bandmat/License b/nnmnkwii/paramgen/_bandmat/License
new file mode 100644
index 00000000..6285aa94
--- /dev/null
+++ b/nnmnkwii/paramgen/_bandmat/License
@@ -0,0 +1,23 @@
+Copyright 2013, 2014, 2015, 2016, 2017, 2018 Matt Shannon
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+   1. Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+   2. Redistributions in binary form must reproduce the above copyright notice,
+      this list of conditions and the following disclaimer in the documentation
+      and/or other materials provided with the distribution.
+   3. The name of the author may not be used to endorse or promote products
+      derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR "AS IS" AND ANY EXPRESS OR IMPLIED
+WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
+EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
+OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
+IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
+OF SUCH DAMAGE.
diff --git a/nnmnkwii/paramgen/_bandmat/__init__.py b/nnmnkwii/paramgen/_bandmat/__init__.py
new file mode 100644
index 00000000..bdae5264
--- /dev/null
+++ b/nnmnkwii/paramgen/_bandmat/__init__.py
@@ -0,0 +1,100 @@
+"""A banded matrix library for python.
+
+This package provides a simple banded matrix library for python.
+It supports banded matrix-vector and matrix-matrix multiplication, converting
+between full and banded matrix representations, and certain linear algebra
+operations on banded matrices.
+
+Overview
+--------
+
+A banded matrix is a matrix where only the diagonal, a number of superdiagonals
+and a number of subdiagonals are non-zero.
+The well-known BLAS interface and LAPACK library for linear algebra define
+several banded matrix operations, and some of these, such as banded Cholesky
+decomposition, are wrapped in the excellent python package scipy, specifically
+in scipy.linalg.
+The bandmat package re-uses the banded matrix representation used by BLAS,
+LAPACK and scipy.linalg, wrapping it in a lightweight class for ease of use.
+See the docstring for the BandMat class for full details of the representation
+used.
+
+The bandmat package provides:
+
+- a lightweight class wrapping the LAPACK-style banded matrix representation.
+  This class keeps track of things like bandwidths to allow a more direct
+  coding style when working with banded matrices.
+- some basic banded matrix operations not present in scipy.
+  For example, banded matrix-vector multiplication is defined by BLAS but not
+  wrapped by scipy, and banded matrix-matrix multiplication is not defined in
+  BLAS or in scipy.
+  The bandmat package contains C implementations of these operations written in
+  cython.
+- helper functions for converting between full and banded matrix
+  representations.
+- certain linear algebra operations on banded matrices.
+- an implementation of a fancy indexed += function for numpy arrays.
+  This is included for (the author's) convenience and is not directly related
+  to banded matrix manipulation.
+
+Only square banded matrices are supported by this package.
+
+Examples
+--------
+
+Define a banded matrix:
+>>> import numpy as np
+>>> import bandmat as bm
+>>> # Ensure consistent formatting between old and new versions of numpy.
+>>> try:
+...   np.set_printoptions(legacy='1.13')
+... except TypeError:
+...   pass
+>>> a_bm = bm.BandMat(1, 1, np.arange(15, dtype=np.float64).reshape((3, 5)))
+>>> print(a_bm.data)
+[[  0.   1.   2.   3.   4.]
+ [  5.   6.   7.   8.   9.]
+ [ 10.  11.  12.  13.  14.]]
+
+Get the equivalent numpy array:
+>>> print(a_bm.full())
+[[  5.   1.   0.   0.   0.]
+ [ 10.   6.   2.   0.   0.]
+ [  0.  11.   7.   3.   0.]
+ [  0.   0.  12.   8.   4.]
+ [  0.   0.   0.  13.   9.]]
+
+Take the transpose:
+>>> print(a_bm.T.full())
+[[  5.  10.   0.   0.   0.]
+ [  1.   6.  11.   0.   0.]
+ [  0.   2.   7.  12.   0.]
+ [  0.   0.   3.   8.  13.]
+ [  0.   0.   0.   4.   9.]]
+
+Banded matrix addition:
+>>> b_bm = bm.BandMat(0, 1, np.arange(10, dtype=np.float64).reshape((2, 5)))
+>>> print((a_bm + b_bm).full())
+[[ 10.   2.   0.   0.   0.]
+ [ 10.  12.   4.   0.   0.]
+ [  0.  11.  14.   6.   0.]
+ [  0.   0.  12.  16.   8.]
+ [  0.   0.   0.  13.  18.]]
+
+Banded matrix multiplication:
+>>> b_bm = bm.BandMat(0, 1, np.arange(10, dtype=np.float64).reshape((2, 5)))
+>>> print(bm.dot_mm(a_bm, b_bm).full())
+[[  25.   11.    2.    0.    0.]
+ [  50.   46.   26.    6.    0.]
+ [   0.   66.   71.   45.   12.]
+ [   0.    0.   84.  100.   68.]
+ [   0.    0.    0.  104.  133.]]
+"""
+
+# Copyright 2013, 2014, 2015, 2016, 2017, 2018 Matt Shannon
+
+# This file is part of bandmat.
+# See `License` for details of license and warranty.
+
+from .core import *
+from .tensor import *
\ No newline at end of file
diff --git a/nnmnkwii/paramgen/_bandmat/core.pyx b/nnmnkwii/paramgen/_bandmat/core.pyx
new file mode 100644
index 00000000..d5e1098d
--- /dev/null
+++ b/nnmnkwii/paramgen/_bandmat/core.pyx
@@ -0,0 +1,704 @@
+# cython: language_level=3
+
+"""Core banded matrix definitions and functions."""
+
+# Copyright 2013, 2014, 2015, 2016, 2017, 2018 Matt Shannon
+
+# This file is part of bandmat.
+# See `License` for details of license and warranty.
+
+from nnmnkwii.paramgen._bandmat import full as fl
+
+import numpy as np
+
+cimport numpy as cnp
+cimport cython
+
+cnp.import_array()
+cnp.import_ufunc()
+
+class BandMat(object):
+    """A memory-efficient representation of a square banded matrix.
+
+    An N by N matrix with bandwidth D can be stored efficiently by storing its
+    band as a rectangular D by N matrix.
+    This class is a lightweight wrapper around a rectangular matrix being used
+    in this way, and stores additional details needed to recover the square
+    matrix such as the lower and upper bandwidths.
+
+    The representation used for the rectangular matrix is the same one used by
+    BLAS and LAPACK (and thus scipy): the columns of the rectangular matrix are
+    (parts of) the columns of the square matrix being represented, and the
+    successive rows of the rectangular matrix give the superdiagonals, diagonal
+    and subdiagonals in order (starting with the outermost superdiagonal and
+    ending with the outermost subdiagonal).
+    See the "Band Storage" section of the LAPACK Users' Guide at
+    http://www.netlib.org/lapack/lug/node124.html or the docstring for
+    scipy.linalg.solve_banded for some examples.
+
+    `l` is the number of subdiagonals stored and `u` is the number of
+    superdiagonals stored.
+    Thus `l` and `u` determine the band in which the entries of the represented
+    matrix can be non-zero.
+    `data` is the LAPACK-style banded matrix representation of the square
+    matrix (or of the transpose of the square matrix if `transposed` is True)
+    stored as a numpy array.
+    Note that if `transposed` is True, `l` and `u` still refer to the square
+    matrix being represented rather than to its transpose.
+    """
+    def __init__(self, l, u, data, transposed=False):
+        self.l = l
+        self.u = u
+        self.data = data
+        self.transposed = transposed
+
+        assert self.l >= 0
+        assert self.u >= 0
+        assert self.data.ndim == 2
+        assert self.data.shape[0] == self.l + self.u + 1
+
+    def __repr__(self):
+        return ('BandMat(%r, %r, %r, transposed=%r)' %
+                (self.l, self.u, self.data, self.transposed))
+
+    @property
+    def size(self):
+        """Returns the size of this matrix."""
+        return self.data.shape[1]
+
+    @property
+    def T(self):
+        """Returns the transpose of this matrix.
+
+        This is a cheap operation since it just sets a flag internally.
+        The returned BandMat has the same underlying data array as `self`.
+        """
+        return BandMat(self.u, self.l, self.data,
+                       transposed=not self.transposed)
+
+    def full(self):
+        """Converts this BandMat to a conventional numpy array.
+
+        The returned numpy array represents the same matrix as `self`.
+        """
+        if self.transposed:
+            return fl.band_c(self.u, self.l, self.data).T
+        else:
+            return fl.band_c(self.l, self.u, self.data)
+
+    def copy_exact(self):
+        """Returns a copy of this BandMat.
+
+        The returned BandMat represents the same matrix as `self`, but has a
+        newly-created underlying data array.
+        It has the same `transposed` setting as `self`.
+        """
+        return BandMat(self.l, self.u, self.data.copy(),
+                       transposed=self.transposed)
+
+    def copy(self):
+        """Returns a copy of this BandMat with transposed set to False.
+
+        The returned BandMat represents the same matrix as `self`, but has a
+        newly-created underlying data array, and always has `transposed` set to
+        False.
+        """
+        l = self.l
+        u = self.u
+        if self.transposed:
+            return BandMat(l, u, fl.band_cTe(u, l, self.data))
+        else:
+            return BandMat(l, u, self.data.copy())
+
+    def equiv(self, l_new=None, u_new=None, transposed_new=None,
+              zero_extra=False):
+        """Returns an equivalent BandMat stored differently.
+
+        The returned BandMat represents the same matrix as `self`, but has a
+        newly-created underlying data array, and has possibly different
+        parameters `l`, `u` and `transposed`.
+        The new values of these parameters are given by `l_new`, `u_new` and
+        `transposed_new`, with the corresponding value from `self` used if any
+        of these are None.
+
+        If `zero_extra` is True then the underlying data array of the returned
+        BandMat is guaranteed to have extra entries set to zero.
+        """
+        l = self.l
+        u = self.u
+        if l_new is None:
+            l_new = l
+        if u_new is None:
+            u_new = u
+        if transposed_new is None:
+            transposed_new = self.transposed
+
+        assert l_new >= l
+        assert u_new >= u
+
+        data_new = np.empty((l_new + u_new + 1, self.size))
+
+        ll, uu = (u, l) if transposed_new else (l, u)
+        ll_new, uu_new = (u_new, l_new) if transposed_new else (l_new, u_new)
+
+        data_new[(uu_new - uu_new):(uu_new - uu)] = 0.0
+        data_new[(uu_new + ll + 1):(uu_new + ll_new + 1)] = 0.0
+        data_new_co = data_new[(uu_new - uu):(uu_new + ll + 1)]
+        if self.transposed == transposed_new:
+            data_new_co[:] = self.data
+            if zero_extra:
+                fl.zero_extra_entries(ll, uu, data_new_co)
+        else:
+            fl.band_cTe(uu, ll, self.data, target_rect=data_new_co)
+
+        return BandMat(l_new, u_new, data_new, transposed=transposed_new)
+
+    @cython.boundscheck(False)
+    def plus_equals_band_of(self, mat_bm, double mult=1.0):
+        """Adds a multiple of a band of another matrix to this matrix in-place.
+
+        Any entries of `mat_bm` which lie outside of `self` are ignored.
+        Thus to implement conventional matrix addition, `self` must be large
+        enough to contain the result of the addition, i.e. `self` must have at
+        least as many subdiagonals and superdiagonals as `mat_bm`.
+
+        The statement `target_bm.plus_equals_band_of(mat_bm, mult)` where
+        `target_bm` and `mat_bm` are BandMats is the equivalent of:
+
+            target_full += band_ec(l, u, mat_full) * mult
+
+        where `target_full` and `mat_full` are square numpy arrays.
+        Here `l` is `target_bm.l` and `u` is `target_bm.u`.
+        """
+        cdef long frames
+        cdef long l_a, u_a, transposed_a
+        cdef long l_b, u_b, transposed_b
+        cdef cnp.ndarray[cnp.float64_t, ndim=2] a_data
+        cdef cnp.ndarray[cnp.float64_t, ndim=2] b_data
+
+        l_a = self.l
+        u_a = self.u
+        a_data = self.data
+        transposed_a = self.transposed
+        assert l_a >= 0
+        assert u_a >= 0
+        assert a_data.shape[0] == l_a + u_a + 1
+
+        l_b = mat_bm.l
+        u_b = mat_bm.u
+        b_data = mat_bm.data
+        transposed_b = mat_bm.transposed
+        assert l_b >= 0
+        assert u_b >= 0
+        assert b_data.shape[0] == l_b + u_b + 1
+
+        frames = a_data.shape[1]
+        assert b_data.shape[1] == frames
+
+        cdef long o
+        cdef unsigned long row_a, row_b
+        cdef long d_a, d_b
+        cdef unsigned long frame
+
+        for o in range(-min(u_a, u_b), min(l_a, l_b) + 1):
+            row_a = (l_a - o) if transposed_a else (u_a + o)
+            row_b = (l_b - o) if transposed_b else (u_b + o)
+            d_a = o if transposed_a else 0
+            d_b = o if transposed_b else 0
+            for frame in range(max(0, -o), max(0, frames + min(0, -o))):
+                a_data[row_a, frame + d_a] += b_data[row_b, frame + d_b] * mult
+
+        return
+
+    def __add__(self, other):
+        """Sums two banded matrices.
+
+        The expression `a_bm + b_bm` where `a_bm` and `b_bm` are BandMats is
+        the equivalent of:
+
+            a_full + b_full
+
+        where `a_full` and `b_full` are square numpy arrays.
+        """
+        # (FIXME : is "is 0" the right way to test for this?)
+        if other is 0:
+            return self.copy_exact()
+        elif not isinstance(other, BandMat):
+            return NotImplemented
+
+        assert self.size == other.size
+        c_bm = self.equiv(l_new=max(self.l, other.l),
+                          u_new=max(self.u, other.u))
+        c_bm.plus_equals_band_of(other)
+        return c_bm
+
+    def __radd__(self, other):
+        if other is 0:
+            # this is so "sum" will work as expected
+            return self.copy_exact()
+        else:
+            return NotImplemented
+
+    def __sub__(self, other):
+        """Subtracts one banded matrix from another.
+
+        The expression `a_bm - b_bm` where `a_bm` and `b_bm` are BandMats is
+        the equivalent of:
+
+            a_full - b_full
+
+        where `a_full` and `b_full` are square numpy arrays.
+        """
+        if other is 0:
+            return self.copy_exact()
+        elif not isinstance(other, BandMat):
+            return NotImplemented
+
+        assert self.size == other.size
+        c_bm = self.equiv(l_new=max(self.l, other.l),
+                          u_new=max(self.u, other.u))
+        c_bm.plus_equals_band_of(other, mult=-1.0)
+        return c_bm
+
+    def __rsub__(self, other):
+        if other is 0:
+            return self.__neg__()
+        else:
+            return NotImplemented
+
+    def __iadd__(self, other):
+        """Adds another matrix to this matrix in-place.
+
+        The statement `a_bm += b_bm` where `a_bm` and `b_bm` are BandMats is
+        the equivalent of:
+
+            a_full += b_full
+
+        where `a_full` and `b_full` are square numpy arrays.
+        """
+        if other is 0:
+            return self
+        elif not isinstance(other, BandMat):
+            return NotImplemented
+
+        assert self.size == other.size
+        assert self.l >= other.l
+        assert self.u >= other.u
+
+        self.plus_equals_band_of(other)
+        return self
+
+    def __isub__(self, other):
+        """Subtracts another matrix from this matrix in-place.
+
+        The statement `a_bm -= b_bm` where `a_bm` and `b_bm` are BandMats is
+        the equivalent of:
+
+            a_full -= b_full
+
+        where `a_full` and `b_full` are square numpy arrays.
+        """
+        if other is 0:
+            return self
+        elif not isinstance(other, BandMat):
+            return NotImplemented
+
+        assert self.size == other.size
+        assert self.l >= other.l
+        assert self.u >= other.u
+
+        self.plus_equals_band_of(other, mult=-1.0)
+        return self
+
+    def __pos__(self):
+        """Take the positive of a banded matrix.
+
+        The expression `+a_bm` where `a_bm` is a BandMat is the equivalent of:
+
+            +a_full
+
+        where `a_full` is a square numpy array.
+        """
+        return BandMat(self.l, self.u, +self.data,
+                       transposed=self.transposed)
+
+    def __neg__(self):
+        """Take the negative of a banded matrix.
+
+        The expression `-a_bm` where `a_bm` is a BandMat is the equivalent of:
+
+            -a_full
+
+        where `a_full` is a square numpy array.
+        """
+        return BandMat(self.l, self.u, -self.data,
+                       transposed=self.transposed)
+
+    def __mul__(self, other):
+        """Multiplies a banded matrix by a scalar.
+
+        The expression `a_bm * mult` where `a_bm` is a BandMat is the
+        equivalent of:
+
+            a_full * mult
+
+        where `a_full` is a square numpy array.
+        """
+        try:
+            mult = float(other)
+        except:
+            return NotImplemented
+
+        return BandMat(self.l, self.u, self.data * mult,
+                       transposed=self.transposed)
+
+    def __rmul__(self, other):
+        return self.__mul__(other)
+
+    def __floordiv__(self, other):
+        """Floor-divides a banded matrix by a scalar.
+
+        The expression `a_bm // mult` where `a_bm` is a BandMat is the
+        equivalent of:
+
+            a_full // mult
+
+        where `a_full` is a square numpy array.
+        """
+        try:
+            mult = float(other)
+        except:
+            return NotImplemented
+
+        return BandMat(self.l, self.u, self.data.__floordiv__(mult),
+                       transposed=self.transposed)
+
+    def __div__(self, other):
+        """Old-style divides a banded matrix by a scalar.
+
+        When using old-style division (c.f. `from __future__ import division`),
+        the expression `a_bm / mult` where `a_bm` is a BandMat is the
+        equivalent of:
+
+            a_full / mult
+
+        where `a_full` is a square numpy array.
+        """
+        try:
+            mult = float(other)
+        except:
+            return NotImplemented
+
+        return BandMat(self.l, self.u, self.data.__div__(mult),
+                       transposed=self.transposed)
+
+    def __truediv__(self, other):
+        """Divides a banded matrix by a scalar.
+
+        When using new-style division (c.f. `from __future__ import division`),
+        the expression `a_bm / mult` where `a_bm` is a BandMat is the
+        equivalent of:
+
+            a_full / mult
+
+        where `a_full` is a square numpy array.
+        """
+        try:
+            mult = float(other)
+        except:
+            return NotImplemented
+
+        return BandMat(self.l, self.u, self.data.__truediv__(mult),
+                       transposed=self.transposed)
+
+    def __imul__(self, other):
+        """Multiplies this matrix by a scalar in-place.
+
+        The statement `a_bm *= mult` where `a_bm` is a BandMat is the
+        equivalent of:
+
+            a_full *= mult
+
+        where `a_full` is a square numpy array.
+        """
+        try:
+            mult = float(other)
+        except:
+            return NotImplemented
+
+        self.data *= mult
+        return self
+
+    def __ifloordiv__(self, other):
+        """Floor-divides this matrix by a scalar in-place.
+
+        The statement `a_bm //= mult` where `a_bm` is a BandMat is the
+        equivalent of:
+
+            a_full //= mult
+
+        where `a_full` is a square numpy array.
+        """
+        try:
+            mult = float(other)
+        except:
+            return NotImplemented
+
+        self.data.__ifloordiv__(mult)
+        return self
+
+    def __idiv__(self, other):
+        """Old-style divides this matrix by a scalar in-place.
+
+        When using old-style division (c.f. `from __future__ import division`),
+        the expression `a_bm /= mult` where `a_bm` is a BandMat is the
+        equivalent of:
+
+            a_full /= mult
+
+        where `a_full` is a square numpy array.
+        """
+        try:
+            mult = float(other)
+        except:
+            return NotImplemented
+
+        self.data.__itruediv__(mult)
+        return self
+
+    def __itruediv__(self, other):
+        """Divides this matrix by a scalar in-place.
+
+        When using new-style division (c.f. `from __future__ import division`),
+        the expression `a_bm /= mult` where `a_bm` is a BandMat is the
+        equivalent of:
+
+            a_full /= mult
+
+        where `a_full` is a square numpy array.
+        """
+        try:
+            mult = float(other)
+        except:
+            return NotImplemented
+
+        self.data.__itruediv__(mult)
+        return self
+
+    def reverse_view(self):
+        """Returns the "time-reversed" version of this matrix.
+
+        This function is implemented by taking a view of the underlying data
+        array.
+        To obtain a BandMat with a fresh underlying data array, `.copy_exact()`
+        should be called on the result.
+
+        The expression `a_bm.reverse_view()` where `a_bm` is a BandMat is the
+        equivalent of:
+
+            a_full[::-1, ::-1]
+
+        where `a_full` is a square numpy array.
+        """
+        return BandMat(self.u, self.l, self.data[::-1, ::-1],
+                       transposed=self.transposed)
+
+    def sub_matrix_view(self, start, end):
+        """Returns a sub-matrix of this matrix.
+
+        This is implemented by taking a view of the underlying data array.
+        To obtain a BandMat with a fresh underlying data array, `.copy_exact()`
+        should be called on the result.
+
+        The expression `a_bm.sub_matrix_view(start, end)` where `a_bm` is a
+        BandMat is the equivalent of:
+
+            a_full[start:end, start:end]
+
+        where `a_full` is a square numpy array.
+        """
+        assert 0 <= start <= end <= self.size
+        return BandMat(self.l, self.u, self.data[:, start:end],
+                       transposed=self.transposed)
+
+    def embed_as_sub_matrix(self, start, size):
+        """Returns a new matrix containing this matrix as a sub-matrix.
+
+        The statement `b_bm = a_bm.embed_as_sub_matrix(start, size)` where
+        `a_bm` is a BandMat is the equivalent of:
+
+            end = start + len(a_full)
+            b_full = np.zeros((size, size))
+            b_full[start:end, start:end] = a_full
+
+        where `a_full` is a square numpy array.
+        """
+        end = start + self.size
+        assert 0 <= start <= end <= size
+
+        l, u = self.l, self.u
+        ll, uu = (u, l) if self.transposed else (l, u)
+
+        data = np.empty((l + u + 1, size))
+        data[:, 0:start] = 0.0
+        data[:, start:end] = self.data
+        data[:, end:size] = 0.0
+        fl.zero_extra_entries(ll, uu, data[:, start:end])
+        return BandMat(self.l, self.u, data, transposed=self.transposed)
+
+def zeros(l, u, size):
+    """Returns the zero matrix as a BandMat.
+
+    The returned BandMat `ret_bm` has `ret_bm.l == l`, `ret_bm.u == u` and
+    `ret_bm.size == size`.
+    """
+    data = np.zeros((l + u + 1, size))
+    return BandMat(l, u, data)
+
+def from_full(l, u, mat_full):
+    """Converts a square banded numpy array to a BandMat.
+
+    The returned BandMat represents the same matrix as `mat_full`.
+    `mat_full` should be a numpy array representing a square matrix with zeros
+    outside the band specified by `l` and `u`.
+    An AssertionError is raised if `mat_full` has non-zero entries outside the
+    specified band.
+    """
+    mat_bm = BandMat(l, u, fl.band_e(l, u, mat_full))
+    # check `mat_full` is zero outside the specified band
+    assert np.all(mat_bm.full() == mat_full)
+    return mat_bm
+
+def band_c_bm(l, u, mat_rect):
+    """Constructs a BandMat from its band.
+
+    The expression `band_c_bm(l, u, mat_rect)` where `mat_rect` is a
+    rectangular numpy array is the equivalent of:
+
+        band_c(l, u, mat_rect)
+
+    where the returned value is a square numpy array.
+    """
+    return BandMat(l, u, mat_rect.copy())
+
+def band_e_bm(l, u, mat_bm):
+    """Extracts a band of a BandMat.
+
+    The band to extract is specified by `l` and `u`.
+
+    The expression `band_e_bm(l, u, mat_bm)` where `mat_bm` is a BandMat is the
+    equivalent of:
+
+        band_e(l, u, mat_full)
+
+    where `mat_full` is a square numpy array.
+    """
+    mat_bm_co = band_ec_bm_view(l, u, mat_bm)
+    mat_bm_new = mat_bm_co.equiv(l_new=l, u_new=u,
+                                 transposed_new=False,
+                                 zero_extra=True)
+    return mat_bm_new.data
+
+band_ce_bm = fl.band_ce
+
+def band_ec_bm_view(l, u, mat_bm):
+    """Effectively applies `band_e_bm` then `band_c_bm`, sharing data arrays.
+
+    The combined operation has the effect of zeroing the entries outside the
+    band specified by `l` and `u`.
+    This is implemented by taking a view of `mat_bm`'s underlying data array.
+    To obtain a BandMat with a fresh underlying data array, `.copy_exact()`
+    should be called on the result.
+    """
+    assert l >= 0
+    assert u >= 0
+
+    l_in = mat_bm.l
+    u_in = mat_bm.u
+    l_out = min(l, l_in)
+    u_out = min(u, u_in)
+    if mat_bm.transposed:
+        return BandMat(
+            l_out, u_out,
+            mat_bm.data[(l_in - l_out):(l_in + u_out + 1)],
+            transposed = True
+        )
+    else:
+        return BandMat(
+            l_out, u_out,
+            mat_bm.data[(u_in - u_out):(u_in + l_out + 1)]
+        )
+
+def band_ec_bm(l, u, mat_bm):
+    """Effectively applies `band_e_bm` then `band_c_bm`.
+
+    The combined operation has the effect of zeroing the entries outside the
+    band specified by `l` and `u`.
+
+    The expression `band_ec_bm(l, u, mat_bm)` where `mat_bm` is a BandMat is
+    the equivalent of:
+
+        band_ec(l, u, mat_full)
+
+    where `mat_full` and the returned value are square numpy arrays.
+    """
+    return band_ec_bm_view(l, u, mat_bm).copy_exact()
+
+def band_e_bm_common(*mat_bms):
+    """Extracts the band from several BandMats in a common way.
+
+    The same band is used for each of the BandMats, and this band is the
+    smallest band that encompasses the bands of all the BandMats.
+    This allows BandMats to be compared with the same tools used to compare
+    conventional numpy arrays (and without having to explicitly construct the
+    corresponding full matrices).
+
+    Example usage:
+    >>> import bandmat as bm
+    >>> cc = bm.band_e_bm_common
+    >>> def allclose(x, y): return x.shape == y.shape and np.allclose(x, y)
+    >>> a_bm = bm.zeros(3, 2, 10)
+    >>> b_bm = bm.zeros(1, 3, 10).T
+    >>> # these represent the same matrix
+    >>> allclose(a_bm.full(), b_bm.full())
+    True
+    >>> # naive comparison doesn't show this
+    >>> allclose(a_bm.data, b_bm.data)
+    False
+    >>> # comparison after normalizing works
+    >>> allclose(*cc(a_bm, b_bm))
+    True
+    """
+    if not mat_bms:
+        return []
+    else:
+        l = max([ mat_bm.l for mat_bm in mat_bms ])
+        u = max([ mat_bm.u for mat_bm in mat_bms ])
+        return [ band_e_bm(l, u, mat_bm) for mat_bm in mat_bms ]
+
+def diag(vec_or_mat_bm):
+    """Constructs a diagonal BandMat or extracts the diagonal from a BandMat.
+
+    If `vec_or_mat_bm` is a vector `vec` (i.e. a one-dimensional numpy array),
+    then this function returns a BandMat representing the diagonal matrix with
+    diagonal `vec`.
+    If `vec_or_mat_bm` is a BandMat `mat_bm`, then this function returns the
+    vector consisting of the diagonal of the matrix represented by `mat_bm`.
+    In both cases the array storing the vector and the BandMat's underlying
+    data array share memory.
+
+    The expression `diag(vec_or_mat_bm)` where `vec_or_mat_bm` is a vector or a
+    BandMat is the equivalent of:
+
+        np.diag(vec_or_mat_full)
+
+    where `vec_or_mat_full` is a vector or a square numpy array.
+    """
+    if not isinstance(vec_or_mat_bm, BandMat):
+        vec = vec_or_mat_bm
+        assert vec.ndim == 1
+        return BandMat(0, 0, np.reshape(vec, (1, -1)))
+    else:
+        mat_bm = vec_or_mat_bm
+        row = mat_bm.l if mat_bm.transposed else mat_bm.u
+        return mat_bm.data[row]
diff --git a/nnmnkwii/paramgen/_bandmat/full.pyx b/nnmnkwii/paramgen/_bandmat/full.pyx
new file mode 100644
index 00000000..77f78edd
--- /dev/null
+++ b/nnmnkwii/paramgen/_bandmat/full.pyx
@@ -0,0 +1,200 @@
+# cython: language_level=3
+
+"""Operations involving the bands of square matrices.
+
+This module provides operations involving the bands of square matrices which
+are stored as numpy arrays.
+For example it provides functions to construct a square matrix with given band,
+and to extract the band from an arbitrary square matrix.
+Though this is closely related to the representation of square banded matrices
+used by the `BandMat` class in the `core` module, it is logically distinct.
+"""
+
+# Copyright 2013, 2014, 2015, 2016, 2017, 2018 Matt Shannon
+
+# This file is part of bandmat.
+# See `License` for details of license and warranty.
+
+import numpy as np
+
+cimport numpy as cnp
+cimport cython
+
+cnp.import_array()
+cnp.import_ufunc()
+
+@cython.boundscheck(False)
+def band_c(long l, long u, cnp.ndarray[cnp.float64_t, ndim=2] mat_rect):
+    """Constructs a square banded matrix from its band.
+
+    Given a rectangular numpy array `mat_rect`, this function returns a square
+    numpy array `mat_full` with its `u` superdiagonals, diagonal and `l`
+    subdiagonals given by the rows of `mat_rect`.
+    The part of each column of `mat_full` that lies within the band contains
+    the same entries as the corresponding column of `mat_rect`.
+
+    Not all the entries of `mat_rect` affect `mat_full`.
+    The (l, u)-extra entries of a rectangular matrix `mat_rect` are defined as
+    the entries which have no effect on the result of `band_c(l, u, mat_rect)`.
+    They lie in the upper-left and bottom-right corners of `mat_rect`.
+    """
+    assert l >= 0
+    assert u >= 0
+    assert mat_rect.shape[0] == l + u + 1
+
+    cdef long size
+    cdef cnp.ndarray[cnp.float64_t, ndim=2] mat_full
+
+    size = mat_rect.shape[1]
+    mat_full = np.zeros((size, size))
+
+    cdef long i
+    cdef unsigned long row
+    cdef unsigned long j
+
+    for i in range(-u, l + 1):
+        row = u + i
+        for j in range(max(0, -i), max(0, size + min(0, -i))):
+            mat_full[j + i, j] = mat_rect[row, j]
+
+    return mat_full
+
+@cython.boundscheck(False)
+def band_e(long l, long u, cnp.ndarray[cnp.float64_t, ndim=2] mat_full):
+    """Extracts a band of a square matrix.
+
+    Given a square numpy array `mat_full`, returns a rectangular numpy array
+    `mat_rect` with rows corresponding to the `u` superdiagonals, the diagonal
+    and the `l` subdiagonals of `mat_full`.
+    The square matrix is "collapsed column-wise", i.e. each column of
+    `mat_rect` contains the same entries as the part of the corresponding
+    column of `mat_full` that lies within the band defined by (l, u).
+    The extra entries in the corners of `mat_rect` which do not correspond to
+    any entry in `mat_full` are set to zero.
+    """
+    assert l >= 0
+    assert u >= 0
+    assert mat_full.shape[1] == mat_full.shape[0]
+
+    cdef long size
+    cdef cnp.ndarray[cnp.float64_t, ndim=2] mat_rect
+
+    size = mat_full.shape[0]
+    mat_rect = np.empty((l + u + 1, size))
+
+    cdef long i
+    cdef unsigned long row
+    cdef unsigned long j
+
+    for i in range(-u, l + 1):
+        row = u + i
+        for j in range(size):
+            # "j + i < size" below uses wraparound on unsigned long to actually
+            #   check that "0 <= j + i < size"
+            mat_rect[row, j] = mat_full[j + i, j] if j + i < size else 0.0
+
+    return mat_rect
+
+@cython.boundscheck(False)
+def zero_extra_entries(long l, long u,
+                       cnp.ndarray[cnp.float64_t, ndim=2] mat_rect):
+    """Zeroes the extra entries of a rectangular matrix.
+
+    See the docstring for `band_c` for the definition of extra entries.
+
+    The statement `zero_extra_entries(l, u, mat_rect)` is equivalent to:
+
+        mat_rect[:] = band_e(l, u, band_c(l, u, mat_rect))
+
+    N.B. in-place, i.e. mutates `mat_rect`.
+    """
+    assert l >= 0
+    assert u >= 0
+    assert mat_rect.shape[0] == l + u + 1
+
+    cdef long size
+
+    size = mat_rect.shape[1]
+
+    cdef long i
+    cdef unsigned long row
+    cdef unsigned long j
+
+    for i in range(-u, 0):
+        row = u + i
+        for j in range(0, min(size, -i)):
+            mat_rect[row, j] = 0.0
+    for i in range(1, l + 1):
+        row = u + i
+        for j in range(max(0, size - i), size):
+            mat_rect[row, j] = 0.0
+
+    return
+
+def band_ce(l, u, mat_rect):
+    """Effectively applies `band_c` then `band_e`.
+
+    The combined operation has the effect of zeroing the extra entries in (a
+    copy of) the rectangular matrix `mat_rect`.
+
+    The expression `band_ce(l, u, mat_rect)` is equivalent to:
+
+        band_e(l, u, band_c(l, u, mat_rect))
+    """
+    mat_rect_new = mat_rect.copy()
+    zero_extra_entries(l, u, mat_rect_new)
+    return mat_rect_new
+
+def band_ec(l, u, mat_full):
+    """Effectively applies `band_e` then `band_c`.
+
+    The combined operation has the effect of zeroing the entries outside the
+    band specified by `l` and `u` in (a copy of) the square matrix `mat_full`.
+
+    The expression `band_ec(l, u, mat_full)` is equivalent to:
+
+        band_c(l, u, band_e(l, u, mat_full))
+    """
+    return band_c(l, u, band_e(l, u, mat_full))
+
+@cython.boundscheck(False)
+def band_cTe(long l, long u,
+             cnp.ndarray[cnp.float64_t, ndim=2] mat_rect,
+             cnp.ndarray[cnp.float64_t, ndim=2] target_rect=None):
+    """Effectively applies `band_c` then transpose then `band_e`.
+
+    The expression `band_cTe(l, u, mat_rect)` is equivalent to:
+
+        band_e(u, l, band_c(l, u, mat_rect).T)
+    """
+    assert l >= 0
+    assert u >= 0
+    assert mat_rect.shape[0] == l + u + 1
+
+    cdef long size
+    cdef long target_given = (target_rect is not None)
+
+    size = mat_rect.shape[1]
+    if target_given:
+        assert target_rect.shape[0] == l + u + 1
+        assert target_rect.shape[1] == size
+    else:
+        target_rect = np.empty((l + u + 1, size))
+
+    cdef long i
+    cdef unsigned long row, row_new
+    cdef unsigned long j
+
+    for i in range(-u, l + 1):
+        row = u + i
+        row_new = l - i
+        for j in range(size):
+            # "j - i < size" below uses wraparound on unsigned long to actually
+            #   check that "0 <= j - i < size"
+            target_rect[row_new, j] = (mat_rect[row, j - i] if j - i < size
+                                       else 0.0)
+
+    if target_given:
+        return
+    else:
+        return target_rect
diff --git a/nnmnkwii/paramgen/_bandmat/linalg.pyx b/nnmnkwii/paramgen/_bandmat/linalg.pyx
new file mode 100644
index 00000000..541289bb
--- /dev/null
+++ b/nnmnkwii/paramgen/_bandmat/linalg.pyx
@@ -0,0 +1,379 @@
+# cython: language_level=3
+
+"""Linear algebra operations for banded matrices.
+
+Several of the functions in this module deal with symmetric banded matrices.
+Symmetric banded matrices can in principle be stored even more efficiently than
+general banded matrices, by not storing the superdiagonals (or subdiagonals).
+However in bandmat the simpler, more explicit representation which stores the
+whole band is used instead.
+This is slightly less efficient in terms of memory and time, but results in a
+more unified interface and simpler code in the author's limited experience.
+Note that this is in line with the convention used by numpy and scipy for
+symmetric "full" (i.e. non-banded) matrices, where both the upper and lower
+triangles are represented in memory.
+"""
+
+# Copyright 2013, 2014, 2015, 2016, 2017, 2018 Matt Shannon
+
+# This file is part of bandmat.
+# See `License` for details of license and warranty.
+
+from nnmnkwii.paramgen import _bandmat as bm
+from nnmnkwii.paramgen._bandmat import BandMat
+from nnmnkwii.paramgen._bandmat import full as fl
+
+import numpy as np
+import scipy.linalg as sla
+
+cimport numpy as cnp
+cimport cython
+from libc.math cimport sqrt
+
+cnp.import_array()
+cnp.import_ufunc()
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+def _cholesky_banded(cnp.ndarray[cnp.float64_t, ndim=2] mat,
+                     long overwrite_ab=False,
+                     long lower=False):
+    """A cython reimplementation of scipy.linalg.cholesky_banded.
+
+    Using lower == True is slightly more time and space efficient than using
+    lower == False for this implementation.
+    The default is False only for compatibility with the scipy implementation.
+
+    (This simple implementation seems to be much faster than the scipy
+    implementation, especially for small bandwidths.
+    This is surprising since the scipy implementation wraps an LAPACK call
+    which one would expect to be very fast.
+    I am still unsure why this is, though I have checked it is not to do with
+    Fortran-contiguous versus C-contiguous ordering.
+    The scipy implementation performs a relatively expensive finite-value check
+    which is arguably not necessary for our implementation, but this only
+    accounts for part of the difference.)
+    """
+    cdef unsigned long frames, depth
+    cdef cnp.ndarray[cnp.float64_t, ndim=2] mat_orig
+
+    frames = mat.shape[1]
+    assert mat.shape[0] >= 1
+    depth = mat.shape[0] - 1
+
+    mat_orig = mat
+    if not lower:
+        mat = fl.band_cTe(0, depth, mat)
+    elif not overwrite_ab:
+        mat = mat.copy()
+
+    cdef unsigned long frame, k, l
+    cdef double v0, iv0, siv0
+    cdef cnp.ndarray[cnp.float64_t, ndim=1] v = np.empty(
+        (depth,),
+        dtype=np.float64
+    )
+
+    for frame in range(frames):
+        v0 = mat[<unsigned long>(0), frame]
+        if v0 <= 0.0:
+            raise sla.LinAlgError(
+                '%d-th leading minor not positive definite' % (frame + 1)
+            )
+        iv0 = 1.0 / v0
+        siv0 = sqrt(iv0)
+
+        for k in range(depth):
+            v[k] = mat[k + 1, frame]
+
+        mat[<unsigned long>(0), frame] = 1.0 / siv0
+        for k in range(depth):
+            mat[k + 1, frame] = v[k] * siv0
+
+        for k in range(min(depth, frames - frame - 1)):
+            for l in range(depth - k):
+                mat[l, k + frame + 1] -= v[l + k] * v[k] * iv0
+
+    if not lower:
+        if overwrite_ab:
+            fl.band_cTe(depth, 0, mat, target_rect=mat_orig)
+            mat = mat_orig
+        else:
+            mat = fl.band_cTe(depth, 0, mat)
+
+    return mat
+
+@cython.boundscheck(False)
+@cython.cdivision(True)
+def _solve_triangular_banded(cnp.ndarray[cnp.float64_t, ndim=2] a_rect,
+                             cnp.ndarray[cnp.float64_t, ndim=1] b,
+                             long transposed=False,
+                             long lower=False,
+                             long overwrite_b=False):
+    """A cython implementation of missing scipy.linalg.solve_triangular_banded.
+
+    Solves A . x = b (transposed == False) or A.T . x = b (transposed == True)
+    for x, where A is a upper or lower triangular banded matrix, x and b are
+    vectors, . indicates matrix multiplication, and A.T denotes the transpose
+    of A.
+    The argument `lower` indicates whether `a_rect` stores a lower triangle or
+    an upper triangle.
+
+    This replicates functionality present in the LAPACK routine dtbtrs.
+    If this function existed in scipy, it would probably be
+    scipy.linalg.solve_triangular_banded.
+    """
+    cdef unsigned long frames, depth
+    cdef long solve_lower
+    cdef cnp.ndarray[cnp.float64_t, ndim=1] x
+
+    frames = a_rect.shape[1]
+    assert a_rect.shape[0] >= 1
+    depth = a_rect.shape[0] - 1
+    assert b.shape[0] == frames
+
+    solve_lower = (lower != transposed)
+
+    if overwrite_b:
+        x = b
+    else:
+        x = np.empty_like(b)
+
+    cdef unsigned long pos, frame, framePrev, k
+    cdef double diff, denom
+
+    for pos in range(frames):
+        frame = pos if solve_lower else frames - 1 - pos
+        diff = b[frame]
+        if lower:
+            if transposed:
+                for k in range(1, min(depth + 1, pos + 1)):
+                    framePrev = frame + k
+                    diff -= a_rect[k, frame] * x[framePrev]
+                denom = a_rect[<unsigned long>(0), frame]
+            else:
+                for k in range(1, min(depth + 1, pos + 1)):
+                    framePrev = frame - k
+                    diff -= a_rect[k, framePrev] * x[framePrev]
+                denom = a_rect[<unsigned long>(0), frame]
+        else:
+            if transposed:
+                for k in range(1, min(depth + 1, pos + 1)):
+                    framePrev = frame - k
+                    diff -= a_rect[depth - k, frame] * x[framePrev]
+                denom = a_rect[depth, frame]
+            else:
+                for k in range(1, min(depth + 1, pos + 1)):
+                    framePrev = frame + k
+                    diff -= a_rect[depth - k, framePrev] * x[framePrev]
+                denom = a_rect[depth, frame]
+        if denom == 0.0:
+            raise sla.LinAlgError(
+                'singular matrix: resolution failed at diagonal %d' % frame
+            )
+        x[frame] = diff / denom
+
+    return x
+
+def cholesky(mat_bm, lower=False, alternative=False):
+    """Computes the Cholesky factor of a positive definite banded matrix.
+
+    The conventional Cholesky decomposition of a positive definite matrix P is
+    P = L . L.T, where the lower Cholesky factor L is lower-triangular, the
+    upper Cholesky factor L.T is upper-triangular, and . indicates matrix
+    multiplication.
+    Each positive definite matrix has a unique Cholesky decomposition.
+    Given a positive definite banded matrix, this function computes its
+    Cholesky factor, which is also a banded matrix.
+
+    This function can also work with the alternative Cholesky decomposition.
+    The alternative Cholesky decomposition of P is P = L.T . L, where again L
+    is lower-triangular.
+    Whereas the conventional Cholesky decomposition is intimately connected
+    with linear Gaussian autoregressive probabilistic models defined forwards
+    in time, the alternative Cholesky decomposition is intimately connected
+    with linear Gaussian autoregressive probabilistic models defined backwards
+    in time.
+
+    `mat_bm` (representing P above) should represent the whole matrix, not just
+    the lower or upper triangles.
+    If the matrix represented by `mat_bm` is not symmetric then the behavior of
+    this function is undefined (currently either the upper or lower triangle is
+    used to compute the Cholesky factor and the rest of the matrix is ignored).
+    If `lower` is True then the lower Cholesky factor is returned, and
+    otherwise the upper Cholesky factor is returned.
+    If `alternative` is True then the Cholesky factor returned is for the
+    alternative Cholesky decomposition, and otherwise it is for the
+    conventional Cholesky decomposition.
+    """
+    if mat_bm.transposed:
+        mat_bm = mat_bm.T
+
+    depth = mat_bm.l
+    assert mat_bm.u == depth
+    assert depth >= 0
+
+    l = depth if lower else 0
+    u = 0 if lower else depth
+
+    mat_half_data = mat_bm.data[(depth - u):(depth + l + 1)]
+    if alternative:
+        chol_data = _cholesky_banded(mat_half_data[::-1, ::-1],
+                                     lower=(not lower))[::-1, ::-1]
+    else:
+        chol_data = _cholesky_banded(mat_half_data, lower=lower)
+    chol_bm = BandMat(l, u, chol_data)
+
+    return chol_bm
+
+def solve_triangular(a_bm, b):
+    """Solves a triangular banded matrix equation.
+
+    Solves A . x = b for x, where A is a upper or lower triangular banded
+    matrix, x and b are vectors, and . indicates matrix multiplication.
+    """
+    assert a_bm.l == 0 or a_bm.u == 0
+
+    transposed = a_bm.transposed
+    lower = (a_bm.u == 0)
+    # whether a_bm.data represents a lower or upper triangular matrix
+    data_lower = (lower != transposed)
+    x = _solve_triangular_banded(a_bm.data, b,
+                                 transposed=transposed,
+                                 lower=data_lower)
+    return x
+
+def cho_solve(chol_bm, b):
+    """Solves a matrix equation given the Cholesky decomposition of the matrix.
+
+    Solves A . x = b for x, where A is a positive definite banded matrix, x and
+    b are vectors, and . indicates matrix multiplication.
+    `chol_bm` is a Cholesky factor of A (either upper or lower).
+    """
+    assert chol_bm.l == 0 or chol_bm.u == 0
+
+    lower = (chol_bm.u == 0)
+    chol_lower_bm = chol_bm if lower else chol_bm.T
+
+    x = solve_triangular(
+        chol_lower_bm.T,
+        solve_triangular(chol_lower_bm, b)
+    )
+    return x
+
+def solve(a_bm, b):
+    """Solves a banded matrix equation.
+
+    Solves A . x = b for x, where A is a square banded matrix, x and b are
+    vectors, and . indicates matrix multiplication.
+    """
+    assert a_bm.size == len(b)
+
+    # below is necessary since sla.solve_banded does not have a transpose flag,
+    #   and so is not capable of working with the transposed matrix directly.
+    #   (In fact (surprisingly!) the underlying LAPACK function dgbsv does not
+    #   have a transpose flag either, though gbsv calls dgbtrf and dgbtrs, and
+    #   dgbtrs does have a transpose flag, so LAPACK is capable of working with
+    #   the transposed matrix directly in principle).
+    if a_bm.transposed:
+        a_bm = a_bm.equiv(transposed_new=False)
+
+    if a_bm.size == 0:
+        x = np.zeros_like(b)
+    elif a_bm.size == 1:
+        # workaround for https://github.com/scipy/scipy/issues/8906
+        x = b / a_bm.data[a_bm.u, 0]
+    else:
+        x = sla.solve_banded((a_bm.l, a_bm.u), a_bm.data, b)
+    return x
+
+def solveh(a_bm, b):
+    """Solves a positive definite matrix equation.
+
+    Solves A . x = b for x, where A is a positive definite banded matrix, x and
+    b are vectors, and . indicates matrix multiplication.
+
+    `a_bm` (representing A above) should represent the whole matrix, not just
+    the lower or upper triangles.
+    If the matrix represented by `a_bm` is not symmetric then the behavior of
+    this function is undefined (currently either the upper or lower triangle is
+    used and the rest of the matrix is ignored).
+    """
+    chol_bm = cholesky(a_bm, lower=True)
+    x = cho_solve(chol_bm, b)
+    return x
+
+@cython.boundscheck(False)
+def band_of_inverse_from_chol(chol_bm):
+    """Computes the band of the inverse of a positive definite banded matrix.
+
+    Computes the band of the inverse of a positive definite banded matrix given
+    its Cholesky factor.
+    Equivalently, finds the band of the covariance matrix of a discrete time
+    process which is linear-Gaussian backwards in time.
+
+    `chol_bm` can be either the lower or upper Cholesky factor.
+    """
+    assert chol_bm.l == 0 or chol_bm.u == 0
+
+    if chol_bm.u != 0:
+        chol_bm = chol_bm.T
+
+    cdef unsigned long depth
+    cdef long frames
+    cdef long l_chol, u_chol, transposed_chol
+    cdef cnp.ndarray[cnp.float64_t, ndim=2] chol_data
+    cdef cnp.ndarray[cnp.float64_t, ndim=2] cov_data
+
+    l_chol = chol_bm.l
+    u_chol = chol_bm.u
+    chol_data = chol_bm.data
+    transposed_chol = chol_bm.transposed
+    assert l_chol >= 0
+    assert u_chol == 0
+    assert chol_data.shape[0] == l_chol + u_chol + 1
+
+    depth = l_chol
+    frames = chol_data.shape[1]
+    cov_data = np.zeros((depth * 2 + 1, frames))
+
+    cdef long frame_l
+    cdef unsigned long frame
+    cdef long curr_depth
+    cdef long k_1, k_2
+    cdef unsigned long row_chol
+    cdef long d_chol
+    cdef double mult
+
+    curr_depth = 0
+    for frame_l in range(frames - 1, -1, -1):
+        frame = frame_l
+        row_chol = l_chol if transposed_chol else u_chol
+        mult = 1.0 / chol_data[row_chol, frame]
+        cov_data[depth, frame] = mult * mult
+        for k_2 in range(curr_depth, -1, -1):
+            for k_1 in range(curr_depth, 0, -1):
+                row_chol = l_chol - k_1 if transposed_chol else u_chol + k_1
+                d_chol = k_1 if transposed_chol else 0
+                cov_data[depth + k_2, frame] -= (
+                    chol_data[row_chol, frame + d_chol] *
+                    cov_data[depth + k_1 - k_2, frame + k_2] *
+                    mult
+                )
+        for k_2 in range(curr_depth, 0, -1):
+            cov_data[depth - k_2, frame + k_2] = (
+                cov_data[depth + k_2, frame]
+            )
+        if curr_depth < depth:
+            curr_depth += 1
+
+    cov_bm = BandMat(depth, depth, cov_data)
+    return cov_bm
+
+def band_of_inverse(mat_bm):
+    """Computes band of the inverse of a positive definite banded matrix."""
+    depth = mat_bm.l
+    assert mat_bm.u == depth
+    chol_bm = cholesky(mat_bm, lower=True)
+    band_of_inv_bm = band_of_inverse_from_chol(chol_bm)
+    return band_of_inv_bm
diff --git a/nnmnkwii/paramgen/_bandmat/misc.pyx b/nnmnkwii/paramgen/_bandmat/misc.pyx
new file mode 100644
index 00000000..b1bcd335
--- /dev/null
+++ b/nnmnkwii/paramgen/_bandmat/misc.pyx
@@ -0,0 +1,100 @@
+# cython: language_level=3
+
+"""Assorted helpful functions."""
+
+# Copyright 2013, 2014, 2015, 2016, 2017, 2018 Matt Shannon
+
+# This file is part of bandmat.
+# See `License` for details of license and warranty.
+
+cimport numpy as cnp
+cimport cython
+
+cnp.import_array()
+cnp.import_ufunc()
+
+def fancy_plus_equals(cnp.ndarray[cnp.int_t, ndim=1] target_index_seq,
+                      cnp.ndarray[cnp.float64_t, ndim=1] source,
+                      cnp.ndarray[cnp.float64_t, ndim=1] target):
+    """Implements a += method with fancy indexing.
+
+    Does what you might expect
+        target[target_index_seq] += source
+    to do.
+    """
+    cdef unsigned long source_size
+
+    source_size = source.shape[0]
+    assert target_index_seq.shape[0] == source_size
+
+    cdef unsigned long source_index
+    cdef long target_index
+
+    for source_index in range(source_size):
+        target_index = target_index_seq[source_index]
+        target[target_index] += source[source_index]
+
+    return
+
+def fancy_plus_equals_2d(cnp.ndarray[cnp.int_t, ndim=1] target_index_seq,
+                         cnp.ndarray[cnp.float64_t, ndim=2] source,
+                         cnp.ndarray[cnp.float64_t, ndim=2] target):
+    """Implements a += method with fancy indexing.
+
+    Does what you might expect
+        target[target_index_seq] += source
+    to do.
+    """
+    cdef unsigned long source_size
+    cdef unsigned long size1
+
+    source_size = source.shape[0]
+    assert target_index_seq.shape[0] == source_size
+    size1 = source.shape[1]
+    assert target.shape[1] == size1
+
+    cdef unsigned long source_index
+    cdef long target_index
+    cdef unsigned long index1
+
+    for source_index in range(source_size):
+        target_index = target_index_seq[source_index]
+        for index1 in range(size1):
+            target[target_index, index1] += source[source_index, index1]
+
+    return
+
+def fancy_plus_equals_3d(cnp.ndarray[cnp.int_t, ndim=1] target_index_seq,
+                         cnp.ndarray[cnp.float64_t, ndim=3] source,
+                         cnp.ndarray[cnp.float64_t, ndim=3] target):
+    """Implements a += method with fancy indexing.
+
+    Does what you might expect
+        target[target_index_seq] += source
+    to do.
+    """
+    cdef unsigned long source_size
+    cdef unsigned long size1
+    cdef unsigned long size2
+
+    source_size = source.shape[0]
+    assert target_index_seq.shape[0] == source_size
+    size1 = source.shape[1]
+    assert target.shape[1] == size1
+    size2 = source.shape[2]
+    assert target.shape[2] == size2
+
+    cdef unsigned long source_index
+    cdef long target_index
+    cdef unsigned long index1
+    cdef unsigned long index2
+
+    for source_index in range(source_size):
+        target_index = target_index_seq[source_index]
+        for index1 in range(size1):
+            for index2 in range(size2):
+                target[target_index, index1, index2] += (
+                    source[source_index, index1, index2]
+                )
+
+    return
diff --git a/nnmnkwii/paramgen/_bandmat/overlap.pyx b/nnmnkwii/paramgen/_bandmat/overlap.pyx
new file mode 100644
index 00000000..0b030e37
--- /dev/null
+++ b/nnmnkwii/paramgen/_bandmat/overlap.pyx
@@ -0,0 +1,344 @@
+# cython: language_level=3
+
+"""Functions to do with overlapping subtensors."""
+
+# Copyright 2013, 2014, 2015, 2016, 2017, 2018 Matt Shannon
+
+# This file is part of bandmat.
+# See `License` for details of license and warranty.
+
+from nnmnkwii.paramgen import _bandmat as bm
+
+import numpy as np
+
+cimport numpy as cnp
+cimport cython
+
+cnp.import_array()
+cnp.import_ufunc()
+
+@cython.boundscheck(False)
+def sum_overlapping_v(cnp.ndarray[cnp.float64_t, ndim=2] contribs,
+                      unsigned long step=1,
+                      cnp.ndarray[cnp.float64_t, ndim=1] target=None):
+    """Computes the overlapped sum of a sequence of vectors.
+
+    The overlapped sum of a sequence of vectors is defined as follows.
+    Suppose the vectors in `contribs` are "laid out" along some larger vector
+    such that each element of `contribs` is offset by `step` indices relative
+    to the previous element.
+    For example `contribs[0]` occupies the left edge of the larger vector,
+    `contribs[1]` starts at index `step` and `contribs[2]` starts at index
+    `step * 2`.
+    The overlapped sum is the sum of these vectors laid out in this way, which
+    is a vector.
+
+    If `target` is None then a new vector is returned; otherwise the overlapped
+    sum is added to the vector `target`.
+    If `contribs` has shape `(num_contribs, width)` then the returned vector
+    (or `target` if specified) has size `num_contribs * step + width - step`.
+    The value of `width` here must be at least `step`.
+    """
+    assert contribs.shape[1] >= step
+
+    cdef unsigned long width = contribs.shape[1]
+    cdef unsigned long num_contribs = contribs.shape[0]
+    cdef unsigned long vec_size = num_contribs * step + width - step
+    cdef cnp.ndarray[cnp.float64_t, ndim=1] vec
+
+    if target is None:
+        vec = np.zeros((vec_size,), dtype=contribs.dtype)
+    else:
+        assert target.shape[0] == vec_size
+        vec = target
+
+    cdef unsigned long index, frame, k
+
+    for index in range(num_contribs):
+        frame = index * step
+        for k in range(width):
+            vec[frame + k] += contribs[index, k]
+
+    if target is None:
+        return vec
+    else:
+        return
+
+@cython.boundscheck(False)
+def sum_overlapping_m(cnp.ndarray[cnp.float64_t, ndim=3] contribs,
+                      unsigned long step=1,
+                      target_bm=None):
+    """Computes the overlapped sum of a sequence of square matrices.
+
+    The overlapped sum of a sequence of matrices is defined as follows.
+    Suppose the matrices in `contribs` are "laid out" along the diagonal of
+    some larger matrix such that each element of `contribs` is `step` indices
+    further down and `step` indices further right than the previous element.
+    For example `contribs[0]` occupies the top left corner of the larger matrix
+    and `contribs[1]` is `step` indices down and right of this.
+    The overlapped sum is the sum of these matrices laid out in this way, which
+    is a banded matrix.
+    For this function the contributions are square, so the resulting banded
+    matrix is also square.
+
+    If `target_bm` is None then a new BandMat is returned; otherwise the
+    overlapped sum is added to the BandMat `target_bm`.
+    If `contribs` has shape `(num_contribs, width, width)` then the returned
+    BandMat (or `target_bm` if specified) has upper and lower bandwidth
+    `width - 1` and size `num_contribs * step + width - step`.
+    The value of `width` here must be at least `step`.
+    """
+    assert contribs.shape[1] >= 1 and contribs.shape[1] >= step
+    assert contribs.shape[2] == contribs.shape[1]
+
+    cdef unsigned long width = contribs.shape[1]
+    cdef unsigned long num_contribs = contribs.shape[0]
+    cdef unsigned long depth = contribs.shape[1] - 1
+    cdef unsigned long mat_size = num_contribs * step + width - step
+
+    if target_bm is None:
+        mat_bm = bm.zeros(depth, depth, mat_size)
+    else:
+        assert target_bm.l == depth and target_bm.u == depth
+        assert target_bm.size == mat_size
+        mat_bm = target_bm
+
+    cdef unsigned long transposed = mat_bm.transposed
+    cdef cnp.ndarray[cnp.float64_t, ndim=2] mat_data = mat_bm.data
+
+    cdef unsigned long index, frame, k, l
+
+    if transposed:
+        for index in range(num_contribs):
+            frame = index * step
+            for k in range(width):
+                for l in range(width):
+                    mat_data[depth + l - k, frame + k] += contribs[index, k, l]
+    else:
+        for index in range(num_contribs):
+            frame = index * step
+            for k in range(width):
+                for l in range(width):
+                    mat_data[depth + k - l, frame + l] += contribs[index, k, l]
+
+    if target_bm is None:
+        return mat_bm
+    else:
+        return
+
+@cython.boundscheck(False)
+def extract_overlapping_v(cnp.ndarray[cnp.float64_t, ndim=1] vec,
+                          unsigned long width,
+                          unsigned long step=1,
+                          cnp.ndarray[cnp.float64_t, ndim=2] target=None):
+    """Extracts overlapping subvectors from a vector.
+
+    The result `subvectors` is a matrix consisting of a sequence of subvectors
+    of `vec`.
+    Specifically `subvectors[i]` is `vec[(i * step):(i * step + width)]`.
+
+    If `target` is None then a new matrix is returned; otherwise the result is
+    written to the matrix `target` (and all elements of `target` are guaranteed
+    to be overwritten, so there is no need to zero it ahead of time).
+    The length of `vec` should be `num_subs * step + width - step` for some
+    `num_subs`, i.e. it should "fit" a whole number of subvectors.
+    The returned matrix has shape `(num_subs, width)`.
+    The value of `width` here must be at least `step`.
+    """
+    assert step >= 1
+    assert width >= step
+    assert vec.shape[0] >= width - step
+    assert (vec.shape[0] + step - width) % step == 0
+
+    cdef unsigned long num_subs = (vec.shape[0] + step - width) // step
+    cdef cnp.ndarray[cnp.float64_t, ndim=2] subvectors
+
+    if target is None:
+        subvectors = np.empty((num_subs, width), dtype=vec.dtype)
+    else:
+        assert target.shape[0] == num_subs
+        assert target.shape[1] == width
+        subvectors = target
+
+    cdef unsigned long index, frame, k
+
+    for index in range(num_subs):
+        frame = index * step
+        for k in range(width):
+            subvectors[index, k] = vec[frame + k]
+
+    if target is None:
+        return subvectors
+    else:
+        return
+
+@cython.boundscheck(False)
+def extract_overlapping_m(mat_bm,
+                          unsigned long step=1,
+                          cnp.ndarray[cnp.float64_t, ndim=3] target=None):
+    """Extracts overlapping submatrices along the diagonal of a banded matrix.
+
+    The result `submats` is rank-3 tensor consisting of a sequence of
+    submatrices from along the diagonal of the matrix represented by `mat_bm`.
+    The upper and lower bandwidth of `mat_bm` should be the same.
+    Let `width` be `mat_bm.l + 1`.
+    If `mat_full` is the matrix represented by `mat_bm` then `submats[i]` is
+    `mat_full[(i * step):(i * step + width), (i * step):(i * step + width)]`.
+
+    If `target` is None then a new tensor is returned; otherwise the result is
+    written to the tensor `target` (and all elements of `target` are guaranteed
+    to be overwritten, so there is no need to zero it ahead of time).
+    The size of `mat_bm` should be `num_subs * step + width - step` for some
+    `num_subs`, i.e. it should "fit" a whole number of submatrices.
+    The returned matrix has shape `(num_subs, width, width)`.
+    The value of `width` here must be at least `step`.
+    """
+    assert mat_bm.l == mat_bm.u
+
+    cdef unsigned long width = mat_bm.l + 1
+
+    assert step >= 1
+    assert width >= step
+    assert mat_bm.size >= width - step
+    assert (mat_bm.size + step - width) % step == 0
+
+    cdef unsigned long num_subs = (mat_bm.size + step - width) // step
+    cdef unsigned long depth = mat_bm.l
+    cdef unsigned long transposed = mat_bm.transposed
+    cdef cnp.ndarray[cnp.float64_t, ndim=2] mat_data = mat_bm.data
+    cdef cnp.ndarray[cnp.float64_t, ndim=3] submats
+
+    if target is None:
+        submats = np.empty((num_subs, width, width), dtype=mat_data.dtype)
+    else:
+        assert target.shape[0] == num_subs
+        assert target.shape[1] == width
+        assert target.shape[2] == width
+        submats = target
+
+    cdef unsigned long index, frame, k, l
+
+    if transposed:
+        for index in range(num_subs):
+            frame = index * step
+            for k in range(width):
+                for l in range(width):
+                    submats[index, k, l] = mat_data[depth + l - k, frame + k]
+    else:
+        for index in range(num_subs):
+            frame = index * step
+            for k in range(width):
+                for l in range(width):
+                    submats[index, k, l] = mat_data[depth + k - l, frame + l]
+
+    if target is None:
+        return submats
+    else:
+        return
+
+def sum_overlapping_v_chunked(contribs_chunks, width, target, step=1):
+    """A chunked version of sum_overlapping_v.
+
+    The elements of the iterator `contribs_chunks` should be of the form
+    `(start, end, contribs)`.
+    For example if `contribs_all` has length 10 then
+
+        sum_overlapping_v_chunked([
+            (0, 3, contribs_all[0:3]),
+            (3, 10, contribs_all[3:10])
+        ])
+
+    produces the same result as `sum_overlapping_v(contribs_all)`.
+    This can be used to construct more memory-efficient code in some cases
+    (though not in the above example).
+    """
+    assert step >= 0
+    overlap = width - step
+    assert overlap >= 0
+
+    for start, end, contribs in contribs_chunks:
+        sum_overlapping_v(
+            contribs,
+            step=step,
+            target=target[(start * step):(end * step + overlap)]
+        )
+
+def sum_overlapping_m_chunked(contribs_chunks, target_bm, step=1):
+    """A chunked version of sum_overlapping_m.
+
+    The elements of the iterator `contribs_chunks` should be of the form
+    `(start, end, contribs)`.
+    For example if `contribs_all` has length 10 then
+
+        sum_overlapping_m_chunked([
+            (0, 3, contribs_all[0:3]),
+            (3, 10, contribs_all[3:10])
+        ])
+
+    produces the same result as `sum_overlapping_m(contribs_all)`.
+    This can be used to construct more memory-efficient code in some cases
+    (though not in the above example).
+    """
+    assert step >= 0
+    depth = target_bm.l
+    assert target_bm.u == depth
+    width = depth + 1
+    overlap = width - step
+    assert overlap >= 0
+
+    for start, end, contribs in contribs_chunks:
+        sum_overlapping_m(
+            contribs,
+            step=step,
+            target_bm=target_bm.sub_matrix_view(
+                start * step, end * step + overlap
+            )
+        )
+
+def extract_overlapping_v_chunked(vec, width, chunk_size, step=1):
+    """A chunked version of extract_overlapping_v.
+
+    An iterator over chunks of the output of extract_overlapping_v is returned.
+    This can be used to construct more memory-efficient code in some cases.
+    """
+    assert step >= 1
+    overlap = width - step
+    assert overlap >= 0
+    num_subs = (len(vec) - overlap) // step
+    assert num_subs * step + overlap == len(vec)
+    assert num_subs >= 0
+    assert chunk_size >= 1
+
+    for start in range(0, num_subs, chunk_size):
+        end = min(start + chunk_size, num_subs)
+        subvectors = extract_overlapping_v(
+            vec[(start * step):(end * step + overlap)],
+            width,
+            step=step
+        )
+        yield start, end, subvectors
+
+def extract_overlapping_m_chunked(mat_bm, chunk_size, step=1):
+    """A chunked version of extract_overlapping_m.
+
+    An iterator over chunks of the output of extract_overlapping_m is returned.
+    This can be used to construct more memory-efficient code in some cases.
+    """
+    assert step >= 1
+    depth = mat_bm.l
+    assert mat_bm.u == depth
+    width = depth + 1
+    overlap = width - step
+    assert overlap >= 0
+    num_subs = (mat_bm.size - overlap) // step
+    assert num_subs * step + overlap == mat_bm.size
+    assert num_subs >= 0
+    assert chunk_size >= 1
+
+    for start in range(0, num_subs, chunk_size):
+        end = min(start + chunk_size, num_subs)
+        submats = extract_overlapping_m(
+            mat_bm.sub_matrix_view(start * step, end * step + overlap),
+            step=step
+        )
+        yield start, end, submats
diff --git a/nnmnkwii/paramgen/_bandmat/tensor.pyx b/nnmnkwii/paramgen/_bandmat/tensor.pyx
new file mode 100644
index 00000000..118f7058
--- /dev/null
+++ b/nnmnkwii/paramgen/_bandmat/tensor.pyx
@@ -0,0 +1,308 @@
+# cython: language_level=3
+
+"""Multiplication, etc using banded matrices."""
+
+# Copyright 2013, 2014, 2015, 2016, 2017, 2018 Matt Shannon
+
+# This file is part of bandmat.
+# See `License` for details of license and warranty.
+
+from nnmnkwii.paramgen._bandmat import core as bm_core
+
+import numpy as np
+
+cimport numpy as cnp
+cimport cython
+
+cnp.import_array()
+cnp.import_ufunc()
+
+@cython.boundscheck(False)
+def dot_mv_plus_equals(a_bm,
+                       cnp.ndarray[cnp.float64_t, ndim=1] b,
+                       cnp.ndarray[cnp.float64_t, ndim=1] target):
+    """Multiplies a banded matrix by a vector, adding the result to a vector.
+
+    The statement `dot_mv_plus_equals(a_bm, b, target)` where `a_bm` is a
+    BandMat is the equivalent of:
+
+        target += np.dot(a_full, b)
+
+    where `a_full` is a square numpy array.
+    """
+    # (FIXME : could wrap corresponding BLAS routine (gbmv) instead)
+    cdef long frames
+    cdef long l_a, u_a, transposed
+    cdef cnp.ndarray[cnp.float64_t, ndim=2] a_data
+
+    l_a = a_bm.l
+    u_a = a_bm.u
+    a_data = a_bm.data
+    transposed = a_bm.transposed
+    assert l_a >= 0
+    assert u_a >= 0
+    assert a_data.shape[0] == l_a + u_a + 1
+
+    frames = a_data.shape[1]
+    assert b.shape[0] == frames
+    assert target.shape[0] == frames
+
+    cdef long o_a
+    cdef unsigned long row_a
+    cdef long d_a
+    cdef unsigned long frame
+
+    for o_a in range(-u_a, l_a + 1):
+        row_a = (l_a - o_a) if transposed else (u_a + o_a)
+        d_a = 0 if transposed else -o_a
+        for frame in range(max(0, o_a), max(0, frames + min(0, o_a))):
+            target[frame] += (
+                a_data[row_a, frame + d_a] *
+                b[frame - o_a]
+            )
+
+    return
+
+def dot_mv(a_bm, b):
+    """Multiplies a banded matrix by a vector.
+
+    The expression `dot_mv(a_bm, b)` where `a_bm` is a BandMat is the
+    equivalent of:
+
+        np.dot(a_full, b)
+
+    where `a_full` is a square numpy array.
+    """
+    size = len(b)
+    assert a_bm.size == size
+    c = np.zeros((size,))
+    dot_mv_plus_equals(a_bm, b, c)
+    return c
+
+@cython.boundscheck(False)
+def dot_mm_plus_equals(a_bm, b_bm, target_bm,
+                       cnp.ndarray[cnp.float64_t, ndim=1] diag=None):
+    """Multiplies two banded matrices, adding the result to a banded matrix.
+
+    If `diag` is None, computes A . B, where . indicates matrix multiplication,
+    and adds the result to `target_bm`.
+    If `diag` is not None, computes A . D . B, where D is the diagonal matrix
+    with diagonal `diag`, and adds result to `target_bm`.
+
+    If `target_bm` does not contain enough rows to contain the result of A . B,
+    then only diagonals of A . B which contribute to the rows present in the
+    banded representation used by `target_bm` are computed.
+    This is more efficient in the case that not all diagonals of `target_bm`
+    are needed.
+
+    The statement `dot_mm_plus_equals(a_bm, b_bm, target_bm, diag=None)` where
+    `a_bm`, `b_bm` and `target_bm` are BandMats is the equivalent of:
+
+        target_full += band_ec(l, u, np.dot(a_full, b_full))
+
+    where `a_full`, `b_full` and `target_full` are square numpy arrays.
+    Here `l` is `target_bm.l` and `u` is `target_bm.u`.
+    If `diag` is not None then it is the equivalent of:
+
+        target_full += (
+            band_ec(l, u, np.dot(np.dot(a_full, np.diag(diag)), b_full))
+        )
+    """
+    cdef long frames
+    cdef long l_a, u_a, transposed_a
+    cdef long l_b, u_b, transposed_b
+    cdef long l_c, u_c, transposed_c
+    cdef long use_diag
+    cdef cnp.ndarray[cnp.float64_t, ndim=2] a_data
+    cdef cnp.ndarray[cnp.float64_t, ndim=2] b_data
+    cdef cnp.ndarray[cnp.float64_t, ndim=2] c_data
+
+    l_a = a_bm.l
+    u_a = a_bm.u
+    a_data = a_bm.data
+    transposed_a = a_bm.transposed
+    assert l_a >= 0
+    assert u_a >= 0
+    assert a_data.shape[0] == l_a + u_a + 1
+
+    l_b = b_bm.l
+    u_b = b_bm.u
+    b_data = b_bm.data
+    transposed_b = b_bm.transposed
+    assert l_b >= 0
+    assert u_b >= 0
+    assert b_data.shape[0] == l_b + u_b + 1
+
+    l_c = target_bm.l
+    u_c = target_bm.u
+    c_data = target_bm.data
+    transposed_c = target_bm.transposed
+    assert l_c >= 0
+    assert u_c >= 0
+    assert c_data.shape[0] == l_c + u_c + 1
+
+    frames = a_data.shape[1]
+    assert b_data.shape[1] == frames
+    assert c_data.shape[1] == frames
+
+    use_diag = (diag is not None)
+    if use_diag:
+        assert diag.shape[0] == frames
+
+    cdef long o_a, o_b, o_c
+    cdef unsigned long row_a, row_b, row_c
+    cdef long d_a, d_b, d_c
+    cdef unsigned long frame
+
+    for o_c in range(-min(u_c, u_a + u_b), min(l_c, l_a + l_b) + 1):
+        for o_a in range(-min(u_a, l_b - o_c), min(l_a, u_b + o_c) + 1):
+            o_b = o_c - o_a
+            row_a = (l_a - o_a) if transposed_a else (u_a + o_a)
+            row_b = (l_b - o_b) if transposed_b else (u_b + o_b)
+            row_c = (l_c - o_c) if transposed_c else (u_c + o_c)
+            d_a = o_a if transposed_a else 0
+            d_b = 0 if transposed_b else -o_b
+            d_c = o_a if transposed_c else -o_b
+            for frame in range(max(0, -o_a, o_b),
+                               max(0, frames + min(0, -o_a, o_b))):
+                c_data[row_c, frame + d_c] += (
+                    a_data[row_a, frame + d_a] *
+                    b_data[row_b, frame + d_b] *
+                    (diag[frame] if use_diag else 1.0)
+                )
+
+    return
+
+def dot_mm(a_bm, b_bm, diag=None):
+    """Multiplies two banded matrices.
+
+    If `diag` is None, computes A . B, where . indicates matrix multiplication.
+    If `diag` is not None, computes A . D . B, where D is the diagonal matrix
+    with diagonal `diag`.
+
+    The expression `dot_mm(a_bm, b_bm, diag=None)` where `a_bm` and `b_bm` are
+    BandMats is the equivalent of:
+
+        np.dot(a_full, b_full)
+
+    where `a_full` and `b_full` are square numpy arrays.
+    If `diag` is not None then it is the equivalent of:
+
+        np.dot(np.dot(a_full, np.diag(diag)), b_full)
+    """
+    assert a_bm.size == b_bm.size
+    c_bm = bm_core.zeros(a_bm.l + b_bm.l, a_bm.u + b_bm.u, a_bm.size)
+    dot_mm_plus_equals(a_bm, b_bm, c_bm, diag=diag)
+    return c_bm
+
+def dot_mm_partial(l, u, a_bm, b_bm, diag=None):
+    """Computes part of the result of multiplying two banded matrices.
+
+    If `diag` is None, computes part of C = A . B, where . indicates matrix
+    multiplication.
+    If `diag` is not None, computes part of C = A . D . B, where D is the
+    diagonal matrix with diagonal `diag`.
+
+    This function only computes the diagonals of C that are within the band
+    specified by `l` and `u`.
+    This is desirable in the case that not all diagonals of C are needed.
+
+    The expression `dot_mm_partial(l, u, a_bm, b_bm, diag=None)` where `a_bm`
+    and `b_bm` are BandMats is the equivalent of:
+
+        band_ec(l, u, np.dot(a_full, b_full))
+
+    where `a_full` and `b_full` are square numpy arrays.
+    If `diag` is not None then it is the equivalent of:
+
+        band_ec(l, u, np.dot(np.dot(a_full, np.diag(diag)), b_full))
+    """
+    assert a_bm.size == b_bm.size
+    c_bm = bm_core.zeros(l, u, a_bm.size)
+    dot_mm_plus_equals(a_bm, b_bm, c_bm, diag=diag)
+    return c_bm
+
+def dot_mmm_partial(l, u, a_bm, b_bm, c_bm):
+    """Computes part of the result of multiplying three banded matrices.
+
+    Computes D = A . (B . C), where . indicates matrix multiplication.
+
+    This function only computes the diagonals of D that are within the band
+    specified by `l` and `u`.
+    Furthermore the intermediate result E = B . C is also only partially
+    computed, with rows that can no have influence on the final result never
+    being computed.
+    This behavior is desirable in the case that not all diagonals of D are
+    needed.
+
+    The expression `dot_mmm_partial(l, u, a_bm, b_bm, c_bm)` where `a_bm`,
+    `b_bm` and `c_bm` are BandMats is the equivalent of:
+
+        band_ec(l, u, np.dot(a_full, np.dot(b_full, c_full)))
+
+    where `a_full`, `b_full` and `c_full` are square numpy arrays.
+    """
+    l_i = l + a_bm.u
+    u_i = u + a_bm.l
+    return dot_mm_partial(l, u, a_bm, dot_mm_partial(l_i, u_i, b_bm, c_bm))
+
+@cython.boundscheck(False)
+def band_of_outer_plus_equals(cnp.ndarray[cnp.float64_t, ndim=1] a_vec,
+                              cnp.ndarray[cnp.float64_t, ndim=1] b_vec,
+                              target_bm,
+                              double mult=1.0):
+    """Adds the band of the outer product of two vectors to a banded matrix.
+
+    The statement `band_of_outer_plus_equals(a_vec, b_vec, target_bm, mult)`
+    where `target_bm` is a BandMat is the equivalent of:
+
+        target_full += band_ec(l, u, np.outer(a_vec, b_vec) * mult)
+
+    where `target_full` is a square numpy array.
+    Here `l` is `target_bm.l` and `u` is `target_bm.u`.
+    """
+    cdef long frames
+    cdef long l_c, u_c, transposed_c
+    cdef cnp.ndarray[cnp.float64_t, ndim=2] c_data
+
+    l_c = target_bm.l
+    u_c = target_bm.u
+    c_data = target_bm.data
+    transposed_c = target_bm.transposed
+    assert l_c >= 0
+    assert u_c >= 0
+    assert c_data.shape[0] == l_c + u_c + 1
+
+    frames = c_data.shape[1]
+    assert a_vec.shape[0] == frames
+    assert b_vec.shape[0] == frames
+
+    cdef long o_c
+    cdef unsigned long row_c
+    cdef long d_c
+    cdef unsigned long frame
+
+    for o_c in range(-u_c, l_c + 1):
+        row_c = (l_c - o_c) if transposed_c else (u_c + o_c)
+        d_c = o_c if transposed_c else 0
+        for frame in range(max(0, -o_c), max(0, frames + min(0, -o_c))):
+            c_data[row_c, frame + d_c] += (
+                a_vec[frame + o_c] * b_vec[frame] * mult
+            )
+
+    return
+
+def trace_dot(a_bm, b_bm):
+    """Convenience method to compute the matrix inner product.
+
+    The expression `trace_dot(a_bm, b_bm)` where `a_bm` and `b_bm` are BandMats
+    is the equivalent of:
+
+        np.trace(np.dot(a_full.T, b_full))
+
+    where `a_full` and `b_full` are square numpy arrays.
+
+    This operation is a valid inner product over the vector space of square
+    matrices of a given size.
+    """
+    return np.sum(bm_core.diag(dot_mm_partial(0, 0, a_bm.T, b_bm)))
diff --git a/nnmnkwii/paramgen/_mlpg.py b/nnmnkwii/paramgen/_mlpg.py
index 3f861106..95811a52 100644
--- a/nnmnkwii/paramgen/_mlpg.py
+++ b/nnmnkwii/paramgen/_mlpg.py
@@ -3,8 +3,8 @@
 
 import numpy as np
 
-import bandmat as bm
-import bandmat.linalg as bla
+from nnmnkwii.paramgen import _bandmat as bm
+from nnmnkwii.paramgen._bandmat import linalg as bla
 from scipy.linalg import solve_banded
 from nnmnkwii.util.linalg import cholesky_inv_banded
 from .mlpg_helper import full_window_mat as _full_window_mat
diff --git a/nnmnkwii/paramgen/mlpg_helper.pyx b/nnmnkwii/paramgen/mlpg_helper.pyx
index e41c6873..f841c199 100644
--- a/nnmnkwii/paramgen/mlpg_helper.pyx
+++ b/nnmnkwii/paramgen/mlpg_helper.pyx
@@ -1,6 +1,7 @@
 # coding: utf-8
 # cython: wraparound = False
 # cython: boundscheck = False
+# cython: language_level=3
 
 import numpy as np
 cimport numpy as np
diff --git a/nnmnkwii/preprocessing/generic.py b/nnmnkwii/preprocessing/generic.py
index d002c1ff..d4b00df9 100644
--- a/nnmnkwii/preprocessing/generic.py
+++ b/nnmnkwii/preprocessing/generic.py
@@ -1,20 +1,20 @@
 # coding: utf-8
-from __future__ import division, print_function, absolute_import
+from __future__ import absolute_import, division, print_function
 
 import numpy as np
-from sklearn.utils.extmath import _incremental_mean_and_var
 from scipy import signal
+from sklearn.utils.extmath import _incremental_mean_and_var
 
 
 # from: https://github.com/scikit-learn/scikit-learn/blob/0.22.2/sklearn/preprocessing/_data.py
 def _handle_zeros_in_scale(scale, copy=True):
-    ''' Makes sure that whenever scale is zero, we handle it correctly.
-    This happens in most scalers when we have constant features.'''
+    """Makes sure that whenever scale is zero, we handle it correctly.
+    This happens in most scalers when we have constant features."""
 
     # if we are fitting on 1D arrays, scale might be a scalar
     if np.isscalar(scale):
-        if scale == .0:
-            scale = 1.
+        if scale == 0.0:
+            scale = 1.0
         return scale
     elif isinstance(scale, np.ndarray):
         if copy:
@@ -46,7 +46,7 @@ def _asint(x):
     # ugly wrapper to support torch/numpy arrays
     isnumpy = isinstance(x, np.ndarray)
     isscalar = np.isscalar(x)
-    return x.astype(np.int) if isnumpy else int(x) if isscalar else x.long()
+    return x.astype(int) if isnumpy else int(x) if isscalar else x.long()
 
 
 def _asfloat(x):
@@ -104,7 +104,7 @@ def inv_mulaw(y, mu=256):
         :func:`nnmnkwii.preprocessing.mulaw_quantize`
         :func:`nnmnkwii.preprocessing.inv_mulaw_quantize`
     """
-    return _sign(y) * (1.0 / mu) * ((1.0 + mu)**_abs(y) - 1.0)
+    return _sign(y) * (1.0 / mu) * ((1.0 + mu) ** _abs(y) - 1.0)
 
 
 def mulaw_quantize(x, mu=256):
@@ -197,8 +197,8 @@ def preemphasis(x, coef=0.97):
         >>> y = P.preemphasis(x, coef=0.97)
         >>> assert x.shape == y.shape
     """
-    b = np.array([1., -coef], x.dtype)
-    a = np.array([1.], x.dtype)
+    b = np.array([1.0, -coef], x.dtype)
+    a = np.array([1.0], x.dtype)
     return signal.lfilter(b, a, x)
 
 
@@ -224,8 +224,8 @@ def inv_preemphasis(x, coef=0.97):
         >>> x_hat = P.inv_preemphasis(P.preemphasis(x, coef=0.97), coef=0.97)
         >>> assert np.allclose(x, x_hat)
     """
-    b = np.array([1.], x.dtype)
-    a = np.array([1., -coef], x.dtype)
+    b = np.array([1.0], x.dtype)
+    a = np.array([1.0, -coef], x.dtype)
     return signal.lfilter(b, a, x)
 
 
@@ -281,13 +281,17 @@ def delta_features(x, windows):
     T, D = x.shape
     assert len(windows) > 0
     combined_features = np.empty((T, D * len(windows)), dtype=x.dtype)
-    for idx, (_, _, window) in enumerate(windows):
-        combined_features[:, D * idx:D * idx +
-                          D] = _apply_delta_window(x, window)
+    # NOTE: bandmat case
+    if isinstance(windows[0], tuple):
+        for idx, (_, _, window) in enumerate(windows):
+            combined_features[:, D * idx : D * idx + D] = _apply_delta_window(x, window)
+    else:
+        for idx, window in enumerate(windows):
+            combined_features[:, D * idx : D * idx + D] = _apply_delta_window(x, window)
     return combined_features
 
 
-def trim_zeros_frames(x, eps=1e-7, trim='b'):
+def trim_zeros_frames(x, eps=1e-7, trim="b"):
     """Remove leading and/or trailing zeros frames.
 
     Similar to :func:`numpy.trim_zeros`, trimming trailing zeros features.
@@ -307,28 +311,28 @@ def trim_zeros_frames(x, eps=1e-7, trim='b'):
         >>> y = trim_zeros_frames(x)
     """
 
-    assert trim in {'f', 'b', 'fb'}
+    assert trim in {"f", "b", "fb"}
 
     T, D = x.shape
     s = np.sum(np.abs(x), axis=1)
-    s[s < eps] = 0.
+    s[s < eps] = 0.0
 
-    if trim == 'f':
-        return x[len(x) - len(np.trim_zeros(s, trim=trim)):]
-    elif trim == 'b':
+    if trim == "f":
+        return x[len(x) - len(np.trim_zeros(s, trim=trim)) :]
+    elif trim == "b":
         end = len(np.trim_zeros(s, trim=trim)) - len(x)
         if end == 0:
             return x
         else:
-            return x[: end]
-    elif trim == 'fb':
-        f = len(np.trim_zeros(s, trim='f'))
-        b = len(np.trim_zeros(s, trim='b'))
+            return x[:end]
+    elif trim == "fb":
+        f = len(np.trim_zeros(s, trim="f"))
+        b = len(np.trim_zeros(s, trim="b"))
         end = b - len(x)
         if end == 0:
-            return x[len(x) - f:]
+            return x[len(x) - f :]
         else:
-            return x[len(x) - f: end]
+            return x[len(x) - f : end]
 
 
 def remove_zeros_frames(x, eps=1e-7):
@@ -351,7 +355,7 @@ def remove_zeros_frames(x, eps=1e-7):
     """
     T, D = x.shape
     s = np.sum(np.abs(x), axis=1)
-    s[s < eps] = 0.
+    s[s < eps] = 0.0
     return x[s > eps]
 
 
@@ -413,8 +417,7 @@ def adjust_frame_length(x, pad=True, divisible_by=1, **kwargs):
     return x
 
 
-def adjust_frame_lengths(x, y, pad=True, ensure_even=False, divisible_by=1,
-                         **kwargs):
+def adjust_frame_lengths(x, y, pad=True, ensure_even=False, divisible_by=1, **kwargs):
     """Adjust frame lengths given two feature vectors or matrices.
 
     This ensures that two feature vectors or matrices have same number of
@@ -493,8 +496,14 @@ def adjust_frame_lengths(x, y, pad=True, ensure_even=False, divisible_by=1,
     return x, y
 
 
-def meanvar(dataset, lengths=None, mean_=0., var_=0.,
-            last_sample_count=0, return_last_sample_count=False):
+def meanvar(
+    dataset,
+    lengths=None,
+    mean_=0.0,
+    var_=0.0,
+    last_sample_count=0,
+    return_last_sample_count=False,
+):
     """Mean/variance computation given a iterable dataset
 
     Dataset can have variable length samples. In that cases, you need to
@@ -532,9 +541,8 @@ def meanvar(dataset, lengths=None, mean_=0., var_=0.,
 
     for idx, x in enumerate(dataset):
         if lengths is not None:
-            x = x[:lengths[idx]]
-        mean_, var_, _ = _incremental_mean_and_var(
-            x, mean_, var_, last_sample_count)
+            x = x[: lengths[idx]]
+        mean_, var_, _ = _incremental_mean_and_var(x, mean_, var_, last_sample_count)
         last_sample_count += len(x)
     mean_, var_ = mean_.astype(dtype), var_.astype(dtype)
 
@@ -544,8 +552,14 @@ def meanvar(dataset, lengths=None, mean_=0., var_=0.,
         return mean_, var_
 
 
-def meanstd(dataset, lengths=None, mean_=0., var_=0.,
-            last_sample_count=0, return_last_sample_count=False):
+def meanstd(
+    dataset,
+    lengths=None,
+    mean_=0.0,
+    var_=0.0,
+    last_sample_count=0,
+    return_last_sample_count=False,
+):
     """Mean/std-deviation computation given a iterable dataset
 
     Dataset can have variable length samples. In that cases, you need to
@@ -579,8 +593,9 @@ def meanstd(dataset, lengths=None, mean_=0., var_=0.,
         >>> lengths = [len(y) for y in Y]
         >>> data_mean, data_std = meanstd(Y, lengths)
     """
-    ret = meanvar(dataset, lengths, mean_, var_,
-                  last_sample_count, return_last_sample_count)
+    ret = meanvar(
+        dataset, lengths, mean_, var_, last_sample_count, return_last_sample_count
+    )
     m, v = ret[0], ret[1]
     v = _handle_zeros_in_scale(np.sqrt(v))
     if return_last_sample_count:
@@ -617,7 +632,7 @@ def minmax(dataset, lengths=None):
 
     for idx, x in enumerate(dataset):
         if lengths is not None:
-            x = x[:lengths[idx]]
+            x = x[: lengths[idx]]
         min_ = np.minimum(min_, np.min(x, axis=(0,)))
         max_ = np.maximum(max_, np.max(x, axis=(0,)))
 
@@ -674,8 +689,9 @@ def inv_scale(x, data_mean, data_std):
 
 def __minmax_scale_factor(data_min, data_max, feature_range):
     data_range = data_max - data_min
-    scale = (feature_range[1] - feature_range[0]) / \
-        _handle_zeros_in_scale(data_range, copy=False)
+    scale = (feature_range[1] - feature_range[0]) / _handle_zeros_in_scale(
+        data_range, copy=False
+    )
     return scale
 
 
@@ -718,8 +734,9 @@ def minmax_scale_params(data_min, data_max, feature_range=(0, 1)):
     return min_, scale_
 
 
-def minmax_scale(x, data_min=None, data_max=None, feature_range=(0, 1),
-                 scale_=None, min_=None):
+def minmax_scale(
+    x, data_min=None, data_max=None, feature_range=(0, 1), scale_=None, min_=None
+):
     """Min/max scaling for given a single data.
 
     Given data min, max and feature range, apply min/max normalization to data.
@@ -761,8 +778,10 @@ def minmax_scale(x, data_min=None, data_max=None, feature_range=(0, 1),
         >>> scaled_x = minmax_scale(X[0], data_min, data_max)
     """
     if (scale_ is None or min_ is None) and (data_min is None or data_max is None):
-        raise ValueError("""
-`data_min` and `data_max` or `scale_` and `min_` must be specified to perform minmax scale""")
+        raise ValueError(
+            """
+`data_min` and `data_max` or `scale_` and `min_` must be specified to perform minmax scale"""
+        )
     if scale_ is None:
         scale_ = __minmax_scale_factor(data_min, data_max, feature_range)
     if min_ is None:
@@ -770,8 +789,9 @@ def minmax_scale(x, data_min=None, data_max=None, feature_range=(0, 1),
     return x * scale_ + min_
 
 
-def inv_minmax_scale(x, data_min=None, data_max=None, feature_range=(0, 1),
-                     scale_=None, min_=None):
+def inv_minmax_scale(
+    x, data_min=None, data_max=None, feature_range=(0, 1), scale_=None, min_=None
+):
     """Inverse transform of min/max scaling for given a single data.
 
     Given data min, max and feature range, apply min/max denormalization to data.
@@ -800,8 +820,10 @@ def inv_minmax_scale(x, data_min=None, data_max=None, feature_range=(0, 1),
         :func:`nnmnkwii.preprocessing.minmax_scale_params`
     """
     if (scale_ is None or min_ is None) and (data_min is None or data_max is None):
-        raise ValueError("""
-`data_min` and `data_max` or `scale_` and `min_` must be specified to perform inverse of minmax scale""")
+        raise ValueError(
+            """
+`data_min` and `data_max` or `scale_` and `min_` must be specified to perform inverse of minmax scale"""
+        )
     if scale_ is None:
         scale_ = __minmax_scale_factor(data_min, data_max, feature_range)
     if min_ is None:
diff --git a/nnmnkwii/util/files.py b/nnmnkwii/util/files.py
index 58a54ab2..f6995bd0 100644
--- a/nnmnkwii/util/files.py
+++ b/nnmnkwii/util/files.py
@@ -1,14 +1,10 @@
-# coding: utf-8
-from __future__ import division, print_function, absolute_import
+from glob import glob
+from os.path import join
 
+import numpy as np
 import pkg_resources
-
 from nnmnkwii.datasets import FileDataSource
 
-import numpy as np
-from glob import glob
-from os.path import join
-
 
 def example_label_file(phone_level=False):
     """Get path of example HTS-style full-context lable file.
@@ -71,7 +67,7 @@ def example_question_file():
     Examples:
         >>> from nnmnkwii.util import example_question_file
         >>> from nnmnkwii.io import hts
-        >>> binary_dict, continuous_dict = hts.load_question_set(example_question_file())
+        >>> binary_dict, numeric_dict = hts.load_question_set(example_question_file())
     """
     return pkg_resources.resource_filename(
         __name__, '_example_data/questions-radio_dnn_416.hed')
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..a78cca3c
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,8 @@
+[build-system]
+requires = [
+    "wheel",
+    "setuptools",
+    "cython>=0.21.0",
+    "numpy>=1.8.0",
+    "scipy",
+]
diff --git a/setup.py b/setup.py
index 99494abf..3a0bfafc 100644
--- a/setup.py
+++ b/setup.py
@@ -12,17 +12,20 @@
 import os
 import sys
 
-version = '0.0.23'
+version = "0.0.23"
 
 # Adapted from https://github.com/pytorch/pytorch
 cwd = os.path.dirname(os.path.abspath(__file__))
-if os.getenv('NNMNKWII_BUILD_VERSION'):
-    version = os.getenv('NNMNKWII_BUILD_VERSION')
+if os.getenv("NNMNKWII_BUILD_VERSION"):
+    version = os.getenv("NNMNKWII_BUILD_VERSION")
 else:
     try:
-        sha = subprocess.check_output(
-            ['git', 'rev-parse', 'HEAD'], cwd=cwd).decode('ascii').strip()
-        version += '+' + sha[:7]
+        sha = (
+            subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=cwd)
+            .decode("ascii")
+            .strip()
+        )
+        version += "+" + sha[:7]
     except subprocess.CalledProcessError:
         pass
     except IOError:  # FileNotFoundError for python 3
@@ -36,11 +39,11 @@ def finalize_options(self):
         # Prevent numpy from thinking it is still in its setup process:
         __builtins__.__NUMPY_SETUP__ = False
         import numpy
+
         self.include_dirs.append(numpy.get_include())
 
 
 class build_py(setuptools.command.build_py.build_py):
-
     def run(self):
         self.create_version_file()
         setuptools.command.build_py.build_py.run(self)
@@ -48,14 +51,13 @@ def run(self):
     @staticmethod
     def create_version_file():
         global version, cwd
-        print('-- Building version ' + version)
-        version_path = os.path.join(cwd, 'nnmnkwii', 'version.py')
-        with open(version_path, 'w') as f:
+        print("-- Building version " + version)
+        version_path = os.path.join(cwd, "nnmnkwii", "version.py")
+        with open(version_path, "w") as f:
             f.write("__version__ = '{}'\n".format(version))
 
 
 class develop(setuptools.command.develop.develop):
-
     def run(self):
         build_py.create_version_file()
         setuptools.command.develop.develop.run(self)
@@ -63,9 +65,10 @@ def run(self):
 
 cmdclass = {"build_py": build_py, "develop": develop}
 
-min_cython_ver = '0.28.0'
+min_cython_ver = "0.28.0"
 try:
     import Cython
+
     ver = Cython.__version__
     _CYTHON_INSTALLED = ver >= LooseVersion(min_cython_ver)
 except ImportError:
@@ -73,20 +76,22 @@ def run(self):
 
 try:
     if not _CYTHON_INSTALLED:
-        raise ImportError('No supported version of Cython installed.')
+        raise ImportError("No supported version of Cython installed.")
     from Cython.Distutils import build_ext
+
     cython = True
 except ImportError:
     cython = False
 
 include_dirs = []
-cmdclass['build_ext'] = build_ext
+cmdclass["build_ext"] = build_ext
 if cython:
-    ext = '.pyx'
+    ext = ".pyx"
     import numpy as np
+
     include_dirs.insert(0, np.get_include())
 else:
-    ext = '.c'
+    ext = ".c"
     print("Building extentions from pre-generated C source")
     if not os.path.exists(join("nnmnkwii", "paramgen", "mlpg_helper" + ext)):
         raise RuntimeError("Cython is required to generate C code.")
@@ -104,56 +109,74 @@ def run(self):
         sources=[join("nnmnkwii", "paramgen", "mlpg_helper" + ext)],
         include_dirs=include_dirs,
         language="c",
-        extra_compile_args=["-std=c99"]
+        extra_compile_args=["-std=c99"],
     ),
 ]
 
+### [start] bandmat related code ###
+# https://github.com/MattShannon/bandmat/issues/10
+bandmat_upstream_fixed = False
+if not bandmat_upstream_fixed:
+    cython_locs = [
+        ("nnmnkwii", "paramgen", "_bandmat", "full"),
+        ("nnmnkwii", "paramgen", "_bandmat", "core"),
+        ("nnmnkwii", "paramgen", "_bandmat", "tensor"),
+        ("nnmnkwii", "paramgen", "_bandmat", "linalg"),
+        ("nnmnkwii", "paramgen", "_bandmat", "misc"),
+        ("nnmnkwii", "paramgen", "_bandmat", "overlap"),
+    ]
+    ext_modules.extend(
+        Extension(
+            ".".join(loc),
+            [join(*loc) + ".pyx"],
+            extra_compile_args=["-O3"],
+            include_dirs=include_dirs,
+        )
+        for loc in cython_locs
+    )
+### [end] bandmat related code ###
+
 
 def package_files(directory):
     # https://stackoverflow.com/questions/27664504/
     paths = []
     for (path, directories, filenames) in os.walk(directory):
         for filename in filenames:
-            paths.append(os.path.join('..', path, filename))
+            paths.append(os.path.join("..", path, filename))
     return paths
 
 
 package_data = package_files("./nnmnkwii/util/_example_data")
 
 install_requires = [
-    'scipy',
-    'cython >= ' + min_cython_ver,
-    'fastdtw',
-    'scikit-learn',
-    'pysptk >= 0.1.17',
-    'tqdm',
+    "scipy",
+    "cython >= " + min_cython_ver,
+    "fastdtw",
+    "scikit-learn",
+    "pysptk >= 0.1.17",
+    "tqdm",
 ]
 
-# Workaround for python >= 3.7 and bandmat
-# https://github.com/r9y9/deepvoice3_pytorch/issues/154
-if sys.version_info[:2] < (3, 7):
-    install_requires.append('bandmat >= 0.7')
-
 setup(
-    name='nnmnkwii',
+    name="nnmnkwii",
     version=version,
-    description='Library to build speech synthesis systems designed for easy and fast prototyping.',
-    long_description=open('README.md', 'rb').read().decode("utf-8"),
-    long_description_content_type='text/markdown',
-    author='Ryuichi Yamamoto',
-    author_email='zryuichi@gmail.com',
-    url='https://github.com/r9y9/nnmnkwii',
-    license='MIT',
+    description="Library to build speech synthesis systems designed for easy and fast prototyping.",
+    long_description=open("README.md", "rb").read().decode("utf-8"),
+    long_description_content_type="text/markdown",
+    author="Ryuichi Yamamoto",
+    author_email="zryuichi@gmail.com",
+    url="https://github.com/r9y9/nnmnkwii",
+    license="MIT",
     packages=find_packages(exclude=["tests", "perf"]),
-    package_data={'': package_data},
+    package_data={"": package_data},
     ext_modules=ext_modules,
     cmdclass=cmdclass,
     setup_requires=["numpy >= 1.11.0"],
     install_requires=install_requires,
-    tests_require=['nose', 'coverage'],
+    tests_require=["nose", "coverage"],
     extras_require={
-        'docs': ['numpydoc', 'sphinx_rtd_theme'],
-        'test': ['nose', 'pyworld', 'librosa'],
+        "docs": ["numpydoc", "sphinx_rtd_theme"],
+        "test": ["nose", "pyworld", "librosa"],
     },
     classifiers=[
         "Operating System :: Microsoft :: Windows",
@@ -173,5 +196,5 @@ def package_files(directory):
         "Intended Audience :: Science/Research",
         "Intended Audience :: Developers",
     ],
-    keywords=["Research"]
+    keywords=["Research"],
 )
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index b40b9668..70bbe11c 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,15 +1,15 @@
-from __future__ import division, print_function, absolute_import
+from __future__ import absolute_import, division, print_function
 
-from nnmnkwii.datasets import FileSourceDataset, PaddedFileSourceDataset
-from nnmnkwii.datasets import MemoryCacheFramewiseDataset
-from nnmnkwii.datasets import FileDataSource
-from nnmnkwii.util import example_file_data_sources_for_acoustic_model
-from nnmnkwii.util import example_file_data_sources_for_duration_model
+from os.path import dirname, join
 
 import numpy as np
-from nose.tools import raises
+from nnmnkwii.datasets import (FileDataSource, FileSourceDataset,
+                               MemoryCacheFramewiseDataset,
+                               PaddedFileSourceDataset)
+from nnmnkwii.util import (example_file_data_sources_for_acoustic_model,
+                           example_file_data_sources_for_duration_model)
 from nose.plugins.attrib import attr
-from os.path import join, dirname
+from nose.tools import raises
 
 DATA_DIR = join(dirname(__file__), "data")
 
@@ -153,7 +153,7 @@ def test_fixed_length_sequence_wise_iteration():
 def test_frame_wise_iteration():
     X, Y = _get_small_datasets(padded=False)
 
-    lengths = np.array([len(x) for x in X], dtype=np.int)
+    lengths = np.array([len(x) for x in X], dtype=int)
     num_utterances = len(lengths)
 
     # With sufficient cache size
@@ -198,7 +198,7 @@ def __len__(self):
     def __test(X, Y, batch_size):
         dataset = TorchDataset(X, Y)
         loader = data_utils.DataLoader(
-            dataset, batch_size=batch_size, num_workers=1, shuffle=True)
+            dataset, batch_size=batch_size, num_workers=0, shuffle=True)
         for idx, (x, y) in enumerate(loader):
             assert len(x.shape) == len(y.shape)
             assert len(x.shape) == 3
@@ -228,7 +228,7 @@ def test_frame_wise_torch_data_loader():
     # fixed size length, i.e., implements `__len__` method, we need to know
     # number of frames for each utterance.
     # Sum of the number of frames is the dataset size for frame-wise iteration.
-    lengths = np.array([len(x) for x in X], dtype=np.int)
+    lengths = np.array([len(x) for x in X], dtype=int)
 
     # For the above reason, we need to explicitly give the number of frames.
     X = MemoryCacheFramewiseDataset(X, lengths, cache_size=len(X))
@@ -248,7 +248,7 @@ def __len__(self):
     def __test(X, Y, batch_size):
         dataset = TorchDataset(X, Y)
         loader = data_utils.DataLoader(
-            dataset, batch_size=batch_size, num_workers=1, shuffle=True)
+            dataset, batch_size=batch_size, num_workers=0, shuffle=True)
         for idx, (x, y) in enumerate(loader):
             assert len(x.shape) == 2
             assert len(y.shape) == 2
diff --git a/tests/test_frontend.py b/tests/test_frontend.py
index 70f44095..229f2514 100644
--- a/tests/test_frontend.py
+++ b/tests/test_frontend.py
@@ -1,25 +1,22 @@
-from __future__ import division, print_function, absolute_import
-
-from nnmnkwii.io import hts
-from nnmnkwii.frontend import merlin as fe
-from nnmnkwii.util import example_question_file, example_label_file
-
 from os.path import dirname, join
-import numpy as np
 
+import numpy as np
+from nnmnkwii.frontend import merlin as fe
+from nnmnkwii.io import hts
+from nnmnkwii.util import example_label_file, example_question_file
 from nose.tools import raises
 
 DATA_DIR = join(dirname(__file__), "data")
 
 
 def test_invalid_linguistic_features():
-    binary_dict, continuous_dict = hts.load_question_set(example_question_file())
+    binary_dict, numeric_dict = hts.load_question_set(example_question_file())
     phone_labels = hts.load(example_label_file(phone_level=True))
     state_labels = hts.load(example_label_file(phone_level=False))
 
     @raises(ValueError)
     def __test(labels, subphone_features, add_frame_features):
-        fe.linguistic_features(labels, binary_dict, continuous_dict,
+        fe.linguistic_features(labels, binary_dict, numeric_dict,
                                subphone_features=subphone_features,
                                add_frame_features=add_frame_features)
 
@@ -40,13 +37,13 @@ def __test(labels, unit_size, feature_size):
 
 def test_silence_frame_removal_given_hts_labels():
     qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed")
-    binary_dict, continuous_dict = hts.load_question_set(qs_file_name)
+    binary_dict, numeric_dict = hts.load_question_set(qs_file_name)
 
     input_state_label = join(DATA_DIR, "label_state_align", "arctic_a0001.lab")
     labels = hts.load(input_state_label)
     features = fe.linguistic_features(labels,
                                       binary_dict,
-                                      continuous_dict,
+                                      numeric_dict,
                                       add_frame_features=True,
                                       subphone_features="full"
                                       )
@@ -64,7 +61,7 @@ def test_silence_frame_removal_given_hts_labels():
 # Make sure we can get same results with Merlin
 def test_linguistic_and_duration_features_for_duration_model():
     qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed")
-    binary_dict, continuous_dict = hts.load_question_set(qs_file_name)
+    binary_dict, numeric_dict = hts.load_question_set(qs_file_name)
 
     # Phone-level linguistic features
     # Linguistic features
@@ -73,7 +70,7 @@ def test_linguistic_and_duration_features_for_duration_model():
     assert labels.is_state_alignment_label()
     x = fe.linguistic_features(labels,
                                binary_dict,
-                               continuous_dict,
+                               numeric_dict,
                                add_frame_features=False,
                                subphone_features=None
                                )
@@ -93,7 +90,7 @@ def test_linguistic_and_duration_features_for_duration_model():
 
 def test_linguistic_features_for_acoustic_model():
     qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed")
-    binary_dict, continuous_dict = hts.load_question_set(qs_file_name)
+    binary_dict, numeric_dict = hts.load_question_set(qs_file_name)
 
     # Linguistic features
     # To train acoustic model paired with linguistic features,
@@ -103,7 +100,7 @@ def test_linguistic_features_for_acoustic_model():
     assert labels.is_state_alignment_label()
     x = fe.linguistic_features(labels,
                                binary_dict,
-                               continuous_dict,
+                               numeric_dict,
                                add_frame_features=True,
                                subphone_features="full"
                                )
@@ -114,18 +111,18 @@ def test_linguistic_features_for_acoustic_model():
 
 def test_phone_alignment_label():
     qs_file_name = join(DATA_DIR, "questions-radio_dnn_416.hed")
-    binary_dict, continuous_dict = hts.load_question_set(qs_file_name)
+    binary_dict, numeric_dict = hts.load_question_set(qs_file_name)
 
     input_state_label = join(DATA_DIR, "label_phone_align", "arctic_a0001.lab")
     labels = hts.load(input_state_label)
-    x = fe.linguistic_features(labels, binary_dict, continuous_dict,
+    x = fe.linguistic_features(labels, binary_dict, numeric_dict,
                                add_frame_features=False,
                                subphone_features=None)
     assert not labels.is_state_alignment_label()
     assert np.all(np.isfinite(x))
 
     for subphone_features in ["coarse_coding", "minimal_phoneme"]:
-        x = fe.linguistic_features(labels, binary_dict, continuous_dict,
+        x = fe.linguistic_features(labels, binary_dict, numeric_dict,
                                    add_frame_features=True,
                                    subphone_features=subphone_features)
         assert np.all(np.isfinite(x))
diff --git a/tests/test_io.py b/tests/test_io.py
index f01aff56..719c7089 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -1,35 +1,34 @@
-# coding: utf-8
-from nnmnkwii.io import hts
-from nnmnkwii.frontend import merlin as fe
-from os.path import dirname, join
 import copy
-from nnmnkwii.util import example_question_file
 import re
-from nose.tools import raises
+from os.path import dirname, join
 
+from nnmnkwii.frontend import merlin as fe
+from nnmnkwii.io import hts
+from nnmnkwii.util import example_question_file
+from nose.tools import raises
 
 DATA_DIR = join(dirname(__file__), "data")
 
 
 def test_labels_number_of_frames():
     # https://github.com/r9y9/nnmnkwii/issues/85
-    binary_dict, continuous_dict = hts.load_question_set(
-        join(DATA_DIR, "jp.hed"))
+    binary_dict, numeric_dict = hts.load_question_set(join(DATA_DIR, "jp.hed"))
     labels = hts.load(join(DATA_DIR, "BASIC5000_0619.lab"))
     linguistic_features = fe.linguistic_features(
-        labels, binary_dict, continuous_dict, add_frame_features=True)
+        labels, binary_dict, numeric_dict, add_frame_features=True
+    )
     assert labels.num_frames() == linguistic_features.shape[0]
 
 
 def test_load_question_set():
-    binary_dict, continuous_dict = hts.load_question_set(
-        example_question_file())
-    assert len(binary_dict) + len(continuous_dict) == 416
+    binary_dict, numeric_dict = hts.load_question_set(example_question_file())
+    assert len(binary_dict) + len(numeric_dict) == 416
 
 
 def test_htk_style_question_basics():
-    binary_dict, continuous_dict = hts.load_question_set(
-        join(DATA_DIR, "test_question.hed"))
+    binary_dict, numeric_dict = hts.load_question_set(
+        join(DATA_DIR, "test_question.hed")
+    )
     # sil k o n i ch i w a sil
     input_phone_label = join(DATA_DIR, "hts-nit-atr503", "phrase01.lab")
     labels = hts.load(input_phone_label)
@@ -44,12 +43,12 @@ def test_htk_style_question_basics():
 QS "R-Phone_o"       {*+o=*}
 QS "RR-Phone_o"      {*=o/A:*}
     """
-    LL_muon1 = binary_dict[0][0]
-    LL_muon2 = binary_dict[1][0]
-    L_muon1 = binary_dict[2][0]
-    C_sil = binary_dict[3][0]
-    R_phone_o = binary_dict[4][0]
-    RR_phone_o = binary_dict[5][0]
+    LL_muon1 = binary_dict[0][1][0]
+    LL_muon2 = binary_dict[1][1][0]
+    L_muon1 = binary_dict[2][1][0]
+    C_sil = binary_dict[3][1][0]
+    R_phone_o = binary_dict[4][1][0]
+    RR_phone_o = binary_dict[5][1][0]
 
     # xx^xx-sil+k=o
     label = labels[0][-1]
@@ -88,25 +87,29 @@ def test_singing_voice_question():
 QS "L-Phone_Yuusei_Boin"           {*^a-*,*^i-*,*^u-*,*^e-*,*^o-*}
 CQS "e1" {/E:(\\NOTE)]}
     """
-    binary_dict, continuous_dict = hts.load_question_set(
-        join(DATA_DIR, "test_jp_svs.hed"), append_hat_for_LL=False, convert_svs_pattern=True)
+    binary_dict, numeric_dict = hts.load_question_set(
+        join(DATA_DIR, "test_jp_svs.hed"),
+        append_hat_for_LL=False,
+        convert_svs_pattern=True,
+    )
     input_phone_label = join(DATA_DIR, "song070_f00001_063.lab")
     labels = hts.load(input_phone_label)
-    feats = fe.linguistic_features(labels, binary_dict, continuous_dict)
+    feats = fe.linguistic_features(labels, binary_dict, numeric_dict)
     assert feats.shape == (74, 3)
 
     # CQS e1: get the current MIDI number
-    C_e1 = continuous_dict[0]
+    C_e1 = numeric_dict[0][1]
     for idx, lab in enumerate(labels):
         context = lab[-1]
         if C_e1.search(context) is not None:
             from nnmnkwii.frontend import NOTE_MAPPING
+
             assert NOTE_MAPPING[C_e1.findall(context)[0]] == feats[idx, 1]
 
     # CQS e57: get pitch diff
     # In contrast to other continous features, the pitch diff has a prefix "m" or "p"
     # to indiecate th sign of numbers.
-    C_e57 = continuous_dict[1]
+    C_e57 = numeric_dict[1][1]
     for idx, lab in enumerate(labels):
         context = lab[-1]
         if "~p2+" in context:
@@ -157,8 +160,9 @@ def test_mono():
     sil_regex = re.compile("sil")
 
     for indices in [
-            labels.silence_label_indices(sil_regex),
-            labels.silence_phone_indices(sil_regex)]:
+        labels.silence_label_indices(sil_regex),
+        labels.silence_phone_indices(sil_regex),
+    ]:
         assert len(indices) == 2
         assert indices[0] == 0
         assert indices[1] == len(labels) - 1
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
index 4ac68632..2e052a38 100644
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
@@ -110,7 +110,7 @@ def test_real_metrics():
 
     mgc = X[:, :, :source.mgc_dim // 3]
     lf0 = X[:, :, source.lf0_start_idx]
-    vuv = (X[:, :, source.vuv_start_idx] > 0).astype(np.int)
+    vuv = (X[:, :, source.vuv_start_idx] > 0).astype(int)
     bap = X[:, :, source.bap_start_idx]
 
     mgc_tgt = mgc + 0.01
diff --git a/tests/test_pack_pad_sequence.py b/tests/test_pack_pad_sequence.py
index 08520a65..79a97f6c 100644
--- a/tests/test_pack_pad_sequence.py
+++ b/tests/test_pack_pad_sequence.py
@@ -82,7 +82,7 @@ def test_pack_sequnce():
     """
 
     X, Y = _get_small_datasets(padded=False)
-    lengths = np.array([len(x) for x in X], dtype=np.int)[:, None]
+    lengths = np.array([len(x) for x in X], dtype=int)[:, None]
 
     # We need padded dataset
     X, Y = _get_small_datasets(padded=True)
diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
index 8cc880a9..4e83ac09 100644
--- a/tests/test_preprocessing.py
+++ b/tests/test_preprocessing.py
@@ -1,29 +1,32 @@
-from __future__ import division, print_function, absolute_import
+from __future__ import absolute_import, division, print_function
 
-from nnmnkwii.preprocessing.f0 import interp1d
-from nnmnkwii.preprocessing import trim_zeros_frames, remove_zeros_frames
-from nnmnkwii.preprocessing import adjust_frame_lengths, delta_features
-from nnmnkwii.preprocessing import adjust_frame_length
-from nnmnkwii.preprocessing import modspec, modphase
-from nnmnkwii.preprocessing import preemphasis, inv_preemphasis
+import librosa
+import numpy as np
+import pysptk
+import pyworld
 from nnmnkwii import preprocessing as P
-from nnmnkwii.util import example_file_data_sources_for_duration_model
-from nnmnkwii.util import example_file_data_sources_for_acoustic_model
-from nnmnkwii.util import example_audio_file
 from nnmnkwii.datasets import FileSourceDataset, PaddedFileSourceDataset
-
+from nnmnkwii.preprocessing import (
+    adjust_frame_length,
+    adjust_frame_lengths,
+    delta_features,
+    inv_preemphasis,
+    preemphasis,
+    remove_zeros_frames,
+    trim_zeros_frames,
+)
+from nnmnkwii.preprocessing.f0 import interp1d
+from nnmnkwii.util import (
+    example_audio_file,
+    example_file_data_sources_for_acoustic_model,
+    example_file_data_sources_for_duration_model,
+)
+from nose.plugins.attrib import attr
 from nose.tools import raises
-
 from scipy.io import wavfile
-import numpy as np
-import pyworld
-import librosa
-import pysptk
-
-from nose.plugins.attrib import attr
 
 
-def _get_windows_set():
+def _get_windows_set_bandmat():
     windows_set = [
         # Static
         [
@@ -44,6 +47,27 @@ def _get_windows_set():
     return windows_set
 
 
+def _get_windows_set():
+    windows_set = [
+        # Static
+        [
+            [1.0],
+        ],
+        # Static + delta
+        [
+            [1.0],
+            [-0.5, 0.0, 0.5],
+        ],
+        # Static + delta + deltadelta
+        [
+            [1.0],
+            [-0.5, 0.0, 0.5],
+            [1.0, -2.0, 1.0],
+        ],
+    ]
+    return windows_set
+
+
 def test_mulaw():
     # Check corner cases
     assert P.mulaw_quantize(-1.0, 2) == 0
@@ -84,7 +108,9 @@ def test_mulaw():
 
     # torch array input
     from warnings import warn
+
     import torch
+
     torch.manual_seed(1234)
     for mu in [128, 256, 512]:
         x = torch.rand(10)
@@ -121,24 +147,22 @@ def test_meanvar_incremental():
     assert np.allclose(X_var, X_var_inc)
 
     # Split dataset and compute meanvar incrementaly
-    X_a = X[:N // 2]
-    X_b = X[N // 2:]
-    X_mean_a, X_var_a, last_sample_count = P.meanvar(
-        X_a, return_last_sample_count=True)
-    assert last_sample_count == np.sum(lengths[:N // 2])
+    X_a = X[: N // 2]
+    X_b = X[N // 2 :]
+    X_mean_a, X_var_a, last_sample_count = P.meanvar(X_a, return_last_sample_count=True)
+    assert last_sample_count == np.sum(lengths[: N // 2])
     X_mean_b, X_var_b = P.meanvar(
-        X_b, mean_=X_mean_a, var_=X_var_a,
-        last_sample_count=last_sample_count)
+        X_b, mean_=X_mean_a, var_=X_var_a, last_sample_count=last_sample_count
+    )
     assert np.allclose(X_mean, X_mean_b)
     assert np.allclose(X_var, X_var_b)
 
     # meanstd
-    X_mean_a, X_std_a, last_sample_count = P.meanstd(
-        X_a, return_last_sample_count=True)
-    assert last_sample_count == np.sum(lengths[:N // 2])
+    X_mean_a, X_std_a, last_sample_count = P.meanstd(X_a, return_last_sample_count=True)
+    assert last_sample_count == np.sum(lengths[: N // 2])
     X_mean_b, X_std_b = P.meanstd(
-        X_b, mean_=X_mean_a, var_=X_std_a**2,
-        last_sample_count=last_sample_count)
+        X_b, mean_=X_mean_a, var_=X_std_a ** 2, last_sample_count=last_sample_count
+    )
     assert np.allclose(X_mean, X_mean_b)
     assert np.allclose(X_std, X_std_b)
 
@@ -224,7 +248,8 @@ def __test_raise2(x, X_min, X_max):
     assert np.allclose(x, x_hat)
 
     x_hat = P.inv_minmax_scale(
-        P.minmax_scale(x, scale_=scale_, min_=min_), scale_=scale_, min_=min_)
+        P.minmax_scale(x, scale_=scale_, min_=min_), scale_=scale_, min_=min_
+    )
     assert np.allclose(x, x_hat)
 
 
@@ -279,29 +304,29 @@ def test_trim_zeros_frames():
     np.testing.assert_array_equal(actual_default, desired_default)
 
     desired_b = np.array(((0, 0), (0, 0), (1, 1), (2, 2)))
-    actual_b = trim_zeros_frames(arr, trim='b')
+    actual_b = trim_zeros_frames(arr, trim="b")
 
     assert desired_b.shape[1] == actual_b.shape[1]
     np.testing.assert_array_equal(actual_b, desired_b)
 
     desired_f = np.array(((1, 1), (2, 2), (0, 0)))
-    actual_f = trim_zeros_frames(arr, trim='f')
+    actual_f = trim_zeros_frames(arr, trim="f")
 
     assert desired_f.shape[1] == actual_f.shape[1]
     np.testing.assert_array_equal(actual_f, desired_f)
 
     desired_fb = np.array(((1, 1), (2, 2)))
-    actual_fb = trim_zeros_frames(arr, trim='fb')
+    actual_fb = trim_zeros_frames(arr, trim="fb")
 
     assert desired_fb.shape[1] == actual_fb.shape[1]
     np.testing.assert_array_equal(actual_fb, desired_fb)
 
     non_zeros = np.array(((1, 1), (2, 2), (3, 3), (4, 4), (5, 5)))
     desired_b_or_fb_non_zeros = np.array(((1, 1), (2, 2), (3, 3), (4, 4), (5, 5)))
-    actual_b = trim_zeros_frames(non_zeros, trim='b')
+    actual_b = trim_zeros_frames(non_zeros, trim="b")
     np.testing.assert_array_equal(actual_b, desired_b_or_fb_non_zeros)
 
-    actual_fb = trim_zeros_frames(non_zeros, trim='fb')
+    actual_fb = trim_zeros_frames(non_zeros, trim="fb")
     np.testing.assert_array_equal(actual_fb, desired_b_or_fb_non_zeros)
 
 
@@ -326,8 +351,12 @@ def test_adjust_frame_length_divisible():
     assert (adjust_frame_length(x, pad=True, divisible_by=3)[-1] == 0).all()
 
     # make sure we passes extra kwargs to np.pad
-    assert (adjust_frame_length(x, pad=True, divisible_by=3,
-                                mode="constant", constant_values=1)[-1] == 1).all()
+    assert (
+        adjust_frame_length(
+            x, pad=True, divisible_by=3, mode="constant", constant_values=1
+        )[-1]
+        == 1
+    ).all()
 
     # Should preserve dtype
     for dtype in [np.float32, np.float64]:
@@ -342,8 +371,10 @@ def test_adjust_frame_lengths():
     D = 5
 
     # 1d and 2d padding
-    for (x, y) in [(np.random.rand(T1), np.random.rand(T2)),
-                   (np.random.rand(T1, D), np.random.rand(T2, D))]:
+    for (x, y) in [
+        (np.random.rand(T1), np.random.rand(T2)),
+        (np.random.rand(T1, D), np.random.rand(T2, D)),
+    ]:
         x_hat, y_hat = adjust_frame_lengths(x, y, pad=True)
         assert x_hat.shape == y_hat.shape
         assert x_hat.shape[0] == 11
@@ -352,24 +383,20 @@ def test_adjust_frame_lengths():
         assert x_hat.shape == y_hat.shape
         assert x_hat.shape[0] == 10
 
-        x_hat, y_hat = adjust_frame_lengths(x, y, pad=True,
-                                            divisible_by=2)
+        x_hat, y_hat = adjust_frame_lengths(x, y, pad=True, divisible_by=2)
         assert x_hat.shape == y_hat.shape
         assert x_hat.shape[0] == 12
 
-        x_hat, y_hat = adjust_frame_lengths(x, y, pad=False,
-                                            divisible_by=2)
+        x_hat, y_hat = adjust_frame_lengths(x, y, pad=False, divisible_by=2)
         assert x_hat.shape == y_hat.shape
         assert x_hat.shape[0] == 10
 
         # Divisible
-        x_hat, y_hat = adjust_frame_lengths(x, y, pad=False,
-                                            divisible_by=3)
+        x_hat, y_hat = adjust_frame_lengths(x, y, pad=False, divisible_by=3)
         assert x_hat.shape == y_hat.shape
         assert x_hat.shape[0] == 9
 
-        x_hat, y_hat = adjust_frame_lengths(x, y, pad=True,
-                                            divisible_by=3)
+        x_hat, y_hat = adjust_frame_lengths(x, y, pad=True, divisible_by=3)
         assert x_hat.shape == y_hat.shape
         assert x_hat.shape[0] == 12
 
@@ -381,7 +408,8 @@ def test_adjust_frame_lengths():
 
     # make sure we passes extra kwargs to np.pad
     x_hat, y_hat = adjust_frame_lengths(
-        x, y, pad=True, divisible_by=3, mode="constant", constant_values=1)
+        x, y, pad=True, divisible_by=3, mode="constant", constant_values=1
+    )
     assert x_hat[-1] == 1 and y_hat[-1] == 1
 
 
@@ -392,6 +420,9 @@ def test_delta_features():
     for windows in _get_windows_set():
         y = delta_features(x, windows)
         assert y.shape == (T, static_dim * len(windows))
+    for windows in _get_windows_set_bandmat():
+        y = delta_features(x, windows)
+        assert y.shape == (T, static_dim * len(windows))
 
 
 def _get_mcep(x, fs, frame_period=5, order=24):
@@ -413,11 +444,14 @@ def test_dtw_frame_length_adjustment():
     X = FileSourceDataset(X)
     X_unaligned = X.asarray()
     # This should trigger frame length adjustment
-    Y_unaligned = np.pad(X_unaligned, [(0, 0), (5, 0), (0, 0)],
-                         mode="constant", constant_values=0)
+    Y_unaligned = np.pad(
+        X_unaligned, [(0, 0), (5, 0), (0, 0)], mode="constant", constant_values=0
+    )
     Y_unaligned = Y_unaligned[:, :-5, :]
-    for aligner in [DTWAligner(), IterativeDTWAligner(
-            n_iter=1, max_iter_gmm=1, n_components_gmm=1)]:
+    for aligner in [
+        DTWAligner(),
+        IterativeDTWAligner(n_iter=1, max_iter_gmm=1, n_components_gmm=1),
+    ]:
         X_aligned, Y_aligned = aligner.transform((X_unaligned, Y_unaligned))
         assert X_aligned.shape == Y_aligned.shape
 
@@ -426,7 +460,8 @@ def test_dtw_frame_length_adjustment():
 def test_dtw_aligner():
     from nnmnkwii.preprocessing.alignment import DTWAligner, IterativeDTWAligner
 
-    x, fs = librosa.load(example_audio_file(), sr=None)
+    fs, x = wavfile.read(example_audio_file())
+    x = (x / 32768.0).astype(np.float32)
     assert fs == 16000
     x_fast = librosa.effects.time_stretch(x, 2.0)
 
@@ -447,12 +482,14 @@ def test_dtw_aligner():
     assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y)
 
     X_aligned, Y_aligned = IterativeDTWAligner(
-        n_iter=2, max_iter_gmm=10, n_components_gmm=2).transform((X, Y))
+        n_iter=2, max_iter_gmm=10, n_components_gmm=2
+    ).transform((X, Y))
     assert X_aligned.shape == Y_aligned.shape
     assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y)
 
     # Custom dist function
     from nnmnkwii.metrics import melcd
+
     X_aligned, Y_aligned = DTWAligner(dist=melcd).transform((X, Y))
     assert np.linalg.norm(X_aligned - Y_aligned) < np.linalg.norm(X - Y)
 
@@ -482,15 +519,15 @@ def test_modspec_smoothing():
         for norm in [None, "ortho"]:
             for n in [1024, 2048]:
                 # Nyquist freq
-                y_hat = P.modspec_smoothing(y, modfs, n=n, norm=norm,
-                                            cutoff=modfs // 2,
-                                            log_domain=log_domain)
+                y_hat = P.modspec_smoothing(
+                    y, modfs, n=n, norm=norm, cutoff=modfs // 2, log_domain=log_domain
+                )
                 assert np.allclose(y, y_hat)
 
                 # Smooth
-                P.modspec_smoothing(y, modfs, n=n, norm=norm,
-                                    cutoff=modfs // 4,
-                                    log_domain=log_domain)
+                P.modspec_smoothing(
+                    y, modfs, n=n, norm=norm, cutoff=modfs // 4, log_domain=log_domain
+                )
 
     # Cutoff frequency larger than modfs//2
     @raises(ValueError)
diff --git a/tests/test_util.py b/tests/test_util.py
index 1b1ce4a6..dd787762 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -55,7 +55,7 @@ def dummmy_func2d(x):
 
 
 def _get_banded_test_mat(win_mats, T):
-    import bandmat as bm
+    from nnmnkwii.paramgen import _bandmat as bm
 
     sdw = max([win_mat.l + win_mat.u for win_mat in win_mats])
     P = bm.zeros(sdw, sdw, T)