diff --git a/.coveragerc b/.coveragerc
index ce73b59b11..cf9ec88a13 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -2,8 +2,7 @@
 [run]
 omit =
     tests/*
-    conda/*
-    scripts/tests/*
+    scripts/*
 concurrency =
     multiprocessing
     thread
diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000000..6b82ac0df8
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,4 @@
+[flake8]
+max-line-length = 100
+max-complexity = 18
+exclude = tests,__init__.py
diff --git a/.github/workflows/unittests-gpu.yml b/.github/workflows/unittests-gpu.yml
new file mode 100644
index 0000000000..c0dc3368d3
--- /dev/null
+++ b/.github/workflows/unittests-gpu.yml
@@ -0,0 +1,60 @@
+name: continuous build - gpu
+
+on: [push, pull_request_target]
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  unittest-gpu:
+    runs-on: ubuntu-latest
+    strategy: 
+      fail-fast: false
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      - name: Install Linux dependencies
+        run: sudo apt-get install libopenblas-dev
+
+      - name: Setup python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+          architecture: x64
+
+      - name: Install Other Dependencies
+        run: |
+          python -m pip install --user --quiet --upgrade pip
+          python -m pip install --user --quiet -e .[extras]
+
+      - name: Configure AWS Credentials
+        uses: aws-actions/configure-aws-credentials@v1
+        with:
+          aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
+          aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+          aws-region: us-east-1
+
+      - name: Extract branch name
+        shell: bash
+        run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+        id: extract_branch
+
+      - name: Test project on AWS Batch(For push)
+        if: startsWith(steps.extract_branch.outputs.branch, 'PR-') != true
+        run: |
+          python ./tools/batch/submit-job.py --region us-east-1 --job-type g4dn.4x --source-ref ${{ github.ref }} --work-dir tools/batch --remote https://github.com/dmlc/gluon-nlp --command "/batch_states/test.sh" --wait | tee > script.log
+
+      - name: Test project on AWS Batch(For pull request)
+        if: startsWith(steps.extract_branch.outputs.branch, 'PR-') == true
+        run: |
+          python ./tools/batch/submit-job.py --region us-east-1 --job-type g4dn.4x --source-ref ${{ github.event.pull_request.head.ref }} --work-dir tools/batch --remote https://github.com/${{ github.event.pull_request.head.repo.full_name }} --command "/batch_states/test.sh" --wait | tee > script.log
+
+      - name: Upload log file for AWS Batch test results
+        uses: actions/upload-artifact@v2
+        with:
+          name: GPU_Test_Results
+          path: script.log
+
+
diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml
new file mode 100644
index 0000000000..ced8f9a1c8
--- /dev/null
+++ b/.github/workflows/unittests.yml
@@ -0,0 +1,47 @@
+name: continuous build
+
+on: [push, pull_request]
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  unittest:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        # TODO Add windows test by using "windows-latest"
+        os: [macos-latest, ubuntu-latest]
+        python-version: [ '3.6', '3.7', '3.8']
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v2
+
+      # Install OS specific dependencies
+      - name: Install Linux dependencies
+        if: matrix.os == 'ubuntu-latest'
+        # TODO https://github.com/apache/incubator-mxnet/issues/18293
+        run: sudo apt-get install libopenblas-dev
+
+      - name: Setup python
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+      - name: Install Other Dependencies
+        run: |
+          python -m pip install --user --upgrade pip
+          python -m pip install --user setuptools pytest pytest-cov contextvars
+          python -m pip install --upgrade cython
+          python -m pip install --pre --user "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python
+          python -m pip install --user -e .[extras]
+      - name: Test project
+        run: |
+          python -m pytest --cov=./ --cov-report=xml --device="cpu" --durations=50 tests/
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v1.0.10
+        with:
+          env_vars: OS,PYTHON    
+
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 99f7dae7c9..0000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,10 +0,0 @@
-[submodule "scripts/word_embeddings/tools/extern/CLI11"]
-	path = scripts/word_embeddings/tools/extern/CLI11
-	url = https://github.com/CLIUtils/CLI11.git
-[submodule "scripts/word_embeddings/tools/extern/cnpy"]
-	path = scripts/word_embeddings/tools/extern/cnpy
-	url = https://github.com/leezu/cnpy
-	branch = libzip
-[submodule "scripts/word_embeddings/tools/extern/sparsepp"]
-	path = scripts/word_embeddings/tools/extern/sparsepp
-	url = https://github.com/greg7mdp/sparsepp.git
diff --git a/.pytype.cfg b/.pytype.cfg
index 8220a41658..ebf2d9c586 100644
--- a/.pytype.cfg
+++ b/.pytype.cfg
@@ -5,4 +5,4 @@ inputs =
     src/gluonnlp
 
 # Python version (major.minor) of the target code.
-python_version = 3.5
+python_version = 3.6
diff --git a/CODEOWNERS b/CODEOWNERS
index 11af321c0e..43d8c57893 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,9 +1,9 @@
-# Watchers and contributors to Apache MXNet repo directories/packages/files
+# Watchers and contributors to DMLC GluonNLP repo directories/packages/files
 # Please see documentation of use of CODEOWNERS file at
 # https://help.github.com/articles/about-codeowners/ and
 # https://github.com/blog/2392-introducing-code-owners
 #
-# Anybody can add themselves or a team as additional watcher or contributor 
+# Anybody can add themselves or a team as additional watcher or contributor
 # to get notified about changes in a specific package.
 # See https://help.github.com/articles/about-teams how to setup teams.
 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
deleted file mode 100644
index abb8a2119f..0000000000
--- a/CONTRIBUTING.md
+++ /dev/null
@@ -1 +0,0 @@
-Contribution guideline can be found at http://gluon-nlp.mxnet.io/community/contribute.html
diff --git a/MANIFEST.in b/MANIFEST.in
deleted file mode 100644
index 5ebc05b4eb..0000000000
--- a/MANIFEST.in
+++ /dev/null
@@ -1,5 +0,0 @@
-recursive-include gluonnlp *.py
-include LICENSE
-include README.rst
-recursive-exclude tests *
-recursive-exclude scripts *
\ No newline at end of file
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 90b1b01e19..0000000000
--- a/Makefile
+++ /dev/null
@@ -1,113 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-ROOTDIR = $(CURDIR)
-MD2IPYNB = $(ROOTDIR)/docs/md2ipynb.py
-
-flake8:
-	flake8 --exclude conda,*tests*,test_*.py,scripts/word_embeddings/tools/extern --count --select=E9,F63,F7,F82 --show-source --statistics $(lintdir)
-
-pylint:
-	pylint --rcfile=$(ROOTDIR)/.pylintrc $(lintdir)
-
-pytype:
-	pytype --config=$(ROOTDIR)/.pytype.cfg
-
-restruc:
-	python setup.py check --restructuredtext --strict
-
-lint:
-	make lintdir=$(lintdir) flake8
-	make lintdir=$(lintdir) pylint
-	make pytype
-	make lintdir=$(lintdir) ratcheck
-	make restruc
-
-ci/rat/apache-rat.jar:
-	mkdir -p build
-	svn co http://svn.apache.org/repos/asf/creadur/rat/tags/apache-rat-project-0.13/ ci/rat/apache-rat; \
-	cd ci/rat/apache-rat/apache-rat; \
-	mvn -Dmaven.test.skip=true install;
-	cp ci/rat/apache-rat/apache-rat/target/apache-rat-0.13.jar ci/rat/apache-rat.jar
-
-ratcheck: ci/rat/apache-rat.jar
-	exec 5>&1; \
-	RAT_JAR=ci/rat/apache-rat.jar; \
-	OUTPUT=$(java -jar $(RAT_JAR) -E ci/rat/rat-excludes -d $(lintdir) | tee >(cat - >&5)); \
-    ERROR_MESSAGE="Printing headers for text files without a valid license header"; \
-    echo "-------Process The Output-------"; \
-    if [[ $OUTPUT =~ $ERROR_MESSAGE ]]; then \
-        echo "ERROR: RAT Check detected files with unknown licenses. Please fix and run test again!"; \
-        exit 1; \
-    else \
-        echo "SUCCESS: There are no files with an Unknown License."; \
-    fi
-
-docs: compile_notebooks distribute
-	make -C docs html SPHINXOPTS=-W
-	for f in $(shell find docs/examples -type f -name '*.md' -print) ; do \
-		FILE=`echo $$f | sed 's/docs\///g'` ; \
-		DIR=`dirname $$FILE` ; \
-		BASENAME=`basename $$FILE` ; \
-		HTML_BASENAME=`echo $$BASENAME | sed 's/md/html/'` ; \
-		IPYNB_BASENAME=`echo $$BASENAME | sed 's/md/ipynb/'` ; \
-		TARGET_HTML="docs/_build/html/$$DIR/$$HTML_BASENAME" ; \
-		echo "processing" $$BASENAME ; \
-		sed -i "s/$$IPYNB_BASENAME/$$BASENAME/g" $$TARGET_HTML; \
-	done;
-	for f in $(shell find docs/model_zoo -type f -name '*.rst' -print) ; do \
-		DIR=`dirname $$f` ; \
-		BASENAME=`basename $$f` ; \
-		HTML_BASENAME=`echo $$BASENAME | sed 's/rst/html/'` ; \
-		TARGET_HTML="docs/_build/html/$$DIR/$$HTML_BASENAME" ; \
-		echo "processing" $$BASENAME ; \
-		sed -i "s/docs\/model_zoo/scripts/g" $$TARGET_HTML; \
-	done;
-	sed -i.bak 's/33\,150\,243/23\,141\,201/g' docs/_build/html/_static/material-design-lite-1.3.0/material.blue-deep_orange.min.css;
-	sed -i.bak 's/2196f3/178dc9/g' docs/_build/html/_static/sphinx_materialdesign_theme.css;
-
-clean:
-	git clean -ff -d -x --exclude="$(ROOTDIR)/tests/data/*" --exclude="$(ROOTDIR)/conda/"
-
-compile_notebooks:
-	for f in $(shell find docs/examples -type f -name '*.md' -print) ; do \
-		DIR=$$(dirname $$f) ; \
-		BASENAME=$$(basename $$f) ; \
-		TARGETNAME=$${BASENAME%.md}.ipynb ; \
-		echo $$DIR $$BASENAME $$TARGETNAME; \
-		cd $$DIR ; \
-		if [ -f $$TARGETNAME ]; then \
-			echo $$TARGETNAME exists. Skipping compilation of $$BASENAME in Makefile. ; \
-		else \
-			python $(MD2IPYNB) $$BASENAME ; \
-		fi ; \
-		cd - ; \
-	done;
-
-dist_scripts:
-	cd scripts && \
-	find * -type d -prune | grep -v 'tests\|__pycache__' | xargs -t -n 1 -I{} zip -r {}.zip {}
-
-dist_notebooks:
-	cd docs/examples && \
-	find * -type d -prune | grep -v 'tests\|__pycache__' | xargs -t -n 1 -I{} zip -r {}.zip {} -x "*.md" -x "__pycache__" -x "*.pyc" -x "*.txt" -x "*.log" -x "*.params" -x "*.npz" -x "*.json"
-
-test:
-	py.test -v --capture=no --durations=0  tests/unittest scripts
-
-distribute: dist_scripts dist_notebooks
-	python setup.py sdist
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000..62e34d894e
--- /dev/null
+++ b/README.md
@@ -0,0 +1,111 @@
+<h3 align="center">
+GluonNLP: Your Choice of Deep Learning for NLP
+</h3>
+
+<p align="center">
+    <a href="https://github.com/dmlc/gluon-nlp/actions"><img src="https://github.com/dmlc/gluon-nlp/workflows/continuous%20build/badge.svg"></a>
+    <a href="https://codecov.io/gh/dmlc/gluon-nlp"><img src="https://codecov.io/gh/dmlc/gluon-nlp/branch/master/graph/badge.svg"></a>
+    <a href="https://github.com/dmlc/gluonnlp/actions"><img src="https://img.shields.io/badge/python-3.6%2C3.8-blue.svg"></a>
+    <a href="https://pypi.org/project/gluonnlp/#history"><img src="https://img.shields.io/pypi/v/gluonnlp.svg"></a>
+</p>
+
+GluonNLP is a toolkit that enables easy text preprocessing, datasets
+loading and neural models building to help you speed up your Natural
+Language Processing (NLP) research.
+
+# Features
+
+For NLP Practitioners
+- Easy-to-use Data Pipeline
+- Automatically Train Models via AutoNLP (TODO)
+
+For Researchers
+- Pretrained Model Zoo
+- Programming with numpy-like API
+
+For Engineers
+- Fast Deployment
+    - [TVM](https://tvm.apache.org/) (TODO)
+- AWS Integration
+
+
+# Installation
+First of all, install the latest MXNet. You may use the following commands:
+
+```bash
+# Install the version with CUDA 10.0
+python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20200802" -f https://dist.mxnet.io/python
+
+# Install the version with CUDA 10.1
+python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20200802" -f https://dist.mxnet.io/python
+
+# Install the version with CUDA 10.2
+python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
+
+# Install the cpu-only version
+python3 -m pip install -U --pre "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python
+```
+
+
+To install GluonNLP, use
+
+```bash
+python3 -m pip install -U -e .
+
+# Also, you may install all the extra requirements via
+python3 -m pip install -U -e ."[extras]"
+```
+
+If you find that you do not have the permission, you can also install to the user folder:
+
+```bash
+python3 -m pip install -U -e . --user
+```
+
+For Windows users, we recommend to use the [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/about).
+
+
+# Access the Command-line Toolkits
+
+To facilitate the researcher and the engineers, we provide command-line-toolkits for
+downloading and preprocessing the NLP datasets. For more details, you may refer to
+ [GluonNLP Datasets](./scripts/datasets) and [GluonNLP Preprocessing Tools](./scripts/preprocess).
+
+```bash
+# CLI for downloading / preparing the dataset
+nlp_data help
+
+# CLI for accessing some common data preprocessing scripts
+nlp_preprocess help
+
+# Also, you can use `python -m` to access the toolkits
+python3 -m gluonnlp.cli.data help
+python3 -m gluonnlp.cli.preprocess help
+
+```
+
+### Frequently Asked Questions
+- **Question**: I cannot you access the command line toolkits. By running `nlp_data`, it reports `nlp_data: command not found`.
+  
+  This is sometimes because that you have installed glunonnlp to the user folder and 
+  the executables are installed to `~/.local/bin`. You can try to change the `PATH` variable to 
+  also include '~/.local/bin'.
+  
+  ```
+  export PATH=${PATH}:~/.local/bin
+  ```
+
+
+# Run Unittests
+You may go to [tests](tests) to see all how to run the unittests.
+
+
+# Use Docker
+You can use Docker to launch a JupyterLab development environment with GluonNLP installed.
+
+```
+docker pull gluonai/gluon-nlp:gpu-latest
+docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=4g gluonai/gluon-nlp:gpu-latest
+``` 
+
+For more details, you can refer to the guidance in [tools/docker](tools/docker).
diff --git a/README.rst b/README.rst
deleted file mode 100644
index cf004dc838..0000000000
--- a/README.rst
+++ /dev/null
@@ -1,218 +0,0 @@
-.. raw:: html
-
-   <a href="http://gluon-nlp.mxnet.io/master/index.html"><p align="center"><img width="25%" src="https://github.com/dmlc/gluon-nlp/raw/be3bc8852155e935d68d397e0743715c54c3ce76/docs/_static/gluon_s2.png" /></a>
-   </p>
-
-.. raw:: html
-
-   <h3 align="center">
-
-GluonNLP: Your Choice of Deep Learning for NLP
-
-.. raw:: html
-
-   </h3>
-
-.. raw:: html
-
-   <a href='http://ci.mxnet.io/job/gluon-nlp/job/master/'><img src='https://img.shields.io/badge/python-3.5%2C3.7-blue.svg'></a>
-   <a href='https://codecov.io/gh/dmlc/gluon-nlp'><img src='https://codecov.io/gh/dmlc/gluon-nlp/branch/master/graph/badge.svg'></a>
-   <a href='http://ci.mxnet.io/job/gluonnlp-py3-master-gpu-doc/job/master/'><img src='http://ci.mxnet.io/buildStatus/icon?job=gluonnlp-py3-master-gpu-doc%2Fmaster'></a>
-   <a href='https://pypi.org/project/gluonnlp/#history'><img src='https://img.shields.io/pypi/v/gluonnlp.svg'></a>
-
-GluonNLP is a toolkit that enables easy text preprocessing, datasets
-loading and neural models building to help you speed up your Natural
-Language Processing (NLP) research.
-
-- `Quick Start Guide <https://github.com/dmlc/gluon-nlp#quick-start-guide>`__
-- `Resources <https://github.com/dmlc/gluon-nlp#resources>`__
-
-News
-====
-
-- Tutorial proposal for GluonNLP is accepted at `EMNLP 2019 <https://www.emnlp-ijcnlp2019.org>`__, Hong Kong.
-
-- GluonNLP was featured in:
-
-  - **KDD 2019 Alaska**! Check out our tutorial: `From Shallow to Deep Language Representations: Pre-training, Fine-tuning, and Beyond <http://kdd19.mxnet.io>`__.
-  - **JSALT 2019 in Montreal, 2019-6-14**! Checkout **https://jsalt19.mxnet.io**.
-  - **AWS re:invent 2018 in Las Vegas, 2018-11-28**! Checkout `details <https://www.portal.reinvent.awsevents.com/connect/sessionDetail.ww?SESSION_ID=88736>`_.
-  - **PyData 2018 NYC, 2018-10-18**! Checkout the `awesome talk <https://pydata.org/nyc2018/schedule/presentation/76/>`__ by Sneha Jha.
-  - **KDD 2018 London, 2018-08-21, Apache MXNet Gluon tutorial**! Check out **https://kdd18.mxnet.io**.
-
-Installation
-============
-
-Make sure you have Python 3.5 or newer and a recent version of MXNet (our CI
-server runs the testsuite with Python 3.5).
-
-You can install ``MXNet`` and ``GluonNLP`` using pip.
-
-``GluonNLP`` is based on the most recent version of ``MXNet``.
-
-
-In particular, if you want to install the most recent ``MXNet`` release:
-
-::
-
-    pip install --upgrade mxnet>=1.6.0
-
-Else, if you want to install the most recent ``MXNet`` nightly build:
-
-::
-
-    pip install --pre --upgrade mxnet
-
-Then, you can install ``GluonNLP``:
-
-::
-
-    pip install gluonnlp
-
-Please check more `installation details <https://github.com/dmlc/gluon-nlp/blob/master/docs/install.rst>`_.
-
-Docs 📖
-=======
-
-GluonNLP documentation is available at `our
-website <http://gluon-nlp.mxnet.io/master/index.html>`__.
-
-Community
-=========
-
-GluonNLP is a community that believes in sharing.
-
-For questions, comments, and bug reports, `Github issues <https://github.com/dmlc/gluon-nlp/issues>`__ is the best way to reach us.
-
-We now have a new Slack channel `here <https://apache-mxnet.slack.com/messages/CCCDM10V9>`__.
-(`register <https://join.slack.com/t/apache-mxnet/shared_invite/enQtNDQyMjAxMjQzMTI3LTkzMzY3ZmRlNzNjNGQxODg0N2Y5NmExMjEwOTZlYmIwYTU2ZTY4ZjNlMmEzOWY5MGQ5N2QxYjhlZTFhZTVmYTc>`__).
-
-How to Contribute
-=================
-
-GluonNLP community welcomes contributions from anyone!
-
-There are lots of opportunities for you to become our `contributors <https://github.com/dmlc/gluon-nlp/graphs/contributors>`__:
-
-- Ask or answer questions on `GitHub issues <https://github.com/dmlc/gluon-nlp/issues>`__.
-- Propose ideas, or review proposed design ideas on `GitHub issues <https://github.com/dmlc/gluon-nlp/issues>`__.
-- Improve the `documentation <http://gluon-nlp.mxnet.io/master/index.html>`__.
-- Contribute bug reports `GitHub issues <https://github.com/dmlc/gluon-nlp/issues>`__.
-- Write new `scripts <https://github.com/dmlc/gluon-nlp/tree/master/scripts>`__ to reproduce
-  state-of-the-art results.
-- Write new `examples <https://github.com/dmlc/gluon-nlp/tree/master/docs/examples>`__ to explain
-  key ideas in NLP methods and models.
-- Write new `public datasets <https://github.com/dmlc/gluon-nlp/tree/master/gluonnlp/data>`__
-  (license permitting).
-- Most importantly, if you have an idea of how to contribute, then do it!
-
-For a list of open starter tasks, check `good first issues <https://github.com/dmlc/gluon-nlp/labels/good%20first%20issue>`__.
-
-Also see our `contributing
-guide <http://gluon-nlp.mxnet.io/master/how_to/contribute.html>`__ on simple how-tos,
-contribution guidelines and more.
-
-Resources
-=========
-
-Check out how to use GluonNLP for your own research or projects.
-
-If you are new to Gluon, please check out our `60-minute crash course
-<http://gluon-crash-course.mxnet.io/>`__.
-
-For getting started quickly, refer to notebook runnable examples at
-`Examples. <http://gluon-nlp.mxnet.io/master/examples/index.html>`__
-
-For advanced examples, check out our
-`Scripts. <http://gluon-nlp.mxnet.io/master/scripts/index.html>`__
-
-For experienced users, check out our
-`API Notes <http://gluon-nlp.mxnet.io/master/api/index.html>`__.
-
-Quick Start Guide
-=================
-
-`Dataset Loading <http://gluon-nlp.mxnet.io/master/api/notes/data_api.html>`__
--------------------------------------------------------------------------------
-
-Load the Wikitext-2 dataset, for example:
-
-.. code:: python
-
-    >>> import gluonnlp as nlp
-    >>> train = nlp.data.WikiText2(segment='train')
-    >>> train[0:5]
-    ['=', 'Valkyria', 'Chronicles', 'III', '=']
-
-`Vocabulary Construction <http://gluon-nlp.mxnet.io/master/api/modules/vocab.html>`__
--------------------------------------------------------------------------------------
-
-Build vocabulary based on the above dataset, for example:
-
-.. code:: python
-
-    >>> vocab = nlp.Vocab(counter=nlp.data.Counter(train))
-    >>> vocab
-    Vocab(size=33280, unk="<unk>", reserved="['<pad>', '<bos>', '<eos>']")
-
-`Neural Models Building <http://gluon-nlp.mxnet.io/master/api/modules/model.html>`__
-------------------------------------------------------------------------------------
-
-From the models package, apply a Standard RNN language model to the
-above dataset:
-
-.. code:: python
-
-    >>> model = nlp.model.language_model.StandardRNN('lstm', len(vocab),
-    ...                                              200, 200, 2, 0.5, True)
-    >>> model
-    StandardRNN(
-      (embedding): HybridSequential(
-        (0): Embedding(33280 -> 200, float32)
-        (1): Dropout(p = 0.5, axes=())
-      )
-      (encoder): LSTM(200 -> 200.0, TNC, num_layers=2, dropout=0.5)
-      (decoder): HybridSequential(
-        (0): Dense(200 -> 33280, linear)
-      )
-    )
-
-`Word Embeddings Loading <http://gluon-nlp.mxnet.io/master/api/modules/embedding.html>`__
------------------------------------------------------------------------------------------
-
-For example, load a GloVe word embedding, one of the state-of-the-art
-English word embeddings:
-
-.. code:: python
-
-    >>> glove = nlp.embedding.create('glove', source='glove.6B.50d')
-    # Obtain vectors for 'baby' in the GloVe word embedding
-    >>> type(glove['baby'])
-    <class 'mxnet.ndarray.ndarray.NDArray'>
-    >>> glove['baby'].shape
-    (50,)
-
-
-Reference Paper
-===============
-
-The bibtex entry for the `reference paper <https://arxiv.org/abs/1907.04433>`__ of GluonNLP is:
-
-.. code::
-   
-   @article{gluoncvnlp2020,
-     author  = {Jian Guo and He He and Tong He and Leonard Lausen and Mu Li and Haibin Lin and Xingjian Shi and Chenguang Wang and Junyuan Xie and Sheng Zha and Aston Zhang and Hang Zhang and Zhi Zhang and Zhongyue Zhang and Shuai Zheng and Yi Zhu},
-     title   = {GluonCV and GluonNLP: Deep Learning in Computer Vision and Natural Language Processing},
-     journal = {Journal of Machine Learning Research},
-     year    = {2020},
-     volume  = {21},
-     number  = {23},
-     pages   = {1-7},
-     url     = {http://jmlr.org/papers/v21/19-429.html}
-   }
-
-
-New to Deep Learning or NLP?
-============================
-
-For background knowledge of deep learning or NLP, please refer to the open source book `Dive into Deep Learning <http://en.diveintodeeplearning.org/>`__.
diff --git a/ci/batch/docker/Dockerfile b/ci/batch/docker/Dockerfile
deleted file mode 100644
index 8cc64125b5..0000000000
--- a/ci/batch/docker/Dockerfile
+++ /dev/null
@@ -1,27 +0,0 @@
-FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
-
- RUN apt-get update && apt-get install -y --no-install-recommends \
-          build-essential \
-          locales \
-          cmake \
-          git \
-          curl \
-          vim \
-          unzip \
-          sudo \
-          ca-certificates \
-          libjpeg-dev \
-          libpng-dev \
-          libfreetype6-dev \
-          libxft-dev &&\
-      rm -rf /var/lib/apt/lists/*
-
- RUN curl -o ~/miniconda.sh -O  https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh  && \
-      chmod +x ~/miniconda.sh && \
-      ~/miniconda.sh -b -p /opt/conda && \
-      rm ~/miniconda.sh && \
-      /opt/conda/bin/conda clean -ya
- ENV PATH /opt/conda/bin:$PATH
- RUN git clone https://github.com/dmlc/gluon-nlp
- WORKDIR gluon-nlp
- ADD gluon_nlp_job.sh .
diff --git a/ci/codecov.sh b/ci/codecov.sh
deleted file mode 100755
index 1ef332b1b3..0000000000
--- a/ci/codecov.sh
+++ /dev/null
@@ -1,1550 +0,0 @@
-#!/usr/bin/env bash
-
-# Apache License Version 2.0, January 2004
-# https://github.com/codecov/codecov-bash/blob/master/LICENSE
-
-
-set -e +o pipefail
-
-VERSION="0b37652"
-
-url="https://codecov.io"
-env="$CODECOV_ENV"
-service=""
-token=""
-search_in=""
-flags=""
-exit_with=0
-curlargs=""
-curlawsargs=""
-dump="0"
-clean="0"
-curl_s="-s"
-name="$CODECOV_NAME"
-include_cov=""
-exclude_cov=""
-ddp="$(echo ~)/Library/Developer/Xcode/DerivedData"
-xp=""
-files=""
-cacert="$CODECOV_CA_BUNDLE"
-gcov_ignore="-not -path './bower_components/**' -not -path './node_modules/**' -not -path './vendor/**'"
-gcov_include=""
-
-ft_gcov="1"
-ft_coveragepy="1"
-ft_fix="1"
-ft_search="1"
-ft_s3="1"
-ft_network="1"
-ft_xcodellvm="1"
-ft_xcodeplist="0"
-
-_git_root=$(git rev-parse --show-toplevel 2>/dev/null || hg root 2>/dev/null || echo $PWD)
-git_root="$_git_root"
-codecov_yml=""
-remote_addr=""
-if [ "$git_root" = "$PWD" ];
-then
-  git_root="."
-fi
-
-url_o=""
-pr_o=""
-build_o=""
-commit_o=""
-search_in_o=""
-tag_o=""
-branch_o=""
-slug_o=""
-prefix_o=""
-
-commit="$VCS_COMMIT_ID"
-branch="$VCS_BRANCH_NAME"
-pr="$VCS_PULL_REQUEST"
-slug="$VCS_SLUG"
-tag="$VCS_TAG"
-build_url="$CI_BUILD_URL"
-build="$CI_BUILD_ID"
-job="$CI_JOB_ID"
-
-beta_xcode_partials=""
-
-proj_root="$git_root"
-gcov_exe="gcov"
-gcov_arg=""
-
-b="\033[0;36m"
-g="\033[0;32m"
-r="\033[0;31m"
-e="\033[0;90m"
-x="\033[0m"
-
-show_help() {
-cat << EOF
-
-                Codecov Bash $VERSION
-
-          Global report uploading tool for Codecov
-       Documentation at https://docs.codecov.io/docs
-    Contribute at https://github.com/codecov/codecov-bash
-
-
-    -h          Display this help and exit
-    -f FILE     Target file(s) to upload
-
-                 -f "path/to/file"     only upload this file
-                                       skips searching unless provided patterns below
-
-                 -f '!*.bar'           ignore all files at pattern *.bar
-                 -f '*.foo'            include all files at pattern *.foo
-                 Must use single quotes.
-                 This is non-exclusive, use -s "*.foo" to match specific paths.
-
-    -s DIR       Directory to search for coverage reports.
-                 Already searches project root and artifact folders.
-    -t TOKEN     Set the private repository token
-                 (option) set environment variable CODECOV_TOKEN=:uuid
-
-                 -t @/path/to/token_file
-                 -t uuid
-
-    -n NAME      Custom defined name of the upload. Visible in Codecov UI
-
-    -e ENV       Specify environment variables to be included with this build
-                 Also accepting environment variables: CODECOV_ENV=VAR,VAR2
-
-                 -e VAR,VAR2
-
-    -X feature   Toggle functionalities
-
-                 -X gcov          Disable gcov
-                 -X coveragepy    Disable python coverage
-                 -X fix           Disable report fixing
-                 -X search        Disable searching for reports
-                 -X xcode         Disable xcode processing
-                 -X network       Disable uploading the file network
-
-    -R root dir  Used when not in git/hg project to identify project root directory
-    -y conf file Used to specify the location of the .codecov.yml config file
-    -F flag      Flag the upload to group coverage metrics
-
-                 -F unittests        This upload is only unittests
-                 -F integration      This upload is only integration tests
-                 -F ui,chrome        This upload is Chrome - UI tests
-
-    -c           Move discovered coverage reports to the trash
-    -Z           Exit with 1 if not successful. Default will Exit with 0
-
-    -- xcode --
-    -D           Custom Derived Data Path for Coverage.profdata and gcov processing
-                 Default '~/Library/Developer/Xcode/DerivedData'
-    -J           Specify packages to build coverage.
-                 This can significantly reduces time to build coverage reports.
-
-                 -J 'MyAppName'      Will match "MyAppName" and "MyAppNameTests"
-                 -J '^ExampleApp$'   Will match only "ExampleApp" not "ExampleAppTests"
-
-    -- gcov --
-    -g GLOB      Paths to ignore during gcov gathering
-    -G GLOB      Paths to include during gcov gathering
-    -p dir       Project root directory
-                 Also used when preparing gcov
-    -k prefix    Prefix filepaths to help resolve path fixing: https://github.com/codecov/support/issues/472
-    -x gcovexe   gcov executable to run. Defaults to 'gcov'
-    -a gcovargs  extra arguments to pass to gcov
-
-    -- Override CI Environment Variables --
-       These variables are automatically detected by popular CI providers
-
-    -B branch    Specify the branch name
-    -C sha       Specify the commit sha
-    -P pr        Specify the pull request number
-    -b build     Specify the build number
-    -T tag       Specify the git tag
-
-    -- Enterprise --
-    -u URL       Set the target url for Enterprise customers
-                 Not required when retrieving the bash uploader from your CCE
-                 (option) Set environment variable CODECOV_URL=https://my-hosted-codecov.com
-    -r SLUG      owner/repo slug used instead of the private repo token in Enterprise
-                 (option) set environment variable CODECOV_SLUG=:owner/:repo
-                 (option) set in your codecov.yml "codecov.slug"
-    -S PATH      File path to your cacert.pem file used to verify ssl with Codecov Enterprise (optional)
-                 (option) Set environment variable: CODECOV_CA_BUNDLE="/path/to/ca.pem"
-    -U curlargs  Extra curl arguments to communicate with Codecov. e.g., -U "--proxy http://http-proxy"
-    -A curlargs  Extra curl arguments to communicate with AWS.
-
-    -- Debugging --
-    -d           Don't upload, but dump upload file to stdout
-    -K           Remove color from the output
-    -v           Verbose mode
-
-EOF
-}
-
-
-say() {
-  echo -e "$1"
-}
-
-
-urlencode() {
-  echo "$1" | curl -Gso /dev/null -w %{url_effective} --data-urlencode @- "" | cut -c 3- | sed -e 's/%0A//'
-}
-
-
-swiftcov() {
-  _dir=$(dirname "$1" | sed 's/\(Build\).*/\1/g')
-  for _type in app framework xctest
-  do
-    find "$_dir" -name "*.$_type" | while read f
-    do
-      _proj=${f##*/}
-      _proj=${_proj%."$_type"}
-      if [ "$2" = "" ] || [ "$(echo "$_proj" | grep -i "$2")" != "" ];
-      then
-        say "    $g+$x Building reports for $_proj $_type"
-        dest=$([ -f "$f/$_proj" ] && echo "$f/$_proj" || echo "$f/Contents/MacOS/$_proj")
-        _proj_name=$(echo "$_proj" | sed -e 's/[[:space:]]//g')
-        xcrun llvm-cov show $beta_xcode_partials -instr-profile "$1" "$dest" > "$_proj_name.$_type.coverage.txt" \
-         || say "    ${r}x>${x} llvm-cov failed to produce results for $dest"
-      fi
-    done
-  done
-}
-
-
-# Credits to: https://gist.github.com/pkuczynski/8665367
-parse_yaml() {
-   local prefix=$2
-   local s='[[:space:]]*' w='[a-zA-Z0-9_]*' fs=$(echo @|tr @ '\034')
-   sed -ne "s|^\($s\)\($w\)$s:$s\"\(.*\)\"$s\$|\1$fs\2$fs\3|p" \
-        -e "s|^\($s\)\($w\)$s:$s\(.*\)$s\$|\1$fs\2$fs\3|p" $1 |
-   awk -F$fs '{
-      indent = length($1)/2;
-      vname[indent] = $2;
-      for (i in vname) {if (i > indent) {delete vname[i]}}
-      if (length($3) > 0) {
-         vn=""; if (indent > 0) {vn=(vn)(vname[0])("_")}
-         printf("%s%s%s=\"%s\"\n", "'$prefix'",vn, $2, $3);
-      }
-   }'
-}
-
-
-if [ $# != 0 ];
-then
-  while getopts "a:A:b:B:cC:dD:e:f:F:g:G:hJ:k:Kn:p:P:r:R:y:s:S:t:T:u:U:vx:X:Z" o
-  do
-    case "$o" in
-      "a")
-        gcov_arg=$OPTARG
-        ;;
-      "A")
-        curlawsargs="$OPTARG"
-        ;;
-      "b")
-        build_o="$OPTARG"
-        ;;
-      "B")
-        branch_o="$OPTARG"
-        ;;
-      "c")
-        clean="1"
-        ;;
-      "C")
-        commit_o="$OPTARG"
-        ;;
-      "d")
-        dump="1"
-        ;;
-      "D")
-        ddp="$OPTARG"
-        ;;
-      "e")
-        env="$env,$OPTARG"
-        ;;
-      "f")
-        if [ "${OPTARG::1}" = "!" ];
-        then
-          exclude_cov="$exclude_cov -not -path '${OPTARG:1}'"
-
-        elif [[ "$OPTARG" = *"*"* ]];
-        then
-          include_cov="$include_cov -or -name '$OPTARG'"
-
-        else
-          ft_search=0
-          if [ "$files" = "" ];
-          then
-            files="$OPTARG"
-          else
-            files="$files
-$OPTARG"
-          fi
-        fi
-        ;;
-      "F")
-        if [ "$flags" = "" ];
-        then
-          flags="$OPTARG"
-        else
-          flags="$flags,$OPTARG"
-        fi
-        ;;
-      "g")
-        gcov_ignore="$gcov_ignore -not -path '$OPTARG'"
-        ;;
-      "G")
-        gcov_include="$gcov_include -path '$OPTARG'"
-        ;;
-      "h")
-        show_help
-        exit 0;
-        ;;
-      "J")
-        ft_xcodellvm="1"
-        ft_xcodeplist="0"
-        if [ "$xp" = "" ];
-        then
-          xp="$OPTARG"
-        else
-          xp="$xp\|$OPTARG"
-        fi
-        ;;
-      "k")
-        prefix_o=$(echo "$OPTARG" | sed -e 's:^/*::' -e 's:/*$::')
-        ;;
-      "K")
-        b=""
-        g=""
-        r=""
-        e=""
-        x=""
-        ;;
-      "n")
-        name="$OPTARG"
-        ;;
-      "p")
-        proj_root="$OPTARG"
-        ;;
-      "P")
-        pr_o="$OPTARG"
-        ;;
-      "r")
-        slug_o="$OPTARG"
-        ;;
-      "R")
-        git_root="$OPTARG"
-        ;;
-      "s")
-        if [ "$search_in_o" = "" ];
-        then
-          search_in_o="$OPTARG"
-        else
-          search_in_o="$search_in_o $OPTARG"
-        fi
-        ;;
-      "S")
-        cacert="--cacert \"$OPTARG\""
-        ;;
-      "t")
-        if [ "${OPTARG::1}" = "@" ];
-        then
-          token=$(cat "${OPTARG:1}" | tr -d ' \n')
-        else
-          token="$OPTARG"
-        fi
-        ;;
-      "T")
-        tag_o="$OPTARG"
-        ;;
-      "u")
-        url_o=$(echo "$OPTARG" | sed -e 's/\/$//')
-        ;;
-      "U")
-        curlargs="$OPTARG"
-        ;;
-      "v")
-        set -x
-        curl_s=""
-        ;;
-      "x")
-        gcov_exe=$OPTARG
-        ;;
-      "X")
-        if [ "$OPTARG" = "gcov" ];
-        then
-          ft_gcov="0"
-        elif [ "$OPTARG" = "coveragepy" ] || [ "$OPTARG" = "py" ];
-        then
-          ft_coveragepy="0"
-        elif [ "$OPTARG" = "xcodellvm" ];
-        then
-          ft_xcodellvm="1"
-          ft_xcodeplist="0"
-        elif [ "$OPTARG" = "fix" ] || [ "$OPTARG" = "fixes" ];
-        then
-          ft_fix="0"
-        elif [ "$OPTARG" = "xcode" ];
-        then
-          ft_xcodellvm="0"
-          ft_xcodeplist="0"
-        elif [ "$OPTARG" = "search" ];
-        then
-          ft_search="0"
-        elif [ "$OPTARG" = "xcodepartials" ];
-        then
-          beta_xcode_partials="-use-color"
-        elif [ "$OPTARG" = "network" ];
-        then
-          ft_network="0"
-        elif [ "$OPTARG" = "s3" ];
-        then
-          ft_s3="0"
-        fi
-        ;;
-      "y")
-        codecov_yml="$OPTARG"
-        ;;
-      "Z")
-        exit_with=1
-        ;;
-    esac
-  done
-fi
-
-say "
-  _____          _
- / ____|        | |
-| |     ___   __| | ___  ___ _____   __
-| |    / _ \\ / _\` |/ _ \\/ __/ _ \\ \\ / /
-| |___| (_) | (_| |  __/ (_| (_) \\ V /
- \\_____\\___/ \\__,_|\\___|\\___\\___/ \\_/
-                              Bash-$VERSION
-
-"
-
-search_in="$proj_root"
-
-if [ "$JENKINS_URL" != "" ];
-then
-  say "$e==>$x Jenkins CI detected."
-  # https://wiki.jenkins-ci.org/display/JENKINS/Building+a+software+project
-  # https://wiki.jenkins-ci.org/display/JENKINS/GitHub+pull+request+builder+plugin#GitHubpullrequestbuilderplugin-EnvironmentVariables
-  service="jenkins"
-
-  if [ "$ghprbSourceBranch" != "" ];
-  then
-     branch="$ghprbSourceBranch"
-  elif [ "$GIT_BRANCH" != "" ];
-  then
-     branch="$GIT_BRANCH"
-  elif [ "$BRANCH_NAME" != "" ];
-  then
-    branch="$BRANCH_NAME"
-  fi
-
-  if [ "$ghprbActualCommit" != "" ];
-  then
-    commit="$ghprbActualCommit"
-  elif [ "$GIT_COMMIT" != "" ];
-  then
-    commit="$GIT_COMMIT"
-  fi
-
-  if [ "$ghprbPullId" != "" ];
-  then
-    pr="$ghprbPullId"
-  elif [ "$CHANGE_ID" != "" ];
-  then
-    pr="$CHANGE_ID"
-  fi
-
-  build="$BUILD_NUMBER"
-  build_url=$(urlencode "$BUILD_URL")
-
-elif [ "$CI" = "true" ] && [ "$TRAVIS" = "true" ] && [ "$SHIPPABLE" != "true" ];
-then
-  say "$e==>$x Travis CI detected."
-  # https://docs.travis-ci.com/user/environment-variables/
-  service="travis"
-  commit="${TRAVIS_PULL_REQUEST_SHA:-$TRAVIS_COMMIT}"
-  build="$TRAVIS_JOB_NUMBER"
-  pr="$TRAVIS_PULL_REQUEST"
-  job="$TRAVIS_JOB_ID"
-  slug="$TRAVIS_REPO_SLUG"
-  env="$env,TRAVIS_OS_NAME"
-  tag="$TRAVIS_TAG"
-  if [ "$TRAVIS_BRANCH" != "$TRAVIS_TAG" ];
-  then
-    branch="$TRAVIS_BRANCH"
-  fi
-
-  language=$(printenv | grep "TRAVIS_.*_VERSION" | head -1)
-  if [ "$language" != "" ];
-  then
-    env="$env,${language%=*}"
-  fi
-
-elif [ "$DOCKER_REPO" != "" ];
-then
-  say "$e==>$x Docker detected."
-  # https://docs.docker.com/docker-cloud/builds/advanced/
-  service="docker"
-  branch="$SOURCE_BRANCH"
-  commit="$SOURCE_COMMIT"
-  slug="$DOCKER_REPO"
-  tag="$CACHE_TAG"
-  env="$env,IMAGE_NAME"
-
-elif [ "$CI" = "true" ] && [ "$CI_NAME" = "codeship" ];
-then
-  say "$e==>$x Codeship CI detected."
-  # https://www.codeship.io/documentation/continuous-integration/set-environment-variables/
-  service="codeship"
-  branch="$CI_BRANCH"
-  build="$CI_BUILD_NUMBER"
-  build_url=$(urlencode "$CI_BUILD_URL")
-  commit="$CI_COMMIT_ID"
-
-elif [ ! -z "$CF_BUILD_URL" ] && [ ! -z "$CF_BUILD_ID" ];
-then
-  say "$e==>$x Codefresh CI detected."
-  # https://docs.codefresh.io/v1.0/docs/variables
-  service="codefresh"
-  branch="$CF_BRANCH"
-  build="$CF_BUILD_ID"
-  build_url=$(urlencode "$CF_BUILD_URL")
-  commit="$CF_REVISION"
-
-elif [ "$TEAMCITY_VERSION" != "" ];
-then
-  say "$e==>$x TeamCity CI detected."
-  # https://confluence.jetbrains.com/display/TCD8/Predefined+Build+Parameters
-  # https://confluence.jetbrains.com/plugins/servlet/mobile#content/view/74847298
-  if [ "$TEAMCITY_BUILD_BRANCH" = '' ];
-  then
-    echo "    Teamcity does not automatically make build parameters available as environment variables."
-    echo "    Add the following environment parameters to the build configuration"
-    echo "    env.TEAMCITY_BUILD_BRANCH = %teamcity.build.branch%"
-    echo "    env.TEAMCITY_BUILD_ID = %teamcity.build.id%"
-    echo "    env.TEAMCITY_BUILD_URL = %teamcity.serverUrl%/viewLog.html?buildId=%teamcity.build.id%"
-    echo "    env.TEAMCITY_BUILD_COMMIT = %system.build.vcs.number%"
-    echo "    env.TEAMCITY_BUILD_REPOSITORY = %vcsroot.<YOUR TEAMCITY VCS NAME>.url%"
-  fi
-  service="teamcity"
-  branch="$TEAMCITY_BUILD_BRANCH"
-  build="$TEAMCITY_BUILD_ID"
-  build_url=$(urlencode "$TEAMCITY_BUILD_URL")
-  if [ "$TEAMCITY_BUILD_COMMIT" != "" ];
-  then
-    commit="$TEAMCITY_BUILD_COMMIT"
-  else
-    commit="$BUILD_VCS_NUMBER"
-  fi
-  remote_addr="$TEAMCITY_BUILD_REPOSITORY"
-
-elif [ "$CI" = "true" ] && [ "$CIRCLECI" = "true" ];
-then
-  say "$e==>$x Circle CI detected."
-  # https://circleci.com/docs/environment-variables
-  service="circleci"
-  branch="$CIRCLE_BRANCH"
-  build="$CIRCLE_BUILD_NUM"
-  job="$CIRCLE_NODE_INDEX"
-  if [ "$CIRCLE_PROJECT_REPONAME" != "" ];
-  then
-    slug="$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME"
-  else
-    # git@github.com:owner/repo.git
-    slug="${CIRCLE_REPOSITORY_URL##*:}"
-    # owner/repo.git
-    slug="${slug%%.git}"
-  fi
-  pr="$CIRCLE_PR_NUMBER"
-  commit="$CIRCLE_SHA1"
-  search_in="$search_in $CIRCLE_ARTIFACTS $CIRCLE_TEST_REPORTS"
-
-elif [ "$BUDDYBUILD_BRANCH" != "" ];
-then
-  say "$e==>$x buddybuild detected"
-  # http://docs.buddybuild.com/v6/docs/custom-prebuild-and-postbuild-steps
-  service="buddybuild"
-  branch="$BUDDYBUILD_BRANCH"
-  build="$BUDDYBUILD_BUILD_NUMBER"
-  build_url="https://dashboard.buddybuild.com/public/apps/$BUDDYBUILD_APP_ID/build/$BUDDYBUILD_BUILD_ID"
-  # BUDDYBUILD_TRIGGERED_BY
-  if [ "$ddp" = "$(echo ~)/Library/Developer/Xcode/DerivedData" ];
-  then
-    ddp="/private/tmp/sandbox/${BUDDYBUILD_APP_ID}/bbtest"
-  fi
-
-elif [ "${bamboo_planRepository_revision}" != "" ];
-then
-  say "$e==>$x Bamboo detected"
-  # https://confluence.atlassian.com/bamboo/bamboo-variables-289277087.html#Bamboovariables-Build-specificvariables
-  service="bamboo"
-  commit="${bamboo_planRepository_revision}"
-  branch="${bamboo_planRepository_branch}"
-  build="${bamboo_buildNumber}"
-  build_url="${bamboo_buildResultsUrl}"
-  remote_addr="${bamboo_planRepository_repositoryUrl}"
-
-elif [ "$CI" = "true" ] && [ "$BITRISE_IO" = "true" ];
-then
-  # http://devcenter.bitrise.io/faq/available-environment-variables/
-  say "$e==>$x Bitrise CI detected."
-  service="bitrise"
-  branch="$BITRISE_GIT_BRANCH"
-  build="$BITRISE_BUILD_NUMBER"
-  build_url=$(urlencode "$BITRISE_BUILD_URL")
-  pr="$BITRISE_PULL_REQUEST"
-  if [ "$GIT_CLONE_COMMIT_HASH" != "" ];
-  then
-    commit="$GIT_CLONE_COMMIT_HASH"
-  fi
-
-elif [ "$CI" = "true" ] && [ "$SEMAPHORE" = "true" ];
-then
-  say "$e==>$x Semaphore CI detected."
-  # https://semaphoreapp.com/docs/available-environment-variables.html
-  service="semaphore"
-  branch="$BRANCH_NAME"
-  build="$SEMAPHORE_BUILD_NUMBER"
-  job="$SEMAPHORE_CURRENT_THREAD"
-  pr="$PULL_REQUEST_NUMBER"
-  slug="$SEMAPHORE_REPO_SLUG"
-  commit="$REVISION"
-  env="$env,SEMAPHORE_TRIGGER_SOURCE"
-
-elif [ "$CI" = "true" ] && [ "$BUILDKITE" = "true" ];
-then
-  say "$e==>$x Buildkite CI detected."
-  # https://buildkite.com/docs/guides/environment-variables
-  service="buildkite"
-  branch="$BUILDKITE_BRANCH"
-  build="$BUILDKITE_BUILD_NUMBER"
-  job="$BUILDKITE_JOB_ID"
-  build_url=$(urlencode "$BUILDKITE_BUILD_URL")
-  slug="$BUILDKITE_PROJECT_SLUG"
-  commit="$BUILDKITE_COMMIT"
-  if [[ "$BUILDKITE_PULL_REQUEST" != "false" ]]; then
-    pr="$BUILDKITE_PULL_REQUEST"
-  fi
-  tag="$BUILDKITE_TAG"
-
-elif [ "$CI" = "drone" ] || [ "$DRONE" = "true" ];
-then
-  say "$e==>$x Drone CI detected."
-  # http://docs.drone.io/env.html
-  # drone commits are not full shas
-  service="drone.io"
-  branch="$DRONE_BRANCH"
-  build="$DRONE_BUILD_NUMBER"
-  build_url=$(urlencode "${DRONE_BUILD_LINK}")
-  pr="$DRONE_PULL_REQUEST"
-  job="$DRONE_JOB_NUMBER"
-  tag="$DRONE_TAG"
-
-elif [ "$HEROKU_TEST_RUN_BRANCH" != "" ];
-then
-  say "$e==>$x Heroku CI detected."
-  # https://devcenter.heroku.com/articles/heroku-ci#environment-variables
-  service="heroku"
-  branch="$HEROKU_TEST_RUN_BRANCH"
-  build="$HEROKU_TEST_RUN_ID"
-
-elif [ "$CI" = "True" ] && [ "$APPVEYOR" = "True" ];
-then
-  say "$e==>$x Appveyor CI detected."
-  # http://www.appveyor.com/docs/environment-variables
-  service="appveyor"
-  branch="$APPVEYOR_REPO_BRANCH"
-  build=$(urlencode "$APPVEYOR_JOB_ID")
-  pr="$APPVEYOR_PULL_REQUEST_NUMBER"
-  job="$APPVEYOR_ACCOUNT_NAME%2F$APPVEYOR_PROJECT_SLUG%2F$APPVEYOR_BUILD_VERSION"
-  slug="$APPVEYOR_REPO_NAME"
-  commit="$APPVEYOR_REPO_COMMIT"
-
-elif [ "$CI" = "true" ] && [ "$WERCKER_GIT_BRANCH" != "" ];
-then
-  say "$e==>$x Wercker CI detected."
-  # http://devcenter.wercker.com/articles/steps/variables.html
-  service="wercker"
-  branch="$WERCKER_GIT_BRANCH"
-  build="$WERCKER_MAIN_PIPELINE_STARTED"
-  slug="$WERCKER_GIT_OWNER/$WERCKER_GIT_REPOSITORY"
-  commit="$WERCKER_GIT_COMMIT"
-
-elif [ "$CI" = "true" ] && [ "$MAGNUM" = "true" ];
-then
-  say "$e==>$x Magnum CI detected."
-  # https://magnum-ci.com/docs/environment
-  service="magnum"
-  branch="$CI_BRANCH"
-  build="$CI_BUILD_NUMBER"
-  commit="$CI_COMMIT"
-
-elif [ "$SHIPPABLE" = "true" ];
-then
-  say "$e==>$x Shippable CI detected."
-  # http://docs.shippable.com/ci_configure/
-  service="shippable"
-  branch=$([ "$HEAD_BRANCH" != "" ] && echo "$HEAD_BRANCH" || echo "$BRANCH")
-  build="$BUILD_NUMBER"
-  build_url=$(urlencode "$BUILD_URL")
-  pr="$PULL_REQUEST"
-  slug="$REPO_FULL_NAME"
-  commit="$COMMIT"
-
-elif [ "$TDDIUM" = "true" ];
-then
-  say "Solano CI detected."
-  # http://docs.solanolabs.com/Setup/tddium-set-environment-variables/
-  service="solano"
-  commit="$TDDIUM_CURRENT_COMMIT"
-  branch="$TDDIUM_CURRENT_BRANCH"
-  build="$TDDIUM_TID"
-  pr="$TDDIUM_PR_ID"
-
-elif [ "$GREENHOUSE" = "true" ];
-then
-  say "$e==>$x Greenhouse CI detected."
-  # http://docs.greenhouseci.com/docs/environment-variables-files
-  service="greenhouse"
-  branch="$GREENHOUSE_BRANCH"
-  build="$GREENHOUSE_BUILD_NUMBER"
-  build_url=$(urlencode "$GREENHOUSE_BUILD_URL")
-  pr="$GREENHOUSE_PULL_REQUEST"
-  commit="$GREENHOUSE_COMMIT"
-  search_in="$search_in $GREENHOUSE_EXPORT_DIR"
-
-elif [ "$GITLAB_CI" != "" ];
-then
-  say "$e==>$x GitLab CI detected."
-  # http://doc.gitlab.com/ce/ci/variables/README.html
-  service="gitlab"
-  branch="${CI_BUILD_REF_NAME:-$CI_COMMIT_REF_NAME}"
-  build="${CI_BUILD_ID:-$CI_JOB_ID}"
-  remote_addr="${CI_BUILD_REPO:-$CI_REPOSITORY_URL}"
-  commit="${CI_BUILD_REF:-$CI_COMMIT_SHA}"
-
-else
-  say "${r}x>${x} No CI provider detected."
-  say "    Testing inside Docker? ${b}http://docs.codecov.io/docs/testing-with-docker${x}"
-  say "    Testing with Tox? ${b}https://docs.codecov.io/docs/python#section-testing-with-tox${x}"
-
-fi
-
-say "    ${e}project root:${x} $git_root"
-
-# find branch, commit, repo from git command
-if [ "$GIT_BRANCH" != "" ];
-then
-  branch="$GIT_BRANCH"
-
-elif [ "$branch" = "" ];
-then
-  branch=$(git rev-parse --abbrev-ref HEAD 2>/dev/null || hg branch 2>/dev/null || echo "")
-  if [ "$branch" = "HEAD" ];
-  then
-    branch=""
-  fi
-fi
-
-if [ "$commit_o" = "" ];
-then
-  # merge commit -> actual commit
-  mc=
-  if [ -n "$pr" ] && [ "$pr" != false ];
-  then
-    mc=$(git show --no-patch --format="%P" 2>/dev/null || echo "")
-  fi
-  if [[ "$mc" =~ ^[a-z0-9]{40}[[:space:]][a-z0-9]{40}$ ]];
-  then
-    say "    Fixing merge commit SHA"
-    commit=$(echo "$mc" | cut -d' ' -f2)
-  elif [ "$GIT_COMMIT" != "" ];
-  then
-    commit="$GIT_COMMIT"
-  elif [ "$commit" = "" ];
-  then
-    commit=$(git log -1 --format="%H" 2>/dev/null || hg id -i --debug 2>/dev/null | tr -d '+' || echo "")
-  fi
-else
-  commit="$commit_o"
-fi
-
-if [ "$CODECOV_TOKEN" != "" ] && [ "$token" = "" ];
-then
-  say "${e}-->${x} token set from env"
-  token="$CODECOV_TOKEN"
-fi
-
-if [ "$CODECOV_URL" != "" ] && [ "$url_o" = "" ];
-then
-  say "${e}-->${x} url set from env"
-  url_o=$(echo "$CODECOV_URL" | sed -e 's/\/$//')
-fi
-
-if [ "$CODECOV_SLUG" != "" ];
-then
-  say "${e}-->${x} slug set from env"
-  slug_o="$CODECOV_SLUG"
-
-elif [ "$slug" = "" ];
-then
-  if [ "$remote_addr" = "" ];
-  then
-    remote_addr=$(git config --get remote.origin.url || hg paths default || echo '')
-  fi
-  if [ "$remote_addr" != "" ];
-  then
-    if echo "$remote_addr" | grep -q "//"; then
-      # https
-      slug=$(echo "$remote_addr" | cut -d / -f 4,5 | sed -e 's/\.git$//')
-    else
-      # ssh
-      slug=$(echo "$remote_addr" | cut -d : -f 2 | sed -e 's/\.git$//')
-    fi
-  fi
-  if [ "$slug" = "/" ];
-  then
-    slug=""
-  fi
-fi
-
-yaml=$(test -n "$codecov_yml" && echo "$codecov_yml" \
-       || cd "$git_root" && \
-          git ls-files "*codecov.yml" "*codecov.yaml" 2>/dev/null \
-       || hg locate "*codecov.yml" "*codecov.yaml" 2>/dev/null \
-       || cd $proj_root && find . -type f -name '*codecov.y*ml' -depth 1 2>/dev/null \
-       || echo '')
-yaml=$(echo "$yaml" | head -1)
-
-if [ "$yaml" != "" ];
-then
-  say "    ${e}Yaml found at:${x} $yaml"
-  config=$(parse_yaml "$git_root/$yaml" || echo '')
-
-  # TODO validate the yaml here
-
-  if [ "$(echo "$config" | grep 'codecov_token="')" != "" ] && [ "$token" = "" ];
-  then
-    say "${e}-->${x} token set from yaml"
-    token="$(echo "$config" | grep 'codecov_token="' | sed -e 's/codecov_token="//' | sed -e 's/"\.*//')"
-  fi
-
-  if [ "$(echo "$config" | grep 'codecov_url="')" != "" ] && [ "$url_o" = "" ];
-  then
-    say "${e}-->${x} url set from yaml"
-    url_o="$(echo "$config" | grep 'codecov_url="' | sed -e 's/codecov_url="//' | sed -e 's/"\.*//')"
-  fi
-
-  if [ "$(echo "$config" | grep 'codecov_slug="')" != "" ] && [ "$slug_o" = "" ];
-  then
-    say "${e}-->${x} slug set from yaml"
-    slug_o="$(echo "$config" | grep 'codecov_slug="' | sed -e 's/codecov_slug="//' | sed -e 's/"\.*//')"
-  fi
-else
-  say "    ${g}Yaml not found, that's ok! Learn more at${x} ${b}http://docs.codecov.io/docs/codecov-yaml${x}"
-
-fi
-
-if [ "$branch_o" != "" ];
-then
-  branch=$(urlencode "$branch_o")
-else
-  branch=$(urlencode "$branch")
-fi
-
-query="branch=$branch\
-       &commit=$commit\
-       &build=$([ "$build_o" = "" ] && echo "$build" || echo "$build_o")\
-       &build_url=$build_url\
-       &name=$(urlencode "$name")\
-       &tag=$([ "$tag_o" = "" ] && echo "$tag" || echo "$tag_o")\
-       &slug=$([ "$slug_o" = "" ] && urlencode "$slug" || urlencode "$slug_o")\
-       &service=$service\
-       &flags=$flags\
-       &pr=$([ "$pr_o" = "" ] && echo "${pr##\#}" || echo "${pr_o##\#}")\
-       &job=$job"
-
-if [ "$ft_search" = "1" ];
-then
-  # detect bower comoponents location
-  bower_components="bower_components"
-  bower_rc=$(cd "$git_root" && cat .bowerrc 2>/dev/null || echo "")
-  if [ "$bower_rc" != "" ];
-  then
-    bower_components=$(echo "$bower_rc" | tr -d '\n' | grep '"directory"' | cut -d'"' -f4 | sed -e 's/\/$//')
-    if [ "$bower_components" = "" ];
-    then
-      bower_components="bower_components"
-    fi
-  fi
-
-  # Swift Coverage
-  if [ "$ft_xcodellvm" = "1" ] && [ -d "$ddp" ];
-  then
-    say "${e}==>${x} Processing Xcode reports via llvm-cov"
-    say "    DerivedData folder: $ddp"
-    profdata_files=$(find "$ddp" -name '*.profdata' 2>/dev/null || echo '')
-    if [ "$profdata_files" != "" ];
-    then
-      # xcode via profdata
-      if [ "$xp" = "" ];
-      then
-        # xp=$(xcodebuild -showBuildSettings 2>/dev/null | grep -i "^\s*PRODUCT_NAME" | sed -e 's/.*= \(.*\)/\1/')
-        # say " ${e}->${x} Speed up Xcode processing by adding ${e}-J '$xp'${x}"
-        say "    ${g}hint${x} Speed up Swift processing by using use ${g}-J 'AppName'${x} (regexp accepted)"
-        say "    ${g}hint${x} This will remove Pods/ from your report. Also ${b}https://docs.codecov.io/docs/ignoring-paths${x}"
-      fi
-      while read -r profdata;
-      do
-        if [ "$profdata" != "" ];
-        then
-          swiftcov "$profdata" "$xp"
-        fi
-      done <<< "$profdata_files"
-    else
-      say "    ${e}->${x} No Swift coverage found"
-    fi
-
-    # Obj-C Gcov Coverage
-    if [ "$ft_gcov" = "1" ];
-    then
-      say "    ${e}->${x} Running $gcov_exe for Obj-C"
-      bash -c "find $ddp -type f -name '*.gcda' $gcov_include $gcov_ignore -exec $gcov_exe -p $gcov_arg {} +" || true
-    fi
-  fi
-
-  if [ "$ft_xcodeplist" = "1" ] && [ -d "$ddp" ];
-  then
-    say "${e}==>${x} Processing Xcode plists"
-    plists_files=$(find "$ddp" -name '*.xccoverage' 2>/dev/null || echo '')
-    if [ "$plists_files" != "" ];
-    then
-      while read -r plist;
-      do
-        if [ "$plist" != "" ];
-        then
-          say "    ${g}Found${x} plist file at $plist"
-          plutil -convert xml1 -o "$(basename "$plist").plist" -- $plist
-        fi
-      done <<< "$plists_files"
-    fi
-  fi
-
-  # Gcov Coverage
-  if [ "$ft_gcov" = "1" ];
-  then
-    say "${e}==>${x} Running gcov in $proj_root ${e}(disable via -X gcov)${x}"
-    bash -c "find $proj_root -type f -name '*.gcno' $gcov_include $gcov_ignore -exec $gcov_exe -pb $gcov_arg {} +" || true
-  else
-    say "${e}==>${x} gcov disabled"
-  fi
-
-  # Python Coverage
-  if [ "$ft_coveragepy" = "1" ];
-  then
-    if [ ! -f coverage.xml ];
-    then
-      if which coverage >/dev/null 2>&1;
-      then
-        say "${e}==>${x} Python coveragepy exists ${e}disable via -X coveragepy${x}"
-
-        dotcoverage=$(find "$git_root" -name '.coverage' -or -name '.coverage.*' | head -1 || echo '')
-        if [ "$dotcoverage" != "" ];
-        then
-          cd "$(dirname "$dotcoverage")"
-          if [ ! -f .coverage ];
-          then
-            say "    ${e}->${x} Running coverage combine"
-            coverage combine -a
-          fi
-          say "    ${e}->${x} Running coverage xml"
-          if [ "$(coverage xml -i)" != "No data to report." ];
-          then
-            files="$files
-$PWD/coverage.xml"
-          else
-            say "    ${r}No data to report.${x}"
-          fi
-          cd "$proj_root"
-        else
-          say "    ${r}No .coverage file found.${x}"
-        fi
-      else
-        say "${e}==>${x} Python coveragepy not found"
-      fi
-    fi
-  else
-    say "${e}==>${x} Python coveragepy disabled"
-  fi
-
-  if [ "$search_in_o" != "" ];
-  then
-    # location override
-    search_in="$search_in_o"
-  fi
-
-  say "$e==>$x Searching for coverage reports in:"
-  for _path in $search_in
-  do
-    say "    ${g}+${x} $_path"
-  done
-
-  patterns="find $search_in \( \
-                        -name vendor \
-                        -or -name htmlcov \
-                        -or -name virtualenv \
-                        -or -name js/generated/coverage \
-                        -or -name .virtualenv \
-                        -or -name virtualenvs \
-                        -or -name .virtualenvs \
-                        -or -name .env \
-                        -or -name .envs \
-                        -or -name env \
-                        -or -name .yarn-cache \
-                        -or -name envs \
-                        -or -name .venv \
-                        -or -name .venvs \
-                        -or -name venv \
-                        -or -name venvs \
-                        -or -name .git \
-                        -or -name .hg \
-                        -or -name .tox \
-                        -or -name __pycache__ \
-                        -or -name '.egg-info*' \
-                        -or -name '$bower_components' \
-                        -or -name node_modules \
-                        -or -name 'conftest_*.c.gcov' \
-                    \) -prune -or \
-                    -type f \( -name '*coverage*.*' \
-                     -or -name 'nosetests.xml' \
-                     -or -name 'jacoco*.xml' \
-                     -or -name 'clover.xml' \
-                     -or -name 'report.xml' \
-                     -or -name '*.codecov.*' \
-                     -or -name 'codecov.*' \
-                     -or -name 'cobertura.xml' \
-                     -or -name 'excoveralls.json' \
-                     -or -name 'luacov.report.out' \
-                     -or -name 'coverage-final.json' \
-                     -or -name 'naxsi.info' \
-                     -or -name 'lcov.info' \
-                     -or -name 'lcov.dat' \
-                     -or -name '*.lcov' \
-                     -or -name '*.clover' \
-                     -or -name 'cover.out' \
-                     -or -name 'gcov.info' \
-                     -or -name '*.gcov' \
-                     -or -name '*.lst' \
-                     $include_cov \) \
-                    $exclude_cov \
-                    -not -name '*.profdata' \
-                    -not -name 'coverage-summary.json' \
-                    -not -name 'phpunit-code-coverage.xml' \
-                    -not -name '*/classycle/report.xml' \
-                    -not -name 'remapInstanbul.coverage*.json' \
-                    -not -name 'phpunit-coverage.xml' \
-                    -not -name '*codecov.yml' \
-                    -not -name '*.serialized' \
-                    -not -name '.coverage*' \
-                    -not -name '.*coveragerc' \
-                    -not -name '*.sh' \
-                    -not -name '*.bat' \
-                    -not -name '*.ps1' \
-                    -not -name '*.env' \
-                    -not -name '*.cmake' \
-                    -not -name '*.dox' \
-                    -not -name '*.ec' \
-                    -not -name '*.rst' \
-                    -not -name '*.h' \
-                    -not -name '*.scss' \
-                    -not -name '*.o' \
-                    -not -name '*.proto' \
-                    -not -name '*.sbt' \
-                    -not -name '*.xcoverage.*' \
-                    -not -name '*.gz' \
-                    -not -name '*.conf' \
-                    -not -name '*.p12' \
-                    -not -name '*.csv' \
-                    -not -name '*.rsp' \
-                    -not -name '*.m4' \
-                    -not -name '*.pem' \
-                    -not -name '*~' \
-                    -not -name '*.exe' \
-                    -not -name '*.am' \
-                    -not -name '*.template' \
-                    -not -name '*.cp' \
-                    -not -name '*.bw' \
-                    -not -name '*.crt' \
-                    -not -name '*.log' \
-                    -not -name '*.cmake' \
-                    -not -name '*.pth' \
-                    -not -name '*.in' \
-                    -not -name '*.jar*' \
-                    -not -name '*.pom*' \
-                    -not -name '*.png' \
-                    -not -name '*.jpg' \
-                    -not -name '*.sql' \
-                    -not -name '*.jpeg' \
-                    -not -name '*.svg' \
-                    -not -name '*.gif' \
-                    -not -name '*.csv' \
-                    -not -name '*.snapshot' \
-                    -not -name '*.mak*' \
-                    -not -name '*.bash' \
-                    -not -name '*.data' \
-                    -not -name '*.py' \
-                    -not -name '*.class' \
-                    -not -name '*.xcconfig' \
-                    -not -name '*.ec' \
-                    -not -name '*.coverage' \
-                    -not -name '*.pyc' \
-                    -not -name '*.cfg' \
-                    -not -name '*.egg' \
-                    -not -name '*.ru' \
-                    -not -name '*.css' \
-                    -not -name '*.less' \
-                    -not -name '*.pyo' \
-                    -not -name '*.whl' \
-                    -not -name '*.html' \
-                    -not -name '*.ftl' \
-                    -not -name '*.erb' \
-                    -not -name '*.rb' \
-                    -not -name '*.js' \
-                    -not -name '*.jade' \
-                    -not -name '*.db' \
-                    -not -name '*.md' \
-                    -not -name '*.cpp' \
-                    -not -name '*.gradle' \
-                    -not -name '*.tar.tz' \
-                    -not -name '*.scss' \
-                    -not -name 'include.lst' \
-                    -not -name 'fullLocaleNames.lst' \
-                    -not -name 'inputFiles.lst' \
-                    -not -name 'createdFiles.lst' \
-                    -not -name 'scoverage.measurements.*' \
-                    -not -name 'test_*_coverage.txt' \
-                    -not -name 'testrunner-coverage*' \
-                    -print 2>/dev/null"
-  files=$(eval "$patterns" || echo '')
-
-elif [ "$include_cov" != "" ];
-then
-  files=$(eval "find $search_in -type f \( ${include_cov:5} \)$exclude_cov 2>/dev/null" || echo '')
-fi
-
-num_of_files=$(echo "$files" | wc -l | tr -d ' ')
-if [ "$num_of_files" != '' ] && [ "$files" != '' ];
-then
-  say "    ${e}->${x} Found $num_of_files reports"
-fi
-
-# no files found
-if [ "$files" = "" ];
-then
-  say "${r}-->${x} No coverage report found."
-  say "    Please visit ${b}http://docs.codecov.io/docs/supported-languages${x}"
-  exit ${exit_with};
-fi
-
-if [ "$ft_network" == "1" ];
-then
-  say "${e}==>${x} Detecting git/mercurial file structure"
-  network=$(cd "$git_root" && git ls-files 2>/dev/null || hg locate 2>/dev/null || echo "")
-  if [ "$network" = "" ];
-  then
-    network=$(find "$git_root" \( \
-                   -name virtualenv \
-                   -name .virtualenv \
-                   -name virtualenvs \
-                   -name .virtualenvs \
-                   -name '*.png' \
-                   -name '*.gif' \
-                   -name '*.jpg' \
-                   -name '*.jpeg' \
-                   -name '*.md' \
-                   -name .env \
-                   -name .envs \
-                   -name env \
-                   -name envs \
-                   -name .venv \
-                   -name .venvs \
-                   -name venv \
-                   -name venvs \
-                   -name .git \
-                   -name .egg-info \
-                   -name shunit2-2.1.6 \
-                   -name vendor \
-                   -name __pycache__ \
-                   -name node_modules \
-                   -path '*/$bower_components/*' \
-                   -path '*/target/delombok/*' \
-                   -path '*/build/lib/*' \
-                   -path '*/js/generated/coverage/*' \
-                    \) -prune -or \
-                    -type f -print 2>/dev/null || echo '')
-  fi
-
-  if [ "$prefix_o" != "" ];
-  then
-      network=$(echo "$network" | awk "{print \"$prefix_o/\"\$0}")
-  fi
-fi
-
-upload_file=`mktemp /tmp/codecov.XXXXXX`
-adjustments_file=`mktemp /tmp/codecov.adjustments.XXXXXX`
-
-cleanup() {
-    rm -f $upload_file $adjustments_file $upload_file.gz
-}
-
-trap cleanup INT ABRT TERM
-
-if [ "$env" != "" ];
-then
-  inc_env=""
-  say "${e}==>${x} Appending build variables"
-  for varname in $(echo "$env" | tr ',' ' ')
-  do
-    if [ "$varname" != "" ];
-    then
-      say "    ${g}+${x} $varname"
-      inc_env="${inc_env}${varname}=$(eval echo "\$${varname}")
-"
-    fi
-  done
-
-echo "$inc_env<<<<<< ENV" >> $upload_file
-fi
-
-# Append git file list
-# write discovered yaml location
-echo "$yaml" >> $upload_file
-if [ "$ft_network" == "1" ];
-then
-  i="woff|eot|otf"  # fonts
-  i="$i|gif|png|jpg|jpeg|psd"  # images
-  i="$i|ptt|pptx|numbers|pages|md|txt|xlsx|docx|doc|pdf|html|csv"  # docs
-  i="$i|yml|yaml|.gitignore"  # supporting docs
-  echo "$network" | grep -vwE "($i)$" >> $upload_file
-fi
-echo "<<<<<< network" >> $upload_file
-
-fr=0
-say "${e}==>${x} Reading reports"
-while IFS='' read -r file;
-do
-  # read the coverage file
-  if [ "$(echo "$file" | tr -d ' ')" != '' ];
-  then
-    if [ -f "$file" ];
-    then
-      report_len=$(wc -c < "$file")
-      if [ "$report_len" -ne 0 ];
-      then
-        say "    ${g}+${x} $file ${e}bytes=$(echo "$report_len" | tr -d ' ')${x}"
-        # append to to upload
-        _filename=$(basename "$file")
-        if [ "${_filename##*.}" = 'gcov' ];
-        then
-          echo "# path=$(echo "$file.reduced" | sed "s|^$git_root/||")" >> $upload_file
-          # get file name
-          head -1 $file >> $upload_file
-          # 1. remove source code
-          # 2. remove ending bracket lines
-          # 3. remove whitespace
-          # 4. remove contextual lines
-          # 5. remove function names
-          awk -F': *' '{print $1":"$2":"}' $file \
-            | sed '\/: *} *$/d' \
-            | sed 's/^ *//' \
-            | sed '/^-/d' \
-            | sed 's/^function.*/func/' >> $upload_file
-        else
-          echo "# path=$(echo "$file" | sed "s|^$git_root/||")" >> $upload_file
-          cat "$file" >> $upload_file
-        fi
-        echo "<<<<<< EOF" >> $upload_file
-        fr=1
-        if [ "$clean" = "1" ];
-        then
-          rm "$file"
-        fi
-      else
-        say "    ${r}-${x} Skipping empty file $file"
-      fi
-    else
-      say "    ${r}-${x} file not found at $file"
-    fi
-  fi
-done <<< "$(echo -e "$files")"
-
-if [ "$fr" = "0" ];
-then
-  say "${r}-->${x} No coverage data found."
-  say "    Please visit ${b}http://docs.codecov.io/docs/supported-languages${x}"
-  say "    search for your projects language to learn how to collect reports."
-  exit ${exit_with};
-fi
-
-if [ "$ft_fix" = "1" ];
-then
-  say "${e}==>${x} Appending adjustments"
-  say "    ${b}http://docs.codecov.io/docs/fixing-reports${x}"
-
-  empty_line='^[[:space:]]*$'
-  # //
-  syntax_comment='^[[:space:]]*//.*'
-  # /* or */
-  syntax_comment_block='^[[:space:]]*(\/\*|\*\/)[[:space:]]*$'
-  # { or }
-  syntax_bracket='^[[:space:]]*[\{\}][[:space:]]*(//.*)?$'
-  # [ or ]
-  syntax_list='^[[:space:]]*[][][[:space:]]*(//.*)?$'
-
-  skip_dirs="-not -path '*/$bower_components/*' \
-             -not -path '*/node_modules/*'"
-
-  cut_and_join() {
-    awk 'BEGIN { FS=":" }
-         $3 ~ /\/\*/ || $3 ~ /\*\// { print $0 ; next }
-         $1!=key { if (key!="") print out ; key=$1 ; out=$1":"$2 ; next }
-         { out=out","$2 }
-         END { print out }' 2>/dev/null
-  }
-
-  if echo "$network" | grep -m1 '.kt$' 1>/dev/null;
-  then
-    # skip brackets and comments
-    find "$git_root" -type f \
-                     -name '*.kt' \
-                     -exec \
-      grep -nIHE -e $syntax_bracket \
-                 -e $syntax_comment_block {} \; \
-      | cut_and_join \
-      >> $adjustments_file \
-      || echo ''
-
-    # last line in file
-    find "$git_root" -type f \
-                     -name '*.kt' -exec \
-      wc -l {} \; \
-      | while read l; do echo "EOF: $l"; done \
-      2>/dev/null \
-      >> $adjustments_file \
-      || echo ''
-
-  fi
-
-  if echo "$network" | grep -m1 '.go$' 1>/dev/null;
-  then
-    # skip empty lines, comments, and brackets
-    find "$git_root" -not -path '*/vendor/*' \
-                     -type f \
-                     -name '*.go' \
-                     -exec \
-      grep -nIHE \
-           -e $empty_line \
-           -e $syntax_comment \
-           -e $syntax_comment_block \
-           -e $syntax_bracket \
-           {} \; \
-      | cut_and_join \
-      >> $adjustments_file \
-      || echo ''
-  fi
-
-  if echo "$network" | grep -m1 '.dart$' 1>/dev/null;
-  then
-    # skip brackets
-    find "$git_root" -type f \
-                     -name '*.dart' \
-                     -exec \
-      grep -nIHE \
-           -e $syntax_bracket \
-           {} \; \
-      | cut_and_join \
-      >> $adjustments_file \
-      || echo ''
-  fi
-
-  if echo "$network" | grep -m1 '.php$' 1>/dev/null;
-  then
-    # skip empty lines, comments, and brackets
-    find "$git_root" -not -path "*/vendor/*" \
-                     -type f \
-                     -name '*.php' \
-                     -exec \
-      grep -nIHE \
-           -e $syntax_list \
-           -e $syntax_bracket \
-           -e '^[[:space:]]*\);[[:space:]]*(//.*)?$' \
-           {} \; \
-      | cut_and_join \
-      >> $adjustments_file \
-      || echo ''
-  fi
-
-  if echo "$network" | grep -m1 '\(.cpp\|.h\|.cxx\|.c\|.hpp\|.m\)$' 1>/dev/null;
-  then
-    # skip brackets
-    find "$git_root" -type f \
-                     $skip_dirs \
-         \( \
-           -name '*.h' \
-           -or -name '*.cpp' \
-           -or -name '*.cxx' \
-           -or -name '*.m' \
-           -or -name '*.c' \
-           -or -name '*.hpp' \
-         \) -exec \
-      grep -nIHE \
-           -e $empty_line \
-           -e $syntax_bracket \
-           -e '// LCOV_EXCL' \
-           {} \; \
-      | cut_and_join \
-      >> $adjustments_file \
-      || echo ''
-
-    # skip brackets
-    find "$git_root" -type f \
-                     $skip_dirs \
-         \( \
-           -name '*.h' \
-           -or -name '*.cpp' \
-           -or -name '*.cxx' \
-           -or -name '*.m' \
-           -or -name '*.c' \
-           -or -name '*.hpp' \
-         \) -exec \
-      grep -nIH '// LCOV_EXCL' \
-           {} \; \
-      >> $adjustments_file \
-      || echo ''
-
-  fi
-
-  found=$(cat $adjustments_file | tr -d ' ')
-
-  if [ "$found" != "" ];
-  then
-    say "    ${g}+${x} Found adjustments"
-    echo "# path=fixes" >> $upload_file
-    cat $adjustments_file >> $upload_file
-    echo "<<<<<< EOF" >> $upload_file
-    rm -rf $adjustments_file
-  else
-    say "    ${e}->${x} No adjustments found"
-  fi
-fi
-
-if [ "$url_o" != "" ];
-then
-  url="$url_o"
-fi
-
-if [ "$dump" != "0" ];
-then
-  # trim whitespace from query
-  say "    ${e}->${x} Dumping upload file (no upload)"
-  echo "$url/upload/v4?$(echo "package=bash-$VERSION&token=$token&$query" | tr -d ' ')"
-  cat $upload_file
-else
-
-  say "${e}==>${x} Gzipping contents"
-  gzip -nf9 $upload_file
-
-  query=$(echo "${query}" | tr -d ' ')
-  say "${e}==>${x} Uploading reports"
-  say "    ${e}url:${x} $url"
-  say "    ${e}query:${x} $query"
-
-  # now add token to query
-  query=$(echo "package=bash-$VERSION&token=$token&$query" | tr -d ' ')
-
-  if [ "$ft_s3" = "1" ];
-  then
-    i="0"
-    while [ $i -lt 4 ]
-    do
-      i=$[$i+1]
-      say "    ${e}->${x} Pinging Codecov"
-      res=$(curl $curl_s -X POST $curlargs $cacert \
-            -H 'X-Reduced-Redundancy: false' \
-            -H 'X-Content-Type: application/x-gzip' \
-            "$url/upload/v4?$query" || true)
-      # a good replay is "https://codecov.io" + "\n" + "https://codecov.s3.amazonaws.com/..."
-      status=$(echo "$res" | head -1 | grep 'HTTP ' | cut -d' ' -f2)
-      if [ "$status" = "" ];
-      then
-        s3target=$(echo "$res" | sed -n 2p)
-        say "    ${e}->${x} Uploading"
-        s3=$(curl $curl_s -fiX PUT $curlawsargs \
-                  --data-binary @$upload_file.gz \
-                  -H 'Content-Type: application/x-gzip' \
-                  -H 'Content-Encoding: gzip' \
-                  -H 'x-amz-acl: public-read' \
-                  "$s3target" || true)
-        if [ "$s3" != "" ];
-        then
-          say "    ${g}->${x} View reports at ${b}$(echo "$res" | sed -n 1p)${x}"
-          exit 0
-        else
-          say "    ${r}X>${x} Failed to upload"
-        fi
-      elif [ "$status" = "400" ];
-      then
-          # 400 Error
-          say "${g}${res}${x}"
-          exit ${exit_with}
-      fi
-      say "    ${e}->${x} Sleeping for 30s and trying again..."
-      sleep 30
-    done
-  fi
-
-  say "    ${e}->${x} Uploading to Codecov"
-  i="0"
-  while [ $i -lt 4 ]
-  do
-    i=$[$i+1]
-
-    res=$(curl $curl_s -X POST $curlargs $cacert \
-          --data-binary @$upload_file.gz \
-          -H 'Content-Type: text/plain' \
-          -H 'Content-Encoding: gzip' \
-          -H 'X-Content-Encoding: gzip' \
-          -H 'Accept: text/plain' \
-          "$url/upload/v2?$query" || echo 'HTTP 500')
-    # HTTP 200
-    # http://....
-    status=$(echo "$res" | head -1 | cut -d' ' -f2)
-    if [ "$status" = "" ];
-    then
-      say "    View reports at ${b}$(echo "$res" | head -2 | tail -1)${x}"
-      exit 0
-
-    elif [ "${status:0:1}" = "5" ];
-    then
-      say "    ${e}->${x} Sleeping for 30s and trying again..."
-      sleep 30
-
-    else
-      say "    ${g}${res}${x}"
-      exit 0
-      exit ${exit_with}
-    fi
-
-  done
-
-  say "    ${r}X> Failed to upload coverage reports${x}"
-fi
-
-exit ${exit_with}
diff --git a/ci/jenkins/Jenkinsfile_py3-master_cpu_unittest b/ci/jenkins/Jenkinsfile_py3-master_cpu_unittest
deleted file mode 100644
index fb87760de6..0000000000
--- a/ci/jenkins/Jenkinsfile_py3-master_cpu_unittest
+++ /dev/null
@@ -1,69 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// Jenkins pipeline
-// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
-
-// timeout in minutes
-max_time = 120
-
-node {
-  // Loading the utilities requires a node context unfortunately
-  checkout scm
-  utils = load('ci/jenkins/utils.groovy')
-  build_steps = load('ci/jenkins/build_steps.groovy')
-}
-utils.assign_node_labels(linux_gpu: 'linux-gpu', linux_cpu: 'linux-cpu')
-
-utils.main_wrapper(
-core_logic: {
-  utils.parallel_stage('Sanity', [
-    build_steps.sanity_lint('gluon-nlp-cpu-py3-master', 'cpu/py3-master', 'src/gluonnlp')
-  ])
-
-  utils.parallel_stage('Tests', [
-    build_steps.test_unittest('gluon-nlp-cpu-py3-master', 'cpu/py3-master',
-                              'tests/unittest', 'src/gluonnlp',
-                              'not (gpu or serial or skip_master)',
-                              4, false, false),
-    build_steps.test_unittest('gluon-nlp-cpu-py3-master', 'cpu/py3-master',
-                              'tests/unittest', 'src/gluonnlp',
-                              'not (gpu or skip_master) and serial',
-                              0, false, false),
-    build_steps.test_unittest('gluon-nlp-cpu-py3-master', 'cpu/py3-master',
-                              'scripts/tests', 'src/gluonnlp',
-                              'not (gpu or serial or integration or skip_master)',
-                              4, false, false),
-    build_steps.test_unittest('gluon-nlp-cpu-py3-master', 'cpu/py3-master',
-                              'scripts/tests', 'src/gluonnlp',
-                              '(not (gpu or integration or skip_master)) and serial',
-                              0, false, false),
-    build_steps.test_unittest('gluon-nlp-cpu-py3-master', 'cpu/py3-master',
-                              'scripts/tests', 'src/gluonnlp',
-                              'not (gpu or serial or skip_master) and integration',
-                              4, false, false),
-    build_steps.test_unittest('gluon-nlp-cpu-py3-master', 'cpu/py3-master',
-                              'scripts/tests', 'src/gluonnlp',
-                              'not (gpu or skip_master) and serial and integration',
-                              0, false, false)
-  ])
-}
-,
-failure_handler: {}
-)
diff --git a/ci/jenkins/Jenkinsfile_py3-master_gpu_doc b/ci/jenkins/Jenkinsfile_py3-master_gpu_doc
deleted file mode 100644
index 82d6cc5fee..0000000000
--- a/ci/jenkins/Jenkinsfile_py3-master_gpu_doc
+++ /dev/null
@@ -1,168 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// Jenkins pipeline
-// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
-
-// timeout in minutes
-max_time = 120
-
-node {
-  // Loading the utilities requires a node context unfortunately
-  checkout scm
-  utils = load('ci/jenkins/utils.groovy')
-  build_steps = load('ci/jenkins/build_steps.groovy')
-}
-utils.assign_node_labels(linux_gpu: 'linux-gpu', linux_cpu: 'linux-cpu')
-
-utils.main_wrapper(
-core_logic: {
-  utils.parallel_stage('Doc Test', [
-    build_steps.test_doctest('gluon-nlp-cpu-py3-master', 'cpu/py3-master',
-                 'src/gluonnlp', 'src/gluonnlp', 4)
-  ])
-
-  // Compile example notebooks, Doctest & Create Website
-  node {  // Single node parallelism
-    ws('gluon-nlp-cpu-py3-master') {
-      stage("Prepare conda environment for website") {
-        utils.init_git()
-        // Require a full environment here due to sphinx build step
-        // after compiling and downloading the notebooks
-        sh 'source ci/prepare_clean_env.sh cpu/py3-master'
-      }
-
-      stage("Create Website") {
-        def tests = [:]
-        for (f in findFiles(glob: '**/docs/examples/*/*.md')) {
-          def md_file = f.toString()  // Convert FileWrapper to String
-          def short_name = md_file["docs/examples/".length()..-1]
-          tests[short_name] = { ->
-            def base_name = md_file[0..-4] + ''
-            def ipynb_file = base_name + '.ipynb'
-            def stdout_file = base_name + '.stdout.log'
-            def stderr_file = base_name + '.stderr.log'
-            stage(short_name) {  // remove common path from name
-              // Submit AWS Batch jobs for each example notebook
-              // The converted notebooks and the conversion logs are
-              // saved to S3 and retrieved on the CI server once the jobs
-              // finished.
-
-              if (env.BRANCH_NAME.startsWith('PR-')){
-                sh """
-                set +e
-                conda activate ./conda/cpu/py3-master
-
-                python3 ci/batch/submit-job.py --region us-east-1 --wait \
-                  --timeout 1800 --saved-output ./docs/examples --conda-env docker/py3 \
-                  --name GluonNLP-${env.BRANCH_NAME}-${env.BUILD_NUMBER} \
-                  --save-path batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/docs/examples \
-                  --work-dir . --source-ref refs/pull/${env.CHANGE_ID}/head \
-                  --command \"(python3 docs/md2ipynb.py ${md_file} | tee ${stdout_file}) 3>&1 1>&2 2>&3 | tee ${stderr_file} \"
-                BATCH_EXIT_CODE=\$?
-
-                aws s3api wait object-exists --bucket gluon-nlp-staging \
-                  --key batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${stderr_file}
-                aws s3 cp s3://gluon-nlp-staging/batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${stderr_file} ${stderr_file}
-                cat ${stderr_file}
-
-                aws s3api wait object-exists --bucket gluon-nlp-staging \
-                  --key batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${stdout_file}
-                aws s3 cp s3://gluon-nlp-staging/batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${stdout_file} ${stdout_file}
-                cat ${stdout_file}
-
-                if [ \$BATCH_EXIT_CODE -ne 0 ]; then
-                  echo AWS Batch Task Failed
-                else
-                  aws s3api wait object-exists --bucket gluon-nlp-staging \
-                    --key batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${ipynb_file}
-                  aws s3 cp s3://gluon-nlp-staging/batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${ipynb_file} ${ipynb_file}
-                fi
-
-                exit \$BATCH_EXIT_CODE
-                """
-              } else {
-                sh """
-                set +e
-                conda activate ./conda/cpu/py3-master
-
-                python3 ci/batch/submit-job.py --region us-east-1 --wait \
-                  --timeout 1800 --saved-output ./docs/examples --conda-env docker/py3 \
-                  --name GluonNLP-${env.BRANCH_NAME}-${env.BUILD_NUMBER} \
-                  --save-path batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/docs/examples \
-                  --work-dir . --source-ref ${env.BRANCH_NAME} \
-                  --command \"(python3 docs/md2ipynb.py ${md_file} | tee ${stdout_file}) 3>&1 1>&2 2>&3 | tee ${stderr_file} \"
-                BATCH_EXIT_CODE=\$?
-
-                aws s3api wait object-exists --bucket gluon-nlp-staging \
-                  --key batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${stderr_file}
-                aws s3 cp s3://gluon-nlp-staging/batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${stderr_file} ${stderr_file}
-                cat ${stderr_file}
-
-                aws s3api wait object-exists --bucket gluon-nlp-staging \
-                  --key batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${stdout_file}
-                aws s3 cp s3://gluon-nlp-staging/batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${stdout_file} ${stdout_file}
-                cat ${stdout_file}
-
-                if [ \$BATCH_EXIT_CODE -ne 0 ]; then
-                  echo AWS Batch Task Failed
-                else
-                  aws s3api wait object-exists --bucket gluon-nlp-staging \
-                    --key batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${ipynb_file}
-                  aws s3 cp s3://gluon-nlp-staging/batch/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/${ipynb_file} ${ipynb_file}
-                fi
-
-                exit \$BATCH_EXIT_CODE
-                """
-              }
-            }
-          }
-        }
-
-        parallel tests
-      }
-
-      stage("Upload Website") {
-        if (env.BRANCH_NAME.startsWith('PR-')){
-          bucket = 'gluon-nlp-staging'
-          path = env.BRANCH_NAME+'/'+env.BUILD_NUMBER
-        } else {
-          bucket = 'gluon-nlp'
-          path = env.BRANCH_NAME
-        }
-        sh """
-        conda activate ./conda/cpu/py3-master
-        make docs
-        ci/upload_doc.sh ${bucket} ${path}
-        """
-      }
-    }
-  }
-
-  utils.parallel_stage('Documentation', [
-    build_steps.website_linkcheck('gluon-nlp-cpu-py3-master', 'cpu/py3-master')
-  ])
-
-  utils.parallel_stage('Deploy', [
-    build_steps.post_website_link()
-  ])
-}
-,
-failure_handler: {}
-)
diff --git a/ci/jenkins/Jenkinsfile_py3-master_gpu_integration b/ci/jenkins/Jenkinsfile_py3-master_gpu_integration
deleted file mode 100644
index 31002e4bdd..0000000000
--- a/ci/jenkins/Jenkinsfile_py3-master_gpu_integration
+++ /dev/null
@@ -1,53 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// Jenkins pipeline
-// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
-
-// timeout in minutes
-max_time = 120
-
-node {
-  // Loading the utilities requires a node context unfortunately
-  checkout scm
-  utils = load('ci/jenkins/utils.groovy')
-  build_steps = load('ci/jenkins/build_steps.groovy')
-}
-utils.assign_node_labels(linux_gpu: 'linux-gpu', linux_cpu: 'linux-cpu')
-
-utils.main_wrapper(
-core_logic: {
-  utils.parallel_stage('Sanity', [
-    build_steps.sanity_lint('gluon-nlp-gpu-py3-master', 'gpu/py3-master', 'scripts')
-  ])
-
-  utils.parallel_stage('Scripts', [
-    build_steps.test_unittest('gluon-nlp-gpu-py3-master', 'gpu/py3-master',
-                              'scripts/tests', 'src/gluonnlp',
-                              'gpu and (not (serial or skip_master)) and integration',
-                              4, true, true),
-    build_steps.test_unittest('gluon-nlp-gpu-py3-master', 'gpu/py3-master',
-                              'scripts/tests', 'src/gluonnlp',
-                              'gpu and serial and integration and (not skip_master)',
-                              0, true, true)
-  ])
-}
-,
-failure_handler: {}
-)
diff --git a/ci/jenkins/Jenkinsfile_py3-master_gpu_unittest b/ci/jenkins/Jenkinsfile_py3-master_gpu_unittest
deleted file mode 100644
index 6275e40d58..0000000000
--- a/ci/jenkins/Jenkinsfile_py3-master_gpu_unittest
+++ /dev/null
@@ -1,61 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// Jenkins pipeline
-// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
-
-// timeout in minutes
-max_time = 120
-
-node {
-  // Loading the utilities requires a node context unfortunately
-  checkout scm
-  utils = load('ci/jenkins/utils.groovy')
-  build_steps = load('ci/jenkins/build_steps.groovy')
-}
-utils.assign_node_labels(linux_gpu: 'linux-gpu', linux_cpu: 'linux-cpu')
-
-utils.main_wrapper(
-core_logic: {
-  utils.parallel_stage('Sanity', [
-    build_steps.sanity_lint('gluon-nlp-gpu-py3-master', 'gpu/py3-master', 'src/gluonnlp')
-  ])
-
-  utils.parallel_stage('Tests', [
-    build_steps.test_unittest('gluon-nlp-gpu-py3-master', 'gpu/py3-master',
-                              'tests/unittest', 'src/gluonnlp',
-                              'gpu and (not (serial or skip_master))',
-                              4, true, false),
-    build_steps.test_unittest('gluon-nlp-gpu-py3-master', 'gpu/py3-master',
-                              'tests/unittest', 'src/gluonnlp',
-                              'gpu and serial and not skip_master',
-                              0, true, false),
-    build_steps.test_unittest('gluon-nlp-gpu-py3-master', 'gpu/py3-master',
-                              'scripts/tests', 'src/gluonnlp',
-                              'gpu and (not (serial or skip_master or integration))',
-                              4, true, false),
-    build_steps.test_unittest('gluon-nlp-gpu-py3-master', 'gpu/py3-master',
-                              'scripts/tests', 'src/gluonnlp',
-                              'gpu and serial and not (skip_master or integration)',
-                              0, true, false)
-  ])
-}
-,
-failure_handler: {}
-)
diff --git a/ci/jenkins/Jenkinsfile_py3_cpu_unittest b/ci/jenkins/Jenkinsfile_py3_cpu_unittest
deleted file mode 100644
index 6d518fdbfd..0000000000
--- a/ci/jenkins/Jenkinsfile_py3_cpu_unittest
+++ /dev/null
@@ -1,69 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// Jenkins pipeline
-// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
-
-// timeout in minutes
-max_time = 120
-
-node {
-  // Loading the utilities requires a node context unfortunately
-  checkout scm
-  utils = load('ci/jenkins/utils.groovy')
-  build_steps = load('ci/jenkins/build_steps.groovy')
-}
-utils.assign_node_labels(linux_gpu: 'linux-gpu', linux_cpu: 'linux-cpu')
-
-utils.main_wrapper(
-core_logic: {
-  utils.parallel_stage('Sanity', [
-    build_steps.sanity_lint('gluon-nlp-cpu-py3', 'cpu/py3', 'src/gluonnlp')
-  ])
-
-  utils.parallel_stage('Tests', [
-    build_steps.test_unittest('gluon-nlp-cpu-py3', 'cpu/py3',
-                              'tests/unittest', 'src/gluonnlp',
-                              'not (gpu or serial)',
-                              4, false, false),
-    build_steps.test_unittest('gluon-nlp-cpu-py3', 'cpu/py3',
-                              'tests/unittest', 'src/gluonnlp',
-                              '(not gpu) and serial',
-                              0, false, false),
-    build_steps.test_unittest('gluon-nlp-cpu-py3', 'cpu/py3',
-                              'scripts/tests', 'src/gluonnlp',
-                              'not (gpu or serial or integration)',
-                              4, false, false),
-    build_steps.test_unittest('gluon-nlp-cpu-py3', 'cpu/py3',
-                              'scripts/tests', 'src/gluonnlp',
-                              '(not (gpu or integration)) and serial',
-                              0, false, false),
-    build_steps.test_unittest('gluon-nlp-cpu-py3', 'cpu/py3',
-                              'scripts/tests', 'src/gluonnlp',
-                              'not (gpu or serial) and integration',
-                              4, false, false),
-    build_steps.test_unittest('gluon-nlp-cpu-py3', 'cpu/py3',
-                              'scripts/tests', 'src/gluonnlp',
-                              '(not gpu) and serial and integration',
-                              0, false, false)
-  ])
-}
-,
-failure_handler: {}
-)
diff --git a/ci/jenkins/Jenkinsfile_py3_gpu_integration b/ci/jenkins/Jenkinsfile_py3_gpu_integration
deleted file mode 100644
index e683f5f14d..0000000000
--- a/ci/jenkins/Jenkinsfile_py3_gpu_integration
+++ /dev/null
@@ -1,53 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// Jenkins pipeline
-// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
-
-// timeout in minutes
-max_time = 120
-
-node {
-  // Loading the utilities requires a node context unfortunately
-  checkout scm
-  utils = load('ci/jenkins/utils.groovy')
-  build_steps = load('ci/jenkins/build_steps.groovy')
-}
-utils.assign_node_labels(linux_gpu: 'linux-gpu', linux_cpu: 'linux-cpu')
-
-utils.main_wrapper(
-core_logic: {
-  utils.parallel_stage('Sanity', [
-    build_steps.sanity_lint('gluon-nlp-gpu-py3', 'gpu/py3', 'scripts')
-  ])
-
-  utils.parallel_stage('Scripts', [
-    build_steps.test_unittest('gluon-nlp-gpu-py3', 'gpu/py3',
-                              'scripts/tests', 'src/gluonnlp',
-                              'gpu and (not serial) and integration',
-                              4, true, true),
-    build_steps.test_unittest('gluon-nlp-gpu-py3', 'gpu/py3',
-                              'scripts/tests', 'src/gluonnlp',
-                              'gpu and serial and integration',
-                              0, true, true)
-  ])
-}
-,
-failure_handler: {}
-)
diff --git a/ci/jenkins/Jenkinsfile_py3_gpu_unittest b/ci/jenkins/Jenkinsfile_py3_gpu_unittest
deleted file mode 100644
index 8430ca0a36..0000000000
--- a/ci/jenkins/Jenkinsfile_py3_gpu_unittest
+++ /dev/null
@@ -1,61 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// Jenkins pipeline
-// See documents at https://jenkins.io/doc/book/pipeline/jenkinsfile/
-
-// timeout in minutes
-max_time = 120
-
-node {
-  // Loading the utilities requires a node context unfortunately
-  checkout scm
-  utils = load('ci/jenkins/utils.groovy')
-  build_steps = load('ci/jenkins/build_steps.groovy')
-}
-utils.assign_node_labels(linux_gpu: 'linux-gpu', linux_cpu: 'linux-cpu')
-
-utils.main_wrapper(
-core_logic: {
-  utils.parallel_stage('Sanity', [
-    build_steps.sanity_lint('gluon-nlp-gpu-py3', 'gpu/py3', 'src/gluonnlp')
-  ])
-
-  utils.parallel_stage('Tests', [
-    build_steps.test_unittest('gluon-nlp-gpu-py3', 'gpu/py3',
-                              'tests/unittest', 'src/gluonnlp',
-                              'gpu and not serial',
-                              4, true, false),
-    build_steps.test_unittest('gluon-nlp-gpu-py3', 'gpu/py3',
-                              'tests/unittest', 'src/gluonnlp',
-                              'gpu and serial',
-                              0, true, false),
-    build_steps.test_unittest('gluon-nlp-gpu-py3', 'gpu/py3',
-                              'scripts/tests', 'src/gluonnlp',
-                              'gpu and not (serial or integration)',
-                              4, true, false),
-    build_steps.test_unittest('gluon-nlp-gpu-py3', 'gpu/py3',
-                              'scripts/tests', 'src/gluonnlp',
-                              'gpu and serial and not integration',
-                              0, true, false)
-  ])
-}
-,
-failure_handler: {}
-)
diff --git a/ci/jenkins/build_steps.groovy b/ci/jenkins/build_steps.groovy
deleted file mode 100644
index 63bd59e81d..0000000000
--- a/ci/jenkins/build_steps.groovy
+++ /dev/null
@@ -1,127 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-//
-// This file contains the steps that will be used in the
-// Jenkins pipelines
-
-utils = load('ci/jenkins/utils.groovy')
-
-def sanity_lint(workspace_name, conda_env_name, path) {
-  return ['Lint': {
-    node {
-      ws(workspace_name) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          utils.init_git()
-          sh """
-          set -ex
-          source ci/prepare_clean_env.sh ${conda_env_name}
-          make lintdir=${path} lint
-          set +ex
-          """
-        }
-      }
-    }
-  }]
-}
-
-def test_unittest(workspace_name, conda_env_name,
-                  test_path, cov_path,
-                  mark,
-                  threads, gpu, skip_report) {
-  capture_flag = env.BRANCH_NAME.startsWith('PR-')?'':'--capture=no'
-  node_type = gpu?NODE_LINUX_GPU:NODE_LINUX_CPU
-  return ["${conda_env_name}: ${test_path} -m '${mark}'": {
-    node(node_type) {
-      ws(workspace_name) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          utils.init_git()
-          sh """
-          set -ex
-          source ci/prepare_clean_env.sh ${conda_env_name}
-          pytest -v ${capture_flag} -n ${threads} -m '${mark}' --durations=30 --cov ${cov_path} --cov-report=term --cov-report xml ${test_path}
-          set +ex
-          """
-          if (!skip_report) utils.publish_test_coverage('GluonNLPCodeCov')
-        }
-      }
-    }
-  }]
-}
-
-def test_doctest(workspace_name, conda_env_name,
-                 test_path, cov_path, threads) {
-  capture_flag = env.BRANCH_NAME.startsWith('PR-')?'':'--capture=no'
-  return ["${conda_env_name}: doctest ${test_path}": {
-    node(NODE_LINUX_CPU) {
-      ws(workspace_name) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          utils.init_git()
-          sh """
-          set -ex
-          source ci/prepare_clean_env.sh ${conda_env_name}
-          pytest -v ${capture_flag} -n ${threads} --durations=30 --cov ${cov_path} --cov-report=term --cov-report xml --doctest-modules ${test_path}
-          set +ex
-          """
-          utils.publish_test_coverage('GluonNLPCodeCov')
-        }
-      }
-    }
-  }]
-}
-
-def website_linkcheck(workspace_name, conda_env_name) {
-  return ["${conda_env_name}: website link check": {
-    node(NODE_LINUX_CPU) {
-      ws(workspace_name) {
-        timeout(time: max_time, unit: 'MINUTES') {
-          utils.init_git()
-          sh """
-          set -ex
-          source ci/prepare_clean_env.sh ${conda_env_name}
-          make distribute
-          set +ex
-          """
-          linkcheck_errors = sh returnStdout: true, script: """
-          conda activate ./conda/${conda_env_name}
-          """
-          linkcheck_errors = linkcheck_errors.split('\n').findAll {it ==~ '/^(line *[0-9]*) broken.*$/'}
-          linkcheck_errors = linkcheck_errors.join('\n')
-          linkcheck_errors = linkcheck_errors.trim()
-          if (linkcheck_errors && env.BRANCH_NAME.startsWith("PR-")) {
-            pullRequest.comment("Found link check problems in job ${env.BRANCH_NAME}/${env.BUILD_NUMBER}:\n"+linkcheck_errors)
-          }
-        }
-      }
-    }
-  }]
-}
-
-def post_website_link() {
-  return ["Deploy: ": {
-    node {
-      timeout(time: max_time, unit: 'MINUTES') {
-        if (env.BRANCH_NAME.startsWith("PR-")) {
-            pullRequest.comment("Job ${env.BRANCH_NAME}/${env.BUILD_NUMBER} is complete. \nDocs are uploaded to http://gluon-nlp-staging.s3-accelerate.dualstack.amazonaws.com/${env.BRANCH_NAME}/${env.BUILD_NUMBER}/index.html")
-        }
-      }
-    }
-  }]
-}
-
-return this
diff --git a/ci/jenkins/utils.groovy b/ci/jenkins/utils.groovy
deleted file mode 100644
index ddbde419d5..0000000000
--- a/ci/jenkins/utils.groovy
+++ /dev/null
@@ -1,214 +0,0 @@
-// -*- mode: groovy -*-
-
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// initialize source codes
-def init_git() {
-  deleteDir()
-  retry(5) {
-    try {
-      // Make sure wait long enough for api.github.com request quota. Important: Don't increase the amount of
-      // retries as this will increase the amount of requests and worsen the throttling
-      timeout(time: 15, unit: 'MINUTES') {
-        checkout scm
-        sh 'git clean -xdff'
-        sh 'git reset --hard'
-        sh 'git submodule update --init --recursive'
-        sh 'git submodule foreach --recursive git clean -ffxd'
-        sh 'git submodule foreach --recursive git reset --hard'
-      }
-    } catch (exc) {
-      deleteDir()
-      error "Failed to fetch source codes with ${exc}"
-      sleep 2
-    }
-  }
-}
-
-
-def get_git_commit_hash() {
-  lastCommitMessage = sh (script: "git log -1 --pretty=%B", returnStdout: true)
-  lastCommitMessage = lastCommitMessage.trim()
-  if (lastCommitMessage.startsWith("Merge commit '") && lastCommitMessage.endsWith("' into HEAD")) {
-      // Merge commit applied by Jenkins, skip that commit
-      git_commit_hash = sh (script: "git rev-parse @~", returnStdout: true)
-  } else {
-      git_commit_hash = sh (script: "git rev-parse @", returnStdout: true)
-  }
-  return git_commit_hash.trim()
-}
-
-def publish_test_coverage(codecov_credential) {
-    // CodeCovs auto detection has trouble with our CIs PR validation due the merging strategy
-    git_commit_hash = get_git_commit_hash()
-
-    if (env.CHANGE_ID) {
-      // PR execution
-      codecovArgs = "-B ${env.CHANGE_TARGET} -C ${git_commit_hash} -P ${env.CHANGE_ID}"
-    } else {
-      // Branch execution
-      codecovArgs = "-B ${env.BRANCH_NAME} -C ${git_commit_hash}"
-    }
-
-    // To make sure we never fail because test coverage reporting is not available
-    // Fall back to our own copy of the bash helper if it failed to download the public version
-    withCredentials([string(credentialsId: codecov_credential, variable: 'CODECOV_TOKEN')]) {
-      sh "(curl --retry 10 -s https://codecov.io/bash | bash -s - ${codecovArgs}) || (curl --retry 10 -s https://s3-us-west-2.amazonaws.com/mxnet-ci-prod-slave-data/codecov-bash.txt | bash -s - ${codecovArgs}) || true"
-    }
-}
-
-// Allow publishing to GitHub with a custom context (the status shown under a PR)
-// Credit to https://plugins.jenkins.io/github
-def get_repo_url() {
-  checkout scm
-  return sh(returnStdout: true, script: "git config --get remote.origin.url").trim()
-}
-
-def update_github_commit_status(state, message) {
-  node {
-    // NOTE: https://issues.jenkins-ci.org/browse/JENKINS-39482
-    //The GitHubCommitStatusSetter requires that the Git Server is defined under
-    //*Manage Jenkins > Configure System > GitHub > GitHub Servers*.
-    //Otherwise the GitHubCommitStatusSetter is not able to resolve the repository name
-    //properly and you would see an empty list of repos:
-    //[Set GitHub commit status (universal)] PENDING on repos [] (sha:xxxxxxx) with context:test/mycontext
-    //See https://cwiki.apache.org/confluence/display/MXNET/Troubleshooting#Troubleshooting-GitHubcommit/PRstatusdoesnotgetpublished
-
-    echo "Publishing commit status..."
-
-    repoUrl = get_repo_url()
-    echo "repoUrl=${repoUrl}"
-
-    commitSha = get_git_commit_hash()
-    echo "commitSha=${commitSha}"
-
-    context = get_github_context()
-    echo "context=${context}"
-
-    // a few attempts need to be made: https://github.com/apache/incubator-mxnet/issues/11654
-    for (int attempt = 1; attempt <= 3; attempt++) {
-      echo "Sending GitHub status attempt ${attempt}..."
-
-      step([
-        $class: 'GitHubCommitStatusSetter',
-        reposSource: [$class: "ManuallyEnteredRepositorySource", url: repoUrl],
-        contextSource: [$class: "ManuallyEnteredCommitContextSource", context: context],
-        commitShaSource: [$class: "ManuallyEnteredShaSource", sha: commitSha],
-        statusBackrefSource: [$class: "ManuallyEnteredBackrefSource", backref: "${env.RUN_DISPLAY_URL}"],
-        errorHandlers: [[$class: 'ShallowAnyErrorHandler']],
-        statusResultSource: [
-          $class: 'ConditionalStatusResultSource',
-          results: [[$class: "AnyBuildResult", message: message, state: state]]
-        ]
-      ])
-
-      if (attempt <= 2) {
-        sleep 1
-      }
-    }
-
-    echo "Publishing commit status done."
-
-  }
-}
-
-def get_github_context() {
-  // Since we use multi-branch pipelines, Jenkins appends the branch name to the job name
-  if (env.BRANCH_NAME) {
-    short_job_name = JOB_NAME.substring(0, JOB_NAME.lastIndexOf('/'))
-  } else {
-    short_job_name = JOB_NAME
-  }
-
-  return "ci/jenkins/${short_job_name}"
-}
-
-def parallel_stage(stage_name, steps) {
-    // Allow to pass an array of steps that will be executed in parallel in a stage
-    new_map = [:]
-
-    for (def step in steps) {
-        new_map = new_map << step
-    }
-
-    stage(stage_name) {
-      parallel new_map
-    }
-}
-
-def assign_node_labels(args) {
-  // This function allows to assign instance labels to the generalized placeholders.
-  // This serves two purposes:
-  // 1. Allow generalized placeholders (e.g. NODE_WINDOWS_CPU) in the job definition
-  //    in order to abstract away the underlying node label. This allows to schedule a job
-  //    onto a different node for testing or security reasons. This could be, for example,
-  //    when you want to test a new set of slaves on separate labels or when a job should
-  //    only be run on restricted slaves
-  // 2. Restrict the allowed job types within a Jenkinsfile. For example, a UNIX-CPU-only
-  //    Jenkinsfile should not allowed access to Windows or GPU instances. This prevents
-  //    users from just copy&pasting something into an existing Jenkinsfile without
-  //    knowing about the limitations.
-  NODE_LINUX_GPU = args.linux_gpu
-  NODE_LINUX_CPU = args.linux_cpu
-}
-
-def main_wrapper(args) {
-  // Main Jenkinsfile pipeline wrapper handler that allows to wrap core logic into a format
-  // that supports proper failure handling
-  // args:
-  // - core_logic: Jenkins pipeline containing core execution logic
-  // - failure_handler: Failure handler
-
-  // assign any caught errors here
-  err = null
-  try {
-    update_github_commit_status('PENDING', 'Job has been enqueued')
-
-    timestamps {
-      args['core_logic']()
-    }
-
-    // set build status to success at the end
-    currentBuild.result = "SUCCESS"
-    update_github_commit_status('SUCCESS', 'Job succeeded')
-  } catch (caughtError) {
-    node {
-      sh "echo caught ${caughtError}"
-      err = caughtError
-      currentBuild.result = "FAILURE"
-      update_github_commit_status('FAILURE', 'Job failed')
-    }
-  } finally {
-    timestamps {
-      node {
-        // Call failure handler
-        args['failure_handler']()
-
-        // Clean workspace to reduce space requirements
-        cleanWs()
-
-        // Remember to rethrow so the build is marked as failing
-        if (err) {
-          throw err
-        }
-      }
-    }
-  }
-}
-
-return this
diff --git a/ci/prepare_clean_env.sh b/ci/prepare_clean_env.sh
deleted file mode 100755
index 1a224c418a..0000000000
--- a/ci/prepare_clean_env.sh
+++ /dev/null
@@ -1,25 +0,0 @@
-#!/bin/bash
-env_name=$1
-
-echo Preparing clean environment on $(hostname) in $(ls -id $(pwd))
-
-export LD_LIBRARY_PATH=/usr/local/cuda-10.0/lib64
-export CUDA_VISIBLE_DEVICES=$EXECUTOR_NUMBER
-export CONDA_ENVS_PATH=$PWD/conda
-export CONDA_PKGS_DIRS=$PWD/conda/pkgs
-export MXNET_HOME=$PWD/tests/data
-export HOROVOD_WITHOUT_TENSORFLOW=1
-export HOROVOD_WITHOUT_PYTORCH=1
-export HOROVOD_WITH_MXNET=1
-
-make clean
-conda env update --prune -p conda/${env_name} -f env/${env_name}.yml
-conda activate ./conda/${env_name}
-conda list
-printenv
-
-pip install -v -e .
-pip install horovod --no-cache-dir -U
-python -m spacy download en
-python -m spacy download de
-python -m nltk.downloader all
diff --git a/ci/rat/rat-excludes b/ci/rat/rat-excludes
deleted file mode 100755
index 3d6d00f7e8..0000000000
--- a/ci/rat/rat-excludes
+++ /dev/null
@@ -1,55 +0,0 @@
-\..*
-.*css
-\\.*
-.*ipynb
-.*html
-.*json
-.*txt
-3rdparty/*
-R-package/*
-trunk/*
-.*\\.m
-.*\\.mk
-.*\\.R
-.*svg
-.*cfg
-.*config
-.*rst
-__init__.py
-build/*
-.*\\.t
-MANIFEST
-Changes
-.*csv
-.*names
-CODEOWNERS
-snap.python
-bbox.pyx
-cpu_nms.pyx
-gpu_nms.pyx
-nms_kernel.cu
-_mask.pyx
-coco.py
-base.pyi
-special_functions-inl.h
-erfinv-inl.h
-im2col.cuh
-im2col.h
-pool.h
-dataset.cPickle
-image-classification/*
-rat-excludes
-apache-rat-tasks/*
-moderngpu/*
-deformable_im2col.cuh
-deformable_im2col.h
-REQUIRE
-Project.toml
-include/*
-.*.iml
-.*.json.ref
-searchtools_custom.js
-theme.conf
-LICENSE.binary.dependencies
-multi-bleu-detok.perl
-multi-bleu.perl
diff --git a/ci/upload_doc.sh b/ci/upload_doc.sh
deleted file mode 100755
index efa5e5d904..0000000000
--- a/ci/upload_doc.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-bucket=$1
-path=$2
-echo "Uploading doc to s3://${bucket}/${path}/"
-aws s3 sync --delete docs/_build/html/ s3://${bucket}/${path}/ --acl public-read
-echo "Uploaded doc to http://${bucket}.s3-accelerate.dualstack.amazonaws.com/${path}/index.html"
diff --git a/codecov.yml b/codecov.yml
deleted file mode 100644
index fcc1c6dece..0000000000
--- a/codecov.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-codecov:
-  notify:
-    require_ci_to_pass: yes
-  ci:
-    - ci.mxnet.io
-
-coverage:
-  precision: 2
-  round: down
-  range: "70...100"
-
-  status:
-    project: yes
-    patch: yes
-    changes: no
-
-parsers:
-  gcov:
-    branch_detection:
-      conditional: yes
-      loop: yes
-      method: no
-      macro: no
-
-comment:
-  layout: "header, reach, diff, files"
-  behavior: default
-  require_changes: no
-  require_base: no
-  require_head: no
diff --git a/conftest.py b/conftest.py
index 8c9e442716..04efde9756 100644
--- a/conftest.py
+++ b/conftest.py
@@ -97,7 +97,7 @@ def pytest_configure():
           'use MXNET_MODULE_SEED={} to reproduce.'.format(seed))
 
     np.random.seed(seed)
-    mx.random.seed(seed)
+    mx.npx.random.seed(seed)
     random.seed(seed)
 
     # The MXNET_TEST_SEED environment variable will override MXNET_MODULE_SEED for tests with
@@ -197,6 +197,7 @@ def test_not_ok_with_random_data():
 def hybridize(request):
     return request.param
 
+
 @pytest.fixture(autouse=True)
 def doctest(doctest_namespace):
     doctest_namespace['np'] = np
@@ -205,3 +206,10 @@ def doctest(doctest_namespace):
     doctest_namespace['gluon'] = mx.gluon
     import doctest
     doctest.ELLIPSIS_MARKER = '-etc-'
+
+def pytest_addoption(parser):
+    parser.addoption("--device", action="append", default=[], help="list of device choices to run the tests. ex: mx.gpu() (For GPU test only)")
+
+def pytest_generate_tests(metafunc):
+    if 'ctx' in metafunc.fixturenames:
+        metafunc.parametrize("ctx", [getattr(mx, device)() for device in metafunc.config.option.device])
diff --git a/docs/_static/custom.css b/docs/_static/custom.css
index f812baec3a..51f1f7df1c 100644
--- a/docs/_static/custom.css
+++ b/docs/_static/custom.css
@@ -20,9 +20,11 @@
 }
 
 @media (max-width: 650px) {
-.install .option, .install .title {
-    width: 90%;
-}
-.install .title {
-    margin-top: 1em;
+    .install .option, .install .title {
+        width: 90%;
+    }
+
+    .install .title {
+        margin-top: 1em;
+    }
 }
diff --git a/docs/api/data.batchify.rst b/docs/api/data.batchify.rst
deleted file mode 100644
index 7a7eecd378..0000000000
--- a/docs/api/data.batchify.rst
+++ /dev/null
@@ -1,47 +0,0 @@
-gluonnlp.data.batchify
-======================
-
-Batchify functions can be used to transform a dataset into mini-batches that can be processed
-efficiently.
-
-.. currentmodule:: gluonnlp.data.batchify
-
-Batch Loaders
--------------
-
-.. autosummary::
-    :nosignatures:
-
-    Stack
-    Pad
-    List
-    Tuple
-    NamedTuple
-    Dict
-
-
-Language Modeling
------------------
-
-.. autosummary::
-    :nosignatures:
-
-    CorpusBatchify
-    CorpusBPTTBatchify
-    StreamBPTTBatchify
-
-Embedding Training
-------------------
-
-.. autosummary::
-    :nosignatures:
-
-    EmbeddingCenterContextBatchify
-
-API Reference
--------------
-
-.. automodule:: gluonnlp.data.batchify
-   :members:
-   :imported-members:
-   :special-members: __call__, __iter__
diff --git a/docs/api/data.rst b/docs/api/data.rst
index 78a13e9b79..540dc977f9 100644
--- a/docs/api/data.rst
+++ b/docs/api/data.rst
@@ -5,294 +5,11 @@ GluonNLP Toolkit provides tools for building efficient data pipelines for NLP ta
 
 .. currentmodule:: gluonnlp.data
 
-Public Datasets
----------------
-
-Popular datasets for NLP tasks are provided in gluonnlp.
-By default, all built-in datasets are automatically downloaded from public repo and
-reside in ~/.mxnet/datasets/.
-
-
-Language modeling
-~~~~~~~~~~~~~~~~~
-
-`WikiText <https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/>`_
-is a popular language modeling dataset from Salesforce.
-It is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia.
-The dataset is available under the Creative Commons Attribution-ShareAlike License.
-
-`Google 1 Billion Words <https://github.com/ciprian-chelba/1-billion-word-language-modeling-benchmark/>`_
-is a popular language modeling dataset.
-It is a collection of over 0.8 billion tokens extracted from the WMT11 website.
-The dataset is available under Apache License.
-
-.. autosummary::
-    :nosignatures:
-
-    WikiText2
-    WikiText103
-    WikiText2Raw
-    WikiText103Raw
-    GBWStream
-
-
-Text Classification
-~~~~~~~~~~~~~~~~~~~
-
-`IMDB <http://ai.stanford.edu/~amaas/data/sentiment/>`_ is a popular dataset for binary sentiment classification.
-It provides a set of 25,000 highly polar movie reviews for training, 25,000 for testing, and additional unlabeled data.
-
-`MR <https://www.cs.cornell.edu/people/pabo/movie-review-data/>`_ is a movie-review data set of 10,662 sentences labeled with respect to their overall sentiment polarity (positive or negative).
-
-`SST-1 <http://nlp.stanford.edu/sentiment/>`_ is an extension of the MR data set. However, training/test splits are provided and labels are fine-grained (very positive, positive, neutral, negative, very negative). The training and test data sets have 237,107 and 2,210 sentences respectively.
-
-SST-2 is the same as SST-1 with neutral sentences removed and only binary sentiment polarity are considered: very positive is considered as positive, and very negative is considered as negative.
-
-`SUBJ <https://www.cs.cornell.edu/people/pabo/movie-review-data/>`_ is a Subjectivity data set for sentiment analysis. Sentences labeled with respect to their subjectivity status (subjective or objective).
-
-`TREC <http://cogcomp.org/page/resource_view/49/>`_ is a movie-review data set of 10,000 sentences labeled with respect to their subjectivity status (subjective or objective).
-
-CR is customer reviews of various products (cameras, MP3s etc.). Sentences are labeled with respect to their overall sentiment polarities (positive or negative).
-
-`MPQA <http://www.cs.pitt.edu/mpqa/>`_ is an opinion polarity detection subtask. Sentences are labeled with respect to their overall sentiment polarities (positive or negative).
-
-.. autosummary::
-    :nosignatures:
-
-    IMDB
-    MR
-    SST_1
-    SST_2
-    SUBJ
-    TREC
-    CR
-    MPQA
-
-
-Word Embedding Evaluation Datasets
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-There are a number of commonly used datasets for intrinsic evaluation for word embeddings.
-
-The similarity-based evaluation datasets include:
-
-.. autosummary::
-    :nosignatures:
-
-    WordSim353
-    MEN
-    RadinskyMTurk
-    RareWords
-    SimLex999
-    SimVerb3500
-    SemEval17Task2
-    BakerVerb143
-    YangPowersVerb130
-
-Analogy-based evaluation datasets include:
-
-.. autosummary::
-    :nosignatures:
-
-    GoogleAnalogyTestSet
-    BiggerAnalogyTestSet
-
-
-CoNLL Datasets
-~~~~~~~~~~~~~~
-The `CoNLL <http://www.conll.org/previous-tasks>`_ datasets are from a series of annual
-competitions held at the top tier conference of the same name. The conference is organized by SIGNLL.
-
-These datasets include data for the shared tasks, such as part-of-speech (POS) tagging, chunking,
-named entity recognition (NER), semantic role labeling (SRL), etc.
-
-We provide built in support for CoNLL 2000 -- 2002, 2004, as well as the Universal Dependencies
-dataset which is used in the 2017 and 2018 competitions.
-
-.. autosummary::
-    :nosignatures:
-
-    CoNLL2000
-    CoNLL2001
-    CoNLL2002
-    CoNLL2004
-    UniversalDependencies21
-
-
-Machine Translation Datasets
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autosummary::
-    :nosignatures:
-
-    IWSLT2015
-    WMT2014
-    WMT2014BPE
-    WMT2016
-    WMT2016BPE
-
-
-Intent Classification and Slot Labeling
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autosummary::
-    :nosignatures:
-
-    ATISDataset
-    SNIPSDataset
-
-
-Question Answering
-~~~~~~~~~~~~~~~~~~
-
-`Stanford Question Answering Dataset (SQuAD) <https://rajpurkar.github.io/SQuAD-explorer/>`_ is a reading comprehension dataset, consisting of questions posed by crowdworkers on a set of Wikipedia articles, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.
-
-.. autosummary::
-    :nosignatures:
-
-    SQuAD
-
-
-GLUE Benchmark
-~~~~~~~~~~~~~~
-
-The `General Language Understanding Evaluation (GLUE) benchmark <https://gluebenchmark.com/>`_ is a collection of resources for training, evaluating, and analyzing natural language understanding systems.
-
-.. autosummary::
-    :nosignatures:
-
-    GlueCoLA
-    GlueSST2
-    GlueSTSB
-    GlueQQP
-    GlueRTE
-    GlueMNLI
-    GlueQNLI
-    GlueWNLI
-    GlueMRPC
-
-
-SuperGLUE Benchmark
-~~~~~~~~~~~~~~~~~~~~
-
-The `SuperGLUE Benchmark <https://super.gluebenchmark.com>`_ a new benchmark styled after GLUE with a new set of more difficult language understanding tasks.
-
-.. autosummary::
-    :nosignatures:
-
-    SuperGlueRTE
-    SuperGlueCB
-    SuperGlueWSC
-    SuperGlueWiC
-    SuperGlueCOPA
-    SuperGlueMultiRC
-    SuperGlueBoolQ
-    SuperGlueReCoRD
-    SuperGlueAXb
-    SuperGlueAXg
-
-
-Datasets
---------
-
-Dataset API for processing common text formats. The following classes can be used or subclassed to
-load custom datasets.
-
-.. autosummary::
-    :nosignatures:
-
-    TextLineDataset
-    CorpusDataset
-    TSVDataset
-
-
-DataStreams
------------
-
-DataStream API for streaming and processing common text formats. The following classes can be used or subclassed to
-stream large custom data.
-
-.. autosummary::
-    :nosignatures:
-
-    DataStream
-    SimpleDataStream
-    DatasetStream
-    SimpleDatasetStream
-    PrefetchingStream
-
-Transforms
-----------
-
-Text data transformation functions. They can be used for processing text sequences in conjunction
-with `Dataset.transform` method.
-
-.. autosummary::
-    :nosignatures:
-
-    ClipSequence
-    PadSequence
-    SacreMosesTokenizer
-    SpacyTokenizer
-    SacreMosesDetokenizer
-    BERTTokenizer
-    BERTSentenceTransform
-
-Samplers
---------
-
-Samplers determine how to iterate through datasets. The below samplers and batch samplers can help
-iterate through sequence data.
-
-.. autosummary::
-    :nosignatures:
-
-    SortedSampler
-    FixedBucketSampler
-    SortedBucketSampler
-    SplitSampler
-
-The `FixedBucketSampler` uses following bucket scheme classes to generate bucket keys.
-
-.. autosummary::
-    :nosignatures:
-
-    ConstWidthBucket
-    LinearWidthBucket
-    ExpWidthBucket
-
-DataLoaders
------------
-
-DataLoaders loads data from a dataset and returns mini-batches of data
-
-.. autosummary::
-    :nosignatures:
-
-    ShardedDataLoader
-    DatasetLoader
-
-Utilities
----------
-
-Miscellaneous utility classes and functions for processing text and sequence data.
-
-.. autosummary::
-    :nosignatures:
-
-    Counter
-    count_tokens
-    concat_sequence
-    slice_sequence
-    train_valid_split
-    register
-    create
-    list_datasets
 
 API Reference
 -------------
 
 .. automodule:: gluonnlp.data
-   :members:
-   :imported-members:
-   :special-members: __iter__, __call__
+    :members:
+    :imported-members:
+    :special-members: __contains__, __getitem__, __setitem__
diff --git a/docs/api/embedding.rst b/docs/api/embedding.rst
index b31d4db3f5..9f5830b16e 100644
--- a/docs/api/embedding.rst
+++ b/docs/api/embedding.rst
@@ -7,8 +7,7 @@ GluonNLP Toolkit provides tools for working with embeddings.
 
 This page describes the ``gluonnlp`` APIs for text embedding, such as loading
 pre-trained embedding vectors for text tokens and storing them in the
-``mxnet.ndarray.NDArray`` format as well as utilities for intrinsic evaluation
-of text embeddings.
+``numpy.ndarray`` format.
 
 
 Pre-trained Embeddings
@@ -18,32 +17,9 @@ Pre-trained Embeddings
 .. autosummary::
     :nosignatures:
 
-    register
-    create
     list_sources
-    TokenEmbedding
-    GloVe
-    FastText
-    Word2Vec
-
-
-Intrinsic evaluation
---------------------
-
-.. currentmodule:: gluonnlp.embedding.evaluation
-.. autosummary::
-    :nosignatures:
-
-    register
-    create
-    list_evaluation_functions
-    WordEmbeddingSimilarityFunction
-    WordEmbeddingAnalogyFunction
-    CosineSimilarity
-    ThreeCosAdd
-    ThreeCosMul
-    WordEmbeddingSimilarity
-    WordEmbeddingAnalogy
+    load_embeddings
+    get_fasttext_model
 
 
 API Reference
@@ -54,7 +30,4 @@ API Reference
     :imported-members:
     :special-members: __contains__, __getitem__, __setitem__
 
-.. automodule:: gluonnlp.embedding.evaluation
-    :members:
-    :imported-members:
-    :special-members: __contains__, __getitem__, __setitem__
+
diff --git a/docs/api/index.rst b/docs/api/index.rst
index 4d9a7e76ae..5cc27a6e00 100644
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -4,13 +4,7 @@ API Documentation
 .. toctree::
    :maxdepth: 2
 
-   vocab
-   embedding
    data
-   data.batchify
-   model
-   model.train
-   loss
-   initializer
-   optimizer
+   embedding
+   models
    utils
diff --git a/docs/api/initializer.rst b/docs/api/initializer.rst
deleted file mode 100644
index 5c104e7244..0000000000
--- a/docs/api/initializer.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-gluonnlp.initializer
-======================
-
-This page describes initializers that are useful for multiple NLP model architectures.
-
-.. currentmodule:: gluonnlp.initializer
-
-Highway Bias Initializer
---------------------------
-
-We now provide Highway bias initializer defined in the following work.
-
-.. code-block:: none
-
-    @inproceedings{srivastava2015training,
-         title={Training very deep networks},
-         author={Srivastava, Rupesh K and Greff, Klaus and Schmidhuber, J{\"u}rgen},
-         booktitle={Advances in neural information processing systems},
-         pages={2377--2385},
-         year={2015}}
-
-.. autosummary::
-    :nosignatures:
-
-    HighwayBias
-
-API Reference
--------------
-
-.. automodule:: gluonnlp.initializer
-   :members:
-   :imported-members:
diff --git a/docs/api/loss.rst b/docs/api/loss.rst
deleted file mode 100644
index 12acfc645c..0000000000
--- a/docs/api/loss.rst
+++ /dev/null
@@ -1,51 +0,0 @@
-gluonnlp.loss
-=============
-
-GluonNLP Toolkit provides tools for easily setting up task specific loss.
-
-.. currentmodule:: gluonnlp.loss
-
-Masked Loss
------------
-
-.. autosummary::
-    :nosignatures:
-
-    MaskedSoftmaxCrossEntropyLoss
-
-
-Label Smoothing
----------------
-
-.. autosummary::
-    :nosignatures:
-
-    LabelSmoothing
-
-
-Activation Regularizers
------------------------
-
-Activation regularization and temporal activation regularization defined in the following work:
-
-.. code-block:: none
-
-    @article{merity2017revisiting,
-      title={Revisiting Activation Regularization for Language RNNs},
-      author={Merity, Stephen and McCann, Bryan and Socher, Richard},
-      journal={arXiv preprint arXiv:1708.01009},
-      year={2017}}
-
-.. autosummary::
-    :nosignatures:
-
-    ActivationRegularizationLoss
-    TemporalActivationRegularizationLoss
-
-
-API Reference
--------------
-
-.. automodule:: gluonnlp.loss
-   :members:
-   :imported-members:
diff --git a/docs/api/model.rst b/docs/api/model.rst
deleted file mode 100644
index 8cc594bb87..0000000000
--- a/docs/api/model.rst
+++ /dev/null
@@ -1,170 +0,0 @@
-gluonnlp.model
-==============
-
-GluonNLP Toolkit supplies models for common NLP tasks with pre-trained weights. By default,
-all requested pre-trained weights are downloaded from public repo and stored in ~/.mxnet/models/.
-
-.. currentmodule:: gluonnlp.model
-
-Model Registry
---------------
-
-The model registry provides an easy interface to obtain pre-defined and pre-trained models.
-
-.. autosummary::
-    :nosignatures:
-
-    get_model
-
-The `get_model` function returns a pre-defined model given the name of a
-registered model. The following sections of this page present a list of
-registered names for each model category.
-
-Information about pretrained models
------------------------------------
-
-.. autosummary::
-    :nosignatures:
-
-    list_models
-
-Language Modeling
------------------
-
-Components
-
-.. autosummary::
-    :nosignatures:
-
-    AWDRNN
-    BiLMEncoder
-    LSTMPCellWithClip
-    StandardRNN
-    BigRNN
-
-Pre-defined models
-
-.. autosummary::
-    :nosignatures:
-
-    awd_lstm_lm_1150
-    awd_lstm_lm_600
-    standard_lstm_lm_200
-    standard_lstm_lm_650
-    standard_lstm_lm_1500
-    big_rnn_lm_2048_512
-
-Machine Translation
--------------------
-
-.. autosummary::
-    :nosignatures:
-
-    Seq2SeqEncoder
-    TransformerEncoder
-    TransformerEncoderCell
-    PositionwiseFFN
-
-.. autosummary::
-    :nosignatures:
-
-    transformer_en_de_512
-
-Bidirectional Encoder Representations from Transformers
--------------------------------------------------------
-
-Components
-
-.. autosummary::
-    :nosignatures:
-
-    BERTModel
-    BERTEncoder
-
-Pre-defined models
-
-.. autosummary::
-    :nosignatures:
-
-    bert_12_768_12
-    bert_24_1024_16
-
-Convolutional Encoder
----------------------
-
-.. autosummary::
-    :nosignatures:
-
-    ConvolutionalEncoder
-
-ELMo
-----
-
-Components
-
-.. autosummary::
-    :nosignatures:
-
-    ELMoBiLM
-    ELMoCharacterEncoder
-
-Pre-defined models
-
-.. autosummary::
-    :nosignatures:
-
-    elmo_2x1024_128_2048cnn_1xhighway
-    elmo_2x2048_256_2048cnn_1xhighway
-    elmo_2x4096_512_2048cnn_2xhighway
-
-Highway Network
------------------
-
-.. autosummary::
-    :nosignatures:
-
-    Highway
-
-Attention Cell
---------------
-
-.. autosummary::
-    :nosignatures:
-
-    AttentionCell
-    MultiHeadAttentionCell
-    MLPAttentionCell
-    DotProductAttentionCell
-
-Sequence Sampling
------------------
-
-.. autosummary::
-    :nosignatures:
-
-    BeamSearchScorer
-    BeamSearchSampler
-    SequenceSampler
-
-
-Other Modeling Utilities
-------------------------
-
-.. autosummary::
-    :nosignatures:
-
-    WeightDropParameter
-    apply_weight_drop
-    L2Normalization
-    GELU
-    ISDense
-    NCEDense
-    SparseISDense
-    SparseNCEDense
-
-API Reference
--------------
-
-.. automodule:: gluonnlp.model
-    :members:
-    :imported-members:
diff --git a/docs/api/model.train.rst b/docs/api/model.train.rst
deleted file mode 100644
index 500ab60c72..0000000000
--- a/docs/api/model.train.rst
+++ /dev/null
@@ -1,39 +0,0 @@
-gluonnlp.model.train
-=====================
-
-GluonNLP Toolkit supplies models with train-mode since the corresponding models have different behaviors in training
- and inference, e.g., the number and type of the outputs from the forward pass are different.
-
-.. currentmodule:: gluonnlp.model.train
-
-Language Modeling
------------------
-
-.. autosummary::
-    :nosignatures:
-
-    AWDRNN
-    StandardRNN
-    CacheCell
-    get_cache_model
-    BigRNN
-
-
-
-Word Embeddings
----------------
-
-.. autosummary::
-    :nosignatures:
-
-    EmbeddingModel
-    CSREmbeddingModel
-    FasttextEmbeddingModel
-
-
-API Reference
--------------
-
-.. automodule:: gluonnlp.model.train
-    :members:
-    :imported-members:
diff --git a/docs/api/models.rst b/docs/api/models.rst
new file mode 100644
index 0000000000..a2623ce4b9
--- /dev/null
+++ b/docs/api/models.rst
@@ -0,0 +1,15 @@
+gluonnlp.models
+===============
+
+GluonNLP Toolkit supplies models for common NLP tasks with pre-trained weights. By default,
+all requested pre-trained weights are downloaded from public repo and stored in ~/.mxnet/models/.
+
+.. currentmodule:: gluonnlp.models
+
+API Reference
+-------------
+
+.. automodule:: gluonnlp.models
+    :members:
+    :imported-members:
+    :special-members: __contains__, __getitem__, __setitem__
diff --git a/docs/api/optimizer.rst b/docs/api/optimizer.rst
deleted file mode 100644
index 8bf3f7e214..0000000000
--- a/docs/api/optimizer.rst
+++ /dev/null
@@ -1,23 +0,0 @@
-gluonnlp.optimizer
-======================
-
-Gluonnlp provides some special optimizers for training in natural language processing.
-
-.. currentmodule:: gluonnlp.optimizer
-
-BERTAdam Optimizer
---------------------------
-
-The Adam optimizer with weight decay regularization for BERT.
-
-.. autosummary::
-    :nosignatures:
-
-    BERTAdam
-
-API Reference
--------------
-
-.. automodule:: gluonnlp.optimizer
-   :members:
-   :imported-members:
diff --git a/docs/api/utils.rst b/docs/api/utils.rst
index 58c1aa008f..e672814d8d 100644
--- a/docs/api/utils.rst
+++ b/docs/api/utils.rst
@@ -5,49 +5,10 @@ GluonNLP Toolkit provides tools for easily setting up task specific loss.
 
 .. currentmodule:: gluonnlp.utils
 
-
-File Handling
--------------
-
-.. autosummary::
-    :nosignatures:
-
-    glob
-    mkdir
-
-
-Parameter and Training
-----------------------
-
-.. autosummary::
-    :nosignatures:
-
-    clip_grad_global_norm
-
-
-Serialization and Deserialization
----------------------------------
-
-.. autosummary::
-    :nosignatures:
-
-    load_parameters
-    load_states
-    save_parameters
-    save_states
-
-Setting Seed
----------------------------------
-
-.. autosummary::
-    :nosignatures:
-
-    set_seed
-
-
 API Reference
 -------------
 
 .. automodule:: gluonnlp.utils
-   :members:
-   :imported-members:
+    :members:
+    :imported-members:
+    :special-members: __contains__, __getitem__, __setitem__
diff --git a/docs/api/vocab.rst b/docs/api/vocab.rst
deleted file mode 100644
index 15efa47367..0000000000
--- a/docs/api/vocab.rst
+++ /dev/null
@@ -1,78 +0,0 @@
-gluonnlp.vocab
-==============
-
-This page describes the ``gluonnlp.Vocab`` class for text data numericalization
-and the subword functionality provided in ``gluonnlp.vocab``.
-
-
-Vocabulary
-----------
-
-The vocabulary builds indices for text tokens and can be attached with
-token embeddings. The input counter whose keys are candidate indices may
-be obtained via :func:`gluonnlp.data.count_tokens`
-
-.. currentmodule:: gluonnlp
-.. autosummary::
-    :nosignatures:
-
-    Vocab
-
-
-Subword functionality
----------------------
-
-When using a vocabulary of fixed size, out of vocabulary words may be
-encountered. However, words are composed of characters, allowing intelligent
-fallbacks for out of vocabulary words based on subword units such as the
-characters or ngrams in a word. :class:`gluonnlp.vocab.SubwordFunction` provides
-an API to map words to their subword units. :doc:`model.train` contains
-models that make use of subword information to word embeddings.
-
-.. currentmodule:: gluonnlp.vocab
-.. autosummary::
-    :nosignatures:
-
-    SubwordFunction
-    ByteSubwords
-    NGramHashes
-
-
-ELMo Character-level Vocabulary
--------------------------------
-
-In the original ELMo pre-trained models, the character-level vocabulary relies on UTF-8 encoding in a specific setting.
-We provide the following vocabulary class to keep consistent with ELMo pre-trained models.
-
-.. currentmodule:: gluonnlp.vocab
-.. autosummary::
-    :nosignatures:
-
-    ELMoCharVocab
-
-
-BERT Vocabulary
-----------------
-
-The vocabulary for BERT, inherited from :class:`gluon.Vocab` , provides some additional special tokens for ease of use.
-
-.. currentmodule:: gluonnlp.vocab
-.. autosummary::
-    :nosignatures:
-
-    BERTVocab
-
-
-API Reference
--------------
-
-.. automodule:: gluonnlp
-    :members:
-    :imported-members:
-    :special-members: __call__, __len__
-
-.. automodule:: gluonnlp.vocab
-    :exclude-members: Vocab
-    :members:
-    :imported-members:
-    :special-members: __call__, __len__
diff --git a/docs/conf.py b/docs/conf.py
index c48d913674..f707e1277f 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -63,6 +63,7 @@
 nbsphinx_kernel_name = 'python3'
 nbsphinx_allow_errors = True
 nbsphinx_timeout = 1200
+nbsphinx_execute = 'never'
 html_sourcelink_suffix = ''
 
 html_context = {
@@ -172,8 +173,8 @@
     'header_links' : [
         ('Install', 'install/install-more', False, ''),
         ('API', 'api/index', False, ''),
-        ('Community', 'community/index', False, ''),
-        ('Contribute', 'community/contribute', False, ''),
+        ('Community', 'website/index', False, ''),
+        ('Contribute', 'website/contribute', False, ''),
         ('GitHub', 'https://github.com/dmlc/gluon-nlp/', True, 'fab fa-github'),
     ],
 
@@ -209,7 +210,7 @@
 
 intersphinx_mapping = {
     'python': ('https://docs.python.org/{.major}'.format(sys.version_info), None),
-    'mxnet': ('https://beta.mxnet.io/', None),
+    'mxnet': ('https://mxnet.apache.org/api/python/docs/', None),
     'numpy': ('http://docs.scipy.org/doc/numpy/', None),
     'scipy': ('http://docs.scipy.org/doc/scipy/reference', None),
     'matplotlib': ('http://matplotlib.org/', None),
diff --git a/docs/examples/index.rst b/docs/examples/index.rst
index 0aba428b61..74178c748f 100644
--- a/docs/examples/index.rst
+++ b/docs/examples/index.rst
@@ -3,32 +3,8 @@ Tutorials
 
 Interested in getting started in a new NLP area? Here are some tutorials to help get started.
 
-Data Loading and Vocabularies
------------------------------
 
-.. container:: cards
-
-   .. card::
-      :title: Data Loading APIs
-      :link: notes/data_api.html
-
-      Basics on how to load and process the sentiment dataset to form batches that can be processed efficiently.
-
-   .. card::
-      :title: Vocabulary APIs
-      :link: notes/vocab_emb.html
-
-      Learn how to write simple code to create index for tokens.
-
-
-.. toctree::
-   :hidden:
-   :maxdepth: 2
-
-   notes/index
-
-
-Representation Learning
+Embedding
 -----------------------
 
 .. container:: cards
@@ -40,26 +16,6 @@ Representation Learning
       Basics on how to use word embedding with vocab in GluonNLP and apply it on word similarity and
       analogy problems.
 
-   .. card::
-      :title: Word Embeddings Training and Evaluation
-      :link: word_embedding/word_embedding_training.html
-
-      Learn how to train fastText and word2vec embeddings on your own dataset, and determine
-      embedding quality through intrinsic evaluation.
-
-   .. card::
-      :title: Extracting Sentence Features with Pre-trained ELMo
-      :link: sentence_embedding/elmo_sentence_representation.html
-
-      See how to use GluonNLP's API to automatically download the pre-trained ELMo model, and extract features from it.
-
-   .. card::
-      :title: Fine-tuning Pre-trained BERT Models
-      :link: sentence_embedding/bert.html
-
-      See how to use GluonNLP to fine-tune a sentence pair classification model with
-      pre-trained BERT parameters.
-
 
 .. toctree::
    :hidden:
@@ -68,91 +24,3 @@ Representation Learning
    word_embedding/index
 
 
-Language Modeling
------------------
-
-.. container:: cards
-
-   .. card::
-      :title: LSTM-based Language Models
-      :link: language_model/use_pretrained_lm.html
-
-      Learn what a language model is, what it can do, and how to train a word-level language model
-      with truncated back-propagation-through-time (BPTT).
-
-
-.. toctree::
-   :hidden:
-   :maxdepth: 2
-
-   language_model/index
-
-
-Machine Translation
--------------------
-
-.. container:: cards
-
-   .. card::
-      :title: Training GNMT on IWSLT 2015 Dataset
-      :link: machine_translation/gnmt.html
-
-      Learn how to train Google Neural Machine Translation, a seq2seq with attention model.
-
-   .. card::
-      :title: Using Pre-trained Transformer
-      :link: machine_translation/transformer.html
-
-      Learn how to use a pre-trained transformer translation model for English-German translation.
-
-
-.. toctree::
-   :hidden:
-   :maxdepth: 2
-
-   machine_translation/index
-
-
-Sentiment Analysis
-------------------
-
-.. container:: cards
-
-   .. card::
-      :title: Fine-tuning LSTM-based Language Model
-      :link: sentiment_analysis/sentiment_analysis.html
-
-      See how to fine-tune a pre-trained language model to perform sentiment analysis on movie reviews.
-
-   .. card::
-      :title: Training Structured Self-attentive Sentence Embedding
-      :link: sentiment_analysis/self_attentive_sentence_embedding.html
-
-      See how to use GluonNLP to build more advanced model structure for extracting sentence
-      embeddings to predict Yelp review rating.
-
-
-.. toctree::
-   :hidden:
-   :maxdepth: 2
-
-   sentiment_analysis/index
-
-
-Text Generation
----------------
-
-.. container:: cards
-
-   .. card::
-      :title: Sequence Generation with Beam Search Sampler and Sequence Sampler
-      :link: sequence_sampling/sequence_sampling.html
-
-      Learn how to generate sentence from pre-trained language model through sampling and beam
-      search.
-
-.. toctree::
-   :hidden:
-   :maxdepth: 2
-
-   sequence_sampling/index
diff --git a/docs/examples/language_model/cache_model.png b/docs/examples/language_model/cache_model.png
deleted file mode 100644
index b3c06026d8..0000000000
Binary files a/docs/examples/language_model/cache_model.png and /dev/null differ
diff --git a/docs/examples/language_model/index.rst b/docs/examples/language_model/index.rst
deleted file mode 100644
index 9696673062..0000000000
--- a/docs/examples/language_model/index.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Language Modeling
-=================
-
-.. container:: cards
-
-   .. card::
-      :title: Using Pre-trained Language Model
-      :link: use_pretrained_lm.html
-
-      Learn what a language model is, what it can do, and how to use a pre-trained language model.
-
-   .. card::
-      :title: Train your own LSTM based Language Model
-      :link: train_language_model.html
-
-      Learn how to train a word-level language model
-      with truncated back-propagation-through-time (BPTT).
-
-
-.. toctree::
-   :hidden:
-   :maxdepth: 2
-
-   use_pretrained_lm.ipynb
-   train_language_model.ipynb
-
-
diff --git a/docs/examples/language_model/language_model_intro.png b/docs/examples/language_model/language_model_intro.png
deleted file mode 100644
index ec9af278ff..0000000000
Binary files a/docs/examples/language_model/language_model_intro.png and /dev/null differ
diff --git a/docs/examples/language_model/train_language_model.md b/docs/examples/language_model/train_language_model.md
deleted file mode 100644
index 4bd6178dd3..0000000000
--- a/docs/examples/language_model/train_language_model.md
+++ /dev/null
@@ -1,292 +0,0 @@
-# Train your own LSTM based Language Model
-
-Now let's go through the step-by-step process on how to train your own
-language model using GluonNLP.
-
-## Preparation
-
-We'll start by taking care of
-our basic dependencies and setting up our environment.
-
-Firstly, we import the required modules for GluonNLP and the LM.
-
-```{.python .input}
-import warnings
-warnings.filterwarnings('ignore')
-
-import glob
-import time
-import math
-
-import mxnet as mx
-from mxnet import gluon, autograd
-from mxnet.gluon.utils import download
-
-import gluonnlp as nlp
-nlp.utils.check_version('0.7.0')
-```
-
-Then we setup the environment for GluonNLP.
-
-Please note that we should change num_gpus according to how many NVIDIA GPUs are available on the target machine in the following code.
-
-```{.python .input}
-num_gpus = 1
-context = [mx.gpu(i) for i in range(num_gpus)] if num_gpus else [mx.cpu()]
-log_interval = 200
-```
-
-Next we setup the hyperparameters for the LM we are using.
-
-Note that BPTT stands for "back propagation through time," and LR stands for learning rate. A link to more information on truncated BPTT can be found [here.](https://en.wikipedia.org/wiki/Backpropagation_through_time)
-
-```{.python .input}
-batch_size = 20 * len(context)
-lr = 20
-epochs = 3
-bptt = 35
-grad_clip = 0.25
-```
-
-## Loading the dataset
-
-Now, we load the dataset, extract the vocabulary, numericalize, and batchify in order to perform truncated BPTT.
-
-```{.python .input}
-dataset_name = 'wikitext-2'
-
-# Load the dataset
-train_dataset, val_dataset, test_dataset = [
-    nlp.data.WikiText2(
-        segment=segment, bos=None, eos='<eos>', skip_empty=False)
-    for segment in ['train', 'val', 'test']
-]
-
-# Extract the vocabulary and numericalize with "Counter"
-vocab = nlp.Vocab(
-    nlp.data.Counter(train_dataset), padding_token=None, bos_token=None)
-
-# Batchify for BPTT
-bptt_batchify = nlp.data.batchify.CorpusBPTTBatchify(
-    vocab, bptt, batch_size, last_batch='discard')
-train_data, val_data, test_data = [
-    bptt_batchify(x) for x in [train_dataset, val_dataset, test_dataset]
-]
-```
-
-And then we load the pre-defined language model architecture as so:
-
-```{.python .input}
-model_name = 'standard_lstm_lm_200'
-model, vocab = nlp.model.get_model(model_name, vocab=vocab, dataset_name=None)
-print(model)
-print(vocab)
-
-# Initialize the model
-model.initialize(mx.init.Xavier(), ctx=context)
-
-# Initialize the trainer and optimizer and specify some hyperparameters
-trainer = gluon.Trainer(model.collect_params(), 'sgd', {
-    'learning_rate': lr,
-    'momentum': 0,
-    'wd': 0
-})
-
-# Specify the loss function, in this case, cross-entropy with softmax.
-loss = gluon.loss.SoftmaxCrossEntropyLoss()
-```
-
-## Training the LM
-
-Now that everything is ready, we can start training the model.
-
-We first define a helper function for detaching the gradients on specific states for easier truncated BPTT.
-
-```{.python .input}
-def detach(hidden):
-    if isinstance(hidden, (tuple, list)):
-        hidden = [detach(i) for i in hidden]
-    else:
-        hidden = hidden.detach()
-    return hidden
-```
-
-And then a helper evaluation function.
-
-```{.python .input}
-# Note that ctx is short for context
-def evaluate(model, data_source, batch_size, ctx):
-    total_L = 0.0
-    ntotal = 0
-    hidden = model.begin_state(
-        batch_size=batch_size, func=mx.nd.zeros, ctx=ctx)
-    for i, (data, target) in enumerate(data_source):
-        data = data.as_in_context(ctx)
-        target = target.as_in_context(ctx)
-        output, hidden = model(data, hidden)
-        hidden = detach(hidden)
-        L = loss(output.reshape(-3, -1), target.reshape(-1))
-        total_L += mx.nd.sum(L).asscalar()
-        ntotal += L.size
-    return total_L / ntotal
-```
-
-### The main training loop
-
-Our loss function will be the standard cross-entropy loss function used for multi-class classification, applied at each time step to compare the model's predictions to the true next word in the sequence.
-We can calculate gradients with respect to our parameters using truncated BPTT.
-In this case, we'll back propagate for $35$ time steps, updating our weights with stochastic gradient descent and a learning rate of $20$; these correspond to the hyperparameters that we specified earlier in the notebook.
-
-<img src="https://upload.wikimedia.org/wikipedia/commons/e/ee/Unfold_through_time.png" width="500">
-
-```{.python .input}
-# Function for actually training the model
-def train(model, train_data, val_data, test_data, epochs, lr):
-    best_val = float("Inf")
-    start_train_time = time.time()
-    parameters = model.collect_params().values()
-
-    for epoch in range(epochs):
-        total_L = 0.0
-        start_epoch_time = time.time()
-        start_log_interval_time = time.time()
-        hiddens = [model.begin_state(batch_size//len(context), func=mx.nd.zeros, ctx=ctx)
-                   for ctx in context]
-
-        for i, (data, target) in enumerate(train_data):
-            data_list = gluon.utils.split_and_load(data, context,
-                                                   batch_axis=1, even_split=True)
-            target_list = gluon.utils.split_and_load(target, context,
-                                                     batch_axis=1, even_split=True)
-            hiddens = detach(hiddens)
-            L = 0
-            Ls = []
-
-            with autograd.record():
-                for j, (X, y, h) in enumerate(zip(data_list, target_list, hiddens)):
-                    output, h = model(X, h)
-                    batch_L = loss(output.reshape(-3, -1), y.reshape(-1,))
-                    L = L + batch_L.as_in_context(context[0]) / (len(context) * X.size)
-                    Ls.append(batch_L / (len(context) * X.size))
-                    hiddens[j] = h
-            L.backward()
-            grads = [p.grad(x.context) for p in parameters for x in data_list]
-            gluon.utils.clip_global_norm(grads, grad_clip)
-
-            trainer.step(1)
-
-            total_L += sum([mx.nd.sum(l).asscalar() for l in Ls])
-
-            if i % log_interval == 0 and i > 0:
-                cur_L = total_L / log_interval
-                print('[Epoch %d Batch %d/%d] loss %.2f, ppl %.2f, '
-                      'throughput %.2f samples/s'%(
-                    epoch, i, len(train_data), cur_L, math.exp(cur_L),
-                    batch_size * log_interval / (time.time() - start_log_interval_time)))
-                total_L = 0.0
-                start_log_interval_time = time.time()
-
-        mx.nd.waitall()
-
-        print('[Epoch %d] throughput %.2f samples/s'%(
-                    epoch, len(train_data)*batch_size / (time.time() - start_epoch_time)))
-
-        val_L = evaluate(model, val_data, batch_size, context[0])
-        print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f'%(
-            epoch, time.time()-start_epoch_time, val_L, math.exp(val_L)))
-
-        if val_L < best_val:
-            best_val = val_L
-            test_L = evaluate(model, test_data, batch_size, context[0])
-            model.save_parameters('{}_{}-{}.params'.format(model_name, dataset_name, epoch))
-            print('test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
-        else:
-            lr = lr*0.25
-            print('Learning rate now %f'%(lr))
-            trainer.set_learning_rate(lr)
-
-    print('Total training throughput %.2f samples/s'%(
-                            (batch_size * len(train_data) * epochs) /
-                            (time.time() - start_train_time)))
-```
-
-We can now actually perform the training
-
-```{.python .input}
-train(model, train_data, val_data, test_data, epochs, lr)
-```
-
-## Using your own dataset
-
-When we train a language model, we fit to the statistics of a given dataset.
-While many papers focus on a few standard datasets, such as WikiText or the Penn Tree Bank, that's just to provide a standard benchmark for the purpose of comparing models against one another.
-In general, for any given use case, you'll want to train your own language model using a dataset of your own choice.
-Here, for demonstration, we'll grab some `.txt` files corresponding to Sherlock Holmes novels.
-
-We first download the new dataset.
-
-```{.python .input}
-TRAIN_PATH = "./sherlockholmes.train.txt"
-VALID_PATH = "./sherlockholmes.valid.txt"
-TEST_PATH = "./sherlockholmes.test.txt"
-PREDICT_PATH = "./tinyshakespeare/input.txt"
-download(
-    "https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.train.txt",
-    TRAIN_PATH,
-    sha1_hash="d65a52baaf32df613d4942e0254c81cff37da5e8")
-download(
-    "https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.valid.txt",
-    VALID_PATH,
-    sha1_hash="71133db736a0ff6d5f024bb64b4a0672b31fc6b3")
-download(
-    "https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/sherlockholmes/sherlockholmes.test.txt",
-    TEST_PATH,
-    sha1_hash="b7ccc4778fd3296c515a3c21ed79e9c2ee249f70")
-download(
-    "https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/tinyshakespeare/input.txt",
-    PREDICT_PATH,
-    sha1_hash="04486597058d11dcc2c556b1d0433891eb639d2e")
-
-print(glob.glob("sherlockholmes.*.txt"))
-```
-
-Then we specify the tokenizer as well as batchify the dataset.
-
-```{.python .input}
-import nltk
-moses_tokenizer = nlp.data.SacreMosesTokenizer()
-
-sherlockholmes_datasets = [
-    nlp.data.CorpusDataset(
-        'sherlockholmes.{}.txt'.format(name),
-        sample_splitter=nltk.tokenize.sent_tokenize,
-        tokenizer=moses_tokenizer,
-        flatten=True,
-        eos='<eos>') for name in ['train', 'valid', 'test']
-]
-
-sherlockholmes_train_data, sherlockholmes_val_data, sherlockholmes_test_data = [
-    bptt_batchify(dataset) for dataset in sherlockholmes_datasets
-]
-```
-
-We setup the evaluation to see whether our previous model trained on the other dataset does well on the new dataset.
-
-```{.python .input}
-sherlockholmes_L = evaluate(model, sherlockholmes_val_data, batch_size,
-                            context[0])
-print('Best validation loss %.2f, test ppl %.2f' %
-      (sherlockholmes_L, math.exp(sherlockholmes_L)))
-```
-
-Or we have the option of training the model on the new dataset with just one line of code.
-
-```{.python .input}
-train(
-    model,
-    sherlockholmes_train_data, # This is your input training data, we leave batchifying and tokenizing as an exercise for the reader
-    sherlockholmes_val_data,
-    sherlockholmes_test_data, # This would be your test data, again left as an exercise for the reader
-    epochs=3,
-    lr=20)
-```
diff --git a/docs/examples/language_model/use_pretrained_lm.md b/docs/examples/language_model/use_pretrained_lm.md
deleted file mode 100644
index c6351c872c..0000000000
--- a/docs/examples/language_model/use_pretrained_lm.md
+++ /dev/null
@@ -1,217 +0,0 @@
-# Using Pre-trained Language Model
-
-A statistical language model is simply a probability distribution over sequences of words or characters [1].
-In this tutorial, we'll restrict our attention to word-based language models.
-Given a reliable language model, we can answer questions like *which among the following strings are we more likely to encounter?*
-
-1. 'On Monday, Mr. Lamar’s “DAMN.” took home an even more elusive honor,
-one that may never have even seemed within reach: the Pulitzer Prize"
-1. "Frog zealot flagged xylophone the bean wallaby anaphylaxis extraneous
-porpoise into deleterious carrot banana apricot."
-
-Even if we've never seen either of these sentences in our entire lives, and even though no rapper has previously been
-awarded a Pulitzer Prize, we wouldn't be shocked to see the first sentence in the New York Times.
-By comparison, we can all agree that the second sentence, consisting of incoherent babble, is comparatively unlikely.
-A statistical language model can assign precise probabilities to each of these and other strings of words.
-
-Given a large corpus of text, we can estimate (or, in this case, train) a language model $\hat{p}(x_1, ..., x_n)$.
-And given such a model, we can sample strings $\mathbf{x} \sim \hat{p}(x_1, ..., x_n)$, generating new strings according to their estimated probability.
-Among other useful applications, we can use language models to score candidate transcriptions from speech recognition models, given a preference to sentences that seem more probable (at the expense of those deemed anomalous).
-
-These days recurrent neural networks (RNNs) are the preferred method for language models. In this notebook, we will go through an example of using GluonNLP to
-
-(i) implement a typical LSTM language model architecture
-(ii) train the language model on a corpus of real data
-(iii) bring in your own dataset for training
-(iv) grab off-the-shelf pre-trained state-of-the-art language models (i.e., AWD language model) using GluonNLP.
-
-## What is a language model (LM)?
-
-The standard approach to language modeling consists of training a model that given a trailing window of text, predicts the next word in the sequence.
-When we train the model we feed in the inputs $x_1, x_2, ...$ and try at each time step to predict the corresponding next word $x_2, ..., x_{n+1}$.
-To generate text from a language model, we can iteratively predict the next word, and then feed this word as an input to the model at the subsequent time step. The image included below demonstrates this idea.
-
-<img src="https://gluon.mxnet.io/_images/recurrent-lm.png" style="width: 500px;"/>
-
-## Using a pre-trained AWD LSTM language model
-
-AWD LSTM language model is the state-of-the-art RNN language model [1]. The main technique leveraged is to add weight-dropout on the recurrent hidden to hidden matrices to prevent overfitting on the recurrent connections.
-
-### Load the vocabulary and the pre-trained model
-
-```{.python .input}
-import warnings
-import math
-import mxnet as mx
-from mxnet import gluon
-import gluonnlp as nlp
-
-warnings.filterwarnings('ignore')
-nlp.utils.check_version('0.7.0')
-
-num_gpus = 1
-context = [mx.gpu(i) for i in range(num_gpus)] if num_gpus else [mx.cpu()]
-log_interval = 200
-
-batch_size = 20 * len(context)
-lr = 20
-epochs = 3
-bptt = 35
-grad_clip = 0.25
-
-dataset_name = 'wikitext-2'
-
-# Load the dataset
-train_dataset, val_dataset, test_dataset = [
-    nlp.data.WikiText2(
-        segment=segment, bos=None, eos='<eos>', skip_empty=False)
-    for segment in ['train', 'val', 'test']
-]
-
-vocab = nlp.Vocab(
-    nlp.data.Counter(train_dataset), padding_token=None, bos_token=None)
-
-
-# Batchify for BPTT
-bptt_batchify = nlp.data.batchify.CorpusBPTTBatchify(
-    vocab, bptt, batch_size, last_batch='discard')
-train_data, val_data, test_data = [
-    bptt_batchify(x) for x in [train_dataset, val_dataset, test_dataset]
-]
-
-awd_model_name = 'awd_lstm_lm_1150'
-awd_model, vocab = nlp.model.get_model(
-    awd_model_name,
-    vocab=vocab,
-    dataset_name=dataset_name,
-    pretrained=True,
-    ctx=context[0])
-
-print(awd_model)
-print(vocab)
-```
-
-### Evaluate the pre-trained model on the validation and test datasets
-
-```{.python .input}
-# Specify the loss function, in this case, cross-entropy with softmax.
-loss = gluon.loss.SoftmaxCrossEntropyLoss()
-
-
-def detach(hidden):
-    if isinstance(hidden, (tuple, list)):
-        hidden = [detach(i) for i in hidden]
-    else:
-        hidden = hidden.detach()
-    return hidden
-
-
-# Note that ctx is short for context
-def evaluate(model, data_source, batch_size, ctx):
-    total_L = 0.0
-    ntotal = 0
-    hidden = model.begin_state(
-        batch_size=batch_size, func=mx.nd.zeros, ctx=ctx)
-    for i, (data, target) in enumerate(data_source):
-        data = data.as_in_context(ctx)
-        target = target.as_in_context(ctx)
-        output, hidden = model(data, hidden)
-        hidden = detach(hidden)
-        L = loss(output.reshape(-3, -1), target.reshape(-1))
-        total_L += mx.nd.sum(L).asscalar()
-        ntotal += L.size
-    return total_L / ntotal
-
-
-val_L = evaluate(awd_model, val_data, batch_size, context[0])
-test_L = evaluate(awd_model, test_data, batch_size, context[0])
-
-print('Best validation loss %.2f, val ppl %.2f' % (val_L, math.exp(val_L)))
-print('Best test loss %.2f, test ppl %.2f' % (test_L, math.exp(test_L)))
-```
-
-## Using a cache LSTM LM
-
-Cache LSTM language model [2] adds a cache-like memory to neural network language models. It can be used in conjunction with the aforementioned AWD LSTM language model or other LSTM models.
-It exploits the hidden outputs to define a probability distribution over the words in the cache.
-It generates  state-of-the-art results at inference time.
-
-<img src=cache_model.png width="500">
-
-### Load the pre-trained model and define the hyperparameters
-
-```{.python .input}
-window = 2
-theta = 0.662
-lambdas = 0.1279
-bptt = 2000
-cache_model = nlp.model.train.get_cache_model(name=awd_model_name,
-                                             dataset_name=dataset_name,
-                                             window=window,
-                                             theta=theta,
-                                             lambdas=lambdas,
-                                             ctx=context[0])
-
-print(cache_model)
-```
-
-### Define specific get_batch and evaluation helper functions for the cache model
-
-Note that these helper functions are very similar to the ones we defined above, but are slightly different.
-
-```{.python .input}
-val_test_batch_size = 1
-val_test_batchify = nlp.data.batchify.CorpusBatchify(vocab, val_test_batch_size)
-val_data = val_test_batchify(val_dataset)
-test_data = val_test_batchify(test_dataset)
-```
-
-```{.python .input}
-def get_batch(data_source, i, seq_len=None):
-    seq_len = min(seq_len if seq_len else bptt, len(data_source) - 1 - i)
-    data = data_source[i:i + seq_len]
-    target = data_source[i + 1:i + 1 + seq_len]
-    return data, target
-```
-
-```{.python .input}
-def evaluate_cache(model, data_source, batch_size, ctx):
-    total_L = 0.0
-    hidden = model.begin_state(
-        batch_size=batch_size, func=mx.nd.zeros, ctx=ctx)
-    next_word_history = None
-    cache_history = None
-    for i in range(0, len(data_source) - 1, bptt):
-        if i > 0:
-            print('Batch %d, ppl %f' % (i, math.exp(total_L / i)))
-        if i == bptt:
-            return total_L / i
-        data, target = get_batch(data_source, i)
-        data = data.as_in_context(ctx)
-        target = target.as_in_context(ctx)
-        L = 0
-        outs, next_word_history, cache_history, hidden = model(
-            data, target, next_word_history, cache_history, hidden)
-        for out in outs:
-            L += (-mx.nd.log(out)).asscalar()
-        total_L += L / data.shape[1]
-        hidden = detach(hidden)
-    return total_L / len(data_source)
-```
-
-### Evaluate the pre-trained model on the validation and test datasets
-
-```{.python .input}
-val_L = evaluate_cache(cache_model, val_data, val_test_batch_size, context[0])
-test_L = evaluate_cache(cache_model, test_data, val_test_batch_size, context[0])
-
-print('Best validation loss %.2f, val ppl %.2f'%(val_L, math.exp(val_L)))
-print('Best test loss %.2f, test ppl %.2f'%(test_L, math.exp(test_L)))
-```
-
-
-## References
-
-[1] Merity, S., et al. “Regularizing and optimizing LSTM language models”. ICLR 2018
-
-[2] Grave, E., et al. “Improving neural language models with a continuous cache”. ICLR 2017
diff --git a/docs/examples/machine_translation/dataprocessor.py b/docs/examples/machine_translation/dataprocessor.py
deleted file mode 100644
index 8e5e63f4d4..0000000000
--- a/docs/examples/machine_translation/dataprocessor.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Data preprocessing for transformer."""
-
-import os
-import io
-import time
-import numpy as np
-import mxnet as mx
-from mxnet import gluon
-import gluonnlp as nlp
-import nmt
-import hyperparameters as hparams
-
-def cache_dataset(dataset, prefix):
-    """Cache the processed npy dataset the dataset into a npz
-
-    Parameters
-    ----------
-    dataset : SimpleDataset
-    file_path : str
-    """
-    if not os.path.exists(nmt._constants.CACHE_PATH):
-        os.makedirs(nmt._constants.CACHE_PATH)
-    src_data = np.concatenate([e[0] for e in dataset])
-    tgt_data = np.concatenate([e[1] for e in dataset])
-    src_cumlen = np.cumsum([0]+[len(e[0]) for e in dataset])
-    tgt_cumlen = np.cumsum([0]+[len(e[1]) for e in dataset])
-    np.savez(os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz'),
-             src_data=src_data, tgt_data=tgt_data,
-             src_cumlen=src_cumlen, tgt_cumlen=tgt_cumlen)
-
-
-def load_cached_dataset(prefix):
-    cached_file_path = os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz')
-    if os.path.exists(cached_file_path):
-        print('Loading dataset...')
-        npz_data = np.load(cached_file_path)
-        src_data, tgt_data, src_cumlen, tgt_cumlen = [npz_data[n] for n in
-                ['src_data', 'tgt_data', 'src_cumlen', 'tgt_cumlen']]
-        src_data = np.array([src_data[low:high] for low, high in zip(src_cumlen[:-1], src_cumlen[1:])])
-        tgt_data = np.array([tgt_data[low:high] for low, high in zip(tgt_cumlen[:-1], tgt_cumlen[1:])])
-        return gluon.data.ArrayDataset(np.array(src_data), np.array(tgt_data))
-    else:
-        return None
-
-
-class TrainValDataTransform(object):
-    """Transform the machine translation dataset.
-
-    Clip source and the target sentences to the maximum length. For the source sentence, append the
-    EOS. For the target sentence, append BOS and EOS.
-
-    Parameters
-    ----------
-    src_vocab : Vocab
-    tgt_vocab : Vocab
-    src_max_len : int
-    tgt_max_len : int
-    """
-
-    def __init__(self, src_vocab, tgt_vocab, src_max_len=None, tgt_max_len=None):
-        self._src_vocab = src_vocab
-        self._tgt_vocab = tgt_vocab
-        self._src_max_len = src_max_len
-        self._tgt_max_len = tgt_max_len
-
-    def __call__(self, src, tgt):
-        if self._src_max_len:
-            src_sentence = self._src_vocab[src.split()[:self._src_max_len]]
-        else:
-            src_sentence = self._src_vocab[src.split()]
-        if self._tgt_max_len:
-            tgt_sentence = self._tgt_vocab[tgt.split()[:self._tgt_max_len]]
-        else:
-            tgt_sentence = self._tgt_vocab[tgt.split()]
-        src_sentence.append(self._src_vocab[self._src_vocab.eos_token])
-        tgt_sentence.insert(0, self._tgt_vocab[self._tgt_vocab.bos_token])
-        tgt_sentence.append(self._tgt_vocab[self._tgt_vocab.eos_token])
-        src_npy = np.array(src_sentence, dtype=np.int32)
-        tgt_npy = np.array(tgt_sentence, dtype=np.int32)
-        return src_npy, tgt_npy
-
-
-def process_dataset(dataset, src_vocab, tgt_vocab, src_max_len=-1, tgt_max_len=-1):
-    start = time.time()
-    dataset_processed = dataset.transform(TrainValDataTransform(src_vocab, tgt_vocab,
-                                                                src_max_len,
-                                                                tgt_max_len), lazy=False)
-    end = time.time()
-    print('Processing Time spent: {}'.format(end - start))
-    return dataset_processed
-
-
-def load_translation_data(dataset, src_lang='en', tgt_lang='de'):
-    """Load translation dataset
-
-    Parameters
-    ----------
-    dataset : str
-    src_lang : str, default 'en'
-    tgt_lang : str, default 'de'
-
-    Returns
-    -------
-
-    """
-    if dataset == 'WMT2014BPE':
-        common_prefix = 'WMT2014BPE_{}_{}_{}_{}'.format(src_lang, tgt_lang,
-                                                        hparams.src_max_len, hparams.tgt_max_len)
-        data_train = nlp.data.WMT2014BPE('train', src_lang=src_lang, tgt_lang=tgt_lang)
-        data_val = nlp.data.WMT2014BPE('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang)
-        data_test = nlp.data.WMT2014BPE('newstest2014', src_lang=src_lang, tgt_lang=tgt_lang,
-                               full=False)
-    elif dataset == 'TOY':
-        common_prefix = 'TOY_{}_{}_{}_{}'.format(src_lang, tgt_lang,
-                                                 hparams.src_max_len, hparams.tgt_max_len)
-        data_train = nmt.dataset.TOY('train', src_lang=src_lang, tgt_lang=tgt_lang)
-        data_val = nmt.dataset.TOY('val', src_lang=src_lang, tgt_lang=tgt_lang)
-        data_test = nmt.dataset.TOY('test', src_lang=src_lang, tgt_lang=tgt_lang)
-    else:
-        raise NotImplementedError
-    src_vocab, tgt_vocab = data_train.src_vocab, data_train.tgt_vocab
-    data_train_processed = load_cached_dataset(common_prefix + '_train')
-    if not data_train_processed:
-        data_train_processed = process_dataset(data_train, src_vocab, tgt_vocab,
-                                               hparams.src_max_len, hparams.tgt_max_len)
-        cache_dataset(data_train_processed, common_prefix + '_train')
-    data_val_processed = load_cached_dataset(common_prefix + '_val')
-    if not data_val_processed:
-        data_val_processed = process_dataset(data_val, src_vocab, tgt_vocab)
-        cache_dataset(data_val_processed, common_prefix + '_val')
-    data_test_processed = load_cached_dataset(common_prefix + '_' + str(False) + '_test')
-    if not data_test_processed:
-        data_test_processed = process_dataset(data_test, src_vocab, tgt_vocab)
-        cache_dataset(data_test_processed, common_prefix + '_' + str(False) + '_test')
-    fetch_tgt_sentence = lambda src, tgt: tgt
-    if dataset == 'WMT2014BPE':
-        val_text = nlp.data.WMT2014('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang)
-        test_text = nlp.data.WMT2014('newstest2014', src_lang=src_lang, tgt_lang=tgt_lang,
-                            full=False)
-    elif dataset == 'TOY':
-        val_text = data_val
-        test_text = data_test
-    else:
-        raise NotImplementedError
-    val_tgt_sentences = list(val_text.transform(fetch_tgt_sentence))
-    test_tgt_sentences = list(test_text.transform(fetch_tgt_sentence))
-    return data_train_processed, data_val_processed, data_test_processed, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab
-
-
-def get_data_lengths(dataset):
-    return list(dataset.transform(lambda srg, tgt: (len(srg), len(tgt))))
diff --git a/docs/examples/machine_translation/gnmt.md b/docs/examples/machine_translation/gnmt.md
deleted file mode 100644
index edfc566a46..0000000000
--- a/docs/examples/machine_translation/gnmt.md
+++ /dev/null
@@ -1,531 +0,0 @@
-# Training GNMT on IWSLT 2015 Dataset
-
-In this notebook, we are going to train Google NMT on IWSLT 2015 English-Vietnamese
-Dataset. The building process includes four key steps:
-
-1. Load and preprocess the dataset
-
-2. Create a sampler and `DataLoader`
-
-3. Build the actual model
-
-4. Write the training algorithm
-
-This tutorial will guide you through each of the steps and explain briefly how each works. Please remember to click the download button at the top of the page to download the necessary files to follow this tutorial.
-
-## Setup
-
-Firstly, we need to setup the environment and import the necessary modules. For this tutorial, a GPU is highly important.
-
-```{.python .input}
-import warnings
-warnings.filterwarnings('ignore')
-
-import argparse
-import time
-import random
-import os
-import io
-import logging
-import numpy as np
-import mxnet as mx
-from mxnet import gluon
-import gluonnlp as nlp
-import nmt
-nlp.utils.check_version('0.7.0')
-```
-
-Next, we need to specify the hyperparameters for the dataset, the model, and for training and testing time.
-
-```{.python .input}
-np.random.seed(100)
-random.seed(100)
-mx.random.seed(10000)
-ctx = mx.gpu(0)
-
-# parameters for dataset
-dataset = 'IWSLT2015'
-src_lang, tgt_lang = 'en', 'vi'
-src_max_len, tgt_max_len = 50, 50
-
-# parameters for model
-num_hidden = 512
-num_layers = 2
-num_bi_layers = 1
-dropout = 0.2
-
-# parameters for training
-batch_size, test_batch_size = 128, 32
-num_buckets = 5
-epochs = 1
-clip = 5
-lr = 0.001
-lr_update_factor = 0.5
-log_interval = 10
-save_dir = 'gnmt_en_vi_u512'
-
-#parameters for testing
-beam_size = 10
-lp_alpha = 1.0
-lp_k = 5
-
-nmt.utils.logging_config(save_dir)
-```
-
-## Loading and processing the dataset
-
-The following shows how to process the dataset and cache the processed dataset
-for future use. The processing steps include the following:
-
-1. Clipping the source and target sequences
-2. Splitting the string input to a list of tokens
-3. Mapping the string token onto its integer index in the vocabulary
-4. Appending the end-of-sentence (EOS) token to source sentence and adding BOS and EOS tokens to the target sentence
-
-
-Firstly, we load and cache the dataset with the two helper functions `cache_dataset` and `load_cached_dataset`. The functions are straightforward and well commented so no further explanation will be given.
-
-```{.python .input}
-def cache_dataset(dataset, prefix):
-    """Cache the processed npy dataset  the dataset into an npz file
-
-    Parameters
-    ----------
-    dataset : gluon.data.SimpleDataset
-    file_path : str
-    """
-    if not os.path.exists(nmt._constants.CACHE_PATH):
-        os.makedirs(nmt._constants.CACHE_PATH)
-    src_data = np.concatenate([e[0] for e in dataset])
-    tgt_data = np.concatenate([e[1] for e in dataset])
-    src_cumlen = np.cumsum([0]+[len(e[0]) for e in dataset])
-    tgt_cumlen = np.cumsum([0]+[len(e[1]) for e in dataset])
-    np.savez(os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz'),
-             src_data=src_data, tgt_data=tgt_data,
-             src_cumlen=src_cumlen, tgt_cumlen=tgt_cumlen)
-
-
-def load_cached_dataset(prefix):
-    cached_file_path = os.path.join(nmt._constants.CACHE_PATH, prefix + '.npz')
-    if os.path.exists(cached_file_path):
-        print('Load cached data from {}'.format(cached_file_path))
-        npz_data = np.load(cached_file_path)
-        src_data, tgt_data, src_cumlen, tgt_cumlen = [npz_data[n] for n in
-                ['src_data', 'tgt_data', 'src_cumlen', 'tgt_cumlen']]
-        src_data = np.array([src_data[low:high] for low, high in zip(src_cumlen[:-1], src_cumlen[1:])])
-        tgt_data = np.array([tgt_data[low:high] for low, high in zip(tgt_cumlen[:-1], tgt_cumlen[1:])])
-        return gluon.data.ArrayDataset(np.array(src_data), np.array(tgt_data))
-    else:
-        return None
-
-```
-
-Next, we write the class `TrainValDataTransform` to have easy access to transforming and clipping the source and target sentences. This class also adds the EOS and BOS tokens for cleaner data. Please refer to the comments in the code for more details.
-
-```{.python .input}
-class TrainValDataTransform(object):
-    """Transform the machine translation dataset.
-
-    Clip source and the target sentences to the maximum length. For the source sentence, append the
-    EOS. For the target sentence, append BOS and EOS.
-
-    Parameters
-    ----------
-    src_vocab : Vocab
-    tgt_vocab : Vocab
-    src_max_len : int
-    tgt_max_len : int
-    """
-
-    def __init__(self, src_vocab, tgt_vocab, src_max_len, tgt_max_len):
-        # On initialization of the class, we set the class variables
-        self._src_vocab = src_vocab
-        self._tgt_vocab = tgt_vocab
-        self._src_max_len = src_max_len
-        self._tgt_max_len = tgt_max_len
-
-    def __call__(self, src, tgt):
-        # On actual calling of the class, we perform the clipping then the appending of the EOS and BOS tokens.
-        if self._src_max_len > 0:
-            src_sentence = self._src_vocab[src.split()[:self._src_max_len]]
-        else:
-            src_sentence = self._src_vocab[src.split()]
-        if self._tgt_max_len > 0:
-            tgt_sentence = self._tgt_vocab[tgt.split()[:self._tgt_max_len]]
-        else:
-            tgt_sentence = self._tgt_vocab[tgt.split()]
-        src_sentence.append(self._src_vocab[self._src_vocab.eos_token])
-        tgt_sentence.insert(0, self._tgt_vocab[self._tgt_vocab.bos_token])
-        tgt_sentence.append(self._tgt_vocab[self._tgt_vocab.eos_token])
-        src_npy = np.array(src_sentence, dtype=np.int32)
-        tgt_npy = np.array(tgt_sentence, dtype=np.int32)
-        return src_npy, tgt_npy
-```
-
-We leverage the class written above to create a helper function that processes the dataset in very few lines of code.
-
-```{.python .input}
-def process_dataset(dataset, src_vocab, tgt_vocab, src_max_len=-1, tgt_max_len=-1):
-    start = time.time()
-    dataset_processed = dataset.transform(TrainValDataTransform(src_vocab, tgt_vocab,
-                                                                src_max_len,
-                                                                tgt_max_len), lazy=False)
-    end = time.time()
-    print('Processing time spent: {}'.format(end - start))
-    return dataset_processed
-```
-
-Here we define a function `load_translation_data` that combines all the above steps to load the data, check if it's been processed, and if not, process the data. The method returns all of the required data for training, validating, and testing our model. Please refer to the comments in the code for more information on what each piece does.
-
-```{.python .input}
-def load_translation_data(dataset, src_lang='en', tgt_lang='vi'):
-    """Load translation dataset
-
-    Parameters
-    ----------
-    dataset : str
-    src_lang : str, default 'en'
-    tgt_lang : str, default 'vi'
-
-    Returns
-    -------
-    data_train_processed : Dataset
-        The preprocessed training sentence pairs
-    data_val_processed : Dataset
-        The preprocessed validation sentence pairs
-    data_test_processed : Dataset
-        The preprocessed test sentence pairs
-    val_tgt_sentences : list
-        The target sentences in the validation set
-    test_tgt_sentences : list
-        The target sentences in the test set
-    src_vocab : Vocab
-        Vocabulary of the source language
-    tgt_vocab : Vocab
-        Vocabulary of the target language
-    """
-    common_prefix = 'IWSLT2015_{}_{}_{}_{}'.format(src_lang, tgt_lang,
-                                                   src_max_len, tgt_max_len)
-
-    # Load the three datasets from files
-    data_train = nlp.data.IWSLT2015('train', src_lang=src_lang, tgt_lang=tgt_lang)
-    data_val = nlp.data.IWSLT2015('val', src_lang=src_lang, tgt_lang=tgt_lang)
-    data_test = nlp.data.IWSLT2015('test', src_lang=src_lang, tgt_lang=tgt_lang)
-    src_vocab, tgt_vocab = data_train.src_vocab, data_train.tgt_vocab
-    data_train_processed = load_cached_dataset(common_prefix + '_train')
-
-    # Check if each dataset has been processed or not, and if not, process and cache them.
-    if not data_train_processed:
-        data_train_processed = process_dataset(data_train, src_vocab, tgt_vocab,
-                                               src_max_len, tgt_max_len)
-        cache_dataset(data_train_processed, common_prefix + '_train')
-    data_val_processed = load_cached_dataset(common_prefix + '_val')
-    if not data_val_processed:
-        data_val_processed = process_dataset(data_val, src_vocab, tgt_vocab)
-        cache_dataset(data_val_processed, common_prefix + '_val')
-    data_test_processed = load_cached_dataset(common_prefix + '_test')
-    if not data_test_processed:
-        data_test_processed = process_dataset(data_test, src_vocab, tgt_vocab)
-        cache_dataset(data_test_processed, common_prefix + '_test')
-
-    # Pull out the target sentences for both test and validation
-    fetch_tgt_sentence = lambda src, tgt: tgt.split()
-    val_tgt_sentences = list(data_val.transform(fetch_tgt_sentence))
-    test_tgt_sentences = list(data_test.transform(fetch_tgt_sentence))
-
-    # Return all of the necessary pieces we can extract from the data for training our model
-    return data_train_processed, data_val_processed, data_test_processed, \
-           val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab
-```
-
-We define a last helper function `get_data_lengths` to get the length of the datasets, again, for simplified cleaner code later.
-```{.python .input}
-def get_data_lengths(dataset):
-    return list(dataset.transform(lambda srg, tgt: (len(srg), len(tgt))))
-
-```
-
-And for the last step of processing, we leverage all of our helper functions to keep the code concise and to these 15-20 lines for use in our main. This does all of the aforementioned processing along with storing the necessary information in memory for training our model.
-
-```{.python .input}
-data_train, data_val, data_test, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab\
-    = load_translation_data(dataset=dataset, src_lang=src_lang, tgt_lang=tgt_lang)
-data_train_lengths = get_data_lengths(data_train)
-data_val_lengths = get_data_lengths(data_val)
-data_test_lengths = get_data_lengths(data_test)
-
-with io.open(os.path.join(save_dir, 'val_gt.txt'), 'w', encoding='utf-8') as of:
-    for ele in val_tgt_sentences:
-        of.write(' '.join(ele) + '\n')
-
-with io.open(os.path.join(save_dir, 'test_gt.txt'), 'w', encoding='utf-8') as of:
-    for ele in test_tgt_sentences:
-        of.write(' '.join(ele) + '\n')
-
-
-data_train = data_train.transform(lambda src, tgt: (src, tgt, len(src), len(tgt)), lazy=False)
-data_val = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
-                                     for i, ele in enumerate(data_val)])
-data_test = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
-                                      for i, ele in enumerate(data_test)])
-```
-
-## Sampler and `DataLoader` construction
-
-Now, we have obtained and stored all of the relevant data information. The next step
-is to construct the sampler and `DataLoader`. The first step is to use the `batchify`
-function, which pads and stacks sequences to form mini-batches.
-
-```{.python .input}
-train_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(pad_val=0),
-                                            nlp.data.batchify.Pad(pad_val=0),
-                                            nlp.data.batchify.Stack(dtype='float32'),
-                                            nlp.data.batchify.Stack(dtype='float32'))
-test_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(pad_val=0),
-                                           nlp.data.batchify.Pad(pad_val=0),
-                                           nlp.data.batchify.Stack(dtype='float32'),
-                                           nlp.data.batchify.Stack(dtype='float32'),
-                                           nlp.data.batchify.Stack())
-```
-
-We can then construct bucketing samplers, which generate batches by grouping
-sequences with similar lengths. Here, the bucketing scheme is empirically determined.
-
-```{.python .input}
-bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)
-train_batch_sampler = nlp.data.FixedBucketSampler(lengths=data_train_lengths,
-                                                  batch_size=batch_size,
-                                                  num_buckets=num_buckets,
-                                                  shuffle=True,
-                                                  bucket_scheme=bucket_scheme)
-logging.info('Train Batch Sampler:\n{}'.format(train_batch_sampler.stats()))
-val_batch_sampler = nlp.data.FixedBucketSampler(lengths=data_val_lengths,
-                                                batch_size=test_batch_size,
-                                                num_buckets=num_buckets,
-                                                shuffle=False)
-logging.info('Valid Batch Sampler:\n{}'.format(val_batch_sampler.stats()))
-test_batch_sampler = nlp.data.FixedBucketSampler(lengths=data_test_lengths,
-                                                 batch_size=test_batch_size,
-                                                 num_buckets=num_buckets,
-                                                 shuffle=False)
-logging.info('Test Batch Sampler:\n{}'.format(test_batch_sampler.stats()))
-```
-
-Given the samplers, we can create a `DataLoader`, which is iterable. This simply is a data construct (an iterator) that can feed the model batches at a time. For more information refer to [this](https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/datasets.html) page.
-
-```{.python .input}
-train_data_loader = gluon.data.DataLoader(data_train,
-                                          batch_sampler=train_batch_sampler,
-                                          batchify_fn=train_batchify_fn,
-                                          num_workers=4)
-val_data_loader = gluon.data.DataLoader(data_val,
-                                        batch_sampler=val_batch_sampler,
-                                        batchify_fn=test_batchify_fn,
-                                        num_workers=4)
-test_data_loader = gluon.data.DataLoader(data_test,
-                                         batch_sampler=test_batch_sampler,
-                                         batchify_fn=test_batchify_fn,
-                                         num_workers=4)
-```
-
-## Building the GNMT model
-
-After obtaining the DataLoader, we can finally build the model. The GNMT encoder and decoder
-can be easily constructed by calling `get_gnmt_encoder_decoder` function. Then, we
-feed the encoder and decoder to the `NMTModel` to construct the GNMT model.
-
-`model.hybridize` allows computation to be done using the symbolic backend. To understand what it means to be "hybridized," please refer to [this](https://mxnet.incubator.apache.org/versions/master/tutorials/gluon/hybrid.html) page on MXNet hybridization and its advantages.
-
-```{.python .input}
-encoder, decoder, one_step_ahead_decoder = nmt.gnmt.get_gnmt_encoder_decoder(
-    hidden_size=num_hidden, dropout=dropout, num_layers=num_layers,
-    num_bi_layers=num_bi_layers)
-model = nlp.model.translation.NMTModel(src_vocab=src_vocab, tgt_vocab=tgt_vocab, encoder=encoder,
-                                       decoder=decoder, one_step_ahead_decoder=one_step_ahead_decoder,
-                                       embed_size=num_hidden, prefix='gnmt_')
-model.initialize(init=mx.init.Uniform(0.1), ctx=ctx)
-static_alloc = True
-model.hybridize(static_alloc=static_alloc)
-logging.info(model)
-
-# Due to the paddings, we need to mask out the losses corresponding to padding tokens.
-loss_function = nlp.loss.MaskedSoftmaxCELoss()
-loss_function.hybridize(static_alloc=static_alloc)
-```
-
-Here, we build the `BeamSearchTranslator` and define a predetermined `BeamSearchScorer` as the heuristical mechanism for the search. For more information on Beam Search and its applications to NLP, check [here](https://en.wikipedia.org/wiki/Beam_search).
-
-```{.python .input}
-translator = nmt.translation.BeamSearchTranslator(model=model, beam_size=beam_size,
-                                                  scorer=nlp.model.BeamSearchScorer(alpha=lp_alpha,
-                                                                                    K=lp_k),
-                                                  max_length=tgt_max_len + 100)
-logging.info('Use beam_size={}, alpha={}, K={}'.format(beam_size, lp_alpha, lp_k))
-```
-
-We define the evaluation function as shown in the code block below. The `evaluate` function uses the beam
-search translator to generate outputs for the validation and testing datasets. Please refer to the comments in the code for more information on what each piece does. In addition, we add the `write_sentences` helper method to easily output the sentences.
-
-```{.python .input}
-def evaluate(data_loader):
-    """Evaluate given the data loader
-
-    Parameters
-    ----------
-    data_loader : gluon.data.DataLoader
-
-    Returns
-    -------
-    avg_loss : float
-        Average loss
-    real_translation_out : list of list of str
-        The translation output
-    """
-    translation_out = []
-    all_inst_ids = []
-    avg_loss_denom = 0
-    avg_loss = 0.0
-
-    for _, (src_seq, tgt_seq, src_valid_length, tgt_valid_length, inst_ids) \
-            in enumerate(data_loader):
-        src_seq = src_seq.as_in_context(ctx)
-        tgt_seq = tgt_seq.as_in_context(ctx)
-        src_valid_length = src_valid_length.as_in_context(ctx)
-        tgt_valid_length = tgt_valid_length.as_in_context(ctx)
-
-        # Calculate Loss
-        out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
-        loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean().asscalar()
-        all_inst_ids.extend(inst_ids.asnumpy().astype(np.int32).tolist())
-        avg_loss += loss * (tgt_seq.shape[1] - 1)
-        avg_loss_denom += (tgt_seq.shape[1] - 1)
-
-        # Translate the sequences and score them
-        samples, _, sample_valid_length =\
-            translator.translate(src_seq=src_seq, src_valid_length=src_valid_length)
-        max_score_sample = samples[:, 0, :].asnumpy()
-        sample_valid_length = sample_valid_length[:, 0].asnumpy()
-
-        # Iterate through the tokens and stitch the tokens together for the sentence
-        for i in range(max_score_sample.shape[0]):
-            translation_out.append(
-                [tgt_vocab.idx_to_token[ele] for ele in
-                 max_score_sample[i][1:(sample_valid_length[i] - 1)]])
-
-    # Calculate the average loss and initialize a None-filled translation list
-    avg_loss = avg_loss / avg_loss_denom
-    real_translation_out = [None for _ in range(len(all_inst_ids))]
-
-    # Combine all the words/tokens into a sentence for the final translation
-    for ind, sentence in zip(all_inst_ids, translation_out):
-        real_translation_out[ind] = sentence
-
-    # Return the loss and the translation
-    return avg_loss, real_translation_out
-
-
-def write_sentences(sentences, file_path):
-    with io.open(file_path, 'w', encoding='utf-8') as of:
-        for sent in sentences:
-            of.write(' '.join(sent) + '\n')
-```
-
-## Training
-
-Before entering the training stage, we need to create a trainer for updating the
-parameters based on the loss. In the following example, we create a trainer that uses the ADAM
-optimizer.
-
-```{.python .input}
-trainer = gluon.Trainer(model.collect_params(), 'adam', {'learning_rate': lr})
-```
-
-We can then write the training loop. During the training, we evaluate on the validation and testing datasets every epoch, and record the
-parameters that give the highest [Bilingual Evaluation Understudy Score (BLEU)](https://www.aclweb.org/anthology/P02-1040.pdf) score on the validation dataset. Before
-performing forward and backward computation, we first use the `as_in_context` function to copy
-the mini-batch to the GPU. The statement `with mx.autograd.record()` tells Gluon's
-backend to compute the gradients for the part inside the block.
-
-```{.python .input}
-best_valid_bleu = 0.0
-
-# Run through each epoch
-for epoch_id in range(epochs):
-    log_avg_loss = 0
-    log_avg_gnorm = 0
-    log_wc = 0
-    log_start_time = time.time()
-
-    # Iterate through each batch
-    for batch_id, (src_seq, tgt_seq, src_valid_length, tgt_valid_length)\
-            in enumerate(train_data_loader):
-
-        src_seq = src_seq.as_in_context(ctx)
-        tgt_seq = tgt_seq.as_in_context(ctx)
-        src_valid_length = src_valid_length.as_in_context(ctx)
-        tgt_valid_length = tgt_valid_length.as_in_context(ctx)
-
-        # Compute gradients and losses
-        with mx.autograd.record():
-            out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
-            loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean()
-            loss = loss * (tgt_seq.shape[1] - 1) / (tgt_valid_length - 1).mean()
-            loss.backward()
-
-        grads = [p.grad(ctx) for p in model.collect_params().values()]
-        gnorm = gluon.utils.clip_global_norm(grads, clip)
-        trainer.step(1)
-        src_wc = src_valid_length.sum().asscalar()
-        tgt_wc = (tgt_valid_length - 1).sum().asscalar()
-        step_loss = loss.asscalar()
-        log_avg_loss += step_loss
-        log_avg_gnorm += gnorm
-        log_wc += src_wc + tgt_wc
-        if (batch_id + 1) % log_interval == 0:
-            wps = log_wc / (time.time() - log_start_time)
-            logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, gnorm={:.4f}, '
-                         'throughput={:.2f}K wps, wc={:.2f}K'
-                         .format(epoch_id, batch_id + 1, len(train_data_loader),
-                                 log_avg_loss / log_interval,
-                                 np.exp(log_avg_loss / log_interval),
-                                 log_avg_gnorm / log_interval,
-                                 wps / 1000, log_wc / 1000))
-            log_start_time = time.time()
-            log_avg_loss = 0
-            log_avg_gnorm = 0
-            log_wc = 0
-
-    # Evaluate the losses on validation and test datasets and find the corresponding BLEU score and log it
-    valid_loss, valid_translation_out = evaluate(val_data_loader)
-    valid_bleu_score, _, _, _, _ = nmt.bleu.compute_bleu([val_tgt_sentences], valid_translation_out)
-    logging.info('[Epoch {}] valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'
-                 .format(epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100))
-    test_loss, test_translation_out = evaluate(test_data_loader)
-    test_bleu_score, _, _, _, _ = nmt.bleu.compute_bleu([test_tgt_sentences], test_translation_out)
-    logging.info('[Epoch {}] test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'
-                 .format(epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100))
-
-    # Output the sentences we predicted on the validation and test datasets             
-    write_sentences(valid_translation_out,
-                    os.path.join(save_dir, 'epoch{:d}_valid_out.txt').format(epoch_id))
-    write_sentences(test_translation_out,
-                    os.path.join(save_dir, 'epoch{:d}_test_out.txt').format(epoch_id))
-
-    # Save the model if the BLEU score is better than the previous best
-    if valid_bleu_score > best_valid_bleu:
-        best_valid_bleu = valid_bleu_score
-        save_path = os.path.join(save_dir, 'valid_best.params')
-        logging.info('Save best parameters to {}'.format(save_path))
-        model.save_parameters(save_path)
-
-    # Update the learning rate based on the number of epochs that have passed
-    if epoch_id + 1 >= (epochs * 2) // 3:
-        new_lr = trainer.learning_rate * lr_update_factor
-        logging.info('Learning rate change to {}'.format(new_lr))
-        trainer.set_learning_rate(new_lr)
-```
-
-## Conclusion
-In this notebook, we have shown how to train a GNMT model on the IWSLT 2015 English-Vietnamese dataset using the Gluon NLP toolkit.
-The complete training script can be found [here](https://github.com/dmlc/gluon-nlp/blob/master/scripts/machine_translation/train_gnmt.py).
-The code sequence to reproduce the results can be seen on the [machine translation page](http://gluon-nlp.mxnet.io/model_zoo/machine_translation/index.html).
diff --git a/docs/examples/machine_translation/hyperparameters.py b/docs/examples/machine_translation/hyperparameters.py
deleted file mode 100644
index f0e31a5949..0000000000
--- a/docs/examples/machine_translation/hyperparameters.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Hyperparameters for transformer."""
-
-import nmt
-
-# parameters for dataset
-src_lang = 'en'
-tgt_lang = 'de'
-src_max_len = -1
-tgt_max_len = -1
-
-# parameters for model
-num_units = 512
-hidden_size = 2048
-dropout = 0.1
-epsilon = 0.1
-num_layers = 6
-num_heads = 8
-scaled = True
-
-# parameters for training
-optimizer = 'adam'
-epochs = 3
-batch_size = 2700
-test_batch_size = 256
-num_accumulated = 1
-lr = 2
-warmup_steps = 1
-save_dir = 'transformer_en_de_u512'
-average_start = 1
-num_buckets = 20
-log_interval = 10
-bleu = '13a'
-
-#parameters for testing
-beam_size = 4
-lp_alpha = 0.6
-lp_k = 5
\ No newline at end of file
diff --git a/docs/examples/machine_translation/index.rst b/docs/examples/machine_translation/index.rst
deleted file mode 100644
index 061e1197e2..0000000000
--- a/docs/examples/machine_translation/index.rst
+++ /dev/null
@@ -1,28 +0,0 @@
-Machine Translation
-===================
-
-.. container:: cards
-
-   .. card::
-      :title: Training GNMT on IWSLT 2015 Dataset
-      :link: gnmt.html
-
-      Learn how to train Google Neural Machine Translation, a seq2seq with attention model.
-
-   .. card::
-      :title: Using Pre-trained Transformer
-      :link: transformer.html
-
-      Learn how to use a pre-trained transformer translation model for English-German translation.
-
-
-
-.. toctree::
-   :hidden:
-   :maxdepth: 2
-
-   gnmt.ipynb
-   transformer.ipynb
-
-
-
diff --git a/docs/examples/machine_translation/nmt b/docs/examples/machine_translation/nmt
deleted file mode 120000
index aa09220de6..0000000000
--- a/docs/examples/machine_translation/nmt
+++ /dev/null
@@ -1 +0,0 @@
-../../model_zoo/machine_translation
\ No newline at end of file
diff --git a/docs/examples/machine_translation/transformer.md b/docs/examples/machine_translation/transformer.md
deleted file mode 100644
index d8cdb50654..0000000000
--- a/docs/examples/machine_translation/transformer.md
+++ /dev/null
@@ -1,249 +0,0 @@
-# Using Pre-trained Transformer
-
-In this notebook, we will show how to use Transformer introduced in [1] and evaluate the pre-trained model with GluonNLP. Transformer model is shown to be more accurate and easier to parallelize than previous seq2seq-based models such as Google Neural Machine Translation. We will use the state-of-the-art pre-trained Transformer model, evaluate the pre-trained Transformer model on newstest2014 and translate a few sentences ourselves with the `BeamSearchTranslator`;
-
-## Setup
-
-We start with some usual preparation such as importing libraries and setting the environment.
-
-
-### Load MXNet and GluonNLP
-
-```{.python .input}
-import warnings
-warnings.filterwarnings('ignore')
-
-import random
-import numpy as np
-import mxnet as mx
-from mxnet import gluon
-import gluonnlp as nlp
-nlp.utils.check_version('0.7.0')
-```
-
-### Setup the environment
-
-```{.python .input}
-np.random.seed(100)
-random.seed(100)
-mx.random.seed(10000)
-ctx = mx.gpu(0)
-```
-
-## Using the pre-trained transformer model
-
-Next, we load the Transformer model in GluonNLP model zoo and use the full `newstest2014` segment of the WMT 2014 English-German test dataset, and evaluate the model on it.
-
-### Load the transformer
-
-We load the pre-trained Transformer using the model API in GluonNLP, which returns the source and target vocabulary along with the model.
-
-```{.python .input}
-import nmt
-
-wmt_model_name = 'transformer_en_de_512'
-
-wmt_transformer_model, wmt_src_vocab, wmt_tgt_vocab = \
-    nlp.model.get_model(wmt_model_name,
-                        dataset_name='WMT2014',
-                        pretrained=True,
-                        ctx=ctx)
-
-# we are using mixed vocab of EN-DE, so the source and target language vocab are the same
-print(len(wmt_src_vocab), len(wmt_tgt_vocab))
-```
-
-The Transformer model architecture is shown as below:
-
-<div style="width: 500px;">![transformer](transformer.png)</div>
-
-### Load and preprocess the dataset
-
-We then load the `newstest2014` segment in the WMT 2014 English-German test dataset for evaluation purpose.
-
-The following shows how to process the dataset and cache the processed dataset
-for the future use. The processing steps include:
-
-1) clip the source and target sequences
-2) split the string input to a list of tokens
-3) map the string token into its index in the vocabulary
-4) append EOS token to source sentence and add BOS and EOS tokens to target sentence.
-
-Let's first look at the WMT 2014 corpus. GluonNLP provides [WMT2014BPE](../../api/data.rst#gluonnlp.data.WMT2014BPE)
-and [WMT2014](../../api/data.rst#gluonnlp.data.WMT2014) classes. The former contains BPE-tokenized dataset, while
-the later contains the raw text. Here, we use the former for scoring, and the later for
-demonstrating actual translation.
-
-```{.python .input}
-import hyperparameters as hparams
-
-wmt_data_test = nlp.data.WMT2014BPE('newstest2014',
-                                    src_lang=hparams.src_lang,
-                                    tgt_lang=hparams.tgt_lang)
-print('Source language %s, Target language %s' % (hparams.src_lang, hparams.tgt_lang))
-print('Sample BPE tokens: "{}"'.format(wmt_data_test[0]))
-
-wmt_test_text = nlp.data.WMT2014('newstest2014',
-                                 src_lang=hparams.src_lang,
-                                 tgt_lang=hparams.tgt_lang)
-print('Sample raw text: "{}"'.format(wmt_test_text[0]))
-
-wmt_test_tgt_sentences = wmt_test_text.transform(lambda src, tgt: tgt)
-print('Sample target sentence: "{}"'.format(wmt_test_tgt_sentences[0]))
-```
-
-```{.python .input}
-import dataprocessor
-
-print(dataprocessor.TrainValDataTransform.__doc__)
-
-# wmt_transform_fn includes the four preprocessing steps mentioned above.
-wmt_transform_fn = dataprocessor.TrainValDataTransform(wmt_src_vocab, wmt_tgt_vocab)
-wmt_dataset_processed = wmt_data_test.transform(wmt_transform_fn, lazy=False)
-print(*wmt_dataset_processed[0], sep='\n')
-
-def get_length_index_fn():
-    global idx
-    idx = 0
-    def transform(src, tgt):
-        global idx
-        result = (src, tgt, len(src), len(tgt), idx)
-        idx += 1
-        return result
-    return transform
-
-wmt_data_test_with_len = wmt_dataset_processed.transform(get_length_index_fn(), lazy=False)
-```
-
-### Create the sampler and `DataLoader`
-
-Now, we have obtained the transformed datasets. The next step is to construct the sampler and `DataLoader`. First, we need to construct the batchify function, which pads and stacks sequences to form mini-batches.
-
-```{.python .input}
-wmt_test_batchify_fn = nlp.data.batchify.Tuple(
-    nlp.data.batchify.Pad(pad_val=0),
-    nlp.data.batchify.Pad(pad_val=0),
-    nlp.data.batchify.Stack(dtype='float32'),
-    nlp.data.batchify.Stack(dtype='float32'),
-    nlp.data.batchify.Stack())
-```
-
-In GluonNLP, all dataset items are tuples. In the preprocessed `wmt_data_test_with_len`, it includes
-`(src, tgt, len(src), len(tgt), idx)` elements. In order to express how we'd like to batchify them, we use the built-in batchify functions.
-
-* [Tuple](../../api/data.batchify.rst#gluonnlp.data.batchify.Tuple) is the GluonNLP way of applying different batchify functions to each element of a dataset item. In this case, we are applying `Pad` to `src` and `tgt`, `Stack` to `len(src)` and `len(tgt)` with conversion to float32, and simple `Stack` to `idx` without type conversion.
-* [Pad](../../api/data.batchify.rst#gluonnlp.data.batchify.Pad) takes the elements from all dataset items in a batch, and pad them according to the item of maximum length to form a padded matrix/tensor.
-* [Stack](../../api/data.batchify.rst#gluonnlp.data.batchify.Stack) simply stacks all elements in a batch, and requires all elements to be of the same length.
-
-
-We can then construct bucketing samplers, which generate batches by grouping sequences with similar lengths. Here, we use [FixedBucketSampler](../../api/data.rst#gluonnlp.data.FixedBucketSampler) with [ExpWidthBucket](../../api/data.rst#gluonnlp.data.ExpWidthBucket). FixedBucketSampler aims to assign each data sample to a fixed bucket based on its length. With this setting, the sampler would select buckets following an approximately exponentially increasing interval of maximum bucket lengths.
-
-```{.python .input}
-wmt_bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)
-wmt_test_batch_sampler = nlp.data.FixedBucketSampler(
-    lengths=wmt_data_test_with_len.transform(lambda src, tgt, src_len, tgt_len, idx: tgt_len), # target length
-    use_average_length=True, # control the element lengths (i.e. number of tokens) to be about the same
-    bucket_scheme=wmt_bucket_scheme,
-    batch_size=256)
-print(wmt_test_batch_sampler.stats())
-```
-
-Given the samplers, we can use a `[DataLoader]`(https://mxnet.apache.org/versions/master/api/python/gluon/data.html#mxnet.gluon.data.DataLoader) to sample the datasets.
-
-```{.python .input}
-wmt_test_data_loader = gluon.data.DataLoader(
-    wmt_data_test_with_len,
-    batch_sampler=wmt_test_batch_sampler,
-    batchify_fn=wmt_test_batchify_fn,
-    num_workers=8)
-len(wmt_test_data_loader)
-```
-
-### Evaluating the transformer
-
-Next, we evaluate the performance of the model on the WMT test dataset. We first define the `BeamSearchTranslator` to generate the actual translations.
-
-```{.python .input}
-wmt_translator = nmt.translation.BeamSearchTranslator(
-    model=wmt_transformer_model,
-    beam_size=hparams.beam_size,
-    scorer=nlp.model.BeamSearchScorer(alpha=hparams.lp_alpha, K=hparams.lp_k),
-    max_length=200)
-```
-
-Then we calculate the `loss` as well as the `bleu` score on the `newstest2014` WMT 2014 English-German test dataset. This may take a while.
-
-```{.python .input}
-import time
-import utils
-
-eval_start_time = time.time()
-
-wmt_test_loss_function = nlp.loss.MaskedSoftmaxCELoss()
-wmt_test_loss_function.hybridize()
-
-wmt_detokenizer = nlp.data.SacreMosesDetokenizer()
-
-wmt_test_loss, wmt_test_translation_out = utils.evaluate(wmt_transformer_model,
-                                                         wmt_test_data_loader,
-                                                         wmt_test_loss_function,
-                                                         wmt_translator,
-                                                         wmt_tgt_vocab,
-                                                         wmt_detokenizer,
-                                                         ctx)
-
-wmt_test_bleu_score, _, _, _, _ = nmt.bleu.compute_bleu([wmt_test_tgt_sentences],
-                                                        wmt_test_translation_out,
-                                                        tokenized=False,
-                                                        tokenizer=hparams.bleu,
-                                                        split_compound_word=False,
-                                                        bpe=False)
-
-print('WMT14 EN-DE SOTA model test loss: %.2f; test bleu score: %.2f; time cost %.2fs'
-      %(wmt_test_loss, wmt_test_bleu_score * 100, (time.time() - eval_start_time)))
-```
-
-```{.python .input}
-print('Sample translations:')
-num_pairs = 3
-
-for i in range(num_pairs):
-    print('EN:')
-    print(wmt_test_text[i][0])
-    print('DE-Candidate:')
-    print(wmt_test_translation_out[i])
-    print('DE-Reference:')
-    print(wmt_test_tgt_sentences[i])
-    print('========')
-```
-
-### Translation Inference
-
-We now show the actual translation example (EN-DE) when given a source language using the SOTA Transformer model.
-
-```{.python .input}
-import utils
-
-print('Translate the following English sentence into German:')
-
-sample_src_seq = 'We love language .'
-
-print('[\'' + sample_src_seq + '\']')
-
-sample_tgt_seq = utils.translate(wmt_translator,
-                                 sample_src_seq,
-                                 wmt_src_vocab,
-                                 wmt_tgt_vocab,
-                                 wmt_detokenizer,
-                                 ctx)
-
-print('The German translation is:')
-print(sample_tgt_seq)
-```
-
-If you'd like to train your own transformer models, you may find the training scripts in our
-[model zoo](../../model_zoo/machine_translation/index.rst).
-
-## References
-
-[1] Vaswani, Ashish, et al. "Attention is all you need." Advances in Neural Information Processing Systems. 2017.
diff --git a/docs/examples/machine_translation/transformer.png b/docs/examples/machine_translation/transformer.png
deleted file mode 100644
index 2561c01a11..0000000000
Binary files a/docs/examples/machine_translation/transformer.png and /dev/null differ
diff --git a/docs/examples/machine_translation/utils.py b/docs/examples/machine_translation/utils.py
deleted file mode 100644
index faa16a39d0..0000000000
--- a/docs/examples/machine_translation/utils.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utilities for transformer."""
-
-import numpy as np
-import math
-import mxnet as mx
-import time
-import logging
-import io
-import nmt
-import hyperparameters as hparams
-
-def evaluate(model, data_loader, test_loss_function, translator, tgt_vocab, detokenizer, context):
-    """Evaluate given the data loader
-
-    Parameters
-    ----------
-    data_loader : DataLoader
-
-    Returns
-    -------
-    avg_loss : float
-        Average loss
-    real_translation_out : list of list of str
-        The translation output
-    """
-    translation_out = []
-    all_inst_ids = []
-    avg_loss_denom = 0
-    avg_loss = 0.0
-    for _, (src_seq, tgt_seq, src_valid_length, tgt_valid_length, inst_ids) \
-            in enumerate(data_loader):
-        src_seq = src_seq.as_in_context(context)
-        tgt_seq = tgt_seq.as_in_context(context)
-        src_valid_length = src_valid_length.as_in_context(context)
-        tgt_valid_length = tgt_valid_length.as_in_context(context)
-        # Calculating Loss
-        out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
-        loss = test_loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean().asscalar()
-        all_inst_ids.extend(inst_ids.asnumpy().astype(np.int32).tolist())
-        avg_loss += loss * (tgt_seq.shape[1] - 1)
-        avg_loss_denom += (tgt_seq.shape[1] - 1)
-        # Translate
-        samples, _, sample_valid_length = \
-            translator.translate(src_seq=src_seq, src_valid_length=src_valid_length)
-        max_score_sample = samples[:, 0, :].asnumpy()
-        sample_valid_length = sample_valid_length[:, 0].asnumpy()
-        for i in range(max_score_sample.shape[0]):
-            translation_out.append(
-                [tgt_vocab.idx_to_token[ele] for ele in
-                 max_score_sample[i][1:(sample_valid_length[i] - 1)]])
-    avg_loss = avg_loss / avg_loss_denom
-    real_translation_out = [None for _ in range(len(all_inst_ids))]
-    for ind, sentence in zip(all_inst_ids, translation_out):
-        real_translation_out[ind] = detokenizer(nmt.bleu._bpe_to_words(sentence),
-                                                return_str=True)
-    return avg_loss, real_translation_out
-
-def translate(translator, src_seq, src_vocab, tgt_vocab, detokenizer, ctx):
-    src_sentence = src_vocab[src_seq.split()]
-    src_sentence.append(src_vocab[src_vocab.eos_token])
-    src_npy = np.array(src_sentence, dtype=np.int32)
-    src_nd = mx.nd.array(src_npy)
-    src_nd = src_nd.reshape((1, -1)).as_in_context(ctx)
-    src_valid_length = mx.nd.array([src_nd.shape[1]]).as_in_context(ctx)
-    samples, _, sample_valid_length = \
-        translator.translate(src_seq=src_nd, src_valid_length=src_valid_length)
-    max_score_sample = samples[:, 0, :].asnumpy()
-    
-    sample_valid_length = sample_valid_length[:, 0].asnumpy()
-    translation_out = []
-    for i in range(max_score_sample.shape[0]):
-        translation_out.append(
-            [tgt_vocab.idx_to_token[ele] for ele in
-             max_score_sample[i][1:(sample_valid_length[i] - 1)]])
-    real_translation_out = [None for _ in range(len(translation_out))]
-    for ind, sentence in enumerate(translation_out):
-        real_translation_out[ind] = detokenizer(nmt.bleu._bpe_to_words(sentence),
-                                                return_str=True)
-    return real_translation_out              
-                
-def train_one_epoch(epoch_id, model, train_data_loader, trainer, label_smoothing, loss_function, grad_interval, average_param_dict, update_average_param_dict, step_num, ctx):
-    log_avg_loss = 0
-    log_wc = 0
-    loss_denom = 0
-    step_loss = 0
-    log_start_time = time.time()
-    for batch_id, seqs in enumerate(train_data_loader):
-        if batch_id % grad_interval == 0:
-            step_num += 1
-            new_lr = hparams.lr / math.sqrt(hparams.num_units) * min(1. / math.sqrt(step_num), step_num * hparams.warmup_steps ** (-1.5))
-            trainer.set_learning_rate(new_lr)
-        src_wc, tgt_wc, bs = np.sum([(shard[2].sum(), shard[3].sum(), shard[0].shape[0])
-                                     for shard in seqs], axis=0)
-        src_wc = src_wc.asscalar()
-        tgt_wc = tgt_wc.asscalar()
-        loss_denom += tgt_wc - bs
-        seqs = [[seq.as_in_context(context) for seq in shard]
-                for context, shard in zip([ctx], seqs)]
-        Ls = []
-        with mx.autograd.record():
-            for src_seq, tgt_seq, src_valid_length, tgt_valid_length in seqs:
-                out, _ = model(src_seq, tgt_seq[:, :-1],
-                               src_valid_length, tgt_valid_length - 1)
-                smoothed_label = label_smoothing(tgt_seq[:, 1:])
-                ls = loss_function(out, smoothed_label, tgt_valid_length - 1).sum()
-                Ls.append((ls * (tgt_seq.shape[1] - 1)) / hparams.batch_size / 100.0)
-        for L in Ls:
-            L.backward()
-        if batch_id % grad_interval == grad_interval - 1 or\
-                batch_id == len(train_data_loader) - 1:
-            if update_average_param_dict:
-                for k, v in model.collect_params().items():
-                    average_param_dict[k] = v.data(ctx).copy()
-                update_average_param_dict = False
-                    
-            trainer.step(float(loss_denom) / hparams.batch_size / 100.0)
-            param_dict = model.collect_params()
-            param_dict.zero_grad()
-            if step_num > hparams.average_start:
-                alpha = 1. / max(1, step_num - hparams.average_start)
-                for name, average_param in average_param_dict.items():
-                    average_param[:] += alpha * (param_dict[name].data(ctx) - average_param)
-        step_loss += sum([L.asscalar() for L in Ls])
-        if batch_id % grad_interval == grad_interval - 1 or\
-                batch_id == len(train_data_loader) - 1:
-            log_avg_loss += step_loss / loss_denom * hparams.batch_size * 100.0
-            loss_denom = 0
-            step_loss = 0
-        log_wc += src_wc + tgt_wc
-        if (batch_id + 1) % (hparams.log_interval * grad_interval) == 0:
-            wps = log_wc / (time.time() - log_start_time)
-            logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, '
-                         'throughput={:.2f}K wps, wc={:.2f}K'
-                         .format(epoch_id, batch_id + 1, len(train_data_loader),
-                                 log_avg_loss / hparams.log_interval,
-                                 np.exp(log_avg_loss / hparams.log_interval),
-                                 wps / 1000, log_wc / 1000))
-            log_start_time = time.time()
-            log_avg_loss = 0
-            log_wc = 0
\ No newline at end of file
diff --git a/docs/examples/notes/data_api.rst b/docs/examples/notes/data_api.rst
deleted file mode 100644
index 8b7cc26fd6..0000000000
--- a/docs/examples/notes/data_api.rst
+++ /dev/null
@@ -1,286 +0,0 @@
-Data Loading API
-----------------
-
-In this tutorial, we show how to load and process the sentiment dataset to form batches that can be processed efficiently,
-using classes from :mod:`gluonnlp.data.sampler` and :mod:`gluonnlp.data.batchify`.
-We use :class:`~gluonnlp.data.IMDB`
-dataset as an example, where the dataset has 50,000 movie reviews, labeled as positive or negative. The dataset
-is split into the training/testing dataset, each consisting of 25,000 reviews.
-
-Data Loading
-~~~~~~~~~~~~
-
-Let us see a quick example.
-
-.. code:: python
-
-    >>> import mxnet as mx
-    >>> from mxnet import gluon, nd
-    >>> import gluonnlp as nlp
-
-.. code:: python
-
-    >>> train_dataset, test_dataset = [nlp.data.IMDB(root='data/imdb', segment=segment)
-    >>>                                for segment in ('train', 'test')]
-
-.. code:: python
-
-    >>> print('#training samples={:d}, #testing samples={:d}'.format(len(train_dataset),
-    >>>                                                              len(test_dataset)))
-
-    #training samples: 25000, #testing samples: 25000
-
-.. code:: python
-
-    >>> print(train_dataset[0])
-
-    ['Bromwell High is a cartoon comedy. It ran at the same time as some other programs
-    about school life, such as "Teachers". My 35 years in the teaching profession lead
-    me to believe that Bromwell High\'s satire is much closer to reality than is "Teachers".
-    The scramble to survive financially, the insightful students who can see right through
-    their pathetic teachers\' pomp, the pettiness of the whole situation, all remind me of
-    the schools I knew and their students. When I saw the episode in which a student repeatedly
-    tried to burn down the school, I immediately recalled ......... at .......... High. A
-    classic line: INSPECTOR: I\'m here to sack one of your teachers. STUDENT: Welcome to
-    Bromwell High. I expect that many adults of my age think that Bromwell High is far fetched.
-    What a pity that it isn\'t!', 9]
-
-In the above example, we load ``train_dataset`` and ``test_dataset``, which are both :class:`~mxnet.gluon.data.SimpleDataset` objects.
-
-:class:`~mxnet.gluon.data.SimpleDataset`: wrapper for lists and arrays. Each entry in the train_dataset is a [string, score] pair,
-where the score falls into [1, 2, ..., 10]. Thus in the given example, 9 indicates a positive feedback on the movie.
-
-
-Data Processing
-~~~~~~~~~~~~~~~
-
-The next step is to preprocess the data so that it can be used to train the model. The following code
-shows how to tokenize the string with :class:`~gluonnlp.data.SpacyTokenizer` and then :class:`clip <gluonnlp.data.ClipSequence>`
-the list of output tokens by length.
-
-.. code:: python
-
-    >>> tokenizer = nlp.data.SpacyTokenizer('en')
-    >>> # We use 50 as maximum length for illustration
-    >>> # For actual learning, we may use a large value such as 500
-    >>> length_clip = nlp.data.ClipSequence(50)
-    >>> seq, score = train_dataset[0]
-    >>> print(length_clip(tokenizer(seq)))
-
-    ['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at', 'the', 'same',
-    'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as',
-    '"', 'Teachers', '"', '.', 'My', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead',
-    'me', 'to', 'believe', 'that', 'Bromwell', 'High', "'s", 'satire', 'is', 'much', 'closer',
-    'to', 'reality', 'than', 'is']
-
-Now, we are ready to preprocess the whole dataset. The following code shows how to tokenize the dataset parallelly.
-
-.. code:: python
-
-    >>> import time
-    >>> import multiprocessing as mp
-    >>> length_clip = nlp.data.ClipSequence(500)
-
-.. code:: python
-
-    >>> # Dataset preprocessing
-    >>> def preprocess(x):
-    >>>     data, label = x
-    >>>     # In the labeled train/test sets, a negative review has a score <= 4
-    >>>     # out of 10, and a positive review has a score >= 7 out of 10. Thus
-    >>>     # reviews with more neutral ratings are not included in the train/test
-    >>>     # sets. We labeled a negative review whose score <= 4 as 0, and a
-    >>>     # positive review whose score >= 7 as 1. As the neural ratings are not
-    >>>     # included in the datasets, we can simply use 5 as our threshold.
-    >>>     label = int(label > 5)
-    >>>     data = length_clip(tokenizer(data))
-    >>>     return data, label
-    >>>
-    >>> def get_length(x):
-    >>>     return float(len(x[0]))
-    >>>
-    >>> def preprocess_dataset(dataset):
-    >>>     start = time.time()
-    >>>     pool = mp.Pool()
-    >>>     dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
-    >>>     lengths = gluon.data.SimpleDataset(pool.map(get_length, dataset))
-    >>>     end = time.time()
-    >>>     print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
-    >>>     return dataset, lengths
-    >>>
-    >>> # Preprocess the dataset
-    >>> train_dataset, train_data_lengths = preprocess_dataset(train_dataset)
-    >>> test_dataset, test_data_lengths = preprocess_dataset(test_dataset)
-
-    Tokenize using spaCy...
-
-    Done! Tokenizing Time=12.85s, #Sentences=25000
-
-    Done! Tokenizing Time=12.99s, #Sentences=25000
-
-Then, we are going to construct a :class:`vocabulary <gluonnlp.Vocab>` for the training dataset. The :class:`vocabulary <gluonnlp.Vocab>`
-will be used to convert the tokens to numerical indices, which facilitates the creation of word embedding matrices.
-
-.. code:: python
-
-    >>> import itertools
-    >>> train_seqs = [sample[0] for sample in train_dataset]
-    >>> counter = nlp.data.count_tokens(list(itertools.chain.from_iterable(train_seqs)))
-    >>> vocab = nlp.Vocab(counter, max_size=10000, padding_token=None,
-    >>>                   bos_token=None, eos_token=None)
-    >>> print(vocab)
-
-    Vocab(size=10001, unk="<unk>", reserved="None")
-
-.. code:: python
-
-    >>> # Convert string token to its index in the dictionary
-    >>> def token_to_idx(x):
-    >>>     return vocab[x[0]], x[1]
-    >>>
-    >>> pool = mp.Pool()
-    >>> train_dataset = pool.map(token_to_idx, train_dataset)
-    >>> test_dataset = pool.map(token_to_idx, test_dataset)
-    >>> pool.close()
-    >>> print(train_dataset[0][0][:50])
-
-    [0, 2012, 8, 4, 1116, 231, 3, 51, 2311, 40, 1, 188, 67, 20, 59, 97, 6190, 49, 422, 133,
-    2, 160, 20, 13, 0, 13, 3, 374, 5063, 174, 9, 1, 5390, 6674, 498, 83, 7, 282, 12, 0, 2012,
-    15, 2042, 8, 88, 2661, 7, 714, 87, 8]
-
-
-Bucketing and Dataloader
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-The next step is to construct a :class:`dataloader <mxnet.gluon.data.DataLoader>` for training.
-As the sequences have variable lengths, we need to pad the sequences so that they have the same
-lengths in the minibatch, which allows the fast tensor manipulation in GPU.
-
-.. code:: python
-
-   >>> batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0, pad_val=0),
-   >>>                                       nlp.data.batchify.Stack())
-
-:class:`~gluonnlp.data.batchify.Tuple` wraps multiple batchify functions and applies each input function on each input field,
-respectively. In this case, we are applying :class:`~gluonnlp.data.batchify.Pad` on the sequence and :class:`~gluonnlp.data.batchify.Stack`
-on the labels. Given the batchify function, we can construct the dataloaders for both training samples and testing samples.
-
-.. code:: python
-
-   >>> train_dataloader = gluon.data.DataLoader(dataset=train_dataset,
-   >>>                                          batch_size=batch_size,
-   >>>                                          shuffle=True,
-   >>>                                          batchify_fn=batchify_fn)
-   >>> test_dataloader = gluon.data.DataLoader(dataset=test_dataset,
-   >>>                                         batch_size=batch_size,
-   >>>                                         shuffle=False,
-   >>>                                         batchify_fn=batchify_fn)
-
-As :class:`~mxnet.gluon.data.DataLoader` is iterable, we can iterate over the dataset easily using the following code:
-
-.. code:: python
-
-   >>> for data, label in train_dataloader:
-
-In the above example, minibatcheas are formed using uniform sampling, which can cause a large amount of padding as shown
-in the figure below.
-
-.. image:: ./images/no_bucket_strategy.png
-   :height: 200px
-   :width: 1000 px
-   :alt: alternate text
-   :align: center
-
-In light of this, we consider
-constructing a sampler using bucketing, which defines how the samples in a dataset will be iterated in a more economic way.
-
-.. code:: python
-
-    >>> batch_sampler = nlp.data.sampler.FixedBucketSampler(train_data_lengths,
-    >>>                                                     batch_size=16,
-    >>>                                                     num_buckets=10,
-    >>>                                                     ratio=0,
-    >>>                                                     shuffle=True)
-    >>> print(batch_sampler.stats())
-
-   FixedBucketSampler:
-     sample_num=25000, batch_num=1567
-     key=[68, 116, 164, 212, 260, 308, 356, 404, 452, 500]
-     cnt=[981, 1958, 5686, 4614, 2813, 2000, 1411, 1129, 844, 3564]
-     batch_size=[16, 16, 16, 16, 16, 16, 16, 16, 16, 16]
-
-In this example, we use a :class:`~gluonnlp.data.sampler.FixedBucketSampler`, which assigns each data sample to a
-fixed bucket based on its length.
-
-The bucket keys are either given or generated from the input sequence lengths. We construct 10 buckets, where `cnt`
-shows the number of samples belonging to each bucket. A graphic illustration of using :class:`~gluonnlp.data.sampler.FixedBucketSampler`
-can be seen as follows:
-
-.. image:: ./images/fixed_bucket_strategy_ratio0.0.png
-   :height: 200px
-   :width: 1000 px
-   :alt: alternate text
-   :align: center
-
-To further improve the throughput, we can consider scaling up the batch size of smaller buckets. This can be achieved
-by using a parameter ``ratio``. Assume the :math:`i` th key is :math:`K_i` , the default batch size is :math:`B` , the ratio to
-scale the batch size is :math:`\alpha` and the batch size corresponds to the :math:`i` th bucket is :math:`B_i` . We have:
-
-.. math::
-
-   B_i = \max(\alpha B \times \frac{\max_j sum(K_j)}{sum(K_i)}, B)
-
-.. image:: ./images/fixed_bucket_strategy_ratio0.7.png
-   :height: 200px
-   :width: 1000 px
-   :alt: alternate text
-   :align: center
-
-Thus, setting this to a value larger than 0, like 0.5, will scale up the batch size of the
-smaller buckets.
-
-.. code:: python
-
-    >>> batch_sampler = nlp.data.sampler.FixedBucketSampler(train_data_lengths,
-    >>>                                                     batch_size=16,
-    >>>                                                     num_buckets=10,
-    >>>                                                     ratio=0.5,
-    >>>                                                     shuffle=True)
-    >>> print(batch_sampler.stats())
-
-   FixedBucketSampler:
-     sample_num=25000, batch_num=1306
-     key=[68, 116, 164, 212, 260, 308, 356, 404, 452, 500]
-     cnt=[981, 1958, 5686, 4614, 2813, 2000, 1411, 1129, 844, 3564]
-     batch_size=[58, 34, 24, 18, 16, 16, 16, 16, 16, 16]
-
-Now, we can create dataloader using bucketing sampler for both training set.
-
-.. code:: python
-
-   >>> train_dataloader = gluon.data.DataLoader(dataset=train_dataset,
-   >>>                                          batch_sampler=batch_sampler,
-   >>>                                          batchify_fn=batchify_fn)
-
-In our sampler API, we also provide another sampler called :class:`~gluonnlp.data.sampler.SortedBucketSampler`,
-which results in the following padding pattern:
-
-.. image:: ./images/sorted_bucket_strategy.png
-   :height: 200px
-   :width: 1000 px
-   :alt: alternate text
-   :align: center
-
-With this strategy, we partition data to a number of buckets with size `batch_size * mult`, where `mult` is a multiplier
-to determine the bucket size. Each bucket contains `batch_size * mult` elements. The samples inside each bucket are sorted
-based on sort_key and then batched.
-
-.. code:: python
-
-    >>> batch_sampler = nlp.data.sampler.SortedBucketSampler(train_data_lengths,
-    >>>                                                     batch_size=16,
-    >>>                                                     mult=100,
-    >>>                                                     shuffle=True)
-
-More details about the training using pre-trained language model and bucketing can be found in the
-`sentiment analysis tutorial </examples/sentiment_analysis/sentiment_analysis.html>`_.
diff --git a/docs/examples/notes/images/fixed_bucket_strategy_ratio0.0.png b/docs/examples/notes/images/fixed_bucket_strategy_ratio0.0.png
deleted file mode 100644
index cae9de8c7a..0000000000
Binary files a/docs/examples/notes/images/fixed_bucket_strategy_ratio0.0.png and /dev/null differ
diff --git a/docs/examples/notes/images/fixed_bucket_strategy_ratio0.7.png b/docs/examples/notes/images/fixed_bucket_strategy_ratio0.7.png
deleted file mode 100644
index 685a80d7cf..0000000000
Binary files a/docs/examples/notes/images/fixed_bucket_strategy_ratio0.7.png and /dev/null differ
diff --git a/docs/examples/notes/images/no_bucket_strategy.png b/docs/examples/notes/images/no_bucket_strategy.png
deleted file mode 100644
index dd11d631c3..0000000000
Binary files a/docs/examples/notes/images/no_bucket_strategy.png and /dev/null differ
diff --git a/docs/examples/notes/images/sorted_bucket_strategy.png b/docs/examples/notes/images/sorted_bucket_strategy.png
deleted file mode 100644
index 227e48b8a7..0000000000
Binary files a/docs/examples/notes/images/sorted_bucket_strategy.png and /dev/null differ
diff --git a/docs/examples/notes/index.rst b/docs/examples/notes/index.rst
deleted file mode 100644
index f090dbd4e8..0000000000
--- a/docs/examples/notes/index.rst
+++ /dev/null
@@ -1,26 +0,0 @@
-Data Loading and Vocabularies
-=============================
-
-Here are some notes on the basic usage of our API.
-
-
-.. container:: cards
-
-   .. card::
-      :title: Data Loading API
-      :link: data_api.html
-
-      See how to load and process the sentiment dataset to form batches that can be processed efficiently.
-
-   .. card::
-      :title: Vocabulary and Embedding API
-      :link: vocab_emb.html
-
-      See how to how to write simple code to create index for tokens.
-
-.. toctree::
-   :hidden:
-   :maxdepth: 1
-
-   data_api
-   vocab_emb
diff --git a/docs/examples/notes/vocab_emb.rst b/docs/examples/notes/vocab_emb.rst
deleted file mode 100644
index 04496a6e2d..0000000000
--- a/docs/examples/notes/vocab_emb.rst
+++ /dev/null
@@ -1,99 +0,0 @@
-Vocabulary and Embedding API
-----------------------------
-
-This note illustrates how to write simple code to create index for tokens to form a
-:class:`vocabulary <gluonnlp.Vocab>`, and utilize pre-trained :mod:`word-embeddings
-<gluonnlp.embedding>`.
-
-All the code demonstrated in this document assumes that the following
-modules or packages are imported.
-
-.. code:: python
-
-    >>> from mxnet import gluon, nd
-    >>> import gluonnlp as nlp
-
-
-Indexing words and using pre-trained word embeddings
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-As a common use case, let us index words, attach pre-trained word
-embeddings for them, and use such embeddings in :mod:`mxnet.gluon` in just a few
-lines of code.
-
-To begin with, suppose that we have a simple text data set in the string
-format. We can count word frequency in the data set.
-
-.. code:: python
-
-    >>> text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world']
-    >>> counter = nlp.data.count_tokens(text_data)
-
-The obtained :class:`~gluonnlp.data.Counter` has key-value pairs whose keys are words and
-values are word frequencies. This allows us to filter out infrequent
-words. Suppose that we want to build indices for all the keys in :class:`~gluonnlp.data.Counter`.
-We need a :class:`~gluonnlp.Vocab` instance with :class:`~gluonnlp.data.Counter` as its argument.
-
-.. code:: python
-
-    >>> my_vocab = nlp.Vocab(counter)
-
-To attach word embeddings to indexed words in ``my_vocab``, let us go on
-to create a :class:`fastText word embedding <gluonnlp.embedding.FastText>` instance by specifying the embedding
-name ``fasttext`` and the pre-trained file name ``wiki.simple``.
-
-.. code:: python
-
-    >>> fasttext = nlp.embedding.create('fasttext', source='wiki.simple')
-
-This automatically downloads the corresponding embedding file from public repo,
-and the file is by default stored in ~/.mxnet/embedding/.
-Next, we can attach word embedding ``fasttext`` to indexed words
-``my_vocab``.
-
-.. code:: python
-
-    >>> my_vocab.set_embedding(fasttext)
-
-Now we are ready to access the :class:`fastText word embedding <gluonnlp.embedding.FastText>` vectors for
-indexed words, such as 'hello' and 'world'.
-
-.. code:: python
-
-    >>> my_vocab.embedding[['hello', 'world']]
-
-    [[  3.95669997e-01   2.14540005e-01  -3.53889987e-02  -2.42990002e-01
-        ...
-       -7.54180014e-01  -3.14429998e-01   2.40180008e-02  -7.61009976e-02]
-     [  1.04440004e-01  -1.08580001e-01   2.72119999e-01   1.32990003e-01
-        ...
-       -3.73499990e-01   5.67310005e-02   5.60180008e-01   2.90190000e-02]]
-    <NDArray 2x300 @cpu(0)>
-
-To demonstrate how to use pre-trained word embeddings with :mod:`mxnet.gluon` models,
-let us first obtain indices of the words ‘hello’ and ‘world’.
-
-.. code:: python
-
-    >>> my_vocab[['hello', 'world']]
-    [5, 4]
-
-We can obtain the vector representation for the words ‘hello’ and
-‘world’ by specifying their indices (5 and 4) and the weight matrix
-``my_vocab.embedding.idx_to_vec`` in :class:`mxnet.gluon.nn.Embedding`.
-
-.. code:: python
-
-    >>> input_dim, output_dim = my_vocab.embedding.idx_to_vec.shape
-    >>> layer = gluon.nn.Embedding(input_dim, output_dim)
-    >>> layer.initialize()
-    >>> layer.weight.set_data(my_vocab.embedding.idx_to_vec)
-    >>> layer(nd.array([5, 4]))
-
-    [[  3.95669997e-01   2.14540005e-01  -3.53889987e-02  -2.42990002e-01
-        ...
-       -7.54180014e-01  -3.14429998e-01   2.40180008e-02  -7.61009976e-02]
-     [  1.04440004e-01  -1.08580001e-01   2.72119999e-01   1.32990003e-01
-        ...
-       -3.73499990e-01   5.67310005e-02   5.60180008e-01   2.90190000e-02]]
-    <NDArray 2x300 @cpu(0)>
diff --git a/docs/examples/sentence_embedding/bert b/docs/examples/sentence_embedding/bert
deleted file mode 120000
index 849189304e..0000000000
--- a/docs/examples/sentence_embedding/bert
+++ /dev/null
@@ -1 +0,0 @@
-../../../scripts/bert/
\ No newline at end of file
diff --git a/docs/examples/sentence_embedding/bert-embed.png b/docs/examples/sentence_embedding/bert-embed.png
deleted file mode 100644
index 1100e970cf..0000000000
Binary files a/docs/examples/sentence_embedding/bert-embed.png and /dev/null differ
diff --git a/docs/examples/sentence_embedding/bert-sentence-pair.png b/docs/examples/sentence_embedding/bert-sentence-pair.png
deleted file mode 100644
index 1dc37953f4..0000000000
Binary files a/docs/examples/sentence_embedding/bert-sentence-pair.png and /dev/null differ
diff --git a/docs/examples/sentence_embedding/bert.md b/docs/examples/sentence_embedding/bert.md
deleted file mode 100644
index f26b9b7e57..0000000000
--- a/docs/examples/sentence_embedding/bert.md
+++ /dev/null
@@ -1,421 +0,0 @@
-# Fine-tuning Pre-trained BERT Models
-
-Pre-trained language representations have been shown to improve many downstream NLP tasks such as
-question answering, and natural language inference. To apply pre-trained
-representations to these tasks, there are two main strategies:
-
-1. The *feature-based* approach, which uses the pre-trained representations as additional
-features to the downstream task.
-2. Or the *fine-tuning*-based approach, which trains the downstream tasks by
-fine-tuning pre-trained parameters.
-
-While feature-based approaches such as ELMo [3] (introduced in the previous tutorial) are effective
-in improving many downstream tasks, they require task-specific architectures.
-Devlin, Jacob, et al proposed BERT [1] (Bidirectional Encoder Representations
-from Transformers), which *fine-tunes* deep bi-directional representations on a
-wide range of tasks with minimal task-specific parameters, and obtains state-
-of-the-art results.
-
-In this tutorial, we will focus on fine-tuning with the
-pre-trained BERT model to classify semantically equivalent sentence pairs.
-
-Specifically, we will:
-
-1. Load the state-of-the-art pre-trained BERT model and attach an additional layer for classification
-2. Process and transform sentence-pair data for the task at hand
-3. Fine-tune the BERT model for sentence classification
-
-## Setup
-
-To use this tutorial, please download the required files from the above download link, and install
-GluonNLP.
-
-### Importing necessary modules
-
-```{.python .input}
-import warnings
-warnings.filterwarnings('ignore')
-
-import io
-import random
-import numpy as np
-import mxnet as mx
-import gluonnlp as nlp
-from gluonnlp.calibration import BertLayerCollector
-# this notebook assumes that all required scripts are already
-# downloaded from the corresponding tutorial webpage on http://gluon-nlp.mxnet.io
-from bert import data
-
-nlp.utils.check_version('0.8.1')
-```
-
-### Setting up the environment
-
-Please note the comment in the code if no GPU is available.
-
-```{.python .input}
-np.random.seed(100)
-random.seed(100)
-mx.random.seed(10000)
-# change `ctx` to `mx.cpu()` if no GPU is available.
-ctx = mx.gpu(0)
-```
-
-## Using the pre-trained BERT model
-
-The list of pre-trained BERT models available
-in GluonNLP can be found
-[here](../../model_zoo/bert/index.rst).
-
-In this
-tutorial, the BERT model we will use is BERT
-BASE trained on an uncased corpus of books and
-the English Wikipedia dataset in the
-GluonNLP model zoo.
-
-### Get BERT
-
-Let's first take
-a look at the BERT model
-architecture for sentence pair classification below:
-<div style="width:
-500px;">![bert-sentence-pair](bert-sentence-pair.png)</div>
-where the model takes a pair of
-sequences and pools the representation of the
-first token in the sequence.
-Note that the original BERT model was trained for a
-masked language model and next-sentence prediction tasks, which includes layers
-for language model decoding and
-classification. These layers will not be used
-for fine-tuning the sentence pair classification.
-
-We can load the
-pre-trained BERT fairly easily
-using the model API in GluonNLP, which returns the vocabulary
-along with the
-model. We include the pooler layer of the pre-trained model by setting
-`use_pooler` to `True`.
-
-```{.python .input}
-bert_base, vocabulary = nlp.model.get_model('bert_12_768_12',
-                                             dataset_name='book_corpus_wiki_en_uncased',
-                                             pretrained=True, ctx=ctx, use_pooler=True,
-                                             use_decoder=False, use_classifier=False)
-print(bert_base)
-```
-
-### Transform the model for `SentencePair` classification
-
-Now that we have loaded
-the BERT model, we only need to attach an additional layer for classification.
-The `BERTClassifier` class uses a BERT base model to encode sentence
-representation, followed by a `nn.Dense` layer for classification.
-
-```{.python .input}
-bert_classifier = nlp.model.BERTClassifier(bert_base, num_classes=2, dropout=0.1)
-# only need to initialize the classifier layer.
-bert_classifier.classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
-bert_classifier.hybridize(static_alloc=True)
-
-# softmax cross entropy loss for classification
-loss_function = mx.gluon.loss.SoftmaxCELoss()
-loss_function.hybridize(static_alloc=True)
-
-metric = mx.metric.Accuracy()
-```
-
-## Data preprocessing for BERT
-
-For this tutorial, we need to do a bit of preprocessing before feeding our data introduced
-the BERT model. Here we want to leverage the dataset included in the downloaded archive at the
-beginning of this tutorial.
-
-### Loading the dataset
-
-We use
-the dev set of the
-Microsoft Research Paraphrase Corpus dataset. The file is
-named 'dev.tsv'. Let's take a look at the first few lines of the raw dataset.
-
-```{.python .input}
-tsv_file = io.open('dev.tsv', encoding='utf-8')
-for i in range(5):
-    print(tsv_file.readline())
-```
-
-The file contains 5 columns, separated by tabs.
-The header of
-the file explains each of these columns, although an explanation for each is included
-here:
-0. The label indicating whether the two
-sentences are semantically equivalent
-1. The id of the first sentence in this
-sample
-2. The id of the second sentence in this sample
-3. The content of the
-first sentence
-4. The content of the second sentence
-
-For our task, we are
-interested in the 0th, 3rd and 4th columns.
-To load this dataset, we can use the
-`TSVDataset` API and skip the first line because it's just the schema:
-
-```{.python .input}
-# Skip the first line, which is the schema
-num_discard_samples = 1
-# Split fields by tabs
-field_separator = nlp.data.Splitter('\t')
-# Fields to select from the file
-field_indices = [3, 4, 0]
-data_train_raw = nlp.data.TSVDataset(filename='dev.tsv',
-                                 field_separator=field_separator,
-                                 num_discard_samples=num_discard_samples,
-                                 field_indices=field_indices)
-sample_id = 0
-# Sentence A
-print(data_train_raw[sample_id][0])
-# Sentence B
-print(data_train_raw[sample_id][1])
-# 1 means equivalent, 0 means not equivalent
-print(data_train_raw[sample_id][2])
-```
-
-To use the pre-trained BERT model, we need to pre-process the data in the same
-way it was trained. The following figure shows the input representation in BERT:
-<div style="width: 500px;">![bert-embed](bert-embed.png)</div>
-
-We will use
-`BERTDatasetTransform` to perform the following transformations:
-- tokenize
-the
-input sequences
-- insert [CLS] at the beginning
-- insert [SEP] between sentence
-A and sentence B, and at the end
-- generate segment ids to indicate whether
-a token belongs to the first sequence or the second sequence.
-- generate valid length
-
-```{.python .input}
-# Use the vocabulary from pre-trained model for tokenization
-bert_tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=True)
-
-# The maximum length of an input sequence
-max_len = 128
-
-# The labels for the two classes [(0 = not similar) or  (1 = similar)]
-all_labels = ["0", "1"]
-
-# whether to transform the data as sentence pairs.
-# for single sentence classification, set pair=False
-# for regression task, set class_labels=None
-# for inference without label available, set has_label=False
-pair = True
-transform = data.transform.BERTDatasetTransform(bert_tokenizer, max_len,
-                                                class_labels=all_labels,
-                                                has_label=True,
-                                                pad=True,
-                                                pair=pair)
-data_train = data_train_raw.transform(transform)
-
-print('vocabulary used for tokenization = \n%s'%vocabulary)
-print('%s token id = %s'%(vocabulary.padding_token, vocabulary[vocabulary.padding_token]))
-print('%s token id = %s'%(vocabulary.cls_token, vocabulary[vocabulary.cls_token]))
-print('%s token id = %s'%(vocabulary.sep_token, vocabulary[vocabulary.sep_token]))
-print('token ids = \n%s'%data_train[sample_id][0])
-print('segment ids = \n%s'%data_train[sample_id][1])
-print('valid length = \n%s'%data_train[sample_id][2])
-print('label = \n%s'%data_train[sample_id][3])
-```
-
-## Fine-tuning the model
-
-Now we have all the pieces to put together, and we can finally start fine-tuning the
-model with very few epochs. For demonstration, we use a fixed learning rate and
-skip the validation steps. For the optimizer, we leverage the ADAM optimizer which
-performs very well for NLP data and for BERT models in particular.
-
-```{.python .input}
-# The hyperparameters
-batch_size = 32
-lr = 5e-6
-
-# The FixedBucketSampler and the DataLoader for making the mini-batches
-train_sampler = nlp.data.FixedBucketSampler(lengths=[int(item[2]) for item in data_train],
-                                            batch_size=batch_size,
-                                            shuffle=True)
-bert_dataloader = mx.gluon.data.DataLoader(data_train, batch_sampler=train_sampler)
-
-trainer = mx.gluon.Trainer(bert_classifier.collect_params(), 'adam',
-                           {'learning_rate': lr, 'epsilon': 1e-9})
-
-# Collect all differentiable parameters
-# `grad_req == 'null'` indicates no gradients are calculated (e.g. constant parameters)
-# The gradients for these params are clipped later
-params = [p for p in bert_classifier.collect_params().values() if p.grad_req != 'null']
-grad_clip = 1
-
-# Training the model with only three epochs
-log_interval = 4
-num_epochs = 3
-for epoch_id in range(num_epochs):
-    metric.reset()
-    step_loss = 0
-    for batch_id, (token_ids, segment_ids, valid_length, label) in enumerate(bert_dataloader):
-        with mx.autograd.record():
-
-            # Load the data to the GPU
-            token_ids = token_ids.as_in_context(ctx)
-            valid_length = valid_length.as_in_context(ctx)
-            segment_ids = segment_ids.as_in_context(ctx)
-            label = label.as_in_context(ctx)
-
-            # Forward computation
-            out = bert_classifier(token_ids, segment_ids, valid_length.astype('float32'))
-            ls = loss_function(out, label).mean()
-
-        # And backwards computation
-        ls.backward()
-
-        # Gradient clipping
-        trainer.allreduce_grads()
-        nlp.utils.clip_grad_global_norm(params, 1)
-        trainer.update(1)
-
-        step_loss += ls.asscalar()
-        metric.update([label], [out])
-
-        # Printing vital information
-        if (batch_id + 1) % (log_interval) == 0:
-            print('[Epoch {} Batch {}/{}] loss={:.4f}, lr={:.7f}, acc={:.3f}'
-                         .format(epoch_id, batch_id + 1, len(bert_dataloader),
-                                 step_loss / log_interval,
-                                 trainer.learning_rate, metric.get()[1]))
-            step_loss = 0
-```
-
-## Quantize the model
-
-GluonNLP also delivered some INT8 quantization methods to improve the performance and reduce the deployment costs for the natural language inference tasks. In real production, there are two main benefits of lower precision (INT8). First, the computation can be accelerated by the low precision instruction, like Intel Vector Neural Network Instruction (VNNI). Second, lower precision data type would save the memory bandwidth and allow for better cache locality and save the power. The new feature can get up to 4X performance speedup in the latest [AWS EC2 C5 instances](https://aws.amazon.com/blogs/aws/now-available-new-c5-instance-sizes-and-bare-metal-instances/) under the [Intel Deep Learning Boost (VNNI)](https://www.intel.ai/intel-deep-learning-boost/) enabled hardware with less than 0.5% accuracy drop.
-
-Now we have a fine-tuned model on MRPC training dataset and in this section, we will quantize the model into INT8 data type on a subset of MRPC validation dataset.
-
-```{.python .input}
-# The hyperparameters
-dev_batch_size = 32
-num_calib_batches = 5
-quantized_dtype = 'auto'
-calib_mode = 'customize'
-
-# sampler for evaluation
-pad_val = vocabulary[vocabulary.padding_token]
-batchify_fn = nlp.data.batchify.Tuple(
-    nlp.data.batchify.Pad(axis=0, pad_val=pad_val),  # input
-    nlp.data.batchify.Pad(axis=0, pad_val=0),  # segment
-    nlp.data.batchify.Stack(),  # length
-    nlp.data.batchify.Stack('int32'))  # label
-dev_dataloader = mx.gluon.data.DataLoader(data_train, batch_size=dev_batch_size, num_workers=4,
-                                           shuffle=False, batchify_fn=batchify_fn)
-
-# Calibration function
-def calibration(net, dev_data, num_calib_batches, quantized_dtype, calib_mode):
-    """calibration function on the dev dataset."""
-    print('Now we are doing calibration on dev with cpu.')
-    collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=None)
-    num_calib_examples = dev_batch_size * num_calib_batches
-    quantized_net = mx.contrib.quantization.quantize_net_v2(net, quantized_dtype=quantized_dtype,
-                                                            exclude_layers=[],
-                                                            quantize_mode='smart',
-                                                            quantize_granularity='channel-wise',
-                                                            calib_data=dev_data,
-                                                            calib_mode=calib_mode,
-                                                            num_calib_examples=num_calib_examples,
-                                                            ctx=mx.cpu(),
-                                                            LayerOutputCollector=collector,
-                                                            logger=None)
-    print('Calibration done with success.')
-    return quantized_net
-
-# will remove until mxnet 1.7 release.
-try:
-    quantized_net = calibration(bert_classifier,
-                                dev_dataloader,
-                                num_calib_batches,
-                                quantized_dtype,
-                                calib_mode)
-except AttributeError:
-    nlp.utils.version.check_version('1.7.0', warning_only=True, library=mx)
-    warnings.warn('INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115')
-```
-
-## Deployment
-
-After quantization, we can also export the quantized model for inference deployment.
-
-```{.python .input}
-prefix = './model_bert_squad_quantized'
-
-def deployment(net, prefix, dataloader):
-    net.export(prefix, epoch=0)
-    print('Saving quantized model at ', prefix)
-    print('load symbol file directly as SymbolBlock for model deployment.')
-    static_net = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(prefix), 
-                                    ['data0', 'data1', 'data2'],
-                                    '{}-0000.params'.format(prefix))
-    static_net.hybridize(static_alloc=True, static_shape=True)
-    for batch_id, (token_ids, segment_ids, valid_length, label) in enumerate(dev_dataloader):
-            token_ids = token_ids.as_in_context(mx.cpu())
-            valid_length = valid_length.as_in_context(mx.cpu())
-            segment_ids = segment_ids.as_in_context(mx.cpu())
-            label = label.as_in_context(mx.cpu())
-            out = static_net(token_ids, segment_ids, valid_length.astype('float32'))
-            metric.update([label], [out])
-
-            # Printing vital information
-            if (batch_id + 1) % (log_interval) == 0:
-                print('[Batch {}/{}], acc={:.3f}'
-                            .format(batch_id + 1, len(bert_dataloader),
-                                    metric.get()[1]))
-    return metric
-
-# will remove until mxnet 1.7 release.
-try:
-    eval_metric = deployment(quantized_net, prefix, dev_dataloader)
-except NameError:
-    nlp.utils.version.check_version('1.7.0', warning_only=True, library=mx)
-    warnings.warn('INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115')
-```
-
-## Conclusion
-
-In this tutorial, we showed how to fine-tune a sentence pair
-classification model with pre-trained BERT parameters. In GluonNLP, this can be
-done with such few, simple steps. All we did was apply a BERT-style data transformation to
-pre-process the data, automatically download the pre-trained model, and feed the
-transformed data into the model, all within 50 lines of code!
-
-For demonstration purpose, we skipped the warmup learning rate
-schedule and validation on the dev dataset used in the original
-implementation. Please visit the
-[BERT model zoo webpage](../../model_zoo/bert/index.rst), or the scripts/bert folder
-in the Github repository for the complete fine-tuning scripts.
-
-## References
-
-[1] Devlin, Jacob, et al. "Bert:
-Pre-training of deep
-bidirectional transformers for language understanding."
-arXiv preprint
-arXiv:1810.04805 (2018).
-
-[2] Dolan, William B., and Chris
-Brockett.
-"Automatically constructing a corpus of sentential paraphrases."
-Proceedings of
-the Third International Workshop on Paraphrasing (IWP2005). 2005.
-
-[3] Peters,
-Matthew E., et al. "Deep contextualized word representations." arXiv
-preprint
-arXiv:1802.05365 (2018).
diff --git a/docs/examples/sentence_embedding/bert.png b/docs/examples/sentence_embedding/bert.png
deleted file mode 100644
index 74243dae62..0000000000
Binary files a/docs/examples/sentence_embedding/bert.png and /dev/null differ
diff --git a/docs/examples/sentence_embedding/dev.tsv b/docs/examples/sentence_embedding/dev.tsv
deleted file mode 100644
index 435bde0d09..0000000000
--- a/docs/examples/sentence_embedding/dev.tsv
+++ /dev/null
@@ -1,409 +0,0 @@
-﻿Quality	#1 ID	#2 ID	#1 String	#2 String
-1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
-0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
-0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
-1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
-0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
-1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
-0	1490811	1490840	While dioxin levels in the environment were up last year , they have dropped by 75 percent since the 1970s , said Caswell .	The Institute said dioxin levels in the environment have fallen by as much as 76 percent since the 1970s .
-1	426112	426210	This integrates with Rational PurifyPlus and allows developers to work in supported versions of Java , Visual C # and Visual Basic .NET.	IBM said the Rational products were also integrated with Rational PurifyPlus , which allows developers to work in Java , Visual C # and VisualBasic .Net.
-1	1439663	1439808	The top rate will go to 4.45 percent for all residents with taxable incomes above $ 500,000 .	For residents with incomes above $ 500,000 , the income-tax rate will increase to 4.45 percent .
-1	3147370	3147525	The results appear in the January issue of Cancer , an American Cancer Society journal , being published online today .	The results appear in the January issue of Cancer , an American Cancer Society ( news - web sites ) journal , being published online Monday .
-1	3300040	3299992	The delegates said raising and distributing funds has been complicated by the U.S. crackdown on jihadi charitable foundations , bank accounts of terror-related organizations and money transfers .	Bin Laden ’ s men pointed out that raising and distributing funds has been complicated by the U.S. crackdown on jihadi charitable foundations , bank accounts of terror-related organizations and money transfers .
-0	524136	524119	" Sanitation is poor ... there could be typhoid and cholera , " he said .	" Sanitation is poor , drinking water is generally left behind . . . there could be typhoid and cholera . "
-0	969512	969295	The broader Standard & Poor 's 500 Index .SPX gave up 11.91 points , or 1.19 percent , at 986.60 .	The technology-laced Nasdaq Composite Index was down 25.36 points , or 1.53 percent , at 1,628.26 .
-1	1685339	1685429	The only announced Republican to replace Davis is Rep. Darrell Issa of Vista , who has spent $ 1.71 million of his own money to force a recall .	So far the only declared major party candidate is Rep. Darrell Issa , a Republican who has spent $ 1.5 million of his own money to fund the recall .
-1	1967578	1967664	The decision to issue new guidance has been prompted by intelligence passed to Britain by the FBI in a secret briefing in late July .	Scotland Yard 's decision to issue new guidance has been prompted by new intelligence passed to Britain by the FBI in late July .
-1	2047034	2046820	Unable to find a home for him , a judge told mental health authorities they needed to find supervised housing and treatment for DeVries somewhere in California .	The judge had told the state Department of Mental Health to find supervised housing and treatment for DeVries somewhere in California .
-1	2046630	2046644	The decision came a year after Whipple ended federal oversight of the district 's racial balance , facilities , budget , and busing .	The decision came a year after Whipple ended federal oversight of school busing as well as the district 's racial balance , facilities and budget .
-0	2221603	2221633	In midafternoon trading , the Nasdaq composite index was up 8.34 , or 0.5 percent , to 1,790.47 .	The Nasdaq Composite Index .IXIC dipped 8.59 points , or 0.48 percent , to 1,773.54 .
-1	129995	129864	Morgan Stanley raised its rating on the beverage maker to " overweight " from " equal-weight " saying in part that pricing power with its bottlers should improve in 2004 .	Morgan Stanley raised its rating on the company to " overweight " from " equal-weight , " saying the beverage maker 's pricing power with bottlers should improve in 2004 .
-0	919683	919782	The pound also made progress against the dollar , reached fresh three-year highs at $ 1.6789 .	The British pound flexed its muscle against the dollar , last up 1 percent at $ 1.6672 .
-0	970740	971209	Friday , Stanford ( 47-15 ) blanked the Gamecocks 8-0 .	Stanford ( 46-15 ) has a team full of such players this season .
-1	2745055	2745022	Last month Intel raised its revenue guidance for the quarter to between $ 7.6 billion and $ 7.8 billion .	At the end of the second quarter , Intel initially predicted sales of between $ 6.9 billion and $ 7.5 billion .
-0	2199097	2199072	The driver , Eugene Rogers , helped to remove children from the bus , Wood said .	At the accident scene , the driver was " covered in blood " but helped to remove children , Wood said .
-1	1609290	1609098	ONG KONG , July 9 Tens of thousands of demonstrators gathered tonight before the legislature building here to call for free elections and the resignation of Hong Kong 's leader .	Tens of thousands of demonstrators gathered yesterday evening to stand before this city 's legislature building and call for free elections and the resignation of Hong Kong 's leader .
-1	1597193	1597119	Saddam loyalists have been blamed for sabotaging the nation 's infrastructure , as well as frequent attacks on U.S. soldiers .	Hussein loyalists have been blamed for sabotaging the nation 's infrastructure and attacking US soldiers .
-1	2758944	2758975	Its closest living relatives are a family frogs called sooglossidae that are found only in the Seychelles in the Indian Ocean .	Its closest relative is found in the Seychelles Archipelago , near Madagascar in the Indian Ocean .
-0	2584416	2584653	Cooley said he expects Muhammad will similarly be called as a witness at a pretrial hearing for Malvo .	Lee Boyd Malvo will be called as a witness Wednesday in a pretrial hearing for fellow sniper suspect John Allen Muhammad .
-1	86007	86373	" Instead of pursuing the most imminent and real threats - international terrorists , " Graham said , " this Bush administration chose to settle old scores . "	" Instead of pursuing the most imminent and real threats - international terrorists - this Bush administration has chosen to settle old scores , " Graham said .
-1	1602860	1602844	He said they lied on a sworn affidavit that requires them to list prior marriages .	Morgenthau said the women , all U.S. citizens , lied on a sworn affidavit that requires them to list prior marriages .
-1	1201306	1201329	The association said 28.2 million DVDs were rented in the week that ended June 15 , compared with 27.3 million VHS cassettes .	The Video Software Dealers Association said 28.2 million DVDs were rented out last week , compared to 27.3 million VHS cassettes .
-0	461779	461815	With these assets , Funny Cide has a solid chance to become the first Triple Crown winner since Affirmed in 1978 .	Funny Cide is looking to become horse racing 's first Triple Crown winner in a generation .
-1	1438666	1438643	Intel was disappointed and assessing its " options in the event Mr. Hamidi resumes his spamming activity against Intel , " spokesman Chuck Mulloy said .	Intel spokesman Chuck Mulloy said the company was disappointed and assessing its " options in the event Mr. Hamidi resumes his spamming activity against Intel . "
-1	3261484	3261306	Mr Annan also warned the US should not use the war on terror as an excuse to suppress " long-cherished freedoms " .	Annan warned that the dangers of extremism after September 11 should not be used as an excuse to suppress " long-cherished " freedoms .
-1	1277539	1277527	At community colleges , tuition will jump to $ 2,800 from $ 2,500 .	Community college students will see their tuition rise by $ 300 to $ 2,800 or 12 percent .
-1	3035788	3035918	He made a point of saying during Tuesdays debate that the Confederate flag was a racist symbol .	Though Dean made a point of saying during the debate that the Confederate flag is a racist symbol .
-0	132553	132725	Bush wanted " to see an aircraft landing the same way that the pilots saw an aircraft landing , " White House press secretary Ari Fleischer said yesterday .	On Tuesday , before Byrd 's speech , Fleischer said Bush wanted ' ' to see an aircraft landing the same way that the pilots saw an aircraft landing .
-0	2259788	2259747	On Monday the Palestinian Prime Minister , Mahmoud Abbas , will report to the Palestinian parliament on his Government 's achievements in its first 100 days in office .	Palestinian Prime Minister Mahmoud Abbas must defend the record of his first 100 days in office before Parliament today as the death toll in the occupied territories continues to rise .
-0	2307064	2307235	The civilian unemployment rate improved marginally last month -- slipping to 6.1 percent -- even as companies slashed payrolls by 93,000 .	The civilian unemployment rate improved marginally last month _ sliding down to 6.1 percent _ as companies slashed payrolls by 93,000 amid continuing mixed signals about the nation 's economic health .
-1	3046488	3046824	Per-user pricing is $ 29 for Workplace Messaging , $ 89 for Team Collaboration and $ 35 for Collaborative Learning .	Workplace Messaging is $ 29 , Workplace Team Collaboration is $ 89 , and Collaborative Learning is $ 35 .
-1	86020	86007	" Instead of pursuing the most imminent and real threats – international terrorism – this Bush administration chose to settle old scores , " Mr. Graham said .	" Instead of pursuing the most imminent and real threats - international terrorists , " Graham said , " this Bush administration chose to settle old scores . "
-0	1100998	1100441	SARS has killed about 800 people and affected more than 8400 since being detected in China in November .	SARS has killed about 800 people and sickened more than 8,400 worldwide , mostly in Asia .
-1	2268396	2268480	Authorities had no evidence to suggest the two incidents were connected .	There was no immediate evidence that the two incidents were connected , police said .
-0	1984039	1983986	" Jeremy 's a good guy , " Barber said , adding : " Jeremy is living the dream life of the New York athlete .	He also said Shockey is " living the dream life of a New York athlete .
-0	2697659	2697747	Ratliff 's daughters , Margaret and Martha Ratliff , were adopted by Peterson after their mother 's death .	Peterson helped raise Ratliff 's two daughters , Margaret and Martha Ratliff , who supported him throughout the trial .
-0	2175939	2176090	After losing as much as 84.56 earlier , the Dow Jones industrial average closed up 22.81 , or 0.2 percent , at 9,340.45 .	In midday trading , the Dow Jones industrial average lost 68.84 , or 0.7 percent , to 9,248.80 .
-1	886618	886456	Rumsfeld , who has been feuding for two years with Army leadership , passed over nine active-duty four-star generals .	Rumsfeld has been feuding for a long time with Army leadership , and he passed over nine active-duty four-star generals .
-1	588637	588864	Consumers who said jobs are difficult to find jumped from 29.4 to 32.6 , while those claiming work was plentiful slipped from 13 to 12.6 .	Consumers who said jobs are difficult to find jumped to 32.6 from 29.4 , while those saying work was plentiful slipped to 12.6 from 13 in April .
-0	2252795	2252970	He has no immediate plans for television advertising , believing it is unnecessary this early .	A Lieberman aide said there were no immediate plans for television advertising .
-1	1756329	1756394	" I think it happened very quickly , " Houston Police Department homicide investigator Phil Yochum said of the crime .	" I think it happened very quickly , " said Investigator Phil Yochum of the Houston Police Department 's homicide division .
-1	1673112	1673068	United issued a statement saying it will " work professionally and cooperatively with all its unions . "	Senior vice president Sara Fields said the airline " will work professionally and cooperatively with all our unions . "
-1	2357324	2357271	" But they never climb out of the pot of beer again . "	It 's just that they never climb out of the beer again . "
-1	780408	780363	Chief financial officer Andy Bryant has said that hike had a greater affect volume than officials expected .	Bryant has said that hike had a greater effect on demand than officials expected .
-1	821523	821385	Robert Liscouski , the Assistant Secretary of Homeland Security for Infrastructure Protection , will oversee NCSD .	NCSD 's chief will be Robert Liscouski , the assistant secretary of Homeland Security for Infrastructure Protection .
-1	2304696	2304863	HP 's shipments increased 48 percent year-over-year , compared to an increase of 31 percent for Dell .	HPs shipments increased 48 per cent year-on-year , compared to an increase of 31 per cent for Dell .
-1	2531749	2531607	Chirac , who can pardon a law-breaker , refused Humbert 's request last year but kept in close touch with the family .	Chirac , who has the authority to pardon law-breakers , refused Humbert 's request to be allowed to die last year but kept in close touch with the family .
-1	3180014	3179967	The charges allege that he was part of the conspiracy to kill and kidnap persons in a foreign country .	The government now charges that Sattar conspired with Rahman to kill and kidnap individuals in foreign countries .
-1	726966	726945	In the 2002 study , the margin of error ranged from 1.8 to 4.4 percentage points .	It has a margin of error of plus or minus three to four percentage points .
-1	2638861	2638982	Mr. Clinton 's national security adviser , Sandy Berger , said that the White House wasn 't informed of the FBI activities .	Clinton ’ s national security adviser , Sandy Berger , said in an interview that the White House was not informed of the FBI activities .
-1	2495223	2495307	" This decision is clearly incorrect , " FTC Chairman Timothy Muris said in a written statement .	The decision is " clearly incorrect , " FTC Chairman Tim Muris said .
-1	55187	54831	Prosecutors allege that Nichols and co-conspirator Timothy McVeigh worked together to prepare a bomb that destroyed the Alfred P. Murrah Federal Building .	Prosecutors allege that Nichols and coconspirator Timothy McVeigh worked together to prepare a 4,000-pound fuel-and-fertilizer bomb that destroyed the Murrah building .
-0	2763381	2763517	Terri Schiavo , 39 , is expected to die sometime in the next two weeks in the Tampa-area hospice where she has spent the past several years .	Terri Schiavo , 39 , underwent the procedure at the Tampa Bay area hospice where she has been living for several years , said her father , Bob Schindler .
-1	1990975	1991132	Secretary of State Colin Powell designated the Chechen leader believed responsible for last year 's hostage standoff in a Moscow theater as a threat to U.S. security Friday .	U.S. Secretary of State Colin Powell on Friday designated Chechen rebel leader Shamil Basayev a threat to the security of the United States and to U.S. citizens .
-1	2204353	2204418	" Today , we are trying to convey this problem to Russian President Vladimir Putin and US President George W Bush . "	" Today , we are trying to convey this problem to Russian President Vladimir Putin ( news - web sites ) and President Bush ( news - web sites ) . "
-1	60122	60445	That would be a potential setback to Chief Executive Phil Condit 's strategy of bolstering defense-related sales during a slump in jetliner deliveries .	The inquiry may hinder Chief Executive Phil Condit 's strategy of bolstering defense-related sales during a slump in jetliner deliveries .
-1	961836	962243	PeopleSoft also said its board had officially rejected Oracle 's offer .	Thursday morning , PeopleSoft 's board rejected the Oracle takeover offer .
-0	3140260	3140288	The Dow Jones industrial average ended the day down 10.89 at 9,837.94 , after advancing 111.04 Wednesday .	The Dow Jones industrial average fell 10.89 points , or 0.11 percent , to 9,837.94 .
-1	1720166	1720115	Cortisol levels in the saliva of day care children were highest and rose most steeply in those judged by day care center personnel to be the shyest .	Cortisol levels in the saliva of day-care children were highest and rose most steeply in those whom day-care centre staffed judged to be the shyest .
-1	2573262	2573319	" The idea that Tony Abbott is in some way a one-dimensional political head-kicker couldn 't be more wrong , " Mr Howard said .	" The idea that Tony Abbott is in some way a one-dimensional political head kicker couldn 't be more wrong . "
-0	1353356	1353174	" Biotech products , if anything , may be safer than conventional products because of all the testing , " Fraley said , adding that 18 countries have adopted biotechnology .	" Biotech products , if anything , may be safer than conventional products because of all the testing , " said Robert Fraley , Monsanto 's executive vice president .
-1	2738677	2738741	The rate of skin cancer has tripled since the 1950s in Norway and Sweden , according to the study .	The study also found that skin cancer nearly tripled in Norway and Sweden since the 1950s .
-1	1638813	1639087	We acted because we saw the existing evidence in a new light , through the prism of our experience on 11 September , " Rumsfeld said .	Rather , the US acted because the administration saw " existing evidence in a new light , through the prism of our experience on September 11 " .
-1	1605350	1605425	Trans fat makes up only 1 percent to 3 percent of the total fat Americans consume , compared with 14 percent for saturated fat .	Trans fat accounts for 2.5 percent of Americans ' daily calories , compared to 11 percent to 12 percent for saturated fat .
-1	2494149	2494073	However , a recent slide in prices and OPEC 's expectations of a surge in oil inventories have compounded its fears about a further softening of the market .	A 14 percent slide in crude prices this month and expectations of a build up in oil inventories compounded OPEC 's fears of a further softening of the market .
-1	3023029	3023229	Peterson , 31 , is now charged with murder in the deaths of his 27-year-old wife and their unborn son .	Peterson , 31 , is charged with two counts of first-degree murder in the slayings of his wife , Laci , and their unborn son , Conner .
-1	1351550	1351155	Carlson on Tuesday said he would not recuse himself from the case .	Service officials said Carlson refused to recuse himself from the case .
-1	981185	981234	The program will grow to include ports in Dubai , Turkey and Malaysia , among others .	The program will be expanded to include areas of the Middle East such as Dubai , Turkey and Malaysia , Mr. Ridge said .
-0	2111629	2111786	McCabe said he was considered a witness , not a suspect .	" He is not considered a suspect , " McCabe said .
-1	655498	655391	The woman was exposed to the SARS virus while in the hospital but was not a health care worker , said Dr. Colin D ’ Cunha , Ontario ’ s commissioner of public health .	The woman was exposed to the SARS virus while in the hospital but was not a health-care worker , said Dr Colin D 'Cunha , Ontario 's commissioner of public health .
-1	533823	533909	He added that those " are not solely American principles , nor are they exclusively Western . "	" These are not solely American principles nor are they exclusively Western , " Rumsfeld said .
-1	581592	581570	" If we don 't march into Tehran , I think we will be in pretty good shape , " he said .	" As long as we don 't march on Tehran , I think we are going to be in pretty good shape , " he said .
-0	1010655	1010430	On Saturday , a 149mph serve against Agassi equalled Rusedski 's world record .	On Saturday , Roddick equalled the world record with a 149 m.p.h. serve in beating Andre Agassi .
-1	2241925	2242066	Chad Kolton , emergency management spokesman with the Department of Homeland Security , said the government is open to new technologies and methods to communicate more quickly and efficiently .	Chad Kolton , emergency management spokesman with the Department of Homeland Security , said the government is open to new ways to communicate .
-1	2796978	2797024	" APEC leaders are painfully aware that security and prosperity are inseparable , " Thai Prime Minister Thaksin Shinawatra told business leaders .	" APEC leaders are painfully aware that security and prosperity are inseparable , " Thaksin said .
-0	101746	101775	Danbury prosecutor Warren Murray could not be reached for comment Monday .	Prosecutors could not be reached for comment after the legal papers were obtained late Monday afternoon .
-1	327839	327748	Wittig resigned last year after being indicted on federal bank fraud charges involving a real estate loan unrelated to Westar business .	Wittig resigned in late November about two weeks after being indicted on bank fraud charges in a real estate case unrelated to the company .
-0	2988297	2988555	Shattered Glass , " starring Hayden Christensen as Stephen Glass , debuted well with $ 80,000 in eight theaters .	" Shattered Glass " _ starring Hayden Christensen as Stephen Glass , The New Republic journalist fired for fabricating stories _ debuted well with $ 80,000 in eight theaters .
-1	2217613	2217659	He was arrested Friday night at an Alpharetta seafood restaurant while dining with his wife , singer Whitney Houston .	He was arrested again Friday night at an Alpharetta restaurant where he was having dinner with his wife .
-0	2128530	2128455	However , EPA officials would not confirm the 20 percent figure .	Only in the past few weeks have officials settled on the 20 percent figure .
-1	2208376	2208198	University of Michigan President Mary Sue Coleman said in a statement on the university 's Web site , " Our fundamental values haven 't changed .	" Our fundamental values haven 't changed , " Mary Sue Coleman , president of the university , said in a statement in Ann Arbor .
-1	1980654	1980641	The first products are likely to be dongles costing between US $ 100 and US $ 150 that will establish connections between consumer electronics devices and PCs .	The first products will likely be dongles costing $ 100 to $ 150 that will establish connections between consumer electronics devices and PCs .
-0	589579	589557	However , Lapidus expects foreign brands ' sales to be up 4 percent , driven by strong truck sales at Honda Motor Co .	Lapidus expects Ford to be down 5 percent , Chrysler down 10 percent and foreign brands up 4 percent driven by strong truck sales at Honda .
-1	1636060	1635946	Michel , who remains in the government , denied that US pressure had provoked the government 's move .	Michel , who has stayed in the new government , denied that it was U.S. pressure which had provoked the government 's move .
-1	1630585	1630657	Some of the computers also are used to send spam e-mail messages to drum up traffic to the sites .	Some are also used to send spam e-mail messages to boost traffic to the sites .
-0	447728	447699	Indonesia 's army has often been accused of human rights abuses during GAM 's battle for independence , charges it has generally denied while accusing the separatists of committing rights violations .	Indonesia 's army has been accused of human rights abuses during its earlier battles with GAM , charges it has generally denied .
-1	1606495	1606619	Bush also hoped to polish his anti-AIDS credentials in Uganda , which has been hailed as an African pioneer in fighting the killer disease .	President Bush flies to Uganda Friday hoping to polish his anti- AIDS credentials in a country hailed as an African pioneer in fighting the epidemic .
-1	1550897	1550977	Later this year , the command will send trainers with soldiers from four North African nations on patrolling and intelligence gathering missions .	This fall the command will send trainers to work with soldiers from four North African nations on patrolling and gathering intelligence .
-0	490376	490490	The reports helped overcome investor jitters after the euro briefly hit an all-time high against the dollar Tuesday .	Stocks slipped at the open after the euro hit record highs against the dollar .
-1	3084554	3084612	Sales for the quarter beat expectations , rising 37 percent year-on-year to 1.76 billion euros .	Sales rose 37 per cent year-on-year to 1.76bn , beating expectations .
-1	315647	315778	If the MTA 's appeal to a higher court is successful , the $ 2 bus and subway base fare won 't be rolled back .	If the MTA 's appeal is successful , the $ 2 bus and subway base fare won 't change .
-1	3428298	3428362	Robert Walsh , 40 , remained in critical but stable condition Friday at Staten Island University Hospital 's north campus .	Walsh , also 40 , was in critical but stable condition at Staten Island University Hospital last night .
-1	2523564	2523358	The Guru microcontroller serves four functions : hardware monitoring , overclocking management , BIOS ( Basic Input Output System ) update and a troubleshooting-assistance feature called Black Box .	The µGuru microcontroller serves four functions : hardware monitoring , overclocking management , BIOS update and a troubleshooting-assistance feature called Black Box .
-1	2079200	2079131	U.S. corporate bond yield spreads tightened in spotty trading on Friday as Wall Street labored to get back on its feet after the largest power outage ever in North America .	U.S. stocks rose slightly on feather-light volume on Friday , as Wall Street regrouped after the biggest-ever power outage in North America .
-1	818091	817811	The company said it would issue revised guidance for the full fiscal year next month when it releases its Q2 results .	The company said it would renew its guidance for 2003 when it announces its second quarter results in mid-July .
-1	1580638	1580663	" I stand 100 percent by it , and I think our intelligence services gave us the correct information at the time . "	I stand 100 percent by it , and I think that our intelligence services gave us the correct intelligence and information at the time , " Blair said .
-0	1919740	1919926	" I don 't know if the person I 'm talking to now may end up being someone else at another time that may not follow the rules , " Parrish said .	" I don 't know whether the person I 'm talking to now may end up being someone else , " Parrish said .
-1	2748287	2748550	" I think it 's going to be a close vote , but I think the grant proposal is going to win , " McConnell said .	" I think it 's going to be a close vote , but I think the grant proposal 's going to win , " said Sen. Mitch McConnell , assistant majority leader .
-1	3394891	3394775	Twenty-eight people were believed to have been spending Christmas Day with the caretaker of the St Sophia 's camp , when the mudslide smashed into two cabins .	Twenty-seven people were believed to have been spending Christmas Day with the caretaker of Saint Sophia Camp , a Greek Orthodox facility , when the mudslide roared through .
-0	2963943	2963880	One , Capt. Doug McDonald , remained hospitalized in critical condition on Thursday .	Her 20-year-old sister , Allyson , was severely burned and remained hospitalized in critical condition .
-0	1865364	1865251	The United States finally relented during President Bush 's visit to Africa earlier this month .	During President Bush 's trip to Africa earlier this month , however , Washington said it would support the increase .
-1	263690	263819	" There is no conscious policy of the United States , I can assure you of this , to move the dollar at all , " he said .	He also said there is no conscious policy by the United States to move the value of the dollar .
-1	283751	283290	It 's the first such drill since the September 11 terrorist attacks on New York and Washington .	It is the nation 's first large-scale counterterrorism exercise since the Sept . 11 terrorist attacks .
-1	2517014	2516995	Myanmar 's pro-democracy leader Aung San Suu Kyi will return home late Friday but will remain in detention after recovering from surgery at a Yangon hospital , her personal physician said .	Myanmar 's pro-democracy leader Aung San Suu Kyi will be kept under house arrest following her release from a hospital where she underwent surgery , her personal physician said Friday .
-1	1330643	1330622	According to the Merchant Marine Ministry , the 37-year-old ship is registered to Alpha Shipping Inc. based in the Pacific Ocean nation of Marshall Islands .	The Baltic Sky is a 37-year-old ship registered to Alpha Shipping Inc. based in the Pacific Ocean nation of Marshall Islands .
-1	3111452	3111428	In an unusual move , the U.S. Patent and Trademark Office is reconsidering a patent affecting Internet pages that critics contend could disrupt millions of Web sites .	In an unusual move that critics contend could disrupt millions of Web sites , the U.S. Patent and Trademark Office is reconsidering a patent affecting Internet pages .
-0	1167835	1167651	Kansas Department of Health and Environment records show there were 88 abortions performed on girls age 14 and younger last year .	Statistics from the Kansas Department of Health and Environment show that 11,844 abortions were performed in the state last year .
-0	1423836	1423708	A European Union spokesman said the Commission was consulting EU member states " with a view to taking appropriate action if necessary " on the matter .	Laos 's second most important export destination - said it was consulting EU member states ' ' with a view to taking appropriate action if necessary ' ' on the matter .
-1	2090911	2091154	Waiting crowds filling the streets on both sides overwhelmed the peacekeepers soon after daylight , sweeping past the barbed wire barricades .	But waiting crowds filling the streets rushed the bridges soon after daylight , overrunning razor-wire barricades .
-1	2265271	2265152	Barry Callebaut will be able to use Brach 's retail network to sell products made from its German subsidiary Stollwerck , which makes chocolate products not sold in the United States .	Barry Callebaut will be able to use Brach 's retail network to sell products made from its German subsidiary Stollwerck , which makes chocolate products unknown to the American market .
-1	3062202	3062308	By skirting the FDA 's oversight , Eagan said , the quality of the imported drugs is " less predictable " than for those obtained in the United States .	By skirting the FDA 's oversight , Eagan said the quality of the imported drugs is " less predictable " than U.S. drugs .
-1	2155514	2155377	He said : " For the first time there is an easy and affordable way of making this treasure trove of BBC content available to all . "	" For the first time , there is an easy and affordable way of making this treasure trove of BBC content available to all , " Dyke said .
-1	1552068	1551928	Three such vigilante-style attacks forced the hacker organizer , who identified himself only as " Eleonora [ 67 ] , " to extend the contest until 7 p.m. EST Sunday .	Three such vigilante-style attacks forced the hacker organiser , who identified himself only as " Eleonora67 ] , " to extend the contest until 8am ( AEST ) today .
-1	936978	937500	Eric Gagne pitched a perfect ninth for his 23rd save in as many opportunities .	Gagne struck out two in a perfect ninth inning for his 23rd save .
-0	985015	984975	One way or another , Harry Potter And The Order Of The Phoenix will be in your hands by Saturday .	Just about everything about " Harry Potter and the Order of the Phoenix " will set records .
-1	1430357	1430425	" Allison just proves you don 't need to wait until August or September to have a disaster , " said Josh Lichter , a meteorologist with the Houston-Galveston weather office .	" Allison just proves you don 't need to wait until August or September to have a disaster , " Lichter said .
-1	3039310	3039413	Today , analysts say , UN members can no longer ignore the shifts since the September 11 2001 attacks .	On Wednesday , analysts say , UN members can no longer ignore the shifts since the attacks in the US of September 11 2001 .
-1	34513	34742	Police say CIBA was involved in the importation of qat , a narcotic substance legal in Britain but banned in the United States .	Mr McKinlay said that CIBA was involved in the importation of qat , a narcotic substance legal in Britain but banned in the US .
-1	368067	368018	Chiron already has nearly 20 percent acceptances from PowderJect 's shareholders .	Chiron has acceptances from holders of nearly 20 percent of PowderJect shares .
-0	611663	611716	Ernst & Young has denied any wrongdoing and plans to fight the allegations .	Ernst & Young has denied the SEC 's claims , and called its recommendations " irresponsible " .
-1	98432	98657	The attack followed several days of disturbances in the city where American soldiers exchanged fire with an unknown number of attackers as civilians carried out demonstrations against the American presence .	The attack came after several days of disturbance in the city in which U.S. soldiers exchanged fire with an unknown number of attackers as civilians protested the American presence .
-1	3039007	3038845	No company employee has received an individual target letter at this time .	She said no company official had received " an individual target letter at this time . "
-1	1708040	1708062	Second-quarter results reflected a gain of 10 cents per diluted share , while the 2002 results included a loss of 19 cents per diluted share .	The second-quarter results had a non-operating gain of 10 cents a share while the 2002 second-quarter performance had a net non-operating loss of 19 cents a share .
-0	1757264	1757375	He allegedly told his ex-wife in an angry phone call that he had no intention of following their new custody agreement .	The two had battled over custody and he allegedly told her in an angry phone call that he had no intention of following their new custody agreement .
-1	383417	383558	Worldwide , more than 50 million people have seen " Les Miz , " with gross receipts of $ 1.8 billion .	Worldwide , Les Misérables has been seen by over 50 million people , with a total gross of over $ 2 billion .
-0	2766112	2766084	In fiction : Edward P. Jones ( " The Known World " ) and Scott Spencer ( " A Ship Made of Paper " ) .	The fifth nominee for fiction is Scott Spencer , for A Ship Made of Paper .
-1	1261116	1261234	" Overwhelmingly the Windows brand really resonated with them . "	" Windows was the part of the experience that really resonated with people . "
-1	3028143	3028234	The Centers for Medicare and Medicaid Services , the federal agency that runs Medicare , last year began a similar effort for nursing homes .	The Centers for Medicare and Medicaid launched a similar consumer tool for nursing homes last year .
-0	249699	249623	Vivace was founded in 1999 and has raised over $ 118 million in three rounds of venture financing .	During difficult times for technology venture capital , Vivace raised over $ 118 million in three rounds of venture financing .
-0	3448488	3448449	The Dow Jones industrial average < .DJI > added 28 points , or 0.27 percent , at 10,557 , hitting its highest level in 21 months .	The Dow Jones industrial average < .DJI > rose 49 points , or 0.47 percent , to 10,578 .
-1	2749322	2749663	The Democratic candidates also began announcing their fund-raising totals before Wednesday 's deadline to file quarterly reports with the Federal Election Commission .	The Democratic candidates also began announcing their fund-raising totals in advance of the deadline today to file quarterly reports with the Federal Election Commission .
-0	2204592	2204588	Sun Microsystems Inc. on Thursday said it had added 100 new third-party systems and 100 new components to its Hardware Compatibility List for the Solaris x86 operating system Platform Edition .	The vendor has added 100 new third-party systems and 100 new components to the operating system 's Hardware Compatibility List ( HCL ) .
-1	2889005	2888954	Prosecutors said PW Marketing violated the state 's 1998 anti-spam law by sending unsolicited e-mail without a toll-free number for recipients to call to stop additional mailings .	Prosecutors said PW Marketing violated the 1998 anti-spam law because these unsolicited e-mails were sent without a free call number for recipients to phone to stop additional mailings .
-0	1657632	1657619	The Neighbours star and singer spent yesterday resting at her family home in Sydney and will have more tests today .	Goodrem spent yesterday resting in her family home in Sydney and will have more tests today to determine her exact treatment .
-0	555617	555528	The 3 rd Armored Cavalry Regiment is 5,200 strong and the largest combat unit at Fort Carson .	Broomhead , 34 , was assigned to the 2nd Squadron , 3rd Armored Cavalry Regiment .
-1	2396937	2396818	" The risk of inflation becoming undesirably low remains the predominant concern for the foreseeable future , " the Fed said in a statement accompanying the unanimous decision .	" The risk of inflation becoming undesirably low remains the predominant concern for the foreseeable future , " the policy-setting Federal Open Market Committee said .
-0	2339738	2339771	" It is bad for Symbian , " said Per Lindberg , analyst at Dresdner Kleinwort Wasserstein .	" Motorola has displayed clear disloyalty " to Symbian , said Per Lindberg , an analyst at Dresdner Kleinwort Wasserstein in London .
-0	1616174	1616206	Bob Richter , a spokesman for House Speaker Tom Craddick , had no comment about the ruling .	Bob Richter , spokesman for Craddick , R-Midland , said the speaker had not seen the ruling and could not comment .
-1	635783	635802	But Ms Ward said the headroom under its financial covenants was " tight " and that there could be another downgrade if Southcorp breached any of its banking covenants .	But Ms Ward said the headroom under its financial covenants was " tight " and that there could be a rating downgrade if Southcorp did breach any banking covenants .
-1	3444633	3444733	He added : ``I 've never heard of more reprehensiblebehaviour by a doctor .	The Harrisons ’ lawyer Paul LiCalsi said : “ I ’ ve never heard of more reprehensible behaviour by a doctor .
-1	555553	555528	Broomhead was assigned to 2nd Squadron , 3rd Armor Cavalry Regiment , based at Fort Carson .	Broomhead , 34 , was assigned to the 2nd Squadron , 3rd Armored Cavalry Regiment .
-1	1112021	1111925	Other staff members , however , defended the document , saying it would still help policy-makers and the agency improve efforts to address the climate issue .	Some E.P.A. staff members defended the document , saying that although pared down it would still help policy makers and the agency address the climate issue .
-0	2749410	2749625	President Bush raised a record-breaking $ 49.5 million for his re-election campaign over the last three months , with contributions from 262,000 Americans , the president 's campaign chairman said Tuesday .	President Bush has raised $ 83.9 million since beginning his re-election campaign in May , and has $ 70 million of that left to spend , his campaign said Tuesday .
-1	1629064	1629043	An episode is declared when the ozone reaches .20 parts per million parts of air for one hour .	A Stage 1 episode is declared when ozone levels reach 0.20 parts per million .
-1	789691	789665	" He may not have been there , " the defence official said on Thursday .	" He may not have been there , " said a defence official speaking on condition of anonymity .
-1	844421	844679	The U.N. troops are in Congo to protect U.N. installations and personnel , and they can only fire in self defense and have been unable to stem the violence .	The troops - whose mandate is to protect U.N. installations and personnel - can only fire in self-defense and have been unable to stem the violence .
-1	58540	58567	North American markets grabbed early gains Monday morning , as earnings season begins to slow and economic indicators take the spotlight .	North American futures pointed to a strong start to the first trading session of the week Monday , as earnings season slows and economic indicators take the spotlight .
-1	781439	781461	Xerox itself paid a $ 10 million fine last year to settle similar SEC charges .	Xerox itself previously paid a $ 10-million penalty to settle the SEC accusations .
-1	1909579	1909408	" This deal makes sense for both companies , " said National Chief Executive Brian Halla .	" This deal makes sense for both companies , " Halla said in a prepared statement .
-0	787432	787464	The blasts killed two people and injured more than 150 others .	The Atlanta Olympic Games attack killed one woman and injured more than 100 other people .
-0	52758	52343	Morrill 's wife , Ellie , sobbed and hugged Bondeson 's sister-in-law during the service .	At the service Morrill 's widow , Ellie , sobbed and hugged Bondeson 's sister-in-law as people consoled her .
-1	1675025	1675047	Spansion products are to be available from both AMD and Fujitsu , AMD said .	Spansion Flash memory solutions are available worldwide from AMD and Fujitsu .
-1	2131318	2131372	About 1,500 police will be deployed for the visit .	Around 1,500 police are to be deployed at Niigata for the ferry 's visit .
-1	325763	325928	Gamarekian told The News she remembers only the woman 's first name - and refused to reveal it .	She told the New York Daily News she remembers only the intern 's first name , which she refused to reveal .
-1	2638975	2638855	One of the FBI ’ s key operatives , who had a falling out with the bureau , provided an account of the operation at a friend ’ s closed immigration court proceeding .	One of the FBI 's key operatives , who has had a falling-out with the bureau , provided an account of the operation at a friend 's closed immigration court proceeding .
-1	2198694	2198937	A nationally board certified teacher with a master 's degree , Kelley makes a salary of $ 65,000 in his 30th year .	A nationally board certified teacher with a master 's degree , Kelley , in his 30th year teaching , makes $ 65,000 .
-1	1825432	1825301	A man arrested for allegedly threatening to shoot and kill a city councilman from Queens was ordered held on $ 100,000 bail during an early morning court appearance Saturday .	The Queens man arrested for allegedly threatening to shoot City Councilman Hiram Monserrate was held on $ 100,000 bail Saturday , a spokesman for the Queens district attorney said .
-1	2906104	2906322	They were being held Sunday in the Camden County Jail on $ 100,000 bail .	They remained in Camden County Jail on Sunday on $ 100,000 bail .
-1	722278	722383	Ms Stewart , the chief executive , was not expected to attend .	Ms Stewart , 61 , its chief executive officer and chairwoman , did not attend .
-0	101747	101777	Christina 's aunt , Shelley Riling , said the defense 's claims were preposterous .	Christina 's aunt , Shelley Riling , said she will address the court .
-1	2224884	2224819	The Justice Department Aug. 19 gave pre-clearance for the Oct. 7 date for the election to recall Gov. Gray Davis , saying it would not affect minority voting rights .	The Justice Department on Aug. 19 sanctioned the Oct. 7 date for recall election , saying it would not affect voting rights .
-0	977938	978162	Lord Falconer hailed the changes as " a new beginning as far as the courts , Crown Prosecution Service and police are concerned " .	" It 's a new beginning as far as the courts , Crown Prosecution Service and police are concerned , making the criminal justice system work better . "
-0	1015010	1014963	GE stock closed at $ 30.65 a share , down about 42 cents , on the New York Stock Exchange .	GE 's shares closed at $ 30.65 on Friday on the New York Stock Exchange .
-1	1513190	1513246	At least 27 US troops have been killed in hostile fire since Bush 's statement .	At least 26 American troops have been killed in hostile fire since major combat was officially declared over on May 1 .
-1	2385348	2385394	A recent poll showed Edwards with a narrow lead in South Carolina , and he plans a rally there later on Tuesday .	A recent poll showed Edwards in a virtual four-way tie at the top in South Carolina , and he plans a rally there later on Tuesday .
-1	2317018	2317252	November 17 's last victim was British defence attache Stephen Saunders , who was shot on an Athens road in June 2000 .	November 17 's last victim was British defense attache Stephen Saunders , who was shot and killed at point-blank range on a busy Athens road in June 2000 .
-0	1831696	1831660	The agency charged that one WD Energy worker discussed false reporting with traders at two other energy companies .	The agency found further that a WD Energy employee discussed false reporting with traders at two other energy companies , which the CFTC didn 't identify .
-1	1528383	1528083	Zulifquar Ali , a worshipper slightly wounded by shrapnel , said the assailants first targeted the mosque 's security guards .	Witness Zulfiqar Ali , who was slightly wounded by shrapnel , said the attackers had focused on the mosque 's guards .
-1	917965	918315	For the second year in a row , rises in hospital costs accounted for much of the inflation , accounting for 51 percent of the overall cost increase .	For the second year in a row , rises in hospital costs dominated the increase , accounting for 51 percent of the overall cost spiral .
-0	3218713	3218830	Q : Can I buy coverage for prescription drugs right away ?	Congress has added a new benefit - an option to buy insurance coverage for prescription drugs .
-1	221079	221003	The airline also said it has the option to buy 380 more airplanes , orders that would be split evenly between the two manufacturers .	The airline has the option to buy 380 more , split evenly between the two manufacturers .
-1	2546175	2546198	Dr Mark McClean , Jonathan 's family doctor , said if the drug had been administered earlier Jonathan would have retained more of his brain functions .	Dr Mark McClean , the family 's GP , said had the drug been administered to Jonathan earlier , he would have retained more of his brain function .
-0	799346	799268	The chain operates more than 3,400 stores , and has annual revenue of about $ 15.8 billion .	The chain , which has been under new management since late 1999 , has more than 3,400 stores and $ 15.8 billion in annual revenue .
-0	2673104	2673130	All patients developed some or all of the symptoms of E. coli food poisoning : bloody diarrhea , vomiting , abdominal cramping and nausea .	Symptoms of the E. coli infection include bloody diarrhea , nausea , vomiting and abdominal cramping .
-1	1354501	1354476	Federal regulators have turned from sour to sweet on a proposed $ 2.8 billion merger of ice cream giants Nestle Holdings Inc. and Dreyer 's Grand Ice Cream Inc .	Federal regulators have changed their minds on a proposed $ 2.8 billion merger of ice cream giants Nestle Holdings and Dreyer 's Grand Ice Cream .
-1	3070979	3070949	Environmental campaigners are using this weekend ’ s lunar eclipse to highlight the huge increase in light pollution across the UK .	Environmental campaigners used the eclipse to highlight the surge in light pollution across Britain .
-0	1264509	1264471	Available July 7 , the software supports the Solaris , IBM AIX , Red Hat Linux and Windows operating systems .	The OpForce product currently works with Solaris , AIX , Red Hat Linux and Windows servers .
-1	103280	103431	Justice Minister Martin Cauchon and Prime Minister Jean Chrétien have both said the Liberal government will introduce legislation soon to decriminalize possession of small amounts of pot for personal use .	Justice Minister Martin Cauchon and Prime Minister Jean Chretien both have said the government will introduce legislation to decriminalize possession of small amounts of pot .
-0	110731	110648	But Chauncey Billups demonstrated he 's also capable of big games , scoring 77 points over the final two games against the Magic .	Billups scored 77 points in the final two games of the first-round series against the Magic .
-1	2274844	2274714	Kelly killed himself after being exposed as the source for a BBC report which claimed the government had embellished evidence of Iraq 's banned weapons to justify the war .	He killed himself after being exposed as the source for a BBC report which claimed the government exaggerated the case for war against Iraq .
-0	1050307	1050144	And it 's going to be a wild ride , " said Allan Hoffenblum , a Republican consultant .	Now the rest is just mechanical , " said Allan Hoffenblum , a Republican consultant .
-1	2810634	2810670	While the Ibrahims had one separation operation , Goodrich and Dr. David Staffenberg plan about three for the Aguirres , with several weeks between each .	Instead of one long operation to separate the twins , Goodrich and Dr. David Staffenberg plan about three , with several weeks between each .
-1	3073773	3073779	Lay had contended that turning over the documents would violate his Fifth Amendment right against self-incrimination .	Lay had refused to turn over the papers , asserting his Fifth Amendment right against self-incrimination .
-0	261202	260995	The WHO experts didn 't say how many cases in Hebei were in rural areas .	Hebei has reported 191 cases and eight deaths , though the WHO experts did not say how many were in rural areas .
-1	1824224	1824209	Nearly 300 mutinous troops who seized a Manila shopping and apartment complex demanding the government resign gave up and retreated peacefully after some 19 hours .	Mutinous troops who seized a Manila shopping and apartment complex demanding the government resign ended a 19-hour standoff late Sunday and returned to barracks without a shot fired .
-1	548867	548785	In three years , Lend Lease has slipped from a top-five stock , when its share price was around $ 24 , to 37th .	In the space of three years , Lend Lease has slipped from a top-five 5 stock when its share price hovered around $ 24 to 37th on the list .
-0	2796658	2796682	About two hours later , his body , wrapped in a blanket , was found dumped a few blocks away .	Then his body was dumped a few blocks away , found in a driveway on Argyle Road .
-1	1808166	1808434	Columbia broke up over Texas upon re-entry on Feb. 1 .	Columbia broke apart in the skies above Texas on Feb. 1 .
-1	853475	853342	A year or two later , 259 , or 10 per cent , of the youths reported that they had started to smoke , or had taken just a few puffs .	Within two years , 259 , or 10 percent , of the youths reported they had started to smoke or had at least taken a few puffs .
-0	977772	977804	The Lord Chancellor was guardian of the Great Seal , used to stamp all official documents from the sovereign .	Falconer will hold on , for now , to the Lord Chancellor 's Great Seal , used to sign off instructions from the sovereign .
-1	577854	578500	Cindy Yeast , a 50-year-old Washington-area publicist , says she began taking supplements two years ago in part to avoid mild dementia that affects her elderly parents .	She started taking supplements two years ago - partly to stave off mild dementia that affects her elderly parents .
-1	2829194	2829229	The two are not related , but have referred to each other as father and son .	He 's not related to Malvo , but the two have referred to each other as father and son .
-1	2074182	2074668	Gibson said last month in a press statement that " neither I nor my film are anti-Semitic .	Gibson said in a June statement that he and his film are not anti-Semitic .
-0	2758265	2758282	The world 's largest software company said it recognized the difficulty the multiple patches posed for companies , and set out to make it easier for them to apply the updates .	The world 's largest software company said it recognized the difficulty the multiple patches posed for companies trying to apply them .
-1	1958079	1958143	The Dow Jones industrial average .DJI ended up 64.64 points , or 0.71 percent , at 9,191.09 , according to the latest available data .	The blue-chip Dow Jones industrial average .DJI added 38 points , or 0.42 percent , to 9,165 .
-1	544217	544325	The vote came just two days after Kurds swept City Council elections , taking the largest single block of votes on the 30-seat council .	The vote for mayor followed City Council elections that gave Kurds the largest block of votes on the 30-seat council .
-1	2385288	2385256	Large swells and dangerous surf already were being felt along sections of the coast .	Already large swells and dangerous surf have arrived along the mid-Atlantic .
-0	2324708	2325028	Based on a separate survey of households , the unemployment rate fell in August to 6.1 percent from 6.2 percent .	Labor Department analysts discounted a slight improvement in the national unemployment rate , which fell in August to 6.1 percent from 6.2 percent .
-1	2139506	2139427	" We will work with the board to ensure a smooth transition . "	He said federal regulators would work with the corporation to ensure a " smooth transition . "
-1	2965576	2965701	Gasps could be heard in the courtroom when the photo was displayed .	Gasps could be heard as the photo was projected onto the screen .
-1	2931098	2931144	Gilead had earnings of $ 73.1 million , or 33 cents a share , compared with $ 20.8 million , or 10 cents , in the year-ago quarter .	Quarterly profit climbed to $ 73.1 million , or 33 cents a share , from $ 20.8 million , or 10 cents , a year earlier , the company said .
-0	644788	644816	" I had one bad stretch of holes that put me out of contention to win , " Woods said .	" I had one bad stretch of holes that put me out of contention , " Woods said , referring to his 42 on the front nine Saturday .
-0	2551891	2551563	The poll had a margin of error of plus or minus 2 percentage points .	It had a margin of sampling error of plus or minus four percentage points and was conducted Thursday through Saturday .
-1	1089053	1089297	Sen. Patrick Leahy of Vermont , the committee 's senior Democrat , later said the problem is serious but called Hatch 's suggestion too drastic .	Sen. Patrick Leahy , the committee 's senior Democrat , later said the problem is serious but called Hatch 's idea too drastic a remedy to be considered .
-1	3435735	3435717	The broad Standard & Poor 's 500 < .SPX > eased 0.37 of a point , or 0.03 percent , at 1,121 .	The Standard & Poor 's 500 Index < .SPX > slipped 0.26 point , or 0.02 percent , to 1,121.96 .
-0	1954	2142	Watertown , Saugus and Framingham also are going smoke-free Monday , joining a growing number of cities around the country .	Along with Boston , Watertown , Saugus and Framingham also are going smoke-free Monday .
-1	3400796	3400822	That is evident from their failure , three times in a row , to get a big enough turnout to elect a president .	Three times in a row , they failed to get a big _ enough turnout to elect a president .
-1	1220668	1220801	We firmly believe we have an absolute right to use the common word ' spike ' as the name of our network . "	We firmly believe that we have an absolute right to use the common word ' spike ' to name our network .
-1	1889954	1889847	Sources who knew of the bidding said last week that cable TV company Comcast Corp. was also looking at VUE .	Late last week , sources told Reuters cable TV company Comcast Corp. CMCSA.O also was looking at buying VUE assets .
-1	315785	315653	But MTA officials appropriated the money to the 2003 and 2004 budgets without notifying riders or even the MTA board members considering the 50-cent hike , Hevesi found .	MTA officials appropriated the surplus money to later years ' budgets without notifying riders or the MTA board members when the 50-cent hike was being considered , he said .
-0	1521034	1520582	White , who had suffered kidney failure from years of high blood pressure , died at Cedars-Sinai Medical Center around 9 : 30 a.m. , said manager Ned Shankman .	White , who had kidney failure from years of high blood pressure , had been undergoing dialysis and had been hospitalized since a September stroke .
-1	2083598	2083810	About 10 percent of high school and 16 percent of elementary students must be proficient at math .	In math , 16 percent of elementary and middle school students and 9.6 percent of high school students must be proficient .
-1	1910610	1910455	The legal ruling follows three days of intense speculation Hewlett-Packard Co. may be bidding for the company .	The legal ruling follows three days of wild volatility in RIM 's stock over speculation that PC giant Hewlett-Packard Co. may be bidding for the company .
-1	3113791	3113782	The European Commission , the EU 's antitrust enforcer , is expected to issue its decision next spring — unless a settlement is reached .	The European Commission is expected to issue its decision in the case next spring — unless a settlement is reached .
-1	3214517	3214483	" So Sebastian did his best to convincingly confess to a crime that he didn 't commit in order to survive , " she told jurors .	" Sebastian did his best to confess convincingly to a crime he didn 't do in order to survive , " Ms. Richardson declared .
-0	2083612	2083810	Twenty percent of Latino students and 23 percent of black students performed at proficient or higher .	In math , 16 percent of elementary and middle school students and 9.6 percent of high school students must be proficient .
-1	661390	661218	He is charged in three bombings in Atlanta including a blast at the 1996 Olympics and one in Alabama .	He is charged in three bombings in Atlanta - including a blast at the 1996 Olympics - along with the bombing in Alabama .
-1	1269572	1269682	The men were remanded in custody and are due to appear again before court on July 8 .	They were remanded in custody and will appear in court again on July 8 .
-1	1095780	1095652	" No matter who becomes the sponsor for stock-car racing 's top series , NASCAR will need an all-star event , " Wheeler said in a statement .	No matter who becomes the sponsor for stock-car racings top series , NASCAR will need an all-star event , Wheeler said Tuesday .
-1	116294	116332	The Phillies were upset that Counsell had stolen second in the sixth inning with Arizona leading 7-1 .	The Phillies were apparently upset when Counsell stole during the sixth with the Diamondbacks up 7-1 .
-1	941617	941673	He said his hatred for such people grew from these discussions and had helped convince him violence was the answer .	His hatred for these people had germinated from these discussions and helped cement his belief that violence was the panacea .
-1	2640607	2640576	" There is no need for one deadline for all to create the ASEAN Economic Community , " Thaksin said .	Thus , he said , there did not have to one deadline to create the economic community .
-1	3310210	3310286	The announcement was made during the recording of a Christmas concert attended by top Vatican cardinals , bishops , and many elite from Italian society , witnesses said .	The broadside came during the recording on Saturday night of a Christmas concert attended by top Vatican cardinals , bishops and many elite of Italian society , witnesses said .
-1	3376093	3376101	The additional contribution brings total U.S. food aid to North Korea this year to 100,000 tonnes .	The donation of 60,000 tons brings the total of U.S. contributions for the year to 100,000 .
-1	1549586	1549609	Leon Williams ' body was found inside his third-floor apartment at 196 Bay St. , in Tompkinsville .	The dead man , Leon Williams , was found in his third-floor apartment .
-1	460211	460445	The player 's eyes were bloodshot and a blood-alcohol test produced a reading of 0.18 - well above Tennessee 's level of presumed intoxication of 0.10 , the report said .	He failed a field sobriety test and a blood-alcohol test produced a reading of 0.18 – well above Tennessee 's level of presumed intoxication of 0.10 , the report said .
-1	1196962	1197061	But Virgin wants to operate Concorde on routes to New York , Barbados and Dubai .	Branson said that his preference would be to operate a fully commercial service on routes to New York , Barbados and Dubai .
-0	862804	862715	He tried to fight off officers and was taken to a hospital after a police dog bit him but was later released .	Cruz tried to fight off officers and was hospitalized after a police dog bit him , Sgt. Steve Dixon said .
-1	1726935	1726879	The announcement , which economists said was not a surprise , may be bittersweet for the millions of Americans without jobs .	Economists said the announcement was not a surprise , and politicians said it offered little comfort to the millions of Americans without jobs .
-0	331980	332110	Asked if the delegates could leave on Friday , police intelligence chief in Aceh , Surya Dharma , told reporters they could not because they did not have proper permission .	Asked if the delegates could leave on Friday , police intelligence chief Surya Dharma told reporters : " Of course they may not go .
-1	173879	173832	Dealers said the dollar also drew some downside support as Japanese investors are expected to keep snapping up foreign bonds amid the yen 's rise against the dollar .	Dealers said the dollar also drew some downside support as Japanese investors are expected to keep snapping up foreign bonds amid ever-falling domestic interest rates .
-0	2834988	2835026	Iran has until the end of the month to satisfy the agency it has no plans for nuclear weapons .	The Iranians have until the end of the month to answer all the agency 's questions about their past nuclear activities .
-1	2587300	2587243	Her father , Florin Cioaba , the king of Transylvania 's Gypsies , had her brought back and she was married against her will .	Her father , Roma King Florin Cioaba , had her brought back and she was promptly married against her will .
-0	554905	554627	Claire had advanced to the third round of the 76th annual Scripps Howard National Spelling Bee .	One by one they strolled to the microphone , all 251 youngsters in the 76th Scripps Howard National Spelling Bee .
-1	1912524	1912648	Citigroup Inc . C.N , the world 's largest financial services company , on Wednesday promoted Marjorie Magner to chairman and chief executive of its global consumer group .	Citigroup ( C ) on Wednesday named Marjorie Magner chairman and chief executive of its colossal global consumer business .
-1	3255597	3255668	" They 've been in the stores for over six weeks , " says Carney .	The quarterlies usually stay in stores for between six to eight weeks , " Carney added .
-1	629316	629289	Let me just say this : the evidence that we have of weapons of mass destruction was evidence drawn up and accepted by the joint intelligence community .	" The evidence that we had of weapons of mass destruction was drawn up and accepted by the Joint Intelligence Committee , " he said .
-1	54181	53570	Ridge said no actual explosives or other harmful substances will be used .	Ridge said no real explosives or harmful devices will be used in the exercise .
-1	723557	724115	Thus far , Stewart 's company appears ready to stand behind her .	For now , the company 's management appears to be standing behind Stewart .
-0	2607718	2607708	But late Thursday night , the campaign issued a statement saying there would be no news conference and no big announcement .	But late yesterday , the campaign and the state Democratic Party said there would be no news conference .
-1	753858	753890	There 's also a flaw that results because IE does not implement an appropriate block on a file download dialog box .	The second vulnerability is a result of IE not implementing a block on a file download dialog box .
-1	587009	586969	Another $ 100-million in savings will come from management layoffs and pay cuts .	The airline expects to save another $ 100-million a year through management layoffs and pay cuts .
-1	308567	308525	He called on Prime Minister John Howard to establish a royal commission on child sex abuse .	The Senate motion also called on Prime Minister John Howard to hold a royal commission into child sex abuse .
-0	665419	665612	" We think that the United States of America should support the free speech of all groups , " Mr. White said , objecting to Mr. Olson 's recommendation .	We think that the United States of America should support the free speech of all groups , he said .
-1	2763517	2763576	Terri Schiavo , 39 , underwent the procedure at the Tampa Bay area hospice where she has been living for several years , said her father , Bob Schindler .	The tube was removed Wednesday from Terri Schiavo , 39 , at the Tampa Bay-area hospice where she has lived for several years .
-0	3107118	3107136	After 18 months , Nissen found that Lipitor stopped plaque buildup in the patients ' arteries .	After 18 months , the atorvastatin patients had no change in the plaque in their arteries .
-1	780604	780466	Toll , Australia 's second-largest transport company , last week offered NZ75 a share for Tranz Rail .	Toll last week offered to buy the company for NZ75c a share , or $ NZ158 million .
-0	1989213	1989116	" This child was literally neglected to death , " Armstrong County District Attorney Scott Andreassi said .	Armstrong County District Attorney Scott Andreassi said the many family photos in the home did not include Kristen .
-1	1462409	1462504	Wal-Mart , the nation 's largest private employer , has expanded its antidiscrimination policy to protect gay and lesbian employees , company officials said Tuesday .	Wal-Mart Stores Inc . , the nation 's largest private employer , will now include gays and lesbians in its anti-discrimination policy , company officials said Wednesday .
-1	260952	260924	Metro , bus and local rail services in France 's four largest towns -- Paris , Lyon , Lille and Marseille -- were severely disrupted , Europe 1 radio reported .	Subway , bus and suburban rail services in France 's four largest cities -- Paris , Lyon , Lille and Marseille -- were severely disrupted , transport authorities said .
-1	1224743	1225510	In the undergraduate case , Rehnquist said the use of race was not " narrowly tailored " to achieve the university 's asserted interest in diversity .	Rehnquist wrote that the system was not narrowly tailored to achieve the interest in educational diversity .
-0	3329379	3329416	SP2 is basically about security enhancements to Windows , such as the improved Internet Connection Firewall ( ICF ) .	The firewall in the current Windows XP was known as the Internet Connection Firewall ( ICF ) .
-1	2362761	2362698	A landslide in central Chungchong province derailed a Seoul-bound train and 28 passengers were injured , television said .	In central Chungchong province , a landslide caused a Seoul-bound Saemaeul Express train to derail , injuring 28 people , local television said .
-0	1465073	1464854	They will help draft a plan to attack obesity that Kraft will implement over three to four years .	The team will help draft a plan by the end of the year to attack obesity .
-1	195728	196099	But that amount would probably be impossible to pass in the Senate , where Republican moderates have refused to go above $ 350 billion .	Such an amount would probably be unable to summon a majority of the Senate , where Republican moderates have refused to go above $ 350 billion .
-1	2587767	2587673	In the clash with police , Lt. Mothana Ali said about 1,000 demonstrators had gone to the station demanding jobs .	In Baghdad , police Lieut . Mothana Ali said about 1,000 demonstrators arrived at the station demanding jobs .
-0	1490044	1489975	Corixa shares rose 54 cents to $ 7.74 yesterday on the Nasdaq Stock Market .	Shares of Corixa rose 54 cents , or about 8 percent , to close at $ 7.74 .
-1	958161	957782	Committee approval , expected today , would set the stage for debate on the Senate floor beginning Monday .	That would clear the way for debate in the full Senate beginning on Monday .
-1	1033204	1033365	O 'Brien was charged with leaving the scene of a fatal accident , a felony .	Bishop Thomas O 'Brien , 67 , was booked on a charge of leaving the scene of a fatal accident .
-0	2996241	2996734	Tom Hamilton said his daughter was conscious and alert and in stable condition after the attack Friday morning .	Bethany , who remained in stable condition after the attack Friday morning , talked of the attack Saturday .
-0	2015389	2015410	The Calgary woman , who is in her twenties , donated blood on Aug. 7 .	The woman -- who has no symptoms of illness -- donated blood Aug. 7 .
-1	221515	221509	Quattrone lawyer John W. Keker said his client is innocent .	In a statement Monday , his lawyer John Keker said ``Frank Quattrone is innocent .
-0	2283737	2283794	In the weeks leading up to the execution , several Florida officials received anonymous threatening letters .	Several Florida officials connected to the case have received threatening letters , accompanied by rifle bullets .
-1	2826681	2826474	The disagreement over online music sales was disclosed in documents filed last week with the judge and made available by the court yesterday .	The fight over online music sales was disclosed in documents made available Monday by the court .
-1	2249237	2249305	Parson was charged with intentionally causing and attempting to cause damage to protected computers .	Parson is charged with one count of intentionally causing damage to a protected computer .
-1	389239	389299	" The court and the public need to know much more of the details of the defendant 's seemingly massive fraud , " the judge said .	" The court and the public need to know more of the defendants ' seemingly massive fraud , " he said .
-1	2652187	2652218	The U.S. Supreme Court will hear arguments on Wednesday on whether companies can be sued under the Americans with Disabilities Act for refusing to rehire rehabilitated drug users .	The high court will hear arguments today on whether companies can be sued under the ADA for refusing to rehire rehabilitated drug users .
-1	2945693	2945847	The IRS said taxpayers can avoid undelivered checks by having refunds deposited directly into their checking or savings accounts .	The IRS said taxpayers can avoid problems with lost or stolen refunds by having refunds deposited directly into personal checking or savings accounts .
-1	2065523	2065836	" More than 70,000 men and women from bases in Southern California were deployed in Iraq .	In all , more than 70,000 troops based in Southern California were deployed to Iraq .
-1	2222998	2223097	BP shares slipped 0.8 percent to 433.50 pence ( $ 6.85 ) each in afternoon trading on the London Stock Exchange .	BP shares slipped 48 cents to $ 41.72 Friday in trading on the New York Stock Exchange .
-1	2561999	2561941	Because of the accounting charge , the company now says it lost $ 1.04 billion , or 32 cents a share , in the quarter ended June 30 .	Including the charge , the Santa Clara , Calif.-based company said Monday it lost $ 1.04 billion , or 32 cents per share , in the period ending June 30 .
-0	2324704	2325023	Friday 's report raised new worries that a weak job market could shackle the budding economic recovery despite a slight improvement in the overall unemployment rate .	U.S. companies slashed payrolls for a seventh straight month in August , raising new worries that a weak jobs market could shackle the budding economic recovery .
-1	2336453	2336545	Federal Emergency Management Administration designated $ 20 million to establish the registry .	The registry was launched with $ 20 million from the Federal Emergency Management Agency .
-1	720572	720486	BREAST cancer cases in the UK have hit an all-time high with more than 40,000 women diagnosed with the disease each year , Cancer Re-search UK revealed yesterday .	Cases of breast cancer in Britain have reached a record high , with the number of women diagnosed with the disease passing the 40,000 mark for the first time .
-1	1605818	1605806	" It was never our intention to sell the product , " said Health Minister Anne McClellan , a skeptic of medical marijuana use .	" It was never the intention of us to sell product , " federal Health Minister Anne McLellan said yesterday in Edmonton .
-0	2440680	2440474	GM , the world 's largest automaker , has 115,000 active UAW workers and another 340,000 retirees and spouses .	They cover more than 300,000 UAW workers and 500,000 retirees and spouses .
-0	726399	726078	Rosenthal is hereby sentenced to custody of the Federal Bureau of prisons for one day with credit for time served , " Breyer said to tumultuous cheers in the courtroom .	" Rosenthal is hereby sentenced to custody of the Federal Bureau of Prisons for one day with credit for time served . "
-1	533903	533818	" We are committed to helping the Iraqi people get on the path to a free society , " Rumsfeld said in a speech to the Council on Foreign Relations .	" We are committed to helping the Iraqi people get on the path to a free society , " he said .
-1	1166473	1166857	Mr. Young said he was disappointed that the government didn 't see the severe acute respiratory syndrome crisis as worthy of federal disaster-relief money .	Young said he was disappointed the government didn 't see the SARS crisis as worthy of federal disaster relief money .
-1	144089	143697	The 12-nation currency has risen by 33 percent against the dollar over the past 15 months .	The euro is up 9 percent against the dollar in the past six weeks .
-1	3439854	3439874	In February 2000 , the officers — Kenneth Boss , Sean Carroll , Edward McMellon and Richard Murphy — were acquitted of all charges in the killing .	The officers -- Kenneth Boss , Sean Carroll , Edward McMellon and Richard Murphy -- were acquitted in 2000 of state murder charges .
-1	3464314	3464302	I was surprised it turned out me talking and the president just listening .	" I was surprised it turned out me talking and the president just listening . . . It was mostly a monologue . "
-1	2008984	2009175	The state 's House delegation currently consists of 17 Democrats and 15 Republicans .	Democrats hold a 17-15 edge in the state 's U.S. House delegation .
-0	816867	816831	Freddie also said Leland C. Brendsel will retire as chairman and chief executive and resign from the board .	He replaces Leland Brendsel , 61 , who retired as chairman and chief executive .
-1	192285	192327	We 'll be listening carefully to the [ IAEA ] director general 's report at the next board meeting .	" We 'll be listening carefully to the ( IAEA ) director-general 's report at the next board meeting . "
-1	2688145	2688162	In that position , Elias will report to Joe Tucci , president and CEO of EMC .	As executive vice president of new ventures , Elias will report to Joe Tucci , EMC 's president and chief executive .
-1	3294207	3294290	But with the PM due to leave tomorrow afternoon for personal reasons there was a risk he might not be present when the final decision was made .	But with the Prime Minister due to leave tomorrow , a day early , he may not be present when the final decision is made .
-0	205100	205145	A pro-independence radical , Miodrag Zivkovic , of the Liberal Alliance , came in second with 31 percent of the vote .	Miodrag Zivkovic , of the Liberal Alliance of Montenegro , won 31 percent of the vote while the independent Dragan Hajdukovic got four percent .
-0	3242051	3241897	Mr. Kerkorian tried unsuccessfully to take over Chrysler in 1995 , but did win representation on its board .	Kerkorian and Tracinda had also tried to take over Chrysler in 1995 .
-0	1076861	1077018	Glover spoke at a news conference that included about 20 relatives of the victims .	About 20 family members of the victims were invited to the news conference .
-1	2095803	2095786	Drax faced a financial crisis late last year after it lost its most lucrative sales contract , held with insolvent utility TXU Europe .	Drax ’ s troubles began late last year when it lost its most lucrative sales contract , with the insolvent utility TXU Europe .
-1	2112330	2112376	But I would rather be talking about high standards than low standards . "	" I would rather be talking about positive numbers rather than negative .
-1	3389318	3389271	It was not immediately known how many people were on flight UTA 141 , which could carry 141 passengers and crew .	It was still not known exactly how many people were on the plane , which could carry 141 passengers and crew .
-1	698948	698933	The market remains pinned in a narrow range after a powerful rally drove the broad Standard & Poor 's 500 index .SPX up more than 20 percent since mid-March .	The market remains pinned in a narrow range after a powerful rally pushed the broad S & P 500 index up more than 20 percent since mid-March .
-1	539585	539355	Witnesses said they believed the man planned to crash the Launceston-bound Qantas flight 1737 , which was carrying 47 passengers and six crew .	Witnesses believe he wanted to crash Flight 1737 , which had 47 passengers and six crew .
-1	684848	684557	As Samudra sat down to hear the indictment , he looked over to his nine lawyers and shouted ``God is Great ' ' three times .	As he sat down to hear the indictment , Samudra looked over to his nine lawyers and shouted " Takbir ! " , or " Proclaim ! " , a religious rallying cry .
-1	347017	347002	In hardest-hit Taipei , traffic has disappeared from once bustling streets , ubiquitous department stores stand mostly empty and restaurants are eerily quiet .	In hardest-hit Taipei , traffic has disappeared from once-bustling streets and department stores and restaurants are virtually empty .
-1	1592037	1592076	In a statement , Lee said he " no longer believes that Viacom deliberately intended to trade on my name when naming Spike TV . "	Spike Lee no longer believes that Viacom deliberately intended to trade on his name by calling its own venture " Spike TV , " according to a statement read in court Tuesday .
-0	3013483	3013540	Singapore Prime Minister Goh Chok Tong says China plays an important role in the integration of Asia , including managing the stresses and strains both within and between countries .	HAINAN PROVINCE , China : Singapore Prime Minister Goh Chok Tong said China plays an important role in the integration of Asia .
-1	2020252	2020081	The worm attacks Windows computers via a hole in the operating system , an issue Microsoft on July 16 had warned about .	The worm attacks Windows computers via a hole in the operating system , which Microsoft warned of 16 July .
-0	2614947	2614904	The premium edition adds OfficeFront Page 2003 , Acceleration Server 2000 , and SQL Server 2000 .	The premium edition adds ISA Server , SQL Server and a specialized edition of BizTalk 2004 .
-0	1744257	1744378	In the year-ago quarter , the steelmaker recorded a profit of $ 16.2 million , or 15 cents per share , on sales of $ 1.14 billion .	In the second quarter last year , AK Steel reported a profit of $ 16.2 million , or 15 cents a share .
-0	1119721	1119714	Sony claimed that the reader 's capacitance sensing technology cannot be fooled by paper copies and does not require cleaning .	Its capacitance sensing technology electronically reads a fingerprint ; Sony says it can 't be fooled by paper copies and doesn 't require cleaning .
-1	1186754	1187056	Amazon.com shipped out more than a million copies of the new book , making Saturday the largest distribution day of a single item in e-commerce history .	Amazon.com shipped more than a million copies by Saturday afternoon , making Saturday the largest distribution day of a single item in e-commerce history .
-1	2842562	2842582	The show 's closure affected third-quarter earnings per share by a penny .	The company said this impacted earnings by a penny a share .
-0	431076	431242	After the two-hour meeting on May 14 , publisher Arthur O. Sulzberger Jr . , executive editor Howell Raines and managing editor Gerald Boyd pledged quick remedies to staff grievances .	The committee will make recommendations to Publisher Arthur Sulzberger , Executive Editor Howell Raines and Managing Editor Gerald Boyd .
-1	1393764	1393984	It 's been a busy couple of days for security gurus assigned to keep their companies safe and sound .	It 's been a busy couple of days for enterprise security gurus tasked with the job of keeping their companies safe and sound .
-0	2916199	2916164	Lu reclined in a soft chair wearing a woolly coat near the blackened capsule .	" It 's great to be back home , " said Lu , dressed in a woolly coat near the blackened capsule .
-1	2530671	2530542	Gov. Bob Riley proposed the budget cuts after Alabama voters rejected his $ 1.2 billion tax plan Sept . 9 .	After Alabama voters rejected his $ 1.2 billion tax plan Sept . 9 , Riley forecast significant cuts in state programs .
-1	219064	218969	" It is probably not the easiest time to come in and take over the shuttle program , but then again , I look forward to the challenge , " he said .	" It 's probably not the easiest time to come in and take over the shuttle program , but I look forward to the challenge , " Parsons told reporters at NASA headquarters .
-0	2377289	2377259	Estonia 's place in the European mainstream and safeguard its independence regained in 1991 .	Estonia was forcibly incorporated in the Soviet Union in 1940 and regained its independence only in 1991 .
-0	2110220	2110199	Franklin County Judge-Executive Teresa Barton said a firefighter was struck by lightning and was taken to the Frankfort Regional Medical Center .	A county firefighter , was struck by lightning and was in stable condition at Frankfort Regional Medical Center .
-0	1864253	1863810	Police suspected that Shaichat , 20 , had been abducted either by Palestinians or by Israeli Arabs .	Nobody claimed responsibility for Schaichat 's death , but police suspect that the 20-year-old soldier was abducted either by Palestinians or Israeli Arabs .
-0	3150803	3150839	During this year 's August to October quarter , Lowe 's opened 38 new stores , including two relocations .	During the third quarter , Lowe 's opened 38 new stores and now has 932 stores in 45 states .
-0	969381	969512	The technology-laced Nasdaq Composite Index < .IXIC > declined 25.78 points , or 1.56 percent , to 1,627.84 .	The broader Standard & Poor 's 500 Index .SPX gave up 11.91 points , or 1.19 percent , at 986.60 .
-1	271891	271839	Sony said the PSP would also feature a 4.5-inch LCD screen , Memory Stick expansion slots .	It also features a 4.5 in back-lit LCD screen and memory expansion facilities .
-0	2829648	2829613	Clinton did not mention that two Democratic senators , Charles Robb of Virginia and Wendell Ford of Kentucky , voted to shelve the McCain bill .	Two Democrats , Sen. Charles Robb of Virginia and Wendell Ford of Kentucky , voted with the 40 Republicans .
-1	886904	887158	Some of the company 's software developers will join Microsoft , but details haven 't been finalized , said Mike Nash , corporate vice president of Microsoft 's security business unit .	Some of the companys software developers will join Microsoft , but details havent been finalized , said Mike Nash , corporate vice president of Microsofts security business unit .
-0	2632692	2632767	Wal-Mart has said it plans to open at least 40 Supercenters in the state in the coming years ; analysts expect four or more to be in San Diego County .	At least 40 of the outlets will be in California , and analysts expect four or more to be in San Diego County .
-1	2240399	2240149	Cintas is battling efforts to unionize 17,000 of its workers and to let unions organize the workers by signing cards , rather than by a lengthy election process .	Cintas is battling efforts to unionize 17,000 of its workers and labor 's demands to let its workers organize by signing cards , rather than by a lengthy election process .
-1	805457	805985	The opposition would resort to rolling mass action " at strategic times of our choice and without warning to the dictatorship , " he said .	" From now onwards we will embark on rolling mass action at strategic times of our choice and without any warning to the dictatorship , " he said .
-1	2896308	2896334	Federal Agriculture Minister Warren Truss said the Government still did not know the real reason the sheep were rejected at the Saudi port of Jeddah on August 21 .	He said the Government still did not know the real reason the original Saudi buyer pulled out on August 21 .
-1	2110775	2110924	Tom Kraynak , manager of operations and resources for the Canton , Ohio-based East Central Area Reliability Council , said that scenario is one among many that investigators are considering .	Tom Kraynak , manager of operations and resources for the Canton , Ohio-based East Central Area Reliability Council , said investigators are considering the scenario .
-1	1762569	1762526	Hester said Sanmina was the best fit among several purchase offers the company received from electronics manufacturers and computer makers .	Hester said Sanmina 's offer was the best among several Newisys received from electronics manufacturers and computer makers .
-0	2706154	2706185	The other inmate fell but Selenski shimmed down the makeshift rope to a second-story roof and used the mattress to scale a razor-wire fence , Fischi said .	After the other inmate fell , Selenski used the mattress to scale a 10-foot , razor-wire fence , Fischi said .
-1	1057995	1057778	The hearing , expected to last a week , will determine whether Akbar faces a court-martial .	The purpose of the hearing is to determine whether Akbar should be court-martialled .
-1	1386884	1386857	He said he has begun a court action to seize Beacon Hill 's assets and has frozen more than $ 13 million Beacon Hill had when it closed .	He said he has initiated a forfeiture action in court and frozen more than $ 13 million Beacon Hill had when it closed .
-1	3093023	3092996	Speaking for the first time yesterday , Brigitte 's maternal aunt said his family was unaware he had was in prison or that he had remarried .	Brigitte 's maternal aunt said his family was unaware he had been sent to prison , or that he had remarried in Sydney .
-1	1661381	1661317	" Close co-operation between our law enforcement agencies , close co-operation between our intelligence services lie at the heart of the ongoing fight against terrorism . "	Close cooperation between regional law enforcement agencies and intelligence services was at the heart of the fight against terrorism , he said .
-0	2926039	2925982	The mother of a Briton held by Colombian guerrillasspoke of her relief yesterday after hearing that he might be freed in the next few weeks .	The parents of a Briton being held hostage by Colombian rebels spoke yesterday of their optimism that he would be freed in time for his birthday next month .
-0	637168	637447	We strongly disagree with Novell 's position and view it as a desperate measure to curry favor with the Linux community .	McBride characterized Novell 's move as " a desperate measure to curry favor with the Linux community . "
-1	696677	696932	After more than two years ' detention under the State Security Bureau , the four were found guilty of subversion in Beijing 's No. 1 Intermediate Court last Wednesday .	After more than two years in detention by the State Security Bureau , the four were found guilty last Wednesday of subversion .
-1	3122429	3122305	Mr Russell , 46 , a coal miner from Brisbane , said : " They are obviously hurting , so we are basically going over there to help them . "	" They are obviously hurting so we are basically going over there to help them , " Russell , 46 , said .
-1	1348909	1348954	The New York Democrat and former first lady has said she will not run for the White House in 2004 , but has not ruled out a race in later years .	The former first lady has said she will not run for the White House in 2004 but has not ruled out a race later on .
-0	162203	162101	It does not affect the current Windows Media Player 9.0 Series .	Windows Media Player has had security problems before .
-0	71501	71627	The seizure took place at 4 a.m. on March 18 , just hours before the first American air assault .	The time was about 4 a.m. on March 18 , just hours before the first pinpoint missiles rained down on the capital .
-1	2907762	2907649	Donations stemming from the Sept . 11 attacks helped push up contributions to human service organizations and large branches of the United Way by 15 percent and 28.6 percent , respectively .	Donations stemming from the Sept . 11 attacks helped push up contributions to human service organizations by 15 percent and to large branches of the United Way by 28.6 percent .
-1	2167771	2167744	In May , Mr. Hatfill said he was struck by a vehicle being driven by an FBI employee who was tailing him in Georgetown .	Last May , Hatfill was struck by a vehicle being driven by an FBI employee who was tailing him in Washington 's Georgetown neighborhood .
-1	3320577	3320553	" I will support a constitutional amendment which would honor marriage between a man and a woman , codify that , " he said .	" If necessary , I will support a constitutional amendment which would honour marriage between a man and a woman , codify that . "
-1	849291	849442	IBM of the US and Infineon Technologies of Germany will today announce a technological development that could threaten multi-billion dollar memory chip markets .	IBMof the US andInfineon Technologies of Germany willon Tuesdayannounce a technological development that could threaten multi-billion dollar memory chip markets .
-0	763948	763991	Costa 's semifinal opponent is Spaniard Juan Carlos Ferrero , whom he beat in last year 's final .	Costa will play Juan Carlos Ferrero next in a rematch of last year 's final .
-1	1908763	1908744	A former employee of a local power company pleaded guilty Wednesday to setting off a bomb that knocked out a power substation during the Winter Olympics last year .	A former Utah Power meter reader pleaded guilty Wednesday to bombing a power substation during the 2002 Winter Olympics .
-0	1876120	1876059	Thyroid hormones are known to help in weight loss by stimulating metabolism - and cutting cholesterol - but come with the unwanted side effect of speeding up the heartbeat .	Thyroid hormones are known to help in weight loss by stimulating metabolism , and they can help cut cholesterol too .
-1	518089	518133	Judge Craig Doran said it wasn 't his role to determine if Hovan was " an evil man " but maintained that " he has committed an evil act . "	Judge Craig Doran said he couldn 't determine if Hovan was " an evil man " but said he " has committed an evil act . "
-0	224932	224868	The Hartford shares rose $ 2.88 , or 6.6 percent , to close Monday at $ 46.50 on the New York Stock Exchange .	Shares of Hartford rose $ 2.88 to $ 46.50 in New York Stock Exchange composite trading .
-1	1771131	1771091	It also offers a built-in NAND flash boot loader so that high-density NAND flash memory can be used without having to install an additional support chip .	The S3C2440 has a built-in NAND flash boot loader , for example , so that high-density NAND flash memory can be installed without an additional support chip .
-0	2728425	2728251	It decided instead to issue them before the stock market opened Monday after the downgrade of its debt late Friday by Moody 's , the credit rating agency .	It decided instead to issue them before the stock market opened Monday to counteract the downgrade of its debt late Friday by Moody 's to one step above junk status .
-0	953733	953537	Altria shares fell 2.5 percent or $ 1.11 to $ 42.57 and were the Dow 's biggest percentage loser .	Its shares fell $ 9.61 to $ 50.26 , ranking as the NYSE 's most-active issue and its biggest percentage loser .
-1	349215	349241	It will be followed in November by a third movie , " The Matrix Revolutions . "	The film is the second of a trilogy , which will wrap up in November with " The Matrix Revolutions . "
-1	2919853	2919804	Massachusetts regulators and the Securities and Exchange Commission on Tuesday pressed securities fraud charges against Putnam Investments and two of its former portfolio managers for alleged improper mutual fund trading .	State and federal securities regulators filed civil charges against Putnam Investments and two portfolio managers in the ever-expanding mutual fund trading scandal .
-1	954526	954607	He is blocking them until the Air Force assigns four additional C-130 cargo planes to Gowen Field , an Idaho Air National Guard base in Boise .	He is holding them up until the Air Force agrees to assign four additional C-130 cargo planes to the Idaho Air National Guard .
-1	69773	69792	Cisco pared spending to compensate for sluggish sales .	In response to sluggish sales , Cisco pared spending .
-0	2823575	2823513	The study , published Monday in the journal Molecular Brain Research , is likely to also apply to humans , its authors said .	The study , conducted on the brains of developing mice , was being published today in the journal Molecular Brain Research .
-1	2455942	2455978	My decision today is not based on any one event . "	Governor Rowland said his decision was " not based on any one event . "
-1	131979	131957	Nelson , 27 , is being retried on civil-rights charges stemming from the disturbance which led to Rosenbaum 's death .	Nelson , 27 , is being retried on civil rights charges stemming from the disturbance that led to Rosenbaum 's death .
-0	2010705	2010779	" The government elements who have been causing trouble are still in place .	The government elements who have been causing trouble are still in place , they are attacking us . "
-1	54142	53641	Next Monday at about 2 p.m. ( CST ) , hospital officials in and near Chicago will notice a sudden increase in people complaining of flu-like symptoms .	Around the same time , hospital officials in and near Chicago will notice a sudden increase in people complaining of flu-like symptoms .
-1	1015249	1015204	Wal-Mart Stores Inc . , Kohl 's Corp. , Family Dollar Stores Inc. and Big Lots Inc. were among the merchants posting May sales that fell below Wall Street 's modest expectations .	Wal- Mart , Kohl 's Corp. , Family Dollar Stores Inc . , and Big Lots Inc. posted May sales that fell below Wall Street 's modest expectations .
-0	753928	753890	The patch also fixes a vulnerability that results because IE does not implement an appropriate block on a file download dialog box .	The second vulnerability is a result of IE not implementing a block on a file download dialog box .
-1	3022833	3023029	Peterson , a former fertilizer salesman , is charged with murder in the deaths of his 27-year-old wife and the baby boy she was carrying .	Peterson , 31 , is now charged with murder in the deaths of his 27-year-old wife and their unborn son .
-0	751520	751373	SPOT products run a Microsoft operating system and the company 's DirectBand radio technology developed with SCA Data Systems .	The DirectBand network was developed with the assistance of SCA Data Systems .
-0	218848	218851	He replaces Ron Dittemore , who announced his resignation in April .	Dittemore announced his plans to resign on April 23 .
-1	3181118	3181443	Detectives told Deasean 's father , Stelly Chisolm , a college student , and mother , Kimberly Hill , of the arrest shortly after Perry was apprehended .	Shortly after his arrest , detectives told Deasean 's father , Stelly Chisolm , a college student , and mother , Kimberly Hill , a medical assistant , about the development .
-1	515581	515752	They were among about 40 people attending the traditional Jewish ceremony colored by some non-traditional touches .	He said about 40 people attended the traditional Jewish ceremony colored by some nontraditional touches .
-1	347022	347003	Taiwan had been relatively free of the viral infection until a fiasco at a Taipei hospital in late April caused the number of infections to skyrocket .	Taiwan had been relatively free of the viral infection until a severe outbreak at a Taipei hospital in late April .
-1	3311600	3311633	Mr. Rowland attended a party in South Windsor for the families of Connecticut National Guard soldiers called to active duty .	Rowland was making an appearance at a holiday party for families of Connecticut National Guard soldiers assigned to duty in Iraq and Afghanistan .
-0	3439114	3439084	Ross Garber , Rowland 's lawyer , said Tuesday he would attend the meeting and would ask to speak on the issue .	Ross Garber , Rowland 's legal counsel , said the governor would have no comment on the condo deal .
-0	487951	488007	The euro was at 1.5281 versus the Swiss franc EURCHF = , up 0.2 percent on the session , after hitting its highest since mid-2001 around 1.5292 earlier in the session .	The euro was steady versus the Swiss franc after hitting its highest since mid-2001 of 1.5261 earlier in the session .
-0	314997	315030	On the stand Wednesday , she said she was referring only to the kissing .	On the stand Wednesday , she testified that she was referring to the kissing before the alleged rape .
-0	4733	4557	Garner said the group would probably be expanded to include , for example , a Christian and perhaps another Sunni leader .	The group has already met several times and Gen. Garner said it probably will be expanded to include a Christian and perhaps another Sunni Muslim leader .
-1	2820371	2820525	Blair 's Foreign Secretary Jack Straw was to take his place on Monday to give a statement to parliament on the European Union .	Blair 's office said his Foreign Secretary Jack Straw would take his place on Monday to give a statement to parliament on the EU meeting the prime minister attended last week .
-1	801552	801516	" There were more people surrounding the clubhouse than the Unabomber 's house up in the hills , " Baker said .	" There are more people surrounding the clubhouse than surrounded the Unabomber 's home in the hills .
-1	1704987	1705268	Charles O. Prince , 53 , was named as Mr. Weill 's successor .	Mr. Weill 's longtime confidant , Charles O. Prince , 53 , was named as his successor .
-1	396041	396188	Officials are also meeting with the International Organization for Epizootics ( OIE ) , which establishes animal-health standards for the world .	Canadian officials were also expected to meet yesterday with the International Organization for Epizootics ( OIE ) , which establishes animal-health standards for the world .
-0	1014983	1014963	GE stock closed Friday at $ 30.65 a share , down about 42 cents , on the New York Stock Exchange .	GE 's shares closed at $ 30.65 on Friday on the New York Stock Exchange .
-1	2320654	2320666	The Midwestern research center will focus on the development of diagnostic , therapeutic and vaccine products for anthrax , botulism , tularemia , hemorrhagic fever viruses and plague .	The Midwestern center will focus on diagnosis , treatment and vaccines for anthrax , botulism , tularemia , hemorrhagic fever viruses and plague .
-1	1057876	1057778	The hearing is to determine whether there is enough evidence to order Akbar to a general court-martial proceeding .	The purpose of the hearing is to determine whether Akbar should be court-martialled .
-0	2116843	2116883	In the United States , heart attacks kill about 460,000 year , in Canada about 80,000 .	In the United States , heart attacks kill about 460,000 yearly , according to the National Institutes of Health .
-1	1461629	1461781	Ninety-five percent of international cargo to the United States is carried by ship .	Ships carry 95 percent of international cargo to the United States .
-0	374015	374162	" It 's a major victory for Maine , and it 's a major victory for other states .	The Maine program could be a model for other states .
-1	2493369	2493428	News that oil producers were lowering their output starting in November exacerbated a sell-off that was already under way on Wall Street .	News that the Organization of Petroleum Exporting Countries was lowering output starting in November exacerbated a stock sell-off already under way yesterday .
-1	490355	490378	They note that after several weeks of rallies on upbeat earnings , investors are looking for stronger evidence of a recovery before sending stocks higher .	After several weeks of market rallies on upbeat earnings , many investors are looking for more concrete signs of an economic recovery .
-1	2691044	2691264	Most economists had expected a more dire report , with many anticipating the fifth month of job losses in six months .	Most economists had been expecting a far more dire report , with many expecting to see the fifth month of job losses in six months in September .
-1	1831453	1831491	But software license revenues , a measure financial analysts watch closely , decreased 21 percent to $ 107.6 million .	License sales , a key measure of demand , fell 21 percent to $ 107.6 million .
-1	2380695	2380822	King , brand-name writer , master of the horror story and e-book pioneer , is receiving this year 's medal for Distinguished Contributions to American Letters .	Stephen King , master of the horror story and e-book pioneer , is receiving this year 's medal for Distinguished Contributions to American Letters from the National Book Foundation .
-1	2577517	2577531	The Denver-based natural gas producer and marketer said the inaccurate reporting was discovered after it received a subpoena from the U.S. Commodity Futures Trading Commission .	The natural gas producer and marketer said the inaccurate reporting was discovered in response to a subpoena from the U.S. Commodity Futures Trading Commission , or CFTC .
-1	3267026	3266930	The steel tariffs , which the U.S. president imposed in March 2002 , will officially end at midnight , instead of March 2005 as initially planned .	The U.S. steel tariffs , which Bush imposed in March 2002 , were to officially end at midnight Thursday ( 0500 GMT ) , instead of March 2005 as initially planned .
-1	360875	360943	Business Week 's online edition reported on Friday that WorldCom and the SEC could announce a settlement as early as Monday .	BusinessWeek Online has learned that the settlement could come as early as Monday , May 19 .
-1	162632	162653	Only one of the five buildings in the Baghdad compound of the United Nations Development Program escaped being burned , the UN said on its Web site .	Only one of the five buildings in the compound in Baghdad run by the UN Development Program , escaped being burned , the UN said on its Web site .
-1	1128884	1128865	Shares of Salix have rocketed 64 percent since Axcan made its first offer on April 10 .	Since the initial takeover offer , Salix shares have risen about 35 percent .
-1	3264732	3264648	The jury verdict , reached Wednesday after less than four hours of deliberation , followed a 2 week trial , during which Waagner represented himself .	The quick conviction followed a 2 1 / 2 week trial , during which the Venango County man represented himself .
-1	1721433	1721267	It 's happened five times in the last 11 years : A disaster puts this Southwestern town in the headlines during the summer tourist season .	It 's happened five times in the last decade : A disaster puts this tourist town in the headlines during summer , its busiest season .
-0	146112	146127	The broader Standard & Poor 's 500 Index .SPX edged down 9 points , or 0.98 percent , to 921 .	The technology-laced Nasdaq Composite Index < .IXIC > shed 15 points , or 0.98 percent , to 1,492 .
-1	389117	389052	The company emphasized that McDonald 's USA does not import any raw beef or hamburger patties from Canada for McDonald 's use in the United States .	McDonald 's said in a statement that it does not import any raw beef or hamburger patties from Canada for use in the United States .
-1	872784	872834	Gregory Parseghian , a former investment banker , was appointed chief executive .	Greg Parseghian was appointed the new chief executive .
-0	2977500	2977547	Their contract will expire at 12 : 01 a.m. Wednesday instead of 12 : 01 a.m. Sunday , said Rian Wathen , organizing director for United Food and Commercial Workers Local 700 .	" It has outraged the membership , " said Rian Wathen , organizing director of United Food and Commercial Workers Local 700 .
-1	3107137	3107119	But plaque volume increased by 2.7 percent in pravastatin patients .	The volume of plaque in Pravachol patients ' arteries rose by 3 % .
-1	1619244	1619274	Today in the US , the book - kept under wraps by its publishers , G. P. Putnam 's Sons , since its inception - will appear in bookstores .	Tomorrow the book , kept under wraps by G. P. Putnam 's Sons since its inception , will appear in bookstores .
-0	3061836	3062031	The S & P / TSX composite rose 87.74 points on the week , while the TSX Venture Exchange composite gained 44.49 points .	On the week , the Dow Jones industrial average rose 11.56 points , while the Nasdaq Stock Market gained 39.42 points .
-1	485999	486011	Ex-KGB agent Putin added that the Beatles were considered ' propaganda of an alien ideology ' .	In Soviet times the Beatles ' music " was considered propaganda of an alien ideology .
diff --git a/docs/examples/sentence_embedding/elmo_sentence_representation.md b/docs/examples/sentence_embedding/elmo_sentence_representation.md
deleted file mode 100644
index 84c8309bdc..0000000000
--- a/docs/examples/sentence_embedding/elmo_sentence_representation.md
+++ /dev/null
@@ -1,165 +0,0 @@
-# Extracting Sentence Features with Pre-trained ELMo
-
-While word embeddings have been shown to capture syntactic and semantic information of words as well as have become a standard component in many state-of-the-art NLP architectures, their context-free nature limits their ability to represent context-dependent information.
-Peters et. al. proposed a deep contextualized word representation method, called Embeddings from Language Models, or ELMo for short [1].
-This model is pre-trained with a self-supervising task called a bidirectional language model; they show that the representation from this model is powerful and improves the state-of-the-art performance on many tasks such as question-answer activities, natural language inference, semantic role labeling, coreference resolution, named-entity recognition, and sentiment analysis.
-
-In this notebook, we will show how to leverage the model API in GluonNLP to automatically download the pre-trained ELMo model, and generate sentence representation with this model.
-
-We will focus on:
-
-1) how to process and transform data to be used with pre-trained ELMo model, and
-2) how to load the pre-trained ELMo model, and use it to extract the representation from preprocessed data.
-
-## Preparation
-
-We start with the usual preparation like importing libraries and setting up the environment.
-
-### Load MXNet and GluonNLP
-
-```{.python .input}
-import warnings
-warnings.filterwarnings('ignore')
-import numpy as np
-import io
-
-import mxnet as mx
-from mxnet import gluon
-import gluonnlp as nlp
-nlp.utils.check_version('0.7.0')
-```
-
-## Preprocess the data
-
-The goal of pre-processing the data is to numericalize the text using the pre-processing steps that are consistent with training ELMo model.
-
-The exact same vocabulary needs to be used so that the indices in model embedding matches the pre-trained model.
-In this section, we will proceed with the following steps:
-
-1) Loading a custom dataset
-2) Tokenizing the dataset in the same way as training ELMo
-3) Numericalizing the tokens on both words and characters using the provided `vocab`
-
-### Loading the dataset
-
-The first step is to create a dataset from existing data.
-Here, we use a paragraph from [1] as our dataset, using the built-in [TextLineDataset](../../api/data.rst#gluonnlp.data.TextLineDataset) class.
-It's a dataset of 7 samples, each of which is a sentence.
-
-```{.python .input}
-elmo_intro = """
-Extensive experiments demonstrate that ELMo representations work extremely well in practice.
-We first show that they can be easily added to existing models for six diverse and challenging language understanding problems, including textual entailment, question answering and sentiment analysis.
-The addition of ELMo representations alone significantly improves the state of the art in every case, including up to 20% relative error reductions.
-For tasks where direct comparisons are possible, ELMo outperforms CoVe (McCann et al., 2017), which computes contextualized representations using a neural machine translation encoder.
-Finally, an analysis of both ELMo and CoVe reveals that deep representations outperform those derived from just the top layer of an LSTM.
-Our trained models and code are publicly available, and we expect that ELMo will provide similar gains for many other NLP problems.
-"""
-
-elmo_intro_file = 'elmo_intro.txt'
-with io.open(elmo_intro_file, 'w', encoding='utf8') as f:
-    f.write(elmo_intro)
-
-dataset = nlp.data.TextLineDataset(elmo_intro_file, 'utf8')
-print(len(dataset))
-print(dataset[2]) # print an example sentence from the input data
-```
-
-### Transforming the dataset
-
-Once we have the dataset that consists of sentences in raw text form, the next step is to transform
-the dataset into the format that ELMo model knows and on which it was trained.
-
-In our case, transforming the dataset consists of tokenization and numericalization.
-
-#### Tokenization
-
-The ELMo pre-trained models are trained on Google 1-Billion Words dataset, which was tokenized with the Moses Tokenizer.
-In GluonNLP, using [SacreMosesTokenizer](../../api/data.rst#gluonnlp.data.SacreMosesTokenizer) should do the trick.
-Once tokenized, we can add markers, or tokens, for the beginning and end of sentences. BOS means beginning of sentence, and EOS means the end of a sentence.
-
-```{.python .input}
-tokenizer = nlp.data.SacreMosesTokenizer()
-dataset = dataset.transform(tokenizer)
-dataset = dataset.transform(lambda x: ['<bos>'] + x + ['<eos>'])
-print(dataset[2]) # print the same tokenized sentence as above
-```
-
-
-#### Using Vocab from pre-trained ELMo
-
-Numericalizing the dataset is as straightforward as using the ELMo-specific character-level
-vocabulary as transformation. For details on ELMo's vocabulary, see
-[ELMoCharVocab](../../api/vocab.rst#gluonnlp.vocab.ELMoCharVocab).
-We also calculate the length of each sentence in preparation for batching.
-
-```{.python .input}
-vocab = nlp.vocab.ELMoCharVocab()
-dataset = dataset.transform(lambda x: (vocab[x], len(x)), lazy=False)
-```
-
-#### Creating the `DataLoader`
-
-Now that the dataset is ready, loading it with the `DataLoader` is straightforward.
-Here, we pad the first field to the maximum length, and append/stack the actual length of the sentence to form
-batches.
-The lengths will be used as a mask.
-For more advanced usage examples of the DataLoader object, check out the
-[Sentiment Analysis tutorial](../sentiment_analysis/sentiment_analysis.ipynb).
-
-```{.python .input}
-batch_size = 2
-dataset_batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(pad_val=0),
-                                              nlp.data.batchify.Stack())
-data_loader = gluon.data.DataLoader(dataset,
-                                    batch_size=batch_size,
-                                    batchify_fn=dataset_batchify_fn)
-```
-
-## Loading the pre-trained ELMo model
-
-Using the model API in GluonNLP, you can automatically download the pre-trained models simply by
-calling get_model. The available options are:
-
-1. elmo_2x1024_128_2048cnn_1xhighway
-2. elmo_2x2048_256_2048cnn_1xhighway
-3. elmo_2x4096_512_2048cnn_2xhighway
-
-Note that the second field in get_model's return value is ELMo's vocabulary.
-Since we already created an instance of it above, here we simply ignore this field.
-
-```{.python .input}
-elmo_bilm, _ = nlp.model.get_model('elmo_2x1024_128_2048cnn_1xhighway',
-                                   dataset_name='gbw',
-                                   pretrained=True,
-                                   ctx=mx.cpu())
-print(elmo_bilm)
-```
-
-## Putting everything together
-
-Finally, now we feed the prepared data batch into the [ELMoBiLM](../../api/model.rst#gluonnlp.model.ELMoBiLM) model.
-
-```{.python .input}
-def get_features(data, valid_lengths):
-    length = data.shape[1]
-    hidden_state = elmo_bilm.begin_state(mx.nd.zeros, batch_size=batch_size)
-    mask = mx.nd.arange(length).expand_dims(0).broadcast_axes(axis=(0,), size=(batch_size,))
-    mask = mask < valid_lengths.expand_dims(1).astype('float32')
-    output, hidden_state = elmo_bilm(data, hidden_state, mask)
-    return output
-
-batch = next(iter(data_loader))
-features = get_features(*batch)
-print([x.shape for x in features])
-```
-
-## Conclusion and summary
-
-In this tutorial, we show how to generate sentence representation from the ELMo model.
-In GluonNLP, this can be done with just a few simple steps: reuse of the data transformation from ELMo for preprocessing the data, automatically downloading the pre-trained model, and feeding the transformed data into the model.
-To see how to plug in the pre-trained models in your own model architecture and use fine-tuning to improve downstream tasks, check our [Sentiment Analysis tutorial](../sentiment_analysis/sentiment_analysis.ipynb).
-
-## References
-
-[1] Peters, Matthew E., et al. "Deep contextualized word representations." NAACL (2018).
diff --git a/docs/examples/sentence_embedding/sentences.json b/docs/examples/sentence_embedding/sentences.json
deleted file mode 100644
index 1369580dfa..0000000000
--- a/docs/examples/sentence_embedding/sentences.json
+++ /dev/null
@@ -1,38 +0,0 @@
-[
-  [
-    "The U.S. Centers for Disease Control and Prevention initially advised school systems to close if outbreaks occurred , then reversed itself , saying the apparent mildness of the virus meant most schools and day care centers should stay open , even if they had confirmed cases of swine flu .",
-    "When Ms. Winfrey invited Suzanne Somers to share her controversial views about bio-identical hormone treatment on her syndicated show in 2009 , it won Ms. Winfrey a rare dollop of unflattering press , including a Newsweek cover story titled \" Crazy Talk : Oprah , Wacky Cures & You . \"",
-    "Elk calling -- a skill that hunters perfected long ago to lure game with the promise of a little romance -- is now its own sport .",
-    "Don 't !",
-    "Fish , ranked 98th in the world , fired 22 aces en route to a 6-3 , 6-7 ( 5 \/ 7 ) , 7-6 ( 7 \/ 4 ) win over seventh-seeded Argentinian David Nalbandian .",
-    "Why does everything have to become such a big issue ?",
-    "AMMAN ( Reuters ) - King Abdullah of Jordan will meet U.S. President Barack Obama in Washington on April 21 to lobby on behalf of Arab states for a stronger U.S. role in Middle East peacemaking , palace officials said on Sunday .",
-    "To help keep traffic flowing the Congestion Charge will remain in operation through-out the strike and TfL will be suspending road works on major London roads wherever possible .",
-    "If no candidate wins an absolute majority , there will be a runoff between the top two contenders , most likely in mid-October .",
-    "Authorities previously served search warrants at Murray 's Las Vegas home and his businesses in Las Vegas and Houston ."
-  ],
-  [
-    "Brent North Sea crude for November delivery rose 84 cents to 68.88 dollars a barrel .",
-    "That seems to have been their model up til now .",
-    "Gordon will join Luol Deng on the GB team ; their respective NBA teams , the Detroit Pistons and the Chicago Bulls , play tonight .",
-    "Nikam maintains the attacks were masterminded by the Muslim militant group Lashkar-e-Taiba .",
-    "Last year , Williams was unseeded , ranked 81st and coming off one of her worst losses on tour -- in a Tier 4 event at Hobart -- yet she beat six seeded players en route to the title at Melbourne Park .",
-    "It said that two officers involved in the case had been disciplined .",
-    "\" There is more intelligence now being gathered , \" the official said , adding that such efforts would continue for some time .",
-    "The majority will be of the standard 6X6 configuration for carrying personnel .",
-    "\" Consequently , necessary actions may not be taken to reduce the risks to children of sexual exploitation and drug or alcohol misuse , \" the report said . \u2022 Almost two-thirds of inspected schools were good or outstanding , but the number of underperforming secondaries remained \" stubborn and persistent . \"",
-    "What a World Cup ."
-  ],
-  [
-    "But , there have also been many cases of individuals and small groups of people protesting , as in the case of Rongye Adak , a nomad who called for the return of the Dalai Lama and for the freedom of Tibet during the Lithang Horse Racing Festival , in eastern Tibet .",
-    "James Duncan , head of transportation at Bournemouth Borough Council , said : \" Our legal team is reviewing the entitlement of taxis to drop and pick up passengers at bus stops , only for as long as is absolutely necessary to fulfil that function and for no other reason .",
-    "To Mo concerning the food log you kept -- Dr. Buchholz recommends the same thing .",
-    "The CBO estimates that only 23 percent of that would be spent in 2009 and 2010 .",
-    "Even so , Democrats slammed Bush as out of touch .",
-    "An information campaign will be launched later to raise awareness of employment rights and how to enforce them .",
-    "At the gallery the concept is less vague , as Ms. Piper cites specific instances of racial violence , political assassinations and the devastation of Hurricane Katrina .",
-    "There have been some exceptions -- such as Medicare in 1965 .",
-    "The government guidance will be reviewed early next year after a period of public comment .",
-    "It wasn 't the most seaworthy of prizes ."
-  ]
-]
diff --git a/docs/examples/sentiment_analysis/Bi-LSTM-Rep.png b/docs/examples/sentiment_analysis/Bi-LSTM-Rep.png
deleted file mode 100644
index 36fd1e7eb8..0000000000
Binary files a/docs/examples/sentiment_analysis/Bi-LSTM-Rep.png and /dev/null differ
diff --git a/docs/examples/sentiment_analysis/attention-nlp.png b/docs/examples/sentiment_analysis/attention-nlp.png
deleted file mode 100644
index debaab2a68..0000000000
Binary files a/docs/examples/sentiment_analysis/attention-nlp.png and /dev/null differ
diff --git a/docs/examples/sentiment_analysis/index.rst b/docs/examples/sentiment_analysis/index.rst
deleted file mode 100644
index 0e97ace35d..0000000000
--- a/docs/examples/sentiment_analysis/index.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Sentiment Analysis
-==================
-
-.. container:: cards
-
-   .. card::
-      :title: Fine-tuning LSTM-based Language Model
-      :link: sentiment_analysis.html
-
-      See how to fine-tune a pre-trained language model to perform sentiment analysis on movie reviews.
-
-   .. card::
-      :title: Training Structured Self-attentive Sentence Embedding
-      :link: self_attentive_sentence_embedding.html
-
-      See how to use GluonNLP to build more advanced model structure for extracting sentence
-      embeddings to predict Yelp review rating.
-
-
-
-.. toctree::
-   :hidden:
-   :maxdepth: 2
-
-   sentiment_analysis.ipynb
-   self_attentive_sentence_embedding.ipynb
-
diff --git a/docs/examples/sentiment_analysis/samodel-v3.png b/docs/examples/sentiment_analysis/samodel-v3.png
deleted file mode 100644
index abe56d72dc..0000000000
Binary files a/docs/examples/sentiment_analysis/samodel-v3.png and /dev/null differ
diff --git a/docs/examples/sentiment_analysis/self_attentive_sentence_embedding.md b/docs/examples/sentiment_analysis/self_attentive_sentence_embedding.md
deleted file mode 100644
index ac60e0cea7..0000000000
--- a/docs/examples/sentiment_analysis/self_attentive_sentence_embedding.md
+++ /dev/null
@@ -1,559 +0,0 @@
-# Training Structured Self-attentive Sentence Embedding
-
-After the novelty of word embeddings to create new numerical representations of words, natural language processing (NLP) has still been effectively improved in many ways. Along with the widespread use of embedding techniques, many other methods have been developed to further express the semantics and meanings of sentences with words:
-
-1. A vector representation of multiple words in a sentence can be concatenated or weighted to obtain a vector to represent the entirety of a sentence.
-
-2. Convolution (CNN) and maximum pooling (MaxPooling) on the matrix of all the word vectors of the sentence, using the final result of these techniques to represent the sentence as a whole.
-
-3. Unrolling the sentence according to the time step of the word, inputting the vector representation of each word into a recurrent neural network (RNN), and using the output of the last time step of the RNN as the representation of the sentence.
-
-The above methods solve the problem of sentence meaning, but only to a certain extent. When concatenating is used in method one, if the word of the sentence is too long and the vector dimension of the word is slightly larger, then the vector dimension of the sentence will be particularly large, and the internal interaction between the words of the sentence can not be taken into account. The use of weighted averaging is not accurate and does not adequately express the impact of each word on sentence semantics.
-
-In the second method, many useful word meanings may be lost using CNNs and MaxPooling.
-
-In the third method, the representation selected is only the output of the last step. If a sentence is too long, the output of the last step does not accurately express the entirety of the sentence's semantics.
-
-Based on the aforementioned method, Zhouhan Lin, Minwei Feng et al. published a paper [A Structured Self-attentive Sentence Embedding](https://arxiv.org/pdf/1703.03130.pdf)[1] in 2017, proposing a novel method based on self-attention structures for sentence embedding and application to users' review classification, textual entailment and other NLP tasks. In the end, better results were obtained than the previous methods.
-
-In this tutorial, we will use [GluonNLP](https://gluon-nlp.mxnet.io/index.html) to reproduce the model structure in "A Structured Self-attentive Sentence Embedding" and apply it to [Yelp Data's review star rating data set](https://www.yelp.com/dataset/challenge) for classification.
-
-## Importing necessary packages
-
-The first step, as in every one of these tutorials, is to import the necessary packages.
-
-```{.python .input}
-import os
-import json
-import zipfile
-import time
-import itertools
-
-import numpy as np
-import mxnet as mx
-import multiprocessing as mp
-import gluonnlp as nlp
-
-from mxnet import gluon, nd, init
-from mxnet.gluon import nn, rnn
-from mxnet import autograd, gluon, nd
-
-# iUse sklearn's metric function to evaluate the results of the experiment
-from sklearn.metrics import accuracy_score, f1_score
-
-# fixed random number seed
-np.random.seed(2018)
-mx.random.seed(2018)
-
-def try_gpu():
-    """If GPU is available, return mx.gpu(0); else return mx.cpu()."""
-    try:
-        ctx = mx.gpu()
-        _ = nd.array([0], ctx=ctx)
-    except:
-        ctx = mx.cpu()
-    return ctx
-```
-
-## Data pipeline
-
-The next step is to load and format the data according to the requirements of our model. The dataset used in this tutorial is the Yelp users' review dataset.
-
-### Loading the dataset
-
-The [Yelp users' review dataset](https://www.kaggle.com/yelp-dataset/yelp-dataset) is formatted as a JSON. The original paper selected 500,000 documents as the training set, 2,000 as the validation set, and 2,000 as the test set. For easier reproducibility of the experiment, we subsampled 198,000 documents from this dataset as the training set and 2,000 documents as validation set.
-
-Each sample in the data consists of a user's comment, in English, with each comment marked one through five, each number representing one of five different emotions the user expressed. Here we download, unzip, and reformat the dataset for ease of use further on.
-
-
-```{.python .input}
-# Download the data from the server
-data_url = 'http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/yelp_review_subset-167bb781.zip'
-zip_path = mx.gluon.utils.download(data_url)
-
-# Unzip the zip file
-zip_file = zipfile.ZipFile(zip_path)
-json_path = zip_file.extract(zip_file.namelist()[0])
-
-## Load the JSON data
-with open(json_path, 'r', encoding='utf-8') as fr:
-    data = json.load(fr)
-
-# Create a list of review a label pairs
-dataset = [[text, int(label)] for text, label in zip(data['texts'], data['labels'])]
-
-# Randomly divide one percent from the training set as a verification set
-train_dataset, valid_dataset = nlp.data.train_valid_split(dataset, 0.01)
-len(train_dataset), len(valid_dataset)
-```
-
-### Preliminary processing of the data
-
-The purpose of the following code is to process the raw data so that the pre-processed data can be used for model training and prediction. We will use the `SpacyTokenizer` to split the document into tokens, `ClipSequence` to crop the comments to the specified length, and then build a vocabulary based on the word frequency of the training data. Next, we attach the [Glove](https://nlp.stanford.edu/pubs/glove.pdf) [2] pre-trained word vector to the vocabulary and convert each token into the corresponding word index in the vocabulary.
-Finally, we get the standardized training data set and verification data set. Here we also define a few helper functions for later. We take advantage of the `mp.Pool()` function to spread the pre-processing over multiple cores or machines.
-
-
-```{.python .input}
-# The tokenizer takes as input a string and outputs a list of tokens.
-tokenizer = nlp.data.SpacyTokenizer('en')
-
-# `length_clip` takes as input a list and outputs a list with maximum length 100.
-length_clip = nlp.data.ClipSequence(100)
-
-def preprocess(x):
-
-    # Convert the number of stars 1, 2, 3, 4, 5 to zero-based index, 0, 1, 2, 3, 4
-    data, label = x[0], x[1]-1
-
-    # Clip the length of review words
-    data = length_clip(tokenizer(data))
-    return data, label
-
-def get_length(x):
-    return float(len(x[0]))
-
-def preprocess_dataset(dataset):
-    start = time.time()
-
-    with mp.Pool() as pool:
-        # Each sample is processed in an asynchronous manner.
-        dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
-        lengths = gluon.data.SimpleDataset(pool.map(get_length, dataset))
-    end = time.time()
-
-    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
-    return dataset, lengths
-
-# Preprocess the dataset
-train_dataset, train_data_lengths = preprocess_dataset(train_dataset)
-valid_dataset, valid_data_lengths = preprocess_dataset(valid_dataset)
-```
-
-This section creates the `vocab` object and converts the dataset's words to the Glove embeddings.
-
-```{.python .input}
-# Create the vocab
-train_seqs = [sample[0] for sample in train_dataset]
-counter = nlp.data.count_tokens(list(itertools.chain.from_iterable(train_seqs)))
-
-vocab = nlp.Vocab(counter, max_size=10000)
-
-# Load the pre-trained embedding, in this case the Glove embedding of 300 dimensions
-embedding_weights = nlp.embedding.GloVe(source='glove.6B.300d')
-vocab.set_embedding(embedding_weights)
-print(vocab)
-
-def token_to_idx(x):
-    return vocab[x[0]], x[1]
-
-# A token index or a list of token indices is returned according to the vocabulary.
-with mp.Pool() as pool:
-    train_dataset = pool.map(token_to_idx, train_dataset)
-    valid_dataset = pool.map(token_to_idx, valid_dataset)
-
-```
-
-## Bucketing, mini-batches, and the `DataLoader`
-Since each sentence may have a different length, we need to use `Pad` to fill the sentences in a mini-batch to equal lengths so that the data can be quickly tensored on the GPU. At the same time, we need to use `Stack` to stack the category tags of a batch of data. For convenience, we use `Tuple` to combine `Pad` and `Stack`.
-
-In order to make the length of the sentence pad in each mini-batch as small as possible, we should make the sentences with similar lengths in a batch as much as possible. In light of this, we consider constructing a sampler using `FixedBucketSampler`, which defines how the samples in a dataset will be iterated in a more economical way.
-
-Finally, we use `DataLoader` to build a data loader for the training and validation datasets. The training dataset requires a `FixedBucketSampler`, but the validation dataset doesn't require the sampler.
-
-Here we define the helper functions to do all of the above as well as define the hyperparameters for this section:
-
-```{.python .input}
-batch_size = 64
-bucket_num = 10
-bucket_ratio = 0.5
-
-
-def get_dataloader():
-
-    # Construct the DataLoader Pad data, stack label and lengths
-    batchify_fn = nlp.data.batchify.Tuple(
-        nlp.data.batchify.Pad(axis=0, pad_val=0),
-        nlp.data.batchify.Stack())
-
-    # In this example, we use a FixedBucketSampler,
-    # which assigns each data sample to a fixed bucket based on its length.
-    batch_sampler = nlp.data.sampler.FixedBucketSampler(
-        train_data_lengths,
-        batch_size=batch_size,
-        num_buckets=bucket_num,
-        ratio=bucket_ratio,
-        shuffle=True)
-    print(batch_sampler.stats())
-
-    # Training set DataLoader
-    train_dataloader = gluon.data.DataLoader(
-        dataset=train_dataset,
-        batch_sampler=batch_sampler,
-        batchify_fn=batchify_fn)
-    # Validation set DataLoader
-    valid_dataloader = gluon.data.DataLoader(
-        dataset=valid_dataset,
-        batch_size=batch_size,
-        shuffle=False,
-        batchify_fn=batchify_fn)
-    return train_dataloader, valid_dataloader
-
-train_dataloader, valid_dataloader = get_dataloader()
-```
-
-## Constructing the model and outlining the model's structure
-
-In the original paper, the representation of the sentence is broken into the following steps:
-
-Firstly, the sentence is disassembled into a list corresponding to the word.
-Then the words are unrolled in order, and the word vector of each word is calculated as the input of each step of the [bidirectional LSTM neural network layer](https://www.bioinf.jku.at/publications/older/2604.pdf) [3].
-Taking the output of each step of the bidirectional LSTM network layer, a matrix H is obtained. Suppose the hidden_dim of the bidirectional LSTM is `U`, the word length of the sentence is `N`, then the dimension of the last H is `N x 2U`.  For example, the sentence "This movie is amazing!" would be represented as:
-![](Bi-LSTM-Rep.png)
-
-Attention is very similar to when we are actually looking at an object, we always give different importance (or weights) to things in the scope of the perspective. A brief quote from skymind.ai summarizes what attention means in our daily lives as well as in neural networks in a few clear words:
-
-> The word describes the mind’s ability to allocate consideration unevenly across a field of sensation, thought and proprioception, to focus and bring certain inputs to the fore, while ignoring or diminishing the importance of others. So for neural networks, we’re basically talking about credit assignment. [4]
-
-For example, when we are communicating with people, our eyes will always pay more attention to the face of the communicator, rather than the type of trousers they are wearing or their toe nail polish. So when we are expressing a sentence with this model, we can pay different amounts of attention to the output H of the bi-directional LSTM layer.
-![](attention-nlp.png)
-$$
-A = Softmax(W_{s2}tanh(W_{s1}H^T))
-$$
-
-Here, W<sub>s1</sub> is a weight matrix with the shape: d<sub>a</sub>-by-2u, where d<sub>a</sub> is a hyperparameter.
-W<sub>s2</sub> is a weight matrix with the shape: r-by-d<sub>a</sub>, where r is the number of different attentions you want to use.
-
-When the attention matrix `A` and the output `H` of the LSTM are obtained, the final representation is obtained by $$M = AH$$.
-
-We can first customize a layer of attention, specify the number of hidden nodes (`att_unit`) and the number of attention channels (`att_hops`).
-
-
-```{.python .input}
-# A custom attention layer
-class SelfAttention(nn.HybridBlock):
-    def __init__(self, att_unit, att_hops, **kwargs):
-        super(SelfAttention, self).__init__(**kwargs)
-        with self.name_scope():
-            self.ut_dense = nn.Dense(att_unit, activation='tanh', flatten=False)
-            self.et_dense = nn.Dense(att_hops, activation=None, flatten=False)
-
-    def hybrid_forward(self, F, x):
-        # x shape: [batch_size, seq_len, embedding_width]
-        # ut shape: [batch_size, seq_len, att_unit]
-        ut = self.ut_dense(x)
-        # et shape: [batch_size, seq_len, att_hops]
-        et = self.et_dense(ut)
-
-        # att shape: [batch_size,  att_hops, seq_len]
-        att = F.softmax(F.transpose(et, axes=(0, 2, 1)), axis=-1)
-        # output shape [batch_size, att_hops, embedding_width]
-        output = F.batch_dot(att, x)
-
-        return output, att
-```
-
-When the number of samples for labels are very unbalanced, applying different weights on different labels may improve the performance of the model significantly.
-
-```{.python .input}
-
-class WeightedSoftmaxCE(nn.Block):
-    def __init__(self, sparse_label=True, from_logits=False,  **kwargs):
-        super(WeightedSoftmaxCE, self).__init__(**kwargs)
-        with self.name_scope():
-            self.sparse_label = sparse_label
-            self.from_logits = from_logits
-
-    def forward(self, pred, label, class_weight, depth=None):
-        if self.sparse_label:
-            label = nd.reshape(label, shape=(-1, ))
-            label = nd.one_hot(label, depth)
-        if not self.from_logits:
-            pred = nd.log_softmax(pred, -1)
-
-        weight_label = nd.broadcast_mul(label, class_weight)
-        loss = -nd.sum(pred * weight_label, axis=-1)
-
-        # return nd.mean(loss, axis=0, exclude=True)
-        return loss
-
-```
-
-We now define the basic model characteristics in a self-attentive bi-LSTM model, and configure the layers and dropout, as well as how the model feeds forward.
-
-```{.python .input}
-class SelfAttentiveBiLSTM(nn.HybridBlock):
-    def __init__(self, vocab_len, embsize, nhidden, nlayers, natt_unit, natt_hops, nfc, nclass,
-                 drop_prob, pool_way, prune_p=None, prune_q=None, **kwargs):
-        super(SelfAttentiveBiLSTM, self).__init__(**kwargs)
-        with self.name_scope():
-            self.embedding_layer = nn.Embedding(vocab_len, embsize)
-            self.bilstm = rnn.LSTM(nhidden, num_layers=nlayers, dropout=drop_prob, bidirectional=True)
-            self.att_encoder = SelfAttention(natt_unit, natt_hops)
-            self.dense = nn.Dense(nfc, activation='tanh')
-            self.output_layer = nn.Dense(nclass)
-
-            self.dense_p, self.dense_q = None, None
-            if all([prune_p, prune_q]):
-                self.dense_p = nn.Dense(prune_p, activation='tanh', flatten=False)
-                self.dense_q = nn.Dense(prune_q, activation='tanh', flatten=False)
-
-            self.drop_prob = drop_prob
-            self.pool_way = pool_way
-
-    def hybrid_forward(self, F, inp):
-        # input_embed: [batch, len, emsize]
-        inp_embed = self.embedding_layer(inp)
-        h_output = self.bilstm(F.transpose(inp_embed, axes=(1, 0, 2)))
-        # att_output: [batch, att_hops, emsize]
-        att_output, att = self.att_encoder(F.transpose(h_output, axes=(1, 0, 2)))
-
-        dense_input = None
-        if self.pool_way == 'flatten':
-            dense_input = F.Dropout(F.flatten(att_output), self.drop_prob)
-        elif self.pool_way == 'mean':
-            dense_input = F.Dropout(F.mean(att_output, axis=1), self.drop_prob)
-        elif self.pool_way == 'prune' and all([self.dense_p, self.dense_q]):
-            # p_section: [batch, att_hops, prune_p]
-            p_section = self.dense_p(att_output)
-            # q_section: [batch, emsize, prune_q]
-            q_section = self.dense_q(F.transpose(att_output, axes=(0, 2, 1)))
-            dense_input = F.Dropout(F.concat(F.flatten(p_section), F.flatten(q_section), dim=-1), self.drop_prob)
-
-        dense_out = self.dense(dense_input)
-        output = self.output_layer(F.Dropout(dense_out, self.drop_prob))
-
-        return output, att
-```
-
-## Configuring the parameters and assembling the model
-
-The resulting `M` is a matrix, and the way to classify this matrix is `flatten`-ing, `mean`-ing or `prune`-ing. Pruning is an effective way of trimming parameters that was proposed in the original paper, and has been implemented for our example.
-
-
-```{.python .input}
-vocab_len = len(vocab)
-emsize = 300         # word embedding size
-nhidden = 300        # lstm hidden_dim
-nlayers = 2          # lstm layers
-natt_unit = 300      # the hidden_units of attention layer
-natt_hops = 2        # the channels of attention
-nfc = 512
-nclass = 5
-
-drop_prob = 0.5
-pool_way = 'flatten'  # The way to handle M
-prune_p = None
-prune_q = None
-
-ctx = try_gpu()
-
-model = SelfAttentiveBiLSTM(vocab_len, emsize, nhidden, nlayers,
-                            natt_unit, natt_hops, nfc, nclass,
-                            drop_prob, pool_way, prune_p, prune_q)
-
-model.initialize(init=init.Xavier(), ctx=ctx)
-model.hybridize()
-
-# Attach a pre-trained glove word vector to the embedding layer
-model.embedding_layer.weight.set_data(vocab.embedding.idx_to_vec)
-# fixed the layer
-model.embedding_layer.collect_params().setattr('grad_req', 'null')
-```
-
-Using r attention can improve the representation of sentences with different semantics, but if the value of each line in the attention matrix `A` (r-byn) is very close, that is, there is no difference between several attentions. Subsequently, in $$M = AH$$, the resulting `M` will contain a lot of redundant information.
-So in order to solve this problem, we should try to force `A` to ensure that the value of each line has obvious differences, that is, try to satisfy the diversity of attention. Therefore, a penalty can be used to achieve this goal.
-
-$$ P = ||(AA^T-I)||_F^2 $$
-
-
-It can be seen from the above formula that if the value of each row of `A` is more similar, the result of `P` will be larger, and the value of `A` is less similar for each row, and `P` is smaller. This means that when the r-focused diversity of `A` is larger, the smaller `P` is. So by including this penalty item with the loss of the model, we can try to ensure the diversity of `A`.
-
-We incorporate these findings in the code below adding in the penalty coefficient along with the standard loss function.
-
-
-```{.python .input}
-def calculate_loss(x, y, model, loss, class_weight, penal_coeff):
-    pred, att = model(x)
-    if loss_name == 'sce':
-        l = loss(pred, y)
-    elif loss_name == 'wsce':
-        l = loss(pred, y, class_weight, class_weight.shape[0])
-
-    # penalty
-    diversity_penalty = nd.batch_dot(att, nd.transpose(att, axes=(0, 2, 1))
-                        ) - nd.eye(att.shape[1], ctx=att.context)
-    l = l + penal_coeff * diversity_penalty.norm(axis=(1, 2))
-
-    return pred, l
-```
-
-We then define what one epoch of training would be for the model for easier use later. In addition, we calculate loss, the F1 score, and accuracy for each epoch and print them for easier understanding. Additionally, we dynamically adjust the learning rate as the number of epochs increase. We also include an `is_train` boolean to allow us to know whether or not we should be altering the original model or just reporting the loss.
-
-```{.python .input}
-def one_epoch(data_iter, model, loss, trainer, ctx, is_train, epoch,
-              penal_coeff=0.0, clip=None, class_weight=None, loss_name='wsce'):
-
-    loss_val = 0.
-    total_pred = []
-    total_true = []
-    n_batch = 0
-
-    for batch_x, batch_y in data_iter:
-        batch_x = batch_x.as_in_context(ctx)
-        batch_y = batch_y.as_in_context(ctx)
-
-        if is_train:
-            with autograd.record():
-                batch_pred, l = calculate_loss(batch_x, batch_y, model, loss, class_weight, penal_coeff)
-
-            # backward calculate
-            l.backward()
-
-            # clip gradient
-            clip_params = [p.data() for p in model.collect_params().values()]
-            if clip is not None:
-                norm = nd.array([0.0], ctx)
-                for param in clip_params:
-                    if param.grad is not None:
-                        norm += (param.grad ** 2).sum()
-                norm = norm.sqrt().asscalar()
-                if norm > clip:
-                    for param in clip_params:
-                        if param.grad is not None:
-                            param.grad[:] *= clip / norm
-
-            # update parmas
-            trainer.step(batch_x.shape[0])
-
-        else:
-            batch_pred, l = calculate_loss(batch_x, batch_y, model, loss, class_weight, penal_coeff)
-
-        # keep result for metric
-        batch_pred = nd.argmax(nd.softmax(batch_pred, axis=1), axis=1).asnumpy()
-        batch_true = np.reshape(batch_y.asnumpy(), (-1, ))
-        total_pred.extend(batch_pred.tolist())
-        total_true.extend(batch_true.tolist())
-
-        batch_loss = l.mean().asscalar()
-
-        n_batch += 1
-        loss_val += batch_loss
-
-        # check the result of traing phase
-        if is_train and n_batch % 400 == 0:
-            print('epoch %d, batch %d, batch_train_loss %.4f, batch_train_acc %.3f' %
-                  (epoch, n_batch, batch_loss, accuracy_score(batch_true, batch_pred)))
-
-    # metric
-    F1 = f1_score(np.array(total_true), np.array(total_pred), average='weighted')
-    acc = accuracy_score(np.array(total_true), np.array(total_pred))
-    loss_val /= n_batch
-
-    if is_train:
-        print('epoch %d, learning_rate %.5f \n\t train_loss %.4f, acc_train %.3f, F1_train %.3f, ' %
-              (epoch, trainer.learning_rate, loss_val, acc, F1))
-        # declay lr
-        if epoch % 2 == 0:
-            trainer.set_learning_rate(trainer.learning_rate * 0.9)
-    else:
-        print('\t valid_loss %.4f, acc_valid %.3f, F1_valid %.3f, ' % (loss_val, acc, F1))
-
-```
-
-In addition, we include a helper method `train_valid` which combines the one epoch for the training data as well as the validation data, using the `is_train` boolean to swap between the two modes we discussed above.
-
-```{.python .input}
-def train_valid(data_iter_train, data_iter_valid, model, loss, trainer, ctx, nepochs,
-                penal_coeff=0.0, clip=None, class_weight=None, loss_name='wsce'):
-
-    for epoch in range(1, nepochs+1):
-        start = time.time()
-        # train
-        is_train = True
-        one_epoch(data_iter_train, model, loss, trainer, ctx, is_train,
-                  epoch, penal_coeff, clip, class_weight, loss_name)
-
-        # valid
-        is_train = False
-        one_epoch(data_iter_valid, model, loss, trainer, ctx, is_train,
-                  epoch, penal_coeff, clip, class_weight, loss_name)
-        end = time.time()
-        print('time %.2f sec' % (end-start))
-        print("*"*100)
-
-```
-
-## Training the model
-
-Now that we are actually training the model, we use `WeightedSoftmaxCE` to alleviate the problem of data categorical imbalance. We perform statistical analysis on the data in advance to retrieve a set of `class_weight`s.
-
-
-```{.python .input}
-class_weight = None
-loss_name = 'wsce'
-optim = 'adam'
-lr = 0.001
-penal_coeff = 0.1
-clip = 0.5
-nepochs = 4
-
-trainer = gluon.Trainer(model.collect_params(), optim, {'learning_rate': lr})
-
-if loss_name == 'sce':
-    loss = gluon.loss.SoftmaxCrossEntropyLoss()
-elif loss_name == 'wsce':
-    loss = WeightedSoftmaxCE()
-    # the value of class_weight is obtained by counting data in advance. It can be seen as a hyperparameter.
-    class_weight = nd.array([3.0, 5.3, 4.0, 2.0, 1.0], ctx=ctx)
-```
-
-We've simplified our lives earlier by creating the necessary helper methods so our training is as simple as the below line of code.
-
-```{.python .input}
-# train and valid
-train_valid(train_dataloader, valid_dataloader, model, loss, trainer, ctx, nepochs,
-            penal_coeff=penal_coeff, clip=clip, class_weight=class_weight, loss_name=loss_name)
-```
-
-## Predictions and sampling using our model
-
-Now that the model has been trained, we can randomly input a sentence into the model and predict its emotional value tag. The range of emotional markers (or the labels) is one through five, each corresponding to the degree of negativity to positivity.
-
-```{.python .input}
-input_ar = nd.array(vocab[['This', 'movie', 'is', 'amazing']], ctx=ctx).reshape((1, -1))
-pred, att = model(input_ar)
-
-label = np.argmax(nd.softmax(pred, axis=1).asnumpy(), axis=1) + 1
-print(label)
-print(att)
-```
-
-In order to intuitively understand the role of the attention mechanism, we visualize the output of the model's attention on the predicted samples using the `matplotlib` and `seaborn` modules.
-
-```{.python .input}
-# Visualizing the attention layer
-
-import matplotlib.pyplot as plt
-import seaborn as sns
-%matplotlib inline
-
-np.squeeze(att.asnumpy(), 0).shape
-plt.figure(figsize=(8,1))
-cmap = sns.diverging_palette(220, 10, as_cmap=True)
-sns.heatmap(np.squeeze(att.asnumpy(), 0), cmap=cmap, annot=True,
-            xticklabels=['This', 'movie', 'is', 'amazing'], yticklabels=['att0', 'att1'])
-plt.show()
-```
-
-## Conclusions
-
-Word embedding can effectively represent the semantic similarity between words, which allows for many breakthroughs in complex natural language processing tasks. Attention mechanisms can intuitively grasp the important semantic features in the sentence. The LSTM captures the word-order relationship between words in a sentence. Through a combination of these three, word embeddings, LSTMs, and attention mechanisms, we can effectively represent the semantics of a sentence and apply it to many practical tasks.
-
-GluonNLP provides us with an efficient and convenient toolbox to help us experiment quickly. This greatly simplifies the tedious work of many natural language processing tasks.
-
-## References
-
-1. [A Structured Self-Attentive Sentence Embedding](https://arxiv.org/pdf/1703.03130.pdf)
-2. [Glove: Global vectors for word representation. In Proceedings of the 2014 conference on empirical methods in natural language processing](https://nlp.stanford.edu/pubs/glove.pdf)
-3. [Long short-term memory](https://www.bioinf.jku.at/publications/older/2604.pdf)
-4. [Skymind.AI A Beginner's Guide to Attention Mechanisms and Memory Networks](https://skymind.ai/wiki/attention-mechanism-memory-network)
diff --git a/docs/examples/sentiment_analysis/sentiment_analysis.md b/docs/examples/sentiment_analysis/sentiment_analysis.md
deleted file mode 100644
index 9559bff86f..0000000000
--- a/docs/examples/sentiment_analysis/sentiment_analysis.md
+++ /dev/null
@@ -1,354 +0,0 @@
-# Fine-tuning LSTM-based Language Model
-
-Now that we've covered some advanced topics using advanced models, let's return to the basics and show how these techniques can help us even when addressing the comparatively simple problem of classification. In particular, we'll look at the classic problem of sentiment analysis: taking an input consisting of a string of text and classifying its sentiment as positive or negative.
-
-In this notebook, we are going to use GluonNLP to build a sentiment analysis model whose weights are initialized based on a pre-trained language model. Using pre-trained language model weights is a common approach for semi-supervised learning in NLP. In order to do a good job with large language modeling on a large corpus of text, our model must learn representations that contain information about the structure of natural language. Intuitively, by starting with these good features, versus simply random features, we're able to converge faster towards a superior model for our downstream task.
-
-With GluonNLP, we can quickly prototype the model and it's easy to customize. The building process consists of just three simple steps. For this demonstration we'll focus on movie reviews from the Large Movie Review Dataset, also known as the IMDB dataset. Given a movie, our model will output prediction of its sentiment, which can be positive or negative.
-
-
-## Setup
-
-Firstly, we must load the required modules. Please remember to download the archive from the top of this tutorial
-if you'd like to follow along. We set the random seed so the outcome can be relatively consistent.
-
-```{.python .input}
-import warnings
-warnings.filterwarnings('ignore')
-
-import random
-import time
-import multiprocessing as mp
-import numpy as np
-
-import mxnet as mx
-from mxnet import nd, gluon, autograd
-
-import gluonnlp as nlp
-nlp.utils.check_version('0.7.0')
-
-random.seed(123)
-np.random.seed(123)
-mx.random.seed(123)
-```
-
-## Sentiment analysis model with pre-trained language model encoder
-
-So that we can easily transplant the pre-trained weights, we'll base our model architecture on the pre-trained language model (LM). Following the LSTM layer, we have one representation vector for each word in the sentence. Because we plan to make a single prediction (as opposed to one per word), we'll first pool our predictions across time steps before feeding them through a dense last layer to produce our final prediction (a single sigmoid output node).
-
-![sa-model](samodel-v3.png)
-
-Specifically, our model represents input words by their embeddings. Following the embedding layer, our model consists of a two-layer LSTM, followed by an average pooling layer, followed by a sigmoid output layer (all illustrated in the figure above).
-
-Thus, given an input sequence, the memory cells in the LSTM layer will produce a representation sequence. This representation sequence is then averaged over all time steps resulting in a fixed-length sentence representation $h$. Finally, we apply a sigmoid output layer on top of $h$. We’re using the sigmoid activation function because we’re trying to predict if this text has positive or negative sentiment. A sigmoid activation function squashes the output values to the range [0,1], allowing us to interpret this output as a probability, making our lives relatively simpler.
-
-Below we define our `MeanPoolingLayer` and basic sentiment analysis network's (`SentimentNet`) structure.
-
-```{.python .input}
-class MeanPoolingLayer(gluon.HybridBlock):
-    """A block for mean pooling of encoder features"""
-    def __init__(self, prefix=None, params=None):
-        super(MeanPoolingLayer, self).__init__(prefix=prefix, params=params)
-
-    def hybrid_forward(self, F, data, valid_length): # pylint: disable=arguments-differ
-        """Forward logic"""
-        # Data will have shape (T, N, C)
-        masked_encoded = F.SequenceMask(data,
-                                        sequence_length=valid_length,
-                                        use_sequence_length=True)
-        agg_state = F.broadcast_div(F.sum(masked_encoded, axis=0),
-                                    F.expand_dims(valid_length, axis=1))
-        return agg_state
-
-
-class SentimentNet(gluon.HybridBlock):
-    """Network for sentiment analysis."""
-    def __init__(self, dropout, prefix=None, params=None):
-        super(SentimentNet, self).__init__(prefix=prefix, params=params)
-        with self.name_scope():
-            self.embedding = None # will set with lm embedding later
-            self.encoder = None # will set with lm encoder later
-            self.agg_layer = MeanPoolingLayer()
-            self.output = gluon.nn.HybridSequential()
-            with self.output.name_scope():
-                self.output.add(gluon.nn.Dropout(dropout))
-                self.output.add(gluon.nn.Dense(1, flatten=False))
-
-    def hybrid_forward(self, F, data, valid_length): # pylint: disable=arguments-differ
-        encoded = self.encoder(self.embedding(data))  # Shape(T, N, C)
-        agg_state = self.agg_layer(encoded, valid_length)
-        out = self.output(agg_state)
-        return out
-```
-
-## Defining the hyperparameters and initializing the model
-
-### Hyperparameters
-
-Our model is based on a standard LSTM model. We use a hidden layer size of 200. We use bucketing for speeding up the processing of variable-length sequences. We don't configure dropout for this model as it could be deleterious to the results.
-
-```{.python .input}
-dropout = 0
-language_model_name = 'standard_lstm_lm_200'
-pretrained = True
-learning_rate, batch_size = 0.005, 32
-bucket_num, bucket_ratio = 10, 0.2
-epochs = 1
-grad_clip = None
-log_interval = 100
-```
-
-If your environment supports GPUs, keep the context value the same. If it doesn't, swap the `mx.gpu(0)` to `mx.cpu()`.
-
-```{.python .input}
-context = mx.gpu(0)
-```
-
-### Loading the pre-trained model
-
-The loading of the pre-trained model, like in previous tutorials, is as simple as one line.
-
-```{.python .input}
-lm_model, vocab = nlp.model.get_model(name=language_model_name,
-                                      dataset_name='wikitext-2',
-                                      pretrained=pretrained,
-                                      ctx=context,
-                                      dropout=dropout)
-```
-
-### Creating the sentiment analysis model from the loaded pre-trained model
-
-In the code below, we already have acquireq a pre-trained model on the Wikitext-2 dataset using `nlp.model.get_model`. We then construct a SentimentNet object, which takes as input the embedding layer and encoder of the pre-trained model.
-
-As we employ the pre-trained embedding layer and encoder, *we only need to initialize the output layer* using `net.out_layer.initialize(mx.init.Xavier(), ctx=context)`.
-
-```{.python .input}
-net = SentimentNet(dropout=dropout)
-net.embedding = lm_model.embedding
-net.encoder = lm_model.encoder
-net.hybridize()
-net.output.initialize(mx.init.Xavier(), ctx=context)
-print(net)
-```
-
-## The data pipeline
-
-In this section, we describe in detail the data pipeline, from initialization to modifying it for use in our model.
-
-### Loading the sentiment analysis dataset (IMDB reviews)
-
-In the labeled train/test sets, out of a max score of 10, a negative review has a score of no more than 4, and a positive review has a score of no less than 7. Thus reviews with more neutral ratings are not included in the train/test sets. We labeled a negative review whose score <= 4 as 0, and a
-positive review whose score >= 7 as 1. As the neural ratings are not
-included in the datasets, we can use 5 as our threshold.
-
-```{.python .input}
-# The tokenizer takes as input a string and outputs a list of tokens.
-tokenizer = nlp.data.SpacyTokenizer('en')
-
-# `length_clip` takes as input a list and outputs a list with maximum length 500.
-length_clip = nlp.data.ClipSequence(500)
-
-# Helper function to preprocess a single data point
-def preprocess(x):
-    data, label = x
-    label = int(label > 5)
-    # A token index or a list of token indices is
-    # returned according to the vocabulary.
-    data = vocab[length_clip(tokenizer(data))]
-    return data, label
-
-# Helper function for getting the length
-def get_length(x):
-    return float(len(x[0]))
-
-# Loading the dataset
-train_dataset, test_dataset = [nlp.data.IMDB(root='data/imdb', segment=segment)
-                               for segment in ('train', 'test')]
-print('Tokenize using spaCy...')
-
-```
-
-Here we use the helper functions defined above to make pre-processing the dataset relatively stress-free and concise. As in a previous tutorial, `mp.Pool()` is leveraged to divide the work of preprocessing to multiple cores/machines.
-
-```{.python .input}
-def preprocess_dataset(dataset):
-    start = time.time()
-    with mp.Pool() as pool:
-        # Each sample is processed in an asynchronous manner.
-        dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
-        lengths = gluon.data.SimpleDataset(pool.map(get_length, dataset))
-    end = time.time()
-    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
-    return dataset, lengths
-
-# Doing the actual pre-processing of the dataset
-train_dataset, train_data_lengths = preprocess_dataset(train_dataset)
-test_dataset, test_data_lengths = preprocess_dataset(test_dataset)
-```
-
-In the following code, we use FixedBucketSampler, which assigns each data sample to a fixed bucket based on its length. The bucket keys are either given or generated from the input sequence lengths and the number of buckets.
-
-```{.python .input}
-# Construct the DataLoader
-
-def get_dataloader():
-
-    # Pad data, stack label and lengths
-    batchify_fn = nlp.data.batchify.Tuple(
-        nlp.data.batchify.Pad(axis=0, pad_val=0, ret_length=True),
-        nlp.data.batchify.Stack(dtype='float32'))
-    batch_sampler = nlp.data.sampler.FixedBucketSampler(
-        train_data_lengths,
-        batch_size=batch_size,
-        num_buckets=bucket_num,
-        ratio=bucket_ratio,
-        shuffle=True)
-    print(batch_sampler.stats())
-
-    # Construct a DataLoader object for both the training and test data
-    train_dataloader = gluon.data.DataLoader(
-        dataset=train_dataset,
-        batch_sampler=batch_sampler,
-        batchify_fn=batchify_fn)
-    test_dataloader = gluon.data.DataLoader(
-        dataset=test_dataset,
-        batch_size=batch_size,
-        shuffle=False,
-        batchify_fn=batchify_fn)
-    return train_dataloader, test_dataloader
-
-# Use the pre-defined function to make the retrieval of the DataLoader objects simple
-train_dataloader, test_dataloader = get_dataloader()
-```
-
-## Training the model
-
-Now that all the data has been pre-processed and the model architecture has been loosely defined, we can define the helper functions for evaluation and training of the model.
-
-### Evaluation using loss and accuracy
-
-Here, we define a function `evaluate(net, dataloader, context)` to determine the loss and accuracy of our model in a concise way. The code is very similar to evaluation of other models in the previous tutorials. For more information and explanation of this code, please refer to the previous tutorial on [LSTM-based Language Models](https://gluon-nlp.mxnet.io/master/examples/language_model/language_model.html).
-
-```{.python .input}
-def evaluate(net, dataloader, context):
-    loss = gluon.loss.SigmoidBCELoss()
-    total_L = 0.0
-    total_sample_num = 0
-    total_correct_num = 0
-    start_log_interval_time = time.time()
-
-    print('Begin Testing...')
-    for i, ((data, valid_length), label) in enumerate(dataloader):
-        data = mx.nd.transpose(data.as_in_context(context))
-        valid_length = valid_length.as_in_context(context).astype(np.float32)
-        label = label.as_in_context(context)
-        output = net(data, valid_length)
-
-        L = loss(output, label)
-        pred = (output > 0.5).reshape(-1)
-        total_L += L.sum().asscalar()
-        total_sample_num += label.shape[0]
-        total_correct_num += (pred == label).sum().asscalar()
-
-        if (i + 1) % log_interval == 0:
-            print('[Batch {}/{}] elapsed {:.2f} s'.format(
-                i + 1, len(dataloader),
-                time.time() - start_log_interval_time))
-            start_log_interval_time = time.time()
-
-    avg_L = total_L / float(total_sample_num)
-    acc = total_correct_num / float(total_sample_num)
-
-    return avg_L, acc
-```
-
-In the following code, we use FixedBucketSampler, which assigns each data sample to a fixed bucket based on its length. The bucket keys are either given or generated from the input sequence lengths and number of the buckets.
-
-```{.python .input}
-def train(net, context, epochs):
-    trainer = gluon.Trainer(net.collect_params(), 'ftml',
-                            {'learning_rate': learning_rate})
-    loss = gluon.loss.SigmoidBCELoss()
-
-    parameters = net.collect_params().values()
-
-    # Training/Testing
-    for epoch in range(epochs):
-        # Epoch training stats
-        start_epoch_time = time.time()
-        epoch_L = 0.0
-        epoch_sent_num = 0
-        epoch_wc = 0
-        # Log interval training stats
-        start_log_interval_time = time.time()
-        log_interval_wc = 0
-        log_interval_sent_num = 0
-        log_interval_L = 0.0
-
-        for i, ((data, length), label) in enumerate(train_dataloader):
-            L = 0
-            wc = length.sum().asscalar()
-            log_interval_wc += wc
-            epoch_wc += wc
-            log_interval_sent_num += data.shape[1]
-            epoch_sent_num += data.shape[1]
-            with autograd.record():
-                output = net(data.as_in_context(context).T,
-                             length.as_in_context(context)
-                                   .astype(np.float32))
-                L = L + loss(output, label.as_in_context(context)).mean()
-            L.backward()
-            # Clip gradient
-            if grad_clip:
-                gluon.utils.clip_global_norm(
-                    [p.grad(context) for p in parameters],
-                    grad_clip)
-            # Update parameter
-            trainer.step(1)
-            log_interval_L += L.asscalar()
-            epoch_L += L.asscalar()
-            if (i + 1) % log_interval == 0:
-                print(
-                    '[Epoch {} Batch {}/{}] elapsed {:.2f} s, '
-                    'avg loss {:.6f}, throughput {:.2f}K wps'.format(
-                        epoch, i + 1, len(train_dataloader),
-                        time.time() - start_log_interval_time,
-                        log_interval_L / log_interval_sent_num, log_interval_wc
-                        / 1000 / (time.time() - start_log_interval_time)))
-                # Clear log interval training stats
-                start_log_interval_time = time.time()
-                log_interval_wc = 0
-                log_interval_sent_num = 0
-                log_interval_L = 0
-        end_epoch_time = time.time()
-        test_avg_L, test_acc = evaluate(net, test_dataloader, context)
-        print('[Epoch {}] train avg loss {:.6f}, test acc {:.2f}, '
-              'test avg loss {:.6f}, throughput {:.2f}K wps'.format(
-                  epoch, epoch_L / epoch_sent_num, test_acc, test_avg_L,
-                  epoch_wc / 1000 / (end_epoch_time - start_epoch_time)))
-```
-
-And finally, because of all the helper functions we've defined, training our model becomes simply one line of code!
-
-```{.python .input}
-train(net, context, epochs)
-```
-
-And testing it becomes as simple as feeding in the sample sentence like below:
-
-```{.python .input}
-net(
-    mx.nd.reshape(
-        mx.nd.array(vocab[['This', 'movie', 'is', 'amazing']], ctx=context),
-        shape=(-1, 1)), mx.nd.array([4], ctx=context)).sigmoid()
-```
-
-Indeed, we can feed in any sentence and determine the sentiment with relative ease!
-
-## Conclusion
-
-We built a Sentiment Analysis by reusing the feature extractor from the pre-trained language model. The modular design of Gluon blocks makes it very easy to put together models for various needs. GluonNLP provides powerful building blocks that substantially simplify the process of constructing an efficient data pipeline and versatile models.
-
-### More information
-
-GluonNLP documentation is here along with more tutorials to provide you with the easiest experience in getting to know and use our tool: http://gluon-nlp.mxnet.io/index.html
diff --git a/docs/examples/sequence_sampling/index.rst b/docs/examples/sequence_sampling/index.rst
deleted file mode 100644
index 7ad83d697e..0000000000
--- a/docs/examples/sequence_sampling/index.rst
+++ /dev/null
@@ -1,21 +0,0 @@
-Text Generation
-===============
-
-.. container:: cards
-
-   .. card::
-      :title: Sequence Generation with Beam Search Sampler and Sequence Sampler
-      :link: sequence_sampling.html
-
-      Learn how to generate sentence from pre-trained language model through sampling and beam
-      search.
-
-
-.. toctree::
-   :hidden:
-   :maxdepth: 2
-
-   sequence_sampling.ipynb
-
-
-
diff --git a/docs/examples/sequence_sampling/sequence_sampling.md b/docs/examples/sequence_sampling/sequence_sampling.md
deleted file mode 100644
index 20fb865302..0000000000
--- a/docs/examples/sequence_sampling/sequence_sampling.md
+++ /dev/null
@@ -1,197 +0,0 @@
-# Sequence Generation with Beam Search Sampler and Sequence Sampler
-
-This tutorial demonstrates how to sample sequences using a
-pre-trained language model in the following two ways: with a beam search sampler
-and with a sequence sampler.
-
-Let's use `V` to denote the vocabulary size, and `T` to denote the sequence
-length. Given a language model, we can sample sequences according to the
-probability that they would occur according to our model. At each time step, a
-language model predicts the likelihood of each word occurring, given the context
-from prior time steps. The outputs at any time step can be any word from the
-vocabulary whose size is `V` and thus the number of all possible outcomes for a
-sequence of length `T` is thus $$V^T$$.
-
-While sometimes we might want to sample
-sentences according to their probability of occurring, at other times we want to
-find the sentences that *are most likely to occur*. This is especially true in
-the case of language translation where we don't just want to see *a*
-translation. We want the *best* translation. While finding the optimal outcome
-quickly becomes intractable as time increases, there are still many ways to
-sample reasonably good sequences. GluonNLP provides two samplers for generating
-from a language model: `BeamSearchSampler` and `SequenceSampler`.
-
-## Loading a pre-trained language model (LM)
-
-Firstly, let's load a pre-trained language model,
-from which we will sample sequences. GluonNLP makes this a painless process.
-
-```{.python .input}
-import mxnet as mx
-import gluonnlp as nlp
-nlp.utils.check_version('0.8.0')
-
-ctx = mx.cpu()
-lm_model, vocab = nlp.model.get_model(name='awd_lstm_lm_1150',
-                                      dataset_name='wikitext-2',
-                                      pretrained=True,
-                                      ctx=ctx)
-```
-
-## Sampling a Sequence with `BeamSearchSampler`
-
-To overcome the exponential complexity in sequence decoding, beam search decodes
-greedily, keeping those sequences that are most likely based on the probability
-up to the current time step. The size of this subset is called the *beam size*.
-Suppose the beam size is `K` and the output vocabulary size is `V`. When
-selecting the beams to keep, the beam search algorithm first predicts all
-possible successor words from the previous `K` beams, each of which has `V`
-possible outputs. This becomes a total of `K*V` paths. Out of these `K*V` paths,
-beam search ranks them by their score keeping only the top `K` paths.
-
-Let's take a look how to construct a `BeamSearchSampler`. The
-`nlp.model.BeamSearchSampler` class takes the following arguments for
-customization and extension:
-
-- beam_size : the beam size
-- decoder : callable function of the one-step-ahead decoder
-- eos_id : the id of the EOS token
-- scorer: the score function used in beam search
-- max_length: the maximum search length
-
-For beam search to work, we need a scorer function.
-
-#### The scorer function
-
-In this tutorial, we will use the `BeamSearchScorer`
-as the scorer function, which implements the scoring function with length penalty in the
-[Google NMT](https://arxiv.org/pdf/1609.08144.pdf) paper:
-
-```{.python .input}
-scorer = nlp.model.BeamSearchScorer(alpha=0, K=5, from_logits=False)
-```
-
-Defining the scorer is as simple as this one line.
-
-#### The decoder function
-
-Next, we define the decoder based on the pre-trained
-language model.
-
-```{.python .input}
-class LMDecoder(object):
-    def __init__(self, model):
-        self._model = model
-    def __call__(self, inputs, states):
-        outputs, states = self._model(mx.nd.expand_dims(inputs, axis=0), states)
-        return outputs[0], states
-    def state_info(self, *arg, **kwargs):
-        return self._model.state_info(*arg, **kwargs)
-decoder = LMDecoder(lm_model)
-```
-
-#### Beam Search Sampler
-
-Given a scorer and a decoder, we are ready to create a sampler. We use the symbol `.`
-to indicate the end of sentence (EOS). We can use vocab to get the index of the
-EOS to then feed the index to the sampler. The following code shows how to
-construct a beam search sampler. We will create a sampler with 4 beams and a
-maximum sample length of 20.
-
-```{.python .input}
-eos_id = vocab['.']
-beam_sampler = nlp.model.BeamSearchSampler(beam_size=5,
-                                           decoder=decoder,
-                                           eos_id=eos_id,
-                                           scorer=scorer,
-                                           max_length=20)
-```
-
-It's really that simple!
-
-#### Generate Sequences with Beam Search
-
-Next, we are going to generate sentences starting with "I love it" using beam
-search first. We feed ['I', 'Love'] to the language model to get the initial
-states and set the initial input to be the word 'it'. We will then print the
-top-3 generations.
-
-```{.python .input}
-bos = 'I love it'.split()
-bos_ids = [vocab[ele] for ele in bos]
-begin_states = lm_model.begin_state(batch_size=1, ctx=ctx)
-if len(bos_ids) > 1:
-    _, begin_states = lm_model(mx.nd.expand_dims(mx.nd.array(bos_ids[:-1]), axis=1),
-                               begin_states)
-inputs = mx.nd.full(shape=(1,), ctx=ctx, val=bos_ids[-1])
-```
-
-Here we define the helper function to generate the sequences so we can simply use one line
-to generate new sequences for any given input.
-
-```{.python .input}
-def generate_sequences(sampler, inputs, begin_states, num_print_outcomes):
-
-    samples, scores, valid_lengths = sampler(inputs, begin_states)
-    samples = samples[0].asnumpy()
-    scores = scores[0].asnumpy()
-    valid_lengths = valid_lengths[0].asnumpy()
-    print('Generation Result:')
-
-    for i in range(num_print_outcomes):
-        sentence = bos[:-1]
-
-        for ele in samples[i][:valid_lengths[i]]:
-            sentence.append(vocab.idx_to_token[ele])
-
-        print([' '.join(sentence), scores[i]])
-```
-
-And then below, we have the one-liner to generate the sequences.
-
-```{.python .input}
-generate_sequences(beam_sampler, inputs, begin_states, 5)
-```
-
-## Sampling a Sequence with `SequenceSampler`
-
-The previous generation results
-may look a bit boring. Instead, let's now use the sequence sampler to get relatively more
-interesting results.
-
-A `SequenceSampler` samples from the contextual multinomial distribution
-produced by the language model at each time step. Since we may want to control
-how "sharp" the distribution is to tradeoff diversity with correctness, we can
-use the temperature option in `SequenceSampler`, which controls the temperature
-of the softmax activation function.
-
-For each input, sequence sampler can sample
-multiple **independent** sequences at once. The number of independent sequences
-to sample can be specified through the argument `beam_size`.
-
-Defining the `SequenceSampler` is as simple as this:
-
-```{.python .input}
-seq_sampler = nlp.model.SequenceSampler(beam_size=5,
-                                        decoder=decoder,
-                                        eos_id=eos_id,
-                                        max_length=100,
-                                        temperature=0.97)
-```
-
-
-#### Generate Sequences with Sequence Sampler
-
-Now, instead of using the beam sampler for our `generate_sequences` function, we can use the `SequenceSampler` instead to sample sequences based on the same inputs used previously.
-
-```{.python .input}
-generate_sequences(seq_sampler, inputs, begin_states, 5)
-```
-
-Et voila! We've generated the most likely sentences based on our given input.
-
-#### Exercises for the keen reader
-
-- Tweak alpha and K in BeamSearchScorer, how are the results
-changed? Does it do relatively better or worse than the sequence SequenceSampler?
-- Try different samples to decode and figure out which results the BeamSearchSampler does better than the SequenceSampler
diff --git a/docs/examples/word_embedding/data.py b/docs/examples/word_embedding/data.py
deleted file mode 120000
index fae6ca2f33..0000000000
--- a/docs/examples/word_embedding/data.py
+++ /dev/null
@@ -1 +0,0 @@
-../../model_zoo/word_embeddings/data.py
\ No newline at end of file
diff --git a/docs/examples/word_embedding/index.rst b/docs/examples/word_embedding/index.rst
index 9f4bc9d120..5dbc578ceb 100644
--- a/docs/examples/word_embedding/index.rst
+++ b/docs/examples/word_embedding/index.rst
@@ -10,33 +10,9 @@ Representation Learning
       Basics on how to use word embedding with vocab in GluonNLP and apply it on word similarity and
       analogy problems.
 
-   .. card::
-      :title: Word Embeddings Training and Evaluation
-      :link: word_embedding_training.html
-
-      Learn how to train fastText and word2vec embeddings on your own dataset, and determine
-      embedding quality through intrinsic evaluation.
-
-   .. card::
-      :title: Extracting Sentence Features with Pre-trained ELMo
-      :link: ../sentence_embedding/elmo_sentence_representation.html
-
-      See how to use GluonNLP's model API to automatically download the pre-trained ELMo
-      model from NAACL2018 best paper, and extract features with it.
-
-   .. card::
-      :title: Fine-tuning Pre-trained BERT Models
-      :link: ../sentence_embedding/bert.html
-
-      See how to use GluonNLP to fine-tune a sentence pair classification model with
-      pre-trained BERT parameters.
-
 
 .. toctree::
    :hidden:
    :maxdepth: 2
 
    word_embedding.ipynb
-   word_embedding_training.ipynb
-   ../sentence_embedding/elmo_sentence_representation.ipynb
-   ../sentence_embedding/bert.ipynb
\ No newline at end of file
diff --git a/docs/examples/word_embedding/model.py b/docs/examples/word_embedding/model.py
deleted file mode 120000
index aaaeb28aa3..0000000000
--- a/docs/examples/word_embedding/model.py
+++ /dev/null
@@ -1 +0,0 @@
-../../model_zoo/word_embeddings/model.py
\ No newline at end of file
diff --git a/docs/examples/word_embedding/utils.py b/docs/examples/word_embedding/utils.py
deleted file mode 120000
index 43bec44533..0000000000
--- a/docs/examples/word_embedding/utils.py
+++ /dev/null
@@ -1 +0,0 @@
-../../model_zoo/word_embeddings/utils.py
\ No newline at end of file
diff --git a/docs/examples/word_embedding/word_embedding.md b/docs/examples/word_embedding/word_embedding.md
index 475355b8ec..6eea630294 100644
--- a/docs/examples/word_embedding/word_embedding.md
+++ b/docs/examples/word_embedding/word_embedding.md
@@ -33,11 +33,12 @@ To begin, let's first import a few packages that we'll need for this example:
 import warnings
 warnings.filterwarnings('ignore')
 
-from mxnet import gluon
-from mxnet import nd
+from mxnet import gluon, nd
 import gluonnlp as nlp
 import re
-nlp.utils.check_version('0.7.0')
+import collections
+import numpy as np
+
 ```
 
 ## Creating Vocabulary with Word Embeddings
@@ -54,18 +55,18 @@ in just a few lines of code.
 To begin, suppose that we have a simple text data set consisting of newline-separated strings.
 
 ```{.python .input}
-text = " hello world \n hello nice world \n hi world \n"
+text = " hello world \n hello nice world \n hi world \n goodgod"
 ```
 
 To start, let's implement a simple tokenizer to separate the words and then count the frequency of each word in the data set. We can use our defined tokenizer to count word frequency in the data set.
 
 ```{.python .input}
 def simple_tokenize(source_str, token_delim=' ', seq_delim='\n'):
-    return filter(None, re.split(token_delim + '|' + seq_delim, source_str))
-counter = nlp.data.count_tokens(simple_tokenize(text))
+    return list(filter(None, re.split(token_delim + '|' + seq_delim, source_str)))
+counter = collections.Counter(simple_tokenize(text))
 ```
 
-The obtained `counter` behaves like a Python dictionary whose key-value pairs consist of words and their frequencies, respectively.
+The obtained `counter`'s key-value pairs consist of words and their frequencies, respectively.
 We can then instantiate a `Vocab` object with a counter.
 Because `counter` tracks word frequencies, we are able to specify arguments
 such as `max_size` (maximum size) and `min_freq` (minimum frequency) to the `Vocab` constructor to restrict the size of the resulting vocabulary. 
@@ -74,86 +75,133 @@ Suppose that we want to build indices for all the keys in counter.
 If we simply want to construct a  `Vocab` containing every word, then we can supply `counter`  the only argument.
 
 ```{.python .input}
-vocab = nlp.Vocab(counter)
+vocab = nlp.data.Vocab(counter)
 ```
 
-A `Vocab` object associates each word with an index. We can easily access words by their indices using the `vocab.idx_to_token` attribute.
+A `Vocab` object associates each word with an index. We can easily access words by their indices using the `vocab.all_tokens` attribute.
 
 ```{.python .input}
-for word in vocab.idx_to_token:
+for word in vocab.all_tokens:
     print(word)
 ```
 
-Contrarily, we can also grab an index given a token using `vocab.token_to_idx`.
+Contrarily, we can also grab an index given a token using `__getitem__` or `vocab.token_to_idx`.
 
 ```{.python .input}
-print(vocab.token_to_idx["<unk>"])
+print(vocab["<unk>"])
 print(vocab.token_to_idx["world"])
 ```
 
-In Gluon NLP, for each word, there are three representations: the index of where it occurred in the original input (idx), the embedding (or vector/vec), and the token (the actual word). At any point, we may use any of the following methods to switch between the three representations: `idx_to_vec`, `idx_to_token`, `token_to_idx`.
 
-### Attaching word embeddings
+### Load word embeddings
 
-Our next step will be to attach word embeddings to the words indexed by `vocab`.
+Our next step will be to load word embeddings for a given `vocab`.
 In this example, we'll use *fastText* embeddings trained on the *wiki.simple* dataset.
-First, we'll want to create a word embedding instance by calling `nlp.embedding.create`,
-specifying the embedding type `fasttext` (an unnamed argument) and the source `source='wiki.simple'` (the named argument).
-
-```{.python .input}
-fasttext_simple = nlp.embedding.create('fasttext', source='wiki.simple')
-```
-
-To attach the newly loaded word embeddings `fasttext_simple` to indexed words in `vocab`, we can simply call vocab's `set_embedding` method:
 
 ```{.python .input}
-vocab.set_embedding(fasttext_simple)
+matrix = nlp.embedding.load_embeddings(vocab, 'wiki.simple')
 ```
 
 To see other available sources of pretrained word embeddings using the *fastText* algorithm,
-we can call `text.embedding.list_sources`.
+we can call `nlp.embedding.list_sources`.
 
 ```{.python .input}
 nlp.embedding.list_sources('fasttext')[:5]
 ```
 
-The created vocabulary `vocab` includes four different words and a special
+The created vocabulary `vocab` includes five different words and a special
 unknown token. Let us check the size of `vocab`.
 
 ```{.python .input}
 len(vocab)
 ```
 
-By default, the vector of any token that is unknown to `vocab` is a zero vector.
+By default, the vector of any token that is unknown to `vocab` is the vector of `vocab.unk_token`.
+Its length is equal to the vector dimensions of the fastText word embeddings:
+(300,).
+
+```{.python .input}
+matrix[vocab['beautiful']].shape
+```
+
+Let us check the shape of the embedding of the words 'hello' from `vocab`.
+
+```{.python .input}
+matrix[vocab['hello']].shape
+```
+
+We can access the first five elements of the embedding of 'hello' and see that they are non-zero.
+
+```{.python .input}
+matrix[vocab['hello']][:5]
+```
+
+By default, the vector of any token that is in `vocab` but not in the pre-trained file
+is a vector generated by by sampling from normal distribution 
+with the same std and mean of the pre-trained embedding matrix.
 Its length is equal to the vector dimensions of the fastText word embeddings:
 (300,).
 
 ```{.python .input}
-vocab.embedding['beautiful'].shape
+matrix[vocab['goodgod']].shape
 ```
 
-The first five elements of the vector of any unknown token are zeros.
+We can access the first five elements of the embedding of 'goodgod'.
 
 ```{.python .input}
-vocab.embedding['beautiful'][:5]
+matrix[vocab['goodgod']][:5]
 ```
 
-Let us check the shape of the embedding of the words 'hello' and 'world' from `vocab`.
+You can change the way to generate vectors for this kind of tokens by
+specifying `unk_method` in `load_embeddings` function.
+The `unk_method` is a function which receives `List[str]` 
+and returns an embedding matrix(`numpy.ndarray`) for words not in the pre-trained file.
+For example, 
 
 ```{.python .input}
-vocab.embedding['hello', 'world'].shape
+def simple(words):
+    return np.ones((len(words), 300))
+matrix = nlp.embedding.load_embeddings(vocab, 'wiki.simple', unk_method=simple)
 ```
 
-We can access the first five elements of the embedding of 'hello' and 'world' and see that they are non-zero.
+We can access the first five elements of the embedding of 'goodgod' and see that they are ones.
 
 ```{.python .input}
-vocab.embedding['hello', 'world'][:, :5]
+matrix[vocab['goodgod']][:5]
+```
+
+Sometimes we need to use `FastText` to compute vectors for Out-of-Vocabulary(OOV) words.
+In this case, we provide `get_fasttext_model` to return a `FastText` model for you to use.
+
+```{.python .input}
+model = nlp.embedding.get_fasttext_model('wiki.en')
+```
+
+It will return a `fasttext.FastText._FastText` object, you can get more information 
+about it from `fasttext.cc`.
+
+Let us check the shape of the embedding of the OOV word 'goodgod'.
+
+```{.python .input}
+model['goodgod'].shape
+```
+
+We can access the first five elements of the embedding of 'goodgod'.
+
+```{.python .input}
+model['goodgod'][:5]
+```
+
+To see other available sources of the `FastText` model,
+we can call `nlp.embedding.list_sources`.
+
+```{.python .input}
+nlp.embedding.list_sources('fasttext.bin')[:5]
 ```
 
 ### Using Pre-trained Word Embeddings in Gluon
 
-To demonstrate how to use pre-
-trained word embeddings in Gluon, let us first obtain the indices of the words
+To demonstrate how to use pre-trained word embeddings in Gluon, let us first obtain the indices of the words
 'hello' and 'world'.
 
 ```{.python .input}
@@ -161,14 +209,14 @@ vocab['hello', 'world']
 ```
 
 We can obtain the vectors for the words 'hello' and 'world' by specifying their
-indices (5 and 4) and the weight or embedding matrix, which we get from calling `vocab.embedding.idx_to_vec` in
-`gluon.nn.Embedding`. We initialize a new layer and set the weights using the layer.weight.set_data method. Subsequently, we pull out the indices 5 and 4 from the weight vector and check their first five entries.
+indices (5 and 4) and the weight or embedding matrix, which we get from 
+`gluon.nn.Embedding`. We initialize a new layer and set the weights using the `layer.weight.set_data` method. Subsequently, we pull out the indices 5 and 4 from the weight vector and check their first five entries.
 
 ```{.python .input}
-input_dim, output_dim = vocab.embedding.idx_to_vec.shape
+input_dim, output_dim = matrix.shape
 layer = gluon.nn.Embedding(input_dim, output_dim)
 layer.initialize()
-layer.weight.set_data(vocab.embedding.idx_to_vec)
+layer.weight.set_data(matrix)
 layer(nd.array([5, 4]))[:, :5]
 ```
 
@@ -183,30 +231,24 @@ nlp.embedding.list_sources('glove')[:5]
 ```
 
 For simplicity of demonstration, we use a smaller word embedding file, such as
-the 50-dimensional one.
-
-```{.python .input}
-glove_6b50d = nlp.embedding.create('glove', source='glove.6B.50d')
-```
-
-Now we create vocabulary by using all the tokens from `glove_6b50d`.
+the 50-dimensional one. 
+Now we create vocabulary by using all the tokens from `glove.6b.50d`.
 
 ```{.python .input}
-vocab = nlp.Vocab(nlp.data.Counter(glove_6b50d.idx_to_token))
-vocab.set_embedding(glove_6b50d)
+matrix, vocab = nlp.embedding.load_embeddings(vocab=None, pretrained_name_or_dir='glove.6B.50d')
 ```
 
 Below shows the size of `vocab` including a special unknown token.
 
 ```{.python .input}
-len(vocab.idx_to_token)
+len(vocab)
 ```
 
 We can access attributes of `vocab`.
 
 ```{.python .input}
 print(vocab['beautiful'])
-print(vocab.idx_to_token[71424])
+print(vocab.all_tokens[71424])
 ```
 
 ## Applications of Word Embeddings
@@ -215,18 +257,18 @@ To apply word embeddings, we need to define
 cosine similarity. Cosine similarity determines the similarity between two vectors.
 
 ```{.python .input}
-from mxnet import nd
+import numpy as np
 def cos_sim(x, y):
-    return nd.dot(x, y) / (nd.norm(x) * nd.norm(y))
+    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))
 ```
 
 The range of cosine similarity between two vectors can be between -1 and 1. The
 larger the value, the larger the similarity between the two vectors.
 
 ```{.python .input}
-x = nd.array([1, 2])
-y = nd.array([10, 20])
-z = nd.array([-1, -2])
+x = np.array([1, 2])
+y = np.array([10, 20])
+z = np.array([-1, -2])
 
 print(cos_sim(x, y))
 print(cos_sim(x, z))
@@ -245,14 +287,17 @@ We can then find the indices for which the dot product is greatest (`topk`), whi
 
 ```{.python .input}
 def norm_vecs_by_row(x):
-    return x / nd.sqrt(nd.sum(x * x, axis=1) + 1E-10).reshape((-1,1))
-
-def get_knn(vocab, k, word):
-    word_vec = vocab.embedding[word].reshape((-1, 1))
-    vocab_vecs = norm_vecs_by_row(vocab.embedding.idx_to_vec)
-    dot_prod = nd.dot(vocab_vecs, word_vec)
-    indices = nd.topk(dot_prod.reshape((len(vocab), )), k=k+1, ret_typ='indices')
-    indices = [int(i.asscalar()) for i in indices]
+    return x / np.sqrt(np.sum(x * x, axis=1) + 1E-10).reshape((-1,1))
+
+def topk(res, k):
+    part = np.argpartition(res, k)
+    return part[np.argsort(res[part])].tolist()
+
+def get_knn(vocab, matrix, k, word):
+    word_vec = matrix[vocab[word]].reshape((-1, 1))
+    vocab_vecs = norm_vecs_by_row(matrix)
+    dot_prod = np.dot(vocab_vecs, word_vec)
+    indices = topk(dot_prod.reshape((len(vocab), )), k=k+1)
     # Remove unknown and input tokens.
     return vocab.to_tokens(indices[1:])
 ```
@@ -261,31 +306,31 @@ Let us find the 5 most similar words to 'baby' from the vocabulary (size:
 400,000 words).
 
 ```{.python .input}
-get_knn(vocab, 5, 'baby')
+get_knn(vocab, matrix, 5, 'baby')
 ```
 
 We can verify the cosine similarity of the vectors of 'baby' and 'babies'.
 
 ```{.python .input}
-cos_sim(vocab.embedding['baby'], vocab.embedding['babies'])
+cos_sim(matrix[vocab['baby']], matrix[vocab['babies']])
 ```
 
 Let us find the 5 most similar words to 'computers' from the vocabulary.
 
 ```{.python .input}
-get_knn(vocab, 5, 'computers')
+get_knn(vocab, matrix, 5, 'computers')
 ```
 
 Let us find the 5 most similar words to 'run' from the given vocabulary.
 
 ```{.python .input}
-get_knn(vocab, 5, 'run')
+get_knn(vocab, matrix, 5, 'run')
 ```
 
 Let us find the 5 most similar words to 'beautiful' from the vocabulary.
 
 ```{.python .input}
-get_knn(vocab, 5, 'beautiful')
+get_knn(vocab, matrix, 5, 'beautiful')
 ```
 
 ### Word Analogy
@@ -302,48 +347,47 @@ In this example,
 we will find words that are analogous from the 400,000 indexed words in `vocab`.
 
 ```{.python .input}
-def get_top_k_by_analogy(vocab, k, word1, word2, word3):
-    word_vecs = vocab.embedding[word1, word2, word3]
+def get_top_k_by_analogy(vocab, matrix, k, word1, word2, word3):
+    word_vecs = [matrix[vocab[word]] for word in [word1, word2, word3]]
     word_diff = (word_vecs[1] - word_vecs[0] + word_vecs[2]).reshape((-1, 1))
-    vocab_vecs = norm_vecs_by_row(vocab.embedding.idx_to_vec)
-    dot_prod = nd.dot(vocab_vecs, word_diff)
-    indices = nd.topk(dot_prod.reshape((len(vocab), )), k=k, ret_typ='indices')
-    indices = [int(i.asscalar()) for i in indices]
+    vocab_vecs = norm_vecs_by_row(matrix)
+    dot_prod = np.dot(vocab_vecs, word_diff)
+    indices = topk(dot_prod.reshape((len(vocab), )), k=k)
     return vocab.to_tokens(indices)
 ```
 
 We leverage this method to find the word to complete the analogy 'man : woman :: son :'.
 
 ```{.python .input}
-get_top_k_by_analogy(vocab, 1, 'man', 'woman', 'son')
+get_top_k_by_analogy(vocab, matrix, 1, 'man', 'woman', 'son')
 ```
 
 Let us verify the cosine similarity between vec('son')+vec('woman')-vec('man')
 and vec('daughter').
 
 ```{.python .input}
-def cos_sim_word_analogy(vocab, word1, word2, word3, word4):
+def cos_sim_word_analogy(vocab, matrix, word1, word2, word3, word4):
     words = [word1, word2, word3, word4]
-    vecs = vocab.embedding[words]
+    vecs = [matrix[vocab[word]] for word in words]
     return cos_sim(vecs[1] - vecs[0] + vecs[2], vecs[3])
 
-cos_sim_word_analogy(vocab, 'man', 'woman', 'son', 'daughter')
+cos_sim_word_analogy(vocab, matrix, 'man', 'woman', 'son', 'daughter')
 ```
 
 And to perform some more tests, let's try the following analogy: 'beijing : china :: tokyo : '.
 
 ```{.python .input}
-get_top_k_by_analogy(vocab, 1, 'beijing', 'china', 'tokyo')
+get_top_k_by_analogy(vocab, matrix, 1, 'beijing', 'china', 'tokyo')
 ```
 
 And another word analogy: 'bad : worst :: big : '.
 
 ```{.python .input}
-get_top_k_by_analogy(vocab, 1, 'bad', 'worst', 'big')
+get_top_k_by_analogy(vocab, matrix, 1, 'bad', 'worst', 'big')
 ```
 
 And the last analogy: 'do : did :: go :'.
 
 ```{.python .input}
-get_top_k_by_analogy(vocab, 1, 'do', 'did', 'go')
+get_top_k_by_analogy(vocab, matrix, 1, 'do', 'did', 'go')
 ```
diff --git a/docs/examples/word_embedding/word_embedding_training.md b/docs/examples/word_embedding/word_embedding_training.md
deleted file mode 100644
index 819d239089..0000000000
--- a/docs/examples/word_embedding/word_embedding_training.md
+++ /dev/null
@@ -1,381 +0,0 @@
-# Word Embeddings Training and Evaluation
-
-```{.python .input}
-import warnings
-warnings.filterwarnings('ignore')
-
-import itertools
-import time
-import math
-import logging
-import random
-
-import mxnet as mx
-import gluonnlp as nlp
-import numpy as np
-from scipy import stats
-
-nlp.utils.check_version('0.7.0')
-
-# context = mx.cpu()  # Enable this to run on CPU
-context = mx.gpu(0)  # Enable this to run on GPU
-```
-
-## Data
-Here we use the Text8 corpus from the [Large Text Compression
-Benchmark](http://mattmahoney.net/dc/textdata.html) which includes the first
-100
-MB of cleaned text from Wikipedia in English.
-
-```{.python .input}
-text8 = nlp.data.Text8()
-print('# sentences:', len(text8))
-for sentence in text8[:3]:
-    print('# tokens:', len(sentence), sentence[:5])
-```
-
-Given the tokenized data, we first count all tokens and then construct a
-vocabulary of all tokens that occur at least 5 times in the dataset. The
-vocabulary contains a one-to-one mapping between tokens and integers (also
-called indices or idx for short).
-
-Furthermore, we can store the frequency count of each
-token in the vocabulary as we will require this information later on for
-sampling random negative (or noise) words. Finally, we replace all tokens with
-their integer representation based on the vocabulary.
-
-```{.python .input}
-counter = nlp.data.count_tokens(itertools.chain.from_iterable(text8))
-vocab = nlp.Vocab(counter, unknown_token=None, padding_token=None,
-                  bos_token=None, eos_token=None, min_freq=5)
-idx_to_counts = [counter[w] for w in vocab.idx_to_token]
-
-def code(sentence):
-    return [vocab[token] for token in sentence if token in vocab]
-
-text8 = text8.transform(code, lazy=False)
-
-print('# sentences:', len(text8))
-for sentence in text8[:3]:
-    print('# tokens:', len(sentence), sentence[:5])
-```
-
-Next we need to transform the coded Text8 dataset into batches that are more useful for
-training an embedding model.
-
-In this tutorial we train leveraging the SkipGram
-objective made popular by the following: [1].
-
-For SkipGram, we sample pairs of co-occurring
-words from the corpus.
-Two words are said to co-occur if they occur with
-distance less than a specified *window* size.
-The *window* size is usually
-chosen around 5. Refer to the aforementioned paper for more details.
-
-To obtain the samples from the corpus, we can shuffle the
-sentences and then proceed linearly through each sentence, considering each word
-as well as all the words in its window. In this case, we call the current word
-in focus the center word, and the words in its window, the context words.
-GluonNLP contains `gluonnlp.data.EmbeddingCenterContextBatchify` batchify
-transformation, that takes a corpus, such as the coded Text8 we have here, and
-returns a `DataStream` of batches of center and context words.
-
-To obtain good
-results, each sentence is further subsampled, meaning that words are deleted
-with a probability proportional to their frequency.
-[1] proposes to discard
-individual occurrences of words from the dataset with probability
-
-$$P(w_i) = 1 -
-\sqrt{\frac{t}{f(w_i)}}$$
-
-where $f(w_i)$ is the frequency with which a word is
-observed in a dataset and $t$ is a subsampling constant typically chosen around
-$10^{-5}$.
-[1] has also shown that the final performance is improved if the
-window size is chosen  uniformly random for each center words out of the range
-[1, *window*].
-
-For this notebook, we are interested in training a fastText
-embedding model [2]. A fastText model not only associates an embedding vector with
-each token in the vocabulary, but also with a pre-specified number of subwords.
-Commonly 2 million subword vectors are obtained and each subword vector is
-associated with zero, one, or multiple character-ngrams. The mapping between
-character-ngrams and subwords is based on a hash function.
-The *final* embedding
-vector of a token is the mean of the vectors associated with the token and all
-character-ngrams occurring in the string representation of the token. Thereby a
-fastText embedding model can compute meaningful embedding vectors for tokens
-that were not seen during training.
-
-For this notebook, we have prepared a helper function `transform_data_fasttext`
-which builds a series of transformations of the `text8 Dataset` created above,
-applying the techniques we mention briefly above. It returns a `DataStream` over batches as
-well as a `batchify_fn` function that applied to a batch looks up and includes the
-fastText subwords associated with the center words. Additionally, it returns the subword
-function which can be used to obtain the subwords of a given string
-representation of a token. We will take a closer look at the subword function
-farther on.
-
-You can find the `transform_data_fasttext()` function in `data.py` in the
-archive that can be downloaded via the `Download` button at the top of this page.
-
-```{.python .input}
-from data import transform_data_fasttext
-
-batch_size=4096
-data = nlp.data.SimpleDataStream([text8])  # input is a stream of datasets, here just 1. Allows scaling to larger corpora that don't fit in memory
-data, batchify_fn, subword_function = transform_data_fasttext(
-    data, vocab, idx_to_counts, cbow=False, ngrams=[3,4,5,6], ngram_buckets=100000, batch_size=batch_size, window_size=5)
-```
-
-```{.python .input}
-batches = data.transform(batchify_fn)
-```
-
-Note that the number of subwords is potentially
-different for every word. Therefore the batchify_fn represents a word with its
-subwords as a row in a compressed sparse row (CSR) matrix. For more information on CSR matrices click here:
-https://mxnet.incubator.apache.org/tutorials/sparse/csr.html
-
-Separating the batchify_fn from the previous word-pair
-sampling is useful, as it allows parallelization of the CSR matrix construction over
-multiple CPU cores for separate batches.
-
-## Subwords
-
-`GluonNLP` provides the concept of a subword function which maps
-words to a list of indices representing their subword.
-Possible subword functions
-include mapping a word to the sequence of it's characters/bytes or hashes of all
-its ngrams.
-
-FastText models use a hash function to map each ngram of a word to
-a number in range `[0, num_subwords)`. We include the same hash function.
-Above
-`transform_data_fasttext` has also returned a `subword_function` object. Let's try it with
-a few words:
-
-```{.python .input}
-idx_to_subwordidxs = subword_function(vocab.idx_to_token)
-for word, subwords in zip(vocab.idx_to_token[:3], idx_to_subwordidxs[:3]):
-    print('<'+word+'>', subwords, sep = '\t')
-```
-
-## Model
-
-Here we define a SkipGram model for training fastText embeddings.
-For
-Skip-Gram, the model consists of two independent embedding networks.
-One for the
-center words, and one for the context words.
-For center words, subwords are
-taken into account while for context words only the token itself is taken into
-account.
-
-GluonNLP provides an `nlp.model.train.FasttextEmbeddingModel` block
-which defines the fastText style embedding with subword support.
-It can be used
-for training, but also supports loading models trained with the original C++
-fastText library from `.bin` files.
-After training, vectors for arbitrary words
-can be looked up via `embedding[['a', 'list', 'of', 'potentially', 'unknown',
-'words']]` where `embedding` is an `nlp.model.train.FasttextEmbeddingModel`.
-
-In
-the `model.py` script we provide a definition for the fastText model for the
-SkipGram objective.
-The model definition is a Gluon HybridBlock, meaning that
-the complete forward / backward pass are compiled and executed directly in the
-MXNet backend. Not only does the block include the `FasttextEmbeddingModel` for
-the center words and a simple embedding matrix for the context words, but it
-also takes care of sampling a specified number of noise words for each center-
-context pair. These noise words are called negatives, as the resulting center-
-negative pair is unlikely to occur in the dataset. The model then must learn
-which word-pairs are negatives and which ones are real. Thereby it obtains
-meaningful word and subword vectors for all considered tokens. The negatives are
-sampled from the smoothed unigram frequency distribution.
-
-Let's instantiate and
-initialize the model. We also create a trainer object for updating the
-parameters with AdaGrad.
-Finally we print a summary of the model.
-
-```{.python .input}
-from model import SG as SkipGramNet
-
-emsize = 300
-num_negatives = 5
-
-negatives_weights = mx.nd.array(idx_to_counts)
-embedding = SkipGramNet(
-    vocab.token_to_idx, emsize, batch_size, negatives_weights, subword_function, num_negatives=5, smoothing=0.75)
-embedding.initialize(ctx=context)
-embedding.hybridize()
-trainer = mx.gluon.Trainer(embedding.collect_params(), 'adagrad', dict(learning_rate=0.05))
-
-print(embedding)
-```
-
-Let's take a look at the documentation of the forward pass.
-
-```{.python .input}
-print(SkipGramNet.hybrid_forward.__doc__)
-```
-
-Before we start training, let's examine the quality of our randomly initialized
-embeddings:
-
-```{.python .input}
-def norm_vecs_by_row(x):
-    return x / (mx.nd.sum(x * x, axis=1) + 1e-10).sqrt().reshape((-1, 1))
-
-
-def get_k_closest_tokens(vocab, embedding, k, word):
-    word_vec = norm_vecs_by_row(embedding[[word]])
-    vocab_vecs = norm_vecs_by_row(embedding[vocab.idx_to_token])
-    dot_prod = mx.nd.dot(vocab_vecs, word_vec.T)
-    indices = mx.nd.topk(
-        dot_prod.reshape((len(vocab.idx_to_token), )),
-        k=k + 1,
-        ret_typ='indices')
-    indices = [int(i.asscalar()) for i in indices]
-    result = [vocab.idx_to_token[i] for i in indices[1:]]
-    print('closest tokens to "%s": %s' % (word, ", ".join(result)))
-```
-
-```{.python .input}
-example_token = "vector"
-get_k_closest_tokens(vocab, embedding, 10, example_token)
-```
-
-We can see that in the randomly initialized fastText model the closest tokens to
-"vector" are based on overlapping ngrams.
-
-## Training
-
-Thanks to the Gluon data pipeline and the HybridBlock handling all
-complexity, our training code is very simple.
-We iterate over all batches, move
-them to the appropriate context (GPU), do forward, backward, and parameter update
-and finally include some helpful print statements for following the training
-process.
-
-```{.python .input}
-log_interval = 500
-
-def train_embedding(num_epochs):
-    for epoch in range(1, num_epochs + 1):
-        start_time = time.time()
-        l_avg = 0
-        log_wc = 0
-
-        print('Beginnign epoch %d and resampling data.' % epoch)
-        for i, batch in enumerate(batches):
-            batch = [array.as_in_context(context) for array in batch]
-            with mx.autograd.record():
-                l = embedding(*batch)
-            l.backward()
-            trainer.step(1)
-
-            l_avg += l.mean()
-            log_wc += l.shape[0]
-            if i % log_interval == 0:
-                mx.nd.waitall()
-                wps = log_wc / (time.time() - start_time)
-                l_avg = l_avg.asscalar() / log_interval
-                print('epoch %d, iteration %d, loss %.2f, throughput=%.2fK wps'
-                      % (epoch, i, l_avg, wps / 1000))
-                start_time = time.time()
-                log_wc = 0
-                l_avg = 0
-
-        get_k_closest_tokens(vocab, embedding, 10, example_token)
-        print("")
-```
-
-```{.python .input}
-train_embedding(num_epochs=1)
-```
-
-## Word Similarity and Relatedness Task
-
-Word embeddings should capture the
-relationship between words in natural language.
-In the Word Similarity and
-Relatedness Task, word embeddings are evaluated by comparing word similarity
-scores computed from a pair of words with human labels for the similarity or
-relatedness of the pair.
-
-`GluonNLP` includes a number of common datasets for
-the Word Similarity and Relatedness Task. The included datasets are listed in
-the [API documentation](http://gluon-nlp.mxnet.io/api/data.html#word-embedding-evaluation-datasets). We use several of them in the evaluation example below.
-We first show a few samples from the WordSim353 dataset, to get an overall
-feeling of the Dataset structure.
-
-## Evaluation
-
-Thanks to the subword support of the `FasttextEmbeddingModel` we
-can evaluate on all words in the evaluation dataset,
-not only on the ones that we
-observed during training.
-
-We first compute a list of tokens in our evaluation
-dataset and then create an embedding matrix for them based on the fastText model.
-
-```{.python .input}
-rw = nlp.data.RareWords()
-rw_tokens  = list(set(itertools.chain.from_iterable((d[0], d[1]) for d in rw)))
-
-rw_token_embedding = nlp.embedding.TokenEmbedding(unknown_token=None, allow_extend=True)
-rw_token_embedding[rw_tokens]= embedding[rw_tokens]
-
-print('There are', len(rw_tokens), 'unique tokens in the RareWords dataset. Examples are:')
-for i in range(5):
-    print('\t', rw[i])
-print('The imputed TokenEmbedding has shape', rw_token_embedding.idx_to_vec.shape)
-```
-
-```{.python .input}
-evaluator = nlp.embedding.evaluation.WordEmbeddingSimilarity(
-    idx_to_vec=rw_token_embedding.idx_to_vec,
-    similarity_function="CosineSimilarity")
-evaluator.initialize(ctx=context)
-evaluator.hybridize()
-```
-
-```{.python .input}
-words1, words2, scores = zip(*([rw_token_embedding.token_to_idx[d[0]],
-                                rw_token_embedding.token_to_idx[d[1]],
-                                d[2]] for d in rw))
-words1 = mx.nd.array(words1, ctx=context)
-words2 = mx.nd.array(words2, ctx=context)
-```
-
-```{.python .input}
-pred_similarity = evaluator(words1, words2)
-sr = stats.spearmanr(pred_similarity.asnumpy(), np.array(scores))
-print('Spearman rank correlation on {} pairs of {}: {}'.format(
-    len(words1), rw.__class__.__name__, sr.correlation.round(3)))
-```
-
-## Further information
-
-For further information and examples on training and
-evaluating word embeddings with GluonNLP take a look at the Word Embedding
-section on the Scripts / Model Zoo page. There you will find more thorough
-evaluation techniques and other embedding models. In fact, the `data.py` and
-`model.py` files used in this example are the same as the ones used in the
-script.
-
-## References
-
-- [1] Mikolov, Tomas, et al. “Distributed representations of words and phrases
-and their compositionally.”
-   Advances in neural information processing
-systems. 2013.
-
-
-- [2] Bojanowski et al., "Enriching Word Vectors with Subword
-Information" Transactions of the Association for Computational Linguistics 2017
diff --git a/docs/index.rst b/docs/index.rst
index cb7f30af41..c3d225a957 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -79,5 +79,5 @@ You may find the 60-min Gluon crash course linked from there especially helpful.
    model_zoo/index
    examples/index
    api/index
-   community/index
+   website/index
    genindex
diff --git a/docs/md2ipynb.py b/docs/md2ipynb.py
index 3dfa91959b..ef9edf475a 100644
--- a/docs/md2ipynb.py
+++ b/docs/md2ipynb.py
@@ -1,22 +1,26 @@
+import argparse
 import os
-import sys
 import time
-import notedown
+
 import nbformat
+import notedown
 
-assert len(sys.argv) == 2, 'usage: input.md'
+parser = argparse.ArgumentParser(description='Convert md file to ipynb files.')
+parser.add_argument('input', help='input.md', type=str)
+parser.add_argument('-d', '--disable_compute',
+                    help='Disable computing python scripts', action="store_true")
+args = parser.parse_args()
 
 # timeout for each notebook, in sec
-timeout = 40 * 60
+timeout = 90 * 60
 
 # the files will be ignored for execution
 ignore_execution = []
 
-input_path = sys.argv[1]
-
 # Change working directory to directory of input file
-input_dir, input_fn = os.path.split(input_path)
-os.chdir(input_dir)
+input_dir, input_fn = os.path.split(args.input)
+if input_dir:
+    os.chdir(input_dir)
 
 output_fn = '.'.join(input_fn.split('.')[:-1] + ['ipynb'])
 
@@ -28,8 +32,9 @@
 
 if not any([i in input_fn for i in ignore_execution]):
     tic = time.time()
-    notedown.run(notebook, timeout)
-    print('=== Finished evaluation in %f sec'%(time.time()-tic))
+    if not args.disable_compute:
+        notedown.run(notebook, timeout)
+    print('=== Finished evaluation in %f sec' % (time.time() - tic))
 
 # write
 # need to add language info to for syntax highlight
diff --git a/docs/model_zoo.rst b/docs/model_zoo.rst
index 8dd1d9f81f..249d128d6f 100644
--- a/docs/model_zoo.rst
+++ b/docs/model_zoo.rst
@@ -3,74 +3,9 @@ Model Zoo
 
 .. container:: cards
 
-   .. card::
-      :title: Word Embedding
-      :link: model_zoo/word_embeddings/index.html
-
-      Mapping words to vectors.
-
-   .. card::
-      :title: Language Modeling
-      :link: model_zoo/language_model/index.html
-
-      Learning the distribution and representation of sequences of words.
-
    .. card::
       :title: Machine Translation
       :link: model_zoo/machine_translation/index.html
 
       From "Hello" to "Bonjour".
 
-   .. card::
-      :title: Text Classification
-      :link: model_zoo/text_classification/index.html
-
-      Categorize texts and documents.
-
-   .. card::
-      :title: Sentiment Analysis
-      :link: model_zoo/sentiment_analysis/index.html
-
-      Classifying polarity of emotions and opinions.
-
-   .. card::
-      :title: Parsing 
-      :link: model_zoo/parsing/index.html
-
-      Dependency parsing.
-
-   .. card::
-      :title: Natural Language Inference
-      :link: model_zoo/natural_language_inference/index.html
-
-      Determine if the premise semantically entails the hypothesis.
-
-   .. card::
-      :title: Text Generation
-      :link: model_zoo/text_generation/index.html
-
-      Generating language from models.
-
-   .. card::
-      :title: BERT
-      :link: model_zoo/bert/index.html
-
-      Transferring pre-trained language representations to language understanding tasks.
-
-   .. card::
-      :title: Named Entity Recognition
-      :link: model_zoo/ner/index.html
-
-      Locating and classifying named entity mentioned in unstructured texts.
-
-   .. card::
-      :title: Intent Classification and Slot Labeling
-      :link: model_zoo/intent_cls_slot_labeling/index.html
-
-      Predicting the intent of the query and extracting semantic concepts in the query.
-
-   .. card::
-      :title: Model Conversion
-      :link: model_zoo/conversion_tools/index.html
-
-      Converting NLP models from other frameworks to GluonNLP.
diff --git a/docs/website/configuration.rst b/docs/website/configuration.rst
new file mode 100644
index 0000000000..3e63dae430
--- /dev/null
+++ b/docs/website/configuration.rst
@@ -0,0 +1,74 @@
+Preview GluonNLP Website Locally
+-----------------------------------------------------------------
+
+The GluonNLP docs website is at `release branch <https://gluon-nlp.mxnet.io>`__, or `master branch <https://gluon-nlp.mxnet.io/master/index.html>`__. Its source code is at `gluon-nlp <https://github.com/dmlc/gluon-nlp>`__.
+
+Currently the GluonNLP website is constructed from the source code via CI automatically. Here I will share:
+
+- the structure of files used for the website, and
+- how to make changes to the website and preview the website
+
+Website Structure
+~~~~~~~~~~~~~~~~~
+
+Currently the docs part contain four sections: Model Zoo, Examples, API and Community. It should be noted that the model zoo is a link redirecting to the ``scripts`` folder in the parent folder. The other three folders are used exclusively by the docs website. Also, three different sections use ``rst``, ``py``, ``md`` files and their composition for compiling - they are inconsistent. So when you work on different sections of the docs website, you should  pay attention to handle the different sections with care.
+
+The main structure, the index file of the entire website, is written in ``rst`` format. It calls the index file of each different section separately. Before compiling the website, you should be aware that:
+
+- ``rst`` files are static files, they are directly displayed to the website with further styles;
+- ``md`` files are script files, the python scripts in these files will be executed and then stored into ``ipynb`` files before converting ``ipynb`` files into website files.
+
+Or more specifically, the files in the examples folder will be further executed and converted into intermediate files before writing to the final HTML files, while those in other folders don’t need further conversion or computation.
+
+Environment Configuration Instruction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Next, I will give a step by step instruction on how to compile this website from scratch.
+
+1. Preview website without displaying python output
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Use the command from https://github.com/dmlc/gluon-nlp/blob/master/docs/README.txt to install the necessary packages.
+
+.. code:: bash
+
+    pip install sphinx>=1.5.5 sphinx-gallery sphinx_rtd_theme matplotlib Image recommonmark
+
+Then use the command below to build the website locally, all the ``python`` scripts are skipped and there is no output for ``python`` code blocks:
+
+.. code:: bash
+
+    make docs_local MD2IPYNB_OPTION=-d
+
+You will get full HTML result for the website after successful execution.
+
+2. Preview website with python output
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+To accomplish this task, we recommend you to use the instance ``g4dn.xlarge`` on Amazon EC2. For convenience, you can search *deep learning* in the filter bar to select the deep learning-enabled machines, where you will have no need of installing addition drivers.
+
+After you have got the machine and logged to the machine, you will need to configure the packages using the command below:
+
+.. code:: bash
+
+    git clone https://github.com/dmlc/gluon-nlp
+    cd gluon-nlp
+    pip3 install --user -e '.[extras,dev]'
+
+If necessary, you might still need to configure the packages like below:
+
+Use ``python3`` command to get into the python execution screen, and then type the commands below to install the necessary packages inside python:
+
+.. code:: python
+
+    import nltk
+    nltk.download('perluniprops')
+    nltk.download('nonbreaking_prefixes')
+    nltk.download('punkt')
+
+By now, you should have installed all the necessary packages for the website. You can use the command below for previewing the website locally with all the python output:
+
+.. code:: bash
+
+    make docs_local
+
diff --git a/docs/community/contribute.rst b/docs/website/contribute.rst
similarity index 100%
rename from docs/community/contribute.rst
rename to docs/website/contribute.rst
diff --git a/docs/community/git.rst b/docs/website/git.rst
similarity index 100%
rename from docs/community/git.rst
rename to docs/website/git.rst
diff --git a/docs/community/index.rst b/docs/website/index.rst
similarity index 88%
rename from docs/community/index.rst
rename to docs/website/index.rst
index f9b1627e02..d5313d8f06 100644
--- a/docs/community/index.rst
+++ b/docs/website/index.rst
@@ -30,7 +30,7 @@ Community
       :title: GluonNLP Slack Channel
       :link: https://apache-mxnet.slack.com/messages/CCCDM10V9
 
-      #gluon-nlp Slack channel. Click the `sign-up link <https://join.slack.com/t/apache-mxnet/shared_invite/enQtNDQyMjAxMjQzMTI3LTkzMzY3ZmRlNzNjNGQxODg0N2Y5NmExMjEwOTZlYmIwYTU2ZTY4ZjNlMmEzOWY5MGQ5N2QxYjhlZTFhZTVmYTc>`_ to register.
+      #gluon-nlp Slack channel. Click the `sign-up link <https://join.slack.com/t/apache-mxnet/shared_invite/zt-5n577awn-iEQhjazdppqbAV~0K7_Vvg>`_ to register.
 
 
    .. card::
@@ -55,3 +55,4 @@ Interested in contributing to GluonNLP? Check our contribution guide:
    contribute
    git
    release
+   configuration
\ No newline at end of file
diff --git a/docs/community/release.rst b/docs/website/release.rst
similarity index 100%
rename from docs/community/release.rst
rename to docs/website/release.rst
diff --git a/env/cpu/py3-master.yml b/env/cpu/py3-master.yml
deleted file mode 100644
index 15d49c49fb..0000000000
--- a/env/cpu/py3-master.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-channels:
-  - conda-forge
-dependencies:
-  - python=3.5
-  - pip
-  - perl
-  - pandoc=1.19.2
-  - tornado=5.1.1
-  - sphinx=2.2.1
-  # In the -master pipeline, we test without numba. Numba is an optional
-  # dependency and GluonNLP needs to work both with and without numba installed.
-  - pip:
-    - numpy==1.17.4
-    - notedown==1.5.1
-    - sphinx-gallery==0.4.0
-    - recommonmark==0.6.0
-    - nbconvert==5.6.1
-    - nbsphinx>=0.3.4,<0.4
-    - ipython
-    - ipykernel
-    - https://github.com/szha/mx-theme/tarball/master
-    - seaborn
-    - jieba
-    - cython
-    - boto3
-    - pytype==2019.10.17
-    - pytest==5.3.2
-    - pytest-env==0.6.2
-    - pytest-cov==2.8.1
-    - pytest-xdist==1.31.0
-    - pylint==2.4.4
-    - pylint-quotes==0.2.1
-    - flaky==3.6.1
-    - flake8==3.7.9
-    - mock<3
-    - https://repo.mxnet.io/dist/python/cpu/mxnet-1.6.0-py2.py3-none-manylinux1_x86_64.whl
-    - scipy==1.3.2
-    - regex==2019.11.1 
-    - nltk==3.4.5
-    - sacremoses==0.0.35
-    - spacy==2.2.2
-    - sentencepiece==0.1.83
-    - sphinx-autodoc-typehints==1.7.0
diff --git a/env/cpu/py3.yml b/env/cpu/py3.yml
deleted file mode 100644
index 77a649b07c..0000000000
--- a/env/cpu/py3.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-channels:
-  - conda-forge
-dependencies:
-  - python=3.5
-  - pip
-  - perl
-  - pandoc=1.19.2
-  - tornado=5.1.1
-  - sphinx=2.2.1
-  - pip:
-    - numpy==1.17.4
-    - notedown==1.5.1
-    - sphinx-gallery==0.4.0
-    - recommonmark==0.6.0
-    - nbconvert==5.6.1
-    - nbsphinx>=0.3.4,<0.4
-    - ipython
-    - ipykernel
-    - numba==0.47
-    - https://github.com/szha/mx-theme/tarball/master
-    - seaborn
-    - jieba
-    - cython
-    - boto3
-    - pytype==2019.10.17
-    - pytest==5.3.2
-    - pytest-env==0.6.2
-    - pytest-cov==2.8.1
-    - pytest-xdist==1.31.0
-    - pylint==2.4.4
-    - pylint-quotes==0.2.1
-    - flaky==3.6.1
-    - flake8==3.7.9
-    - mock<3
-    - https://lausen-public.s3.amazonaws.com/mxnet_cu100-1.6.0b20200125-py2.py3-none-manylinux1_x86_64.whl
-    - scipy==1.3.2
-    - regex==2019.11.1
-    - nltk==3.4.5
-    - sacremoses==0.0.35
-    - spacy==2.2.2
-    - sentencepiece==0.1.83
-    - sphinx-autodoc-typehints==1.7.0
diff --git a/env/docker/py3.yml b/env/docker/py3.yml
deleted file mode 100644
index 2c8b532186..0000000000
--- a/env/docker/py3.yml
+++ /dev/null
@@ -1,42 +0,0 @@
-channels:
-  - conda-forge
-dependencies:
-  - python=3.5
-  - pip
-  - perl
-  - pandoc=1.19.2
-  - tornado=5.1.1
-  - sphinx=2.2.1
-  - pip:
-    - numpy==1.17.4
-    - notedown==1.5.1
-    - sphinx-gallery==0.4.0
-    - recommonmark==0.6.0
-    - nbconvert==5.6.1
-    - nbsphinx>=0.3.4,<0.4
-    - ipython
-    - ipykernel
-    - numba==0.47
-    - https://github.com/szha/mx-theme/tarball/master
-    - seaborn
-    - jieba
-    - scikit-learn==0.21.3
-    - cython
-    - pytype==2019.10.17
-    - pytest==5.2.3
-    - pytest-env==0.6.2
-    - pytest-cov==2.8.1
-    - pytest-xdist==1.30.0
-    - pylint==2.4.4
-    - pylint-quotes==0.2.1
-    - flaky==3.6.1
-    - flake8==3.7.9
-    - mock<3
-    - https://lausen-public.s3.amazonaws.com/mxnet_cu100-1.6.0b20200125-py2.py3-none-manylinux1_x86_64.whl
-    - scipy==1.3.2
-    - regex==2019.11.1 
-    - nltk==3.4.5
-    - sacremoses==0.0.35
-    - spacy==2.2.2
-    - sentencepiece==0.1.83
-    - sphinx-autodoc-typehints==1.7.0
diff --git a/env/gpu/py3-master.yml b/env/gpu/py3-master.yml
deleted file mode 100644
index 593614b587..0000000000
--- a/env/gpu/py3-master.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-channels:
-  - conda-forge
-dependencies:
-  - python=3.5
-  - pip
-  - perl
-  - pandoc=1.19.2
-  - tornado=5.1.1
-  - sphinx=2.2.1
-  # In the -master pipeline, we test without numba. Numba is an optional
-  # dependency and GluonNLP needs to work both with and without numba installed.
-  - pip:
-    - numpy==1.17.4
-    - notedown==1.5.1
-    - sphinx-gallery==0.4.0
-    - recommonmark==0.6.0
-    - nbconvert==5.6.1
-    - nbsphinx>=0.3.4,<0.4
-    - ipython
-    - ipykernel
-    - https://github.com/szha/mx-theme/tarball/master
-    - seaborn
-    - jieba
-    - cython
-    - boto3
-    - pytype==2019.10.17
-    - pytest==5.3.2
-    - pytest-env==0.6.2
-    - pytest-cov==2.8.1
-    - pytest-xdist==1.31.0
-    - pylint==2.4.4
-    - pylint-quotes==0.2.1
-    - flaky==3.6.1
-    - flake8==3.7.9
-    - mock<3
-    - https://repo.mxnet.io/dist/python/cu100/mxnet_cu100-1.6.0-py2.py3-none-manylinux1_x86_64.whl
-    - scipy==1.3.2
-    - regex==2019.11.1 
-    - nltk==3.4.5
-    - sacremoses==0.0.35
-    - spacy==2.2.2
-    - sentencepiece==0.1.83
-    - sphinx-autodoc-typehints==1.7.0
-    - seqeval
diff --git a/env/gpu/py3.yml b/env/gpu/py3.yml
deleted file mode 100644
index 1ed92f3fa5..0000000000
--- a/env/gpu/py3.yml
+++ /dev/null
@@ -1,43 +0,0 @@
-channels:
-  - conda-forge
-dependencies:
-  - python=3.5
-  - pip
-  - perl
-  - pandoc=1.19.2
-  - tornado=5.1.1
-  - sphinx=2.2.1
-  - pip:
-    - numpy==1.17.4
-    - notedown==1.5.1
-    - sphinx-gallery==0.4.0
-    - recommonmark==0.6.0
-    - nbconvert==5.6.1
-    - nbsphinx>=0.3.4,<0.4
-    - ipython
-    - ipykernel
-    - numba==0.47
-    - https://github.com/szha/mx-theme/tarball/master
-    - seaborn
-    - jieba
-    - cython
-    - boto3
-    - pytype==2019.10.17
-    - pytest==5.3.2
-    - pytest-env==0.6.2
-    - pytest-cov==2.8.1
-    - pytest-xdist==1.31.0
-    - pylint==2.4.4
-    - pylint-quotes==0.2.1
-    - flaky==3.6.1
-    - flake8==3.7.9
-    - mock<3
-    - https://lausen-public.s3.amazonaws.com/mxnet_cu100-1.6.0b20200125-py2.py3-none-manylinux1_x86_64.whl
-    - scipy==1.3.2
-    - regex==2019.11.1
-    - nltk==3.4.5
-    - sacremoses==0.0.35
-    - spacy==2.2.2
-    - sentencepiece==0.1.83
-    - sphinx-autodoc-typehints==1.7.0
-    - seqeval
diff --git a/examples b/examples
deleted file mode 120000
index 6c33de9655..0000000000
--- a/examples
+++ /dev/null
@@ -1 +0,0 @@
-docs/examples
\ No newline at end of file
diff --git a/mms/README.rst b/mms/README.rst
deleted file mode 100644
index fe8309fa19..0000000000
--- a/mms/README.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-Multi-model-server example
-==========================
-
-https://github.com/awslabs/multi-model-server/
-
-Assuming you are located in the root of the GluonNLP repo, you can run this
-example via:
-
-```
-pip install --user multi-model-server
-curl https://dist-bert.s3.amazonaws.com/demo/finetune/sst.params -o mms/sst.params
-~/.local/bin/model-archiver --model-name bert_sst --model-path mms --handler bert:handle --runtime python --export-path /tmp
-~/.local/bin/multi-model-server --start --models bert_sst.mar --model-store /tmp
-curl -X POST http://127.0.0.1:8080/bert_sst/predict -F 'data=["Positive sentiment", "Negative sentiment"]'
-```
-
-
diff --git a/mms/bert.py b/mms/bert.py
deleted file mode 100644
index fedb54632e..0000000000
--- a/mms/bert.py
+++ /dev/null
@@ -1,88 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import json
-import logging
-
-import mxnet as mx
-import gluonnlp as nlp
-
-
-class BertHandler:
-    """GluonNLP based Bert Handler"""
-
-    def __init__(self):
-        self.error = None
-        self._context = None
-        self.initialized = False
-
-    def initialize(self, context):
-        """
-        Initialize model. This will be called during model loading time
-        :param context: Initial context contains model server system properties.
-        :return:
-        """
-        self._context = context
-        gpu_id = context.system_properties["gpu_id"]
-        self._mx_ctx = mx.cpu() if gpu_id is None else mx.gpu(gpu_id)
-        bert, vocab = nlp.model.get_model('bert_12_768_12',
-                                          dataset_name='book_corpus_wiki_en_uncased',
-                                          pretrained=False, ctx=self._mx_ctx, use_pooler=True,
-                                          use_decoder=False, use_classifier=False)
-        tokenizer = nlp.data.BERTTokenizer(vocab, lower=True)
-        self.sentence_transform = nlp.data.BERTSentenceTransform(tokenizer, max_seq_length=128,
-                                                                 vocab=vocab, pad=True, pair=False)
-        self.batchify = nlp.data.batchify.Tuple(
-            nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token]),  # input
-            nlp.data.batchify.Stack(),  # length
-            nlp.data.batchify.Pad(axis=0, pad_val=0))  # segment
-        # Set dropout to non-zero, to match pretrained model parameter names
-        self.net = nlp.model.BERTClassifier(bert, dropout=0.1)
-        self.net.load_parameters('sst.params', self._mx_ctx)
-        self.net.hybridize()
-
-        self.initialized = True
-
-    def handle(self, batch, context):
-        # we're just faking batch_size==1 but allow dynamic batch size. Ie the
-        # actual batch size is the len of the first element.
-        try:
-            assert len(batch) == 1
-            batch = json.loads(batch[0]["data"].decode('utf-8'))
-        except (json.JSONDecodeError, KeyError, AssertionError) as e:
-            print('call like: curl -X POST http://127.0.0.1:8080/bert_sst/predict '
-                  '-F \'data=["sentence 1", "sentence 2"]\'')
-            raise e
-        model_input = self.batchify([self.sentence_transform(sentence) for sentence in batch])
-
-        inputs, valid_length, token_types = [arr.as_in_context(self._mx_ctx) for arr in model_input]
-        inference_output = self.net(inputs, token_types, valid_length.astype('float32'))
-        inference_output = inference_output.as_in_context(mx.cpu())
-
-        return [mx.nd.softmax(inference_output).argmax(axis=1).astype('int').asnumpy().tolist()]
-
-
-_service = BertHandler()
-
-
-def handle(data, context):
-    if not _service.initialized:
-        _service.initialize(context)
-
-    if data is None:
-        return None
-
-    return _service.handle(data, context)
diff --git a/pytest.ini b/pytest.ini
index 44ce6ae88a..768474a0a1 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -6,13 +6,3 @@ markers =
     gpu: mark a test that requires GPU.
     integration: mark an integration test
     skip_master: mark a test that is temporarily skipped for mxnet master validation.
-    py3_only: mark a test that is intended for a python3-only feature.
-
-env =
-    MXNET_HOME=tests/data
-
-filterwarnings =
-    error
-    # ignore warning about package resolution using __spec__ or __package__
-    # can't reproduce locally
-    ignore:.*can't resolve package from __spec__ or __package__.*:ImportWarning
\ No newline at end of file
diff --git a/scripts/__init__.py b/scripts/__init__.py
index dddd18e45d..e69de29bb2 100644
--- a/scripts/__init__.py
+++ b/scripts/__init__.py
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""NLP examples."""
diff --git a/scripts/benchmarks/README.md b/scripts/benchmarks/README.md
new file mode 100644
index 0000000000..097d0fe03c
--- /dev/null
+++ b/scripts/benchmarks/README.md
@@ -0,0 +1,45 @@
+# Benchmarking the Performance of NLP Backbones
+
+We benchmark the latency and peak memory usage of a single training (forward + backward) and inference (forward-only) step 
+of the NLP backbones.
+For comparison, we also provide the numbers of the models in huggingface.
+
+## Backbones in HuggingFace
+
+We use the [huggingface benchmark](https://github.com/huggingface/transformers/tree/master/examples/benchmarking) 
+to benchmark the training + inference speed of common workloads in NLP. 
+
+```bash
+python3 -m pip install -U -r requirements.txt --user
+python3 benchmark_hf.py
+```
+
+It will generate a list of csv files:
+
+```
+├── pytorch_train_fp32.csv
+├── pytorch_train_fp16.csv
+├── pytorch_infer_fp32.csv
+├── pytorch_infer_fp16.csv
+├── pytorch_infer_fp32_ts.csv
+```
+
+## GluonNLP Backbones based on MXNet-2.0
+
+We profile three options: `NT` layout, `NT` layout with `TN` layout as the compute layout,
+and `TN` layout.
+
+```bash
+python3 -m pip install -U -r requirements.txt --user
+bash benchmark_gluonnlp.sh
+```
+
+It will generate csv files with `gluonnlp_` as the prefix
+```
+├── gluonnlp_train_fp32_NT_NT.csv
+├── gluonnlp_train_fp32_NT_TN.csv
+├── gluonnlp_train_fp32_TN_TN.csv
+├── gluonnlp_infer_fp32_NT_NT.csv
+├── gluonnlp_infer_fp32_NT_TN.csv
+├── gluonnlp_infer_fp32_TN_TN.csv
+```
diff --git a/scripts/benchmarks/benchmark_gluonnlp.py b/scripts/benchmarks/benchmark_gluonnlp.py
new file mode 100644
index 0000000000..440ffc7335
--- /dev/null
+++ b/scripts/benchmarks/benchmark_gluonnlp.py
@@ -0,0 +1,130 @@
+import mxnet as mx
+import argparse
+import os
+import pandas as pd
+from benchmark_utils import GluonNLPBackboneBenchmark
+import multiprocessing as mp
+from multiprocessing import Process
+mx.npx.set_np()
+
+
+MODELS = [
+    'google_en_uncased_bert_base',
+    'google_en_uncased_bert_large',
+    'google_albert_base_v2',
+    'google_albert_large_v2',
+    'google_albert_xlarge_v2',
+    'google_albert_xxlarge_v2',
+    'google_electra_small',
+    'google_electra_base',
+    'google_electra_large',
+    'google_uncased_mobilebert',
+    'fairseq_bart_base',
+    'fairseq_bart_large'
+]
+
+# (batch_size, seq_length)
+train_workloads =\
+    [(4, 128),
+     (8, 128),
+     (16, 128),
+     (32, 128),
+     (1, 512),
+     (2, 512),
+     (4, 512),
+     (8, 512)]
+
+
+inference_workloads = [
+    (1, 128),
+    (1, 384),
+    (1, 512),
+    (8, 32),
+    (8, 128),
+    (8, 512),
+    (32, 512),
+    (256, 128),
+    (400, 100),
+]
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description='Process some integers.')
+    parser.add_argument('--layout', type=str, default='NT',
+                        help='The layout of the computation')
+    parser.add_argument('--compute_layout', type=str, default=None,
+                        help='The compute layout of the computation')
+    parser.add_argument('--mode', type=str, default='train',
+                        choices=['train', 'inference'])
+    return parser
+
+
+def run_benchmark(workload, model_name, out_file_name, is_train):
+    if is_train:
+        benchmark = GluonNLPBackboneBenchmark(
+            workloads=workload,
+            model_names=model_name,
+            profile_inference=False,
+            profile_train=True,
+            to_csv=True,
+            train_out_csv_file=out_file_name)
+        benchmark.run()
+    else:
+        benchmark = GluonNLPBackboneBenchmark(
+            workloads=workload,
+            model_names=model_name,
+            profile_inference=True,
+            profile_train=False,
+            to_csv=True,
+            inference_out_csv_file=out_file_name)
+        benchmark.run()
+    return
+
+
+if __name__ == '__main__':
+    mp.set_start_method('spawn')
+    parser = get_parser()
+    args = parser.parse_args()
+    if args.compute_layout is None:
+        args.compute_layout = args.layout
+    for layout, compute_layout in [(args.layout, args.compute_layout)]:
+        if compute_layout != layout:
+            profile_models = [ele for ele in MODELS if 'bart' not in ele]
+        else:
+            profile_models = [ele for ele in MODELS]
+        if args.mode == 'inference':
+            out_dir = 'infer_fp32_{}_{}'.format(layout, compute_layout)
+            df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
+                                       'latency', 'memory'])
+            os.makedirs(out_dir, exist_ok=True)
+            for model_name in profile_models:
+                for workload in inference_workloads:
+                    out_path = os.path.join(out_dir, '{}_{}_{}.csv'.format(model_name, workload[0],
+                                                                           workload[1]))
+                    process = Process(
+                        target=run_benchmark,
+                        args=(workload, model_name, out_path, False))
+                    process.start()
+                    process.join()
+                    new_df = pd.read_csv(out_path)
+                    df = df.append(new_df, ignore_index=True)
+                    df.to_csv('gluonnlp_infer_fp32_{}_{}.csv'.format(layout, compute_layout))
+        elif args.mode == 'train':
+            out_dir = 'train_fp32_{}_{}'.format(layout, compute_layout)
+            df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
+                                       'latency', 'memory'])
+            os.makedirs(out_dir, exist_ok=True)
+            for model_name in profile_models:
+                for workload in train_workloads:
+                    out_path = os.path.join(out_dir, '{}_{}_{}.csv'.format(model_name, workload[0],
+                                                                           workload[1]))
+                    process = Process(
+                        target=run_benchmark,
+                        args=(workload, model_name, out_path, True))
+                    process.start()
+                    process.join()
+                    new_df = pd.read_csv(out_path)
+                    df = df.append(new_df, ignore_index=True)
+                    df.to_csv('gluonnlp_train_fp32_{}_{}.csv'.format(layout, compute_layout))
+        else:
+            raise NotImplementedError
diff --git a/scripts/benchmarks/benchmark_gluonnlp.sh b/scripts/benchmarks/benchmark_gluonnlp.sh
new file mode 100644
index 0000000000..ada1951864
--- /dev/null
+++ b/scripts/benchmarks/benchmark_gluonnlp.sh
@@ -0,0 +1,14 @@
+for mode in train inference
+do
+  python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode $mode
+done
+
+for mode in train inference
+do
+  python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode $mode
+done
+
+for mode in train inference
+do
+  python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode $mode
+done
diff --git a/scripts/benchmarks/benchmark_hf.py b/scripts/benchmarks/benchmark_hf.py
new file mode 100644
index 0000000000..57ccdcd422
--- /dev/null
+++ b/scripts/benchmarks/benchmark_hf.py
@@ -0,0 +1,184 @@
+import argparse
+import pandas as pd
+import math
+import os
+from multiprocessing import Process
+import torch
+from typing import Callable
+from transformers import HfArgumentParser, PyTorchBenchmark, PyTorchBenchmarkArguments
+import logging
+import timeit
+logger = logging.getLogger()
+
+
+class CustomizedPyTorchBenchmark(PyTorchBenchmark):
+    def _prepare_train_func(self, model_name: str, batch_size: int, sequence_length: int) -> Callable[[], None]:
+        _train = super(CustomizedPyTorchBenchmark, self)._prepare_train_func(model_name,
+                                                                             batch_size,
+                                                                             sequence_length)
+        def train_fn():
+            _train()
+            torch.cuda.synchronize()
+        return train_fn
+
+    def _measure_speed(self, func) -> float:
+        try:
+            if self.args.is_tpu or self.args.torchscript:
+                # run additional 10 times to stabilize compilation for tpu and torchscript
+                logger.info("Do inference on TPU or torchscript. Running model 5 times to stabilize compilation")
+                timeit.repeat(
+                    func, repeat=1, number=3,
+                )
+
+            # as written in https://docs.python.org/2/library/timeit.html#timeit.Timer.repeat, min should be taken rather than the average
+            runtimes = timeit.repeat(func, repeat=self.args.repeat, number=3,)
+
+            if self.args.is_tpu and self.args.torch_xla_tpu_print_metrics:
+                import torch_xla.debug.metrics as met
+
+                self.print_fn(met.metrics_report())
+
+            return min(runtimes) / 3.0
+        except RuntimeError as e:
+            self.print_fn("Doesn't fit on GPU. {}".format(e))
+            return "N/A"
+
+
+HF_MODELS = [
+    'bert-base-uncased',
+    'bert-large-uncased',
+    'albert-base-v2',
+    'albert-large-v2',
+    'albert-xlarge-v2',
+    'albert-xxlarge-v2',
+    'google/electra-small-discriminator',
+    'google/electra-base-discriminator',
+    'google/electra-large-discriminator',
+    'google/mobilebert-uncased',
+    'facebook/bart-base',
+    'facebook/bart-large'
+]
+
+# (batch_size, seq_length)
+train_workloads =\
+    [(4, 128),
+     (8, 128),
+     (16, 128),
+     (32, 128),
+     (1, 512),
+     (2, 512),
+     (4, 512),
+     (8, 512)]
+
+
+inference_workloads = [
+    (1, 128),
+    (1, 384),
+    (1, 512),
+    (8, 32),
+    (8, 128),
+    (8, 512),
+    (32, 512),
+    (256, 128),
+    (400, 100),
+]
+
+
+if __name__ == '__main__':
+    # Profile PyTorch
+    parser = HfArgumentParser(PyTorchBenchmarkArguments)
+    # Benchmark Training
+    for use_fp16 in [False, True]:
+        df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
+                                   'latency', 'memory'])
+        for model in HF_MODELS:
+            for batch_size, seq_length in train_workloads:
+                prefix = '{}_{}_{}'.format(model, batch_size, seq_length).replace('/', '_')
+                args = ['--models', model,
+                        '--batch_sizes', '{}'.format(batch_size),
+                        '--sequence_lengths', '{}'.format(seq_length),
+                        '--train_time_csv_file', '{}.train_time.csv'.format(prefix),
+                        '--train_memory_csv_file', '{}.train_memory.csv'.format(prefix),
+                        '--no_env_print',
+                        '--repeat', '3',
+                        '--save_to_csv', '--training', '--no_inference']
+                if use_fp16:
+                    args.append('--fp16')
+                benchmark_args = parser.parse_args_into_dataclasses(args)[0]
+                benchmark = CustomizedPyTorchBenchmark(args=benchmark_args)
+                p = Process(target=benchmark.run)
+                p.start()
+                p.join()
+                try:
+                    train_time_df = pd.read_csv('{}.train_time.csv'.format(prefix))
+                    train_memory_df = pd.read_csv('{}.train_memory.csv'.format(prefix))
+                    latency = train_time_df['result'][0]
+                    memory = train_memory_df['result'][0]
+                    os.remove('{}.train_time.csv'.format(prefix))
+                    os.remove('{}.train_memory.csv'.format(prefix))
+                except Exception:
+                    latency = math.nan
+                    memory = math.nan
+                new_df = pd.DataFrame({'model': [model],
+                                       'batch_size': [batch_size],
+                                       'sequence_length': [seq_length],
+                                       'latency': [latency],
+                                       'memory': [memory]})
+                df = df.append(new_df, ignore_index=True)
+                if use_fp16:
+                    df.to_csv('pytorch_train_fp16.csv')
+                else:
+                    df.to_csv('pytorch_train_fp32.csv')
+
+    # Benchmark Inference
+    for torch_script in [False, True]:
+        for use_fp16 in [False, True]:
+            if torch_script and use_fp16:
+                # Cannot support both torch_script and use_fp16.
+                continue
+            df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
+                                       'latency', 'memory'])
+            for model in HF_MODELS:
+                for batch_size, seq_length in inference_workloads:
+                    prefix = '{}_{}_{}'.format(model, batch_size, seq_length).replace('/', '_')
+                    args = ['--models', model,
+                            '--batch_sizes', '{}'.format(batch_size),
+                            '--sequence_lengths', '{}'.format(seq_length),
+                            '--inference_time_csv_file', '{}.inference_time.csv'.format(prefix),
+                            '--inference_memory_csv_file', '{}.inference_memory.csv'.format(prefix),
+                            '--no_env_print',
+                            '--repeat', '3',
+                            '--save_to_csv']
+                    if use_fp16:
+                        args.append('--fp16')
+                    if torch_script:
+                        args.append('--torchscript')
+                    benchmark_args = parser.parse_args_into_dataclasses(args)[0]
+                    benchmark = PyTorchBenchmark(args=benchmark_args)
+                    p = Process(target=benchmark.run)
+                    p.start()
+                    p.join()
+                    try:
+                        inference_time_df = pd.read_csv('{}.inference_time.csv'.format(prefix))
+                        inference_memory_df = pd.read_csv('{}.inference_memory.csv'.format(prefix))
+                        latency = inference_time_df['result'][0]
+                        memory = inference_memory_df['result'][0]
+                        os.remove('{}.inference_time.csv'.format(prefix))
+                        os.remove('{}.inference_memory.csv'.format(prefix))
+                    except Exception:
+                        latency = math.nan
+                        memory = math.nan
+                    new_df = pd.DataFrame({'model': [model],
+                                           'batch_size': [batch_size],
+                                           'sequence_length': [seq_length],
+                                           'latency': [latency],
+                                           'memory': [memory]})
+                    df = df.append(new_df, ignore_index=True)
+                    if use_fp16 and torch_script:
+                        df.to_csv('pytorch_infer_fp16_ts.csv')
+                    elif use_fp16 and not torch_script:
+                        df.to_csv('pytorch_infer_fp16.csv')
+                    elif not use_fp16 and torch_script:
+                        df.to_csv('pytorch_infer_fp32_ts.csv')
+                    else:
+                        df.to_csv('pytorch_infer_fp32.csv')
diff --git a/scripts/benchmarks/benchmark_utils.py b/scripts/benchmarks/benchmark_utils.py
new file mode 100644
index 0000000000..c022caff87
--- /dev/null
+++ b/scripts/benchmarks/benchmark_utils.py
@@ -0,0 +1,1011 @@
+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the HuggingFace Transformers library
+at https://github.com/huggingface/transformers/blob/master/src/transformers/benchmark/benchmark_utils.py
+and the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+
+import copy
+import csv
+import linecache
+import logging
+import os
+import platform
+import sys
+import timeit
+import numpy as np
+import gluonnlp
+from gluonnlp.models import get_backbone
+from gluonnlp.utils.misc import logging_config
+from collections import defaultdict, namedtuple
+from datetime import datetime
+import multiprocessing as mp
+from multiprocessing import Pipe, Process, Queue
+from multiprocessing.connection import Connection
+from typing import Callable, Iterable, List, NamedTuple, Optional, Union, Tuple
+
+# Try import psutil + py3nvml
+try:
+    import psutil
+except ImportError:
+    psutil = None
+
+try:
+    import py3nvml.py3nvml as nvml
+except ImportError:
+    nvml = None
+
+try:
+    import mxnet
+    num_gpus = mxnet.context.num_gpus()
+    from mxnet import profiler as mx_profiler
+    if num_gpus == 0:
+        mx_all_contexts = [mxnet.cpu()]
+    else:
+        mx_all_contexts = [mxnet.gpu(i) for i in range(num_gpus)]
+except ImportError:
+    mxnet = None
+    mx_all_contexts = None
+    mx_profiler = None
+
+try:
+    import torch
+    from torch.cuda import empty_cache as torch_empty_cache
+except ImportError:
+    torch = None
+    torch_empty_cache = None
+
+try:
+    import tensorflow
+    from tensorflow.python.eager import context as tf_context
+except ImportError:
+    tensorflow = None
+    tf_context = None
+
+
+def is_psutil_available():
+    return psutil is not None
+
+
+def is_py3nvml_available():
+    return nvml is not None
+
+
+def is_torch_available():
+    return torch is not None
+
+
+def is_tf_available():
+    return tensorflow is not None
+
+
+def is_mxnet_available():
+    return mxnet is not None
+
+
+if platform.system() == "Windows":
+    from signal import CTRL_C_EVENT as SIGKILL
+else:
+    from signal import SIGKILL
+
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+logging_config(folder='gluonnlp_benchmark', name='benchmark', logger=logger)
+
+
+_is_memory_tracing_enabled = False
+
+BenchmarkOutput = namedtuple(
+    "BenchmarkOutput",
+    [
+        "inference_result",
+        "train_result",
+    ],
+)
+
+
+def separate_process_wrapper_fn(func: Callable[[], None], do_multi_processing: bool) -> Callable[[], None]:
+    """
+        This function wraps another function into its own separated process.
+        In order to ensure accurate memory measurements it is important that the function
+        is executed in a separate process
+
+        Args:
+            - `func`: (`callable`): function() -> ...
+                generic function which will be executed in its own separate process
+            - `do_multi_processing`: (`bool`)
+                Whether to run function on separate process or not
+    """
+    def multi_process_func(*args, **kwargs):
+        # run function in an individual
+        # process to get correct memory
+        def wrapper_func(queue: Queue, *args):
+            try:
+                result = func(*args)
+            except Exception as e:
+                logger.error(e)
+                print(e)
+                result = "N/A"
+            queue.put(result)
+
+        queue = Queue()
+        p = Process(target=wrapper_func, args=[queue] + list(args))
+        p.start()
+        result = queue.get()
+        p.join()
+        return result
+
+    if do_multi_processing:
+        logging.info("fFunction {func} is executed in its own process...")
+        return multi_process_func
+    else:
+        return func
+
+
+def is_memory_tracing_enabled():
+    global _is_memory_tracing_enabled
+    return _is_memory_tracing_enabled
+
+
+class Frame(NamedTuple):
+    """ `Frame` is a NamedTuple used to gather the current frame state.
+            `Frame` has the following fields:
+            - 'filename' (string): Name of the file currently executed
+            - 'module' (string): Name of the module currently executed
+            - 'line_number' (int): Number of the line currently executed
+            - 'event' (string): Event that triggered the tracing (default will be "line")
+            - 'line_text' (string): Text of the line in the python script
+    """
+
+    filename: str
+    module: str
+    line_number: int
+    event: str
+    line_text: str
+
+
+class UsedMemoryState(NamedTuple):
+    """ `UsedMemoryState` are named tuples with the following fields:
+        - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
+        - 'cpu_memory': CPU RSS memory state *before* executing the line
+        - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
+    """
+
+    frame: Frame
+    cpu_memory: int
+    gpu_memory: int
+
+
+class Memory(NamedTuple):
+    """ `Memory` NamedTuple have a single field `bytes` and
+        you can get a human readable str of the number of mega bytes by calling `__repr__`
+            - `byte` (integer): number of bytes,
+    """
+
+    bytes: int
+
+    def __repr__(self) -> str:
+        return str(bytes_to_mega_bytes(self.bytes))
+
+
+class MemoryState(NamedTuple):
+    """ `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
+        - `frame` (`Frame`): the current frame (see above)
+        - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
+        - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
+        - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
+    """
+
+    frame: Frame
+    cpu: Memory
+    gpu: Memory
+    cpu_gpu: Memory
+
+
+class MemorySummary(NamedTuple):
+    """ `MemorySummary` namedtuple otherwise with the fields:
+        - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
+            by substracting the memory after executing each line from the memory before executing said line.
+        - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
+            obtained by summing repeated memory increase for a line if it's executed several times.
+            The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
+        - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
+            Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
+    """
+
+    sequential: List[MemoryState]
+    cumulative: List[MemoryState]
+    current: List[MemoryState]
+    total: Memory
+
+
+MemoryTrace = List[UsedMemoryState]
+
+
+def measure_peak_memory_cpu(function: Callable[[], None], interval=0.5, device_idx=None) -> int:
+    """
+        measures peak cpu memory consumption of a given `function`
+        running the function for at least interval seconds
+        and at most 20 * interval seconds.
+        This function is heavily inspired by: `memory_usage`
+        of the package `memory_profiler`: https://github.com/pythonprofilers/memory_profiler/blob/895c4ac7a08020d66ae001e24067da6dcea42451/memory_profiler.py#L239
+
+        Args:
+            - `function`: (`callable`): function() -> ...
+                function without any arguments to measure for which to measure the peak memory
+
+            - `interval`: (`float`, `optional`, defaults to `0.5`)
+                interval in second for which to measure the memory usage
+
+            - `device_idx`: (`int`, `optional`, defaults to `None`)
+                device id for which to measure gpu usage
+
+        Returns:
+            - `max_memory`: (`int`)
+                cosumed memory peak in Bytes
+    """
+
+    def get_cpu_memory(process_id: int) -> int:
+        """
+            measures current cpu memory usage of a given `process_id`
+
+            Args:
+                - `process_id`: (`int`)
+                    process_id for which to measure memory
+
+            Returns
+                - `memory`: (`int`)
+                    cosumed memory in Bytes
+        """
+        process = psutil.Process(process_id)
+        try:
+            meminfo_attr = "memory_info" if hasattr(process, "memory_info") else "get_memory_info"
+            memory = getattr(process, meminfo_attr)()[0]
+        except psutil.AccessDenied:
+            raise ValueError("Error with Psutil.")
+        return memory
+
+    if not is_psutil_available():
+        logger.warning(
+            "Psutil not installed, we won't log CPU memory usage. "
+            "Install Psutil (pip install psutil) to use CPU memory tracing."
+        )
+        max_memory = "N/A"
+    else:
+
+        class MemoryMeasureProcess(Process):
+
+            """
+                `MemoryMeasureProcess` inherits from `Process` and overwrites
+                its `run()` method. Used to measure the memory usage of a process
+            """
+
+            def __init__(self, process_id: int, child_connection: Connection, interval: float):
+                super().__init__()
+                self.process_id = process_id
+                self.interval = interval
+                self.connection = child_connection
+                self.num_measurements = 1
+                self.mem_usage = get_cpu_memory(self.process_id)
+
+            def run(self):
+                self.connection.send(0)
+                stop = False
+                while True:
+                    self.mem_usage = max(self.mem_usage, get_cpu_memory(self.process_id))
+                    self.num_measurements += 1
+
+                    if stop:
+                        break
+
+                    stop = self.connection.poll(self.interval)
+
+                # send results to parent pipe
+                self.connection.send(self.mem_usage)
+                self.connection.send(self.num_measurements)
+
+        while True:
+            # create child, parent connection
+            child_connection, parent_connection = Pipe()
+
+            # instantiate process
+            mem_process = MemoryMeasureProcess(os.getpid(), child_connection, interval)
+            mem_process.start()
+
+            # wait until we get memory
+            parent_connection.recv()
+
+            try:
+                # execute function
+                function()
+
+                # start parent connection
+                parent_connection.send(0)
+
+                # receive memory and num measurements
+                max_memory = parent_connection.recv()
+                num_measurements = parent_connection.recv()
+            except Exception:
+                # kill process in a clean way
+                parent = psutil.Process(os.getpid())
+                for child in parent.children(recursive=True):
+                    os.kill(child.pid, SIGKILL)
+                mem_process.join(0)
+                raise RuntimeError("Process killed. Error in Process")
+
+            # run process at least 20 * interval or until it finishes
+            mem_process.join(20 * interval)
+
+            if (num_measurements > 4) or (interval < 1e-6):
+                break
+
+            # reduce interval
+            interval /= 10
+
+        return max_memory
+
+
+def start_memory_tracing(
+    modules_to_trace: Optional[Union[str, Iterable[str]]] = None,
+    modules_not_to_trace: Optional[Union[str, Iterable[str]]] = None,
+    events_to_trace: str = "line",
+    gpus_to_trace: Optional[List[int]] = None,
+) -> MemoryTrace:
+    """ Setup line-by-line tracing to record rss mem (RAM) at each line of a module or sub-module.
+        See `./benchmark.py` for usage examples.
+        Current memory consumption is returned using psutil and in particular is the RSS memory
+            "Resident Set Size” (the non-swapped physical memory the process is using).
+            See https://psutil.readthedocs.io/en/latest/#psutil.Process.memory_info
+
+        Args:
+            - `modules_to_trace`: (None, string, list/tuple of string)
+                if None, all events are recorded
+                if string or list of strings: only events from the listed module/sub-module will be recorded (e.g. 'fairseq' or 'transformers.modeling_gpt2')
+            - `modules_not_to_trace`: (None, string, list/tuple of string)
+                if None, no module is avoided
+                if string or list of strings: events from the listed module/sub-module will not be recorded (e.g. 'torch')
+            - `events_to_trace`: string or list of string of events to be recorded (see official python doc for `sys.settrace` for the list of events)
+                default to line
+            - `gpus_to_trace`: (optional list, default None) list of GPUs to trace. Default to tracing all GPUs
+
+        Return:
+            - `memory_trace` is a list of `UsedMemoryState` for each event (default each line of the traced script).
+                - `UsedMemoryState` are named tuples with the following fields:
+                    - 'frame': a `Frame` namedtuple (see below) storing information on the current tracing frame (current file, location in current file)
+                    - 'cpu_memory': CPU RSS memory state *before* executing the line
+                    - 'gpu_memory': GPU used memory *before* executing the line (sum for all GPUs or for only `gpus_to_trace` if provided)
+
+        `Frame` is a namedtuple used by `UsedMemoryState` to list the current frame state.
+            `Frame` has the following fields:
+            - 'filename' (string): Name of the file currently executed
+            - 'module' (string): Name of the module currently executed
+            - 'line_number' (int): Number of the line currently executed
+            - 'event' (string): Event that triggered the tracing (default will be "line")
+            - 'line_text' (string): Text of the line in the python script
+
+    """
+    if is_psutil_available():
+        process = psutil.Process(os.getpid())
+    else:
+        logger.warning(
+            "Psutil not installed, we won't log CPU memory usage. "
+            "Install psutil (pip install psutil) to use CPU memory tracing."
+        )
+        process = None
+
+    if is_py3nvml_available():
+        try:
+            nvml.nvmlInit()
+            devices = list(range(nvml.nvmlDeviceGetCount())) if gpus_to_trace is None else gpus_to_trace
+            nvml.nvmlShutdown()
+        except (OSError, nvml.NVMLError):
+            logger.warning("Error while initializing comunication with GPU. " "We won't perform GPU memory tracing.")
+            log_gpu = False
+        else:
+            log_gpu = True
+    else:
+        logger.warning(
+            "py3nvml not installed, we won't log GPU memory usage. "
+            "Install py3nvml (pip install py3nvml) to use GPU memory tracing."
+        )
+        log_gpu = False
+
+    memory_trace = []
+
+    def traceit(frame, event, args):
+        """ Tracing method executed before running each line in a module or sub-module
+            Record memory allocated in a list with debugging information
+        """
+        global _is_memory_tracing_enabled
+
+        if not _is_memory_tracing_enabled:
+            return traceit
+
+        # Filter events
+        if events_to_trace is not None:
+            if isinstance(events_to_trace, str) and event != events_to_trace:
+                return traceit
+            elif isinstance(events_to_trace, (list, tuple)) and event not in events_to_trace:
+                return traceit
+
+        if "__name__" not in frame.f_globals:
+            return traceit
+
+        # Filter modules
+        name = frame.f_globals["__name__"]
+        if not isinstance(name, str):
+            return traceit
+        else:
+            # Filter whitelist of modules to trace
+            if modules_to_trace is not None:
+                if isinstance(modules_to_trace, str) and modules_to_trace not in name:
+                    return traceit
+                elif isinstance(modules_to_trace, (list, tuple)) and all(m not in name for m in modules_to_trace):
+                    return traceit
+
+            # Filter blacklist of modules not to trace
+            if modules_not_to_trace is not None:
+                if isinstance(modules_not_to_trace, str) and modules_not_to_trace in name:
+                    return traceit
+                elif isinstance(modules_not_to_trace, (list, tuple)) and any(m in name for m in modules_not_to_trace):
+                    return traceit
+
+        # Record current tracing state (file, location in file...)
+        lineno = frame.f_lineno
+        filename = frame.f_globals["__file__"]
+        if filename.endswith(".pyc") or filename.endswith(".pyo"):
+            filename = filename[:-1]
+        line = linecache.getline(filename, lineno).rstrip()
+        traced_state = Frame(filename, name, lineno, event, line)
+
+        # Record current memory state (rss memory) and compute difference with previous memory state
+        cpu_mem = 0
+        if process is not None:
+            mem = process.memory_info()
+            cpu_mem = mem.rss
+
+        gpu_mem = 0
+        if log_gpu:
+            # Clear GPU caches
+            if is_mxnet_available():
+                for ctx in mx_all_contexts:
+                    ctx.empty_cache()
+            if is_torch_available():
+                torch_empty_cache()
+            if is_tf_available():
+                tf_context.context()._clear_caches()  # See https://github.com/tensorflow/tensorflow/issues/20218#issuecomment-416771802
+
+            # Sum used memory for all GPUs
+            nvml.nvmlInit()
+
+            for i in devices:
+                handle = nvml.nvmlDeviceGetHandleByIndex(i)
+                meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
+                gpu_mem += meminfo.used
+
+            nvml.nvmlShutdown()
+
+        mem_state = UsedMemoryState(traced_state, cpu_mem, gpu_mem)
+        memory_trace.append(mem_state)
+
+        return traceit
+
+    sys.settrace(traceit)
+
+    global _is_memory_tracing_enabled
+    _is_memory_tracing_enabled = True
+
+    return memory_trace
+
+
+def stop_memory_tracing(
+    memory_trace: Optional[MemoryTrace] = None, ignore_released_memory: bool = True
+) -> Optional[MemorySummary]:
+    """ Stop memory tracing cleanly and return a summary of the memory trace if a trace is given.
+
+        Args:
+            - `memory_trace` (optional output of start_memory_tracing, default: None): memory trace to convert in summary
+            - `ignore_released_memory` (boolean, default: None): if True we only sum memory increase to compute total memory
+
+        Return:
+            - None if `memory_trace` is None
+            - `MemorySummary` namedtuple otherwise with the fields:
+                - `sequential`: a list of `MemoryState` namedtuple (see below) computed from the provided `memory_trace`
+                    by substracting the memory after executing each line from the memory before executing said line.
+                - `cumulative`: a list of `MemoryState` namedtuple (see below) with cumulative increase in memory for each line
+                    obtained by summing repeated memory increase for a line if it's executed several times.
+                    The list is sorted from the frame with the largest memory consumption to the frame with the smallest (can be negative if memory is released)
+                - `total`: total memory increase during the full tracing as a `Memory` named tuple (see below).
+                    Line with memory release (negative consumption) are ignored if `ignore_released_memory` is `True` (default).
+
+        `Memory` named tuple have fields
+            - `byte` (integer): number of bytes,
+            - `string` (string): same as human readable string (ex: "3.5MB")
+
+        `Frame` are namedtuple used to list the current frame state and have the following fields:
+            - 'filename' (string): Name of the file currently executed
+            - 'module' (string): Name of the module currently executed
+            - 'line_number' (int): Number of the line currently executed
+            - 'event' (string): Event that triggered the tracing (default will be "line")
+            - 'line_text' (string): Text of the line in the python script
+
+        `MemoryState` are namedtuples listing frame + CPU/GPU memory with the following fields:
+            - `frame` (`Frame`): the current frame (see above)
+            - `cpu`: CPU memory consumed at during the current frame as a `Memory` named tuple
+            - `gpu`: GPU memory consumed at during the current frame as a `Memory` named tuple
+            - `cpu_gpu`: CPU + GPU memory consumed at during the current frame as a `Memory` named tuple
+    """
+    global _is_memory_tracing_enabled
+    _is_memory_tracing_enabled = False
+
+    if memory_trace is not None and len(memory_trace) > 1:
+        memory_diff_trace = []
+        memory_curr_trace = []
+
+        cumulative_memory_dict = defaultdict(lambda: [0, 0, 0])
+
+        for ((frame, cpu_mem, gpu_mem), (next_frame, next_cpu_mem, next_gpu_mem),) in zip(
+            memory_trace[:-1], memory_trace[1:]
+        ):
+            cpu_mem_inc = next_cpu_mem - cpu_mem
+            gpu_mem_inc = next_gpu_mem - gpu_mem
+            cpu_gpu_mem_inc = cpu_mem_inc + gpu_mem_inc
+            memory_diff_trace.append(
+                MemoryState(
+                    frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
+                )
+            )
+
+            memory_curr_trace.append(
+                MemoryState(
+                    frame=frame,
+                    cpu=Memory(next_cpu_mem),
+                    gpu=Memory(next_gpu_mem),
+                    cpu_gpu=Memory(next_gpu_mem + next_cpu_mem),
+                )
+            )
+
+            cumulative_memory_dict[frame][0] += cpu_mem_inc
+            cumulative_memory_dict[frame][1] += gpu_mem_inc
+            cumulative_memory_dict[frame][2] += cpu_gpu_mem_inc
+
+        cumulative_memory = sorted(
+            list(cumulative_memory_dict.items()), key=lambda x: x[1][2], reverse=True
+        )  # order by the total CPU + GPU memory increase
+        cumulative_memory = list(
+            MemoryState(
+                frame=frame, cpu=Memory(cpu_mem_inc), gpu=Memory(gpu_mem_inc), cpu_gpu=Memory(cpu_gpu_mem_inc),
+            )
+            for frame, (cpu_mem_inc, gpu_mem_inc, cpu_gpu_mem_inc) in cumulative_memory
+        )
+
+        memory_curr_trace = sorted(memory_curr_trace, key=lambda x: x.cpu_gpu.bytes, reverse=True)
+
+        if ignore_released_memory:
+            total_memory = sum(max(0, step_trace.cpu_gpu.bytes) for step_trace in memory_diff_trace)
+        else:
+            total_memory = sum(step_trace.cpu_gpu.bytes for step_trace in memory_diff_trace)
+
+        total_memory = Memory(total_memory)
+
+        return MemorySummary(
+            sequential=memory_diff_trace, cumulative=cumulative_memory, current=memory_curr_trace, total=total_memory,
+        )
+
+    return None
+
+
+def bytes_to_mega_bytes(memory_amount: int) -> int:
+    """ Utility to convert a number of bytes (int) into a number of mega bytes (int)
+    """
+    return memory_amount >> 20
+
+
+class GluonNLPBackboneBenchmark:
+    """
+    Benchmarks is a simple but feature-complete benchmarking script
+    to compare memory and time performance of models in Transformers.
+    """
+    def __init__(self, workloads, model_names, use_fp16=False,
+                 repeat=3, use_gpu=True, device_idx=0,
+                 profile_inference=True,
+                 profile_train=True,
+                 env_print=True,
+                 to_csv=False,
+                 layout='NT',
+                 compute_layout='auto',
+                 inference_out_csv_file='inference_time_memory.csv',
+                 train_out_csv_file='train_time_memory.csv',
+                 env_info_file='env_info.csv'):
+        self._workloads = workloads
+        if not isinstance(workloads, list):
+            workloads = [workloads]
+        if not isinstance(model_names, (list, tuple)):
+            model_names = [model_names]
+        self._workloads = workloads
+        self._model_names = model_names
+        self._use_fp16 = use_fp16
+        self._repeat = repeat
+        self._use_gpu = use_gpu
+        self._device_idx = device_idx
+        self._environment_info = None
+        self._profile_inference = profile_inference
+        self._profile_train = profile_train
+        self._env_print = env_print
+        self._to_csv = to_csv
+        self._layout = layout
+        self._compute_layout = compute_layout
+        self._inference_out_csv_file = inference_out_csv_file
+        self._train_out_csv_file = train_out_csv_file
+        self._env_info_file = env_info_file
+        assert use_fp16 is False, 'Currently fp16 benchmark has not been supported yet.'
+
+    @property
+    def model_names(self):
+        return self._model_names
+
+    @property
+    def workloads(self):
+        return self._workloads
+
+    def _inference_speed_memory(self, model_name: str, batch_size: int, sequence_length: int)\
+            -> Tuple[float, Memory]:
+        if self._use_gpu:
+            ctx = mxnet.gpu()
+        else:
+            ctx = mxnet.cpu()
+        model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone(model_name)
+        # TODO Support fp16 profiling
+        cfg.defrost()
+        cfg.MODEL.layout = self._layout
+        if model_cls.__name__ not in ['BartModel']:
+            cfg.MODEL.compute_layout = self._compute_layout
+        cfg.freeze()
+        if model_cls.__name__ in ['BartModel']:
+            model = model_cls.from_cfg(cfg, extract_feature=True)
+        else:
+            model = model_cls.from_cfg(cfg)
+        model.load_parameters(backbone_param_path, ctx=ctx)
+        model.hybridize()
+        vocab_size = cfg.MODEL.vocab_size
+        if self._layout == 'NT':
+            input_ids = mxnet.np.random.randint(0, vocab_size, (batch_size, sequence_length),
+                                                dtype=np.int32, ctx=ctx)
+            token_types = mxnet.np.zeros((batch_size, sequence_length), dtype=np.int32, ctx=ctx)
+            valid_length = mxnet.np.full((batch_size,), sequence_length,
+                                         dtype=np.int32, ctx=ctx)
+        elif self._layout == 'TN':
+            input_ids = mxnet.np.random.randint(0, vocab_size, (sequence_length, batch_size),
+                                                dtype=np.int32, ctx=ctx)
+            token_types = mxnet.np.zeros((sequence_length, batch_size), dtype=np.int32, ctx=ctx)
+            valid_length = mxnet.np.full((batch_size,), sequence_length,
+                                         dtype=np.int32, ctx=ctx)
+        else:
+            raise NotImplementedError
+        mxnet.npx.waitall()
+
+        def run_forward():
+            if 'roberta' in model_name or 'xlmr' in model_name:
+                out = model(input_ids, valid_length)
+            elif 'bart' in model_name:
+                out = model(input_ids, valid_length, input_ids, valid_length)
+            else:
+                out = model(input_ids, token_types, valid_length)
+            if isinstance(out, list):
+                for ele in out:
+                    ele.wait_to_read()
+            else:
+                out.wait_to_read()
+
+        timeit.repeat(run_forward, repeat=1, number=3)
+        runtimes = timeit.repeat(run_forward, repeat=self._repeat, number=3)
+        mxnet.npx.waitall()
+        # Profile memory
+        if self._use_gpu:
+            nvml.nvmlInit()
+            run_forward()
+            mxnet.npx.waitall()
+            handle = nvml.nvmlDeviceGetHandleByIndex(self._device_idx)
+            meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
+            max_bytes_in_use = meminfo.used
+            memory = Memory(max_bytes_in_use)
+            # shutdown nvml
+            nvml.nvmlShutdown()
+        else:
+            # cpu
+            memory_bytes = measure_peak_memory_cpu(run_forward)
+            memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
+        return float(np.min(runtimes) / 3.0), memory
+
+    def _train_speed_memory(self, model_name: str, batch_size: int, sequence_length: int)\
+            -> Tuple[float, Memory]:
+        if self._use_gpu:
+            ctx = mxnet.gpu()
+        else:
+            ctx = mxnet.cpu()
+        model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone(model_name)
+        # TODO Support fp16 profiling
+        cfg.defrost()
+        cfg.MODEL.layout = self._layout
+        if model_cls.__name__ not in ['BartModel']:
+            cfg.MODEL.compute_layout = self._compute_layout
+        cfg.freeze()
+        if model_cls.__name__ in ['BartModel']:
+            model = model_cls.from_cfg(cfg, extract_feature=True)
+        else:
+            model = model_cls.from_cfg(cfg)
+        model.load_parameters(backbone_param_path, ctx=ctx)
+        model.hybridize()
+        vocab_size = cfg.MODEL.vocab_size
+        if hasattr(cfg.MODEL, 'units'):
+            out_units = cfg.MODEL.units
+        else:
+            out_units = cfg.MODEL.DECODER.units
+        if self._layout == 'NT':
+            input_ids = mxnet.np.random.randint(0, vocab_size, (batch_size, sequence_length),
+                                                dtype=np.int32, ctx=ctx)
+            token_types = mxnet.np.zeros((batch_size, sequence_length), dtype=np.int32, ctx=ctx)
+            valid_length = mxnet.np.full((batch_size,), sequence_length,
+                                         dtype=np.int32, ctx=ctx)
+            contextual_embedding_ograd = mxnet.np.random.normal(
+                0, 1, (batch_size, sequence_length, out_units),
+                dtype=np.float32, ctx=ctx)
+            pooled_out_ograd = mxnet.np.random.normal(
+                0, 1, (batch_size, out_units), dtype=np.float32, ctx=ctx)
+        elif self._layout == 'TN':
+            input_ids = mxnet.np.random.randint(0, vocab_size, (sequence_length, batch_size),
+                                                dtype=np.int32, ctx=ctx)
+            token_types = mxnet.np.zeros((sequence_length, batch_size), dtype=np.int32, ctx=ctx)
+            valid_length = mxnet.np.full((batch_size,), sequence_length,
+                                         dtype=np.int32, ctx=ctx)
+            contextual_embedding_ograd = mxnet.np.random.normal(
+                0, 1, (sequence_length, batch_size, out_units),
+                dtype=np.float32, ctx=ctx)
+            pooled_out_ograd = mxnet.np.random.normal(0, 1, (batch_size, out_units),
+                                                      dtype=np.float32,
+                                                      ctx=ctx)
+        else:
+            raise NotImplementedError
+        if model_cls.__name__ in ['BertModel', 'AlbertModel', 'ElectraModel', 'MobileBertModel']:
+            def train_step():
+                with mxnet.autograd.record():
+                    contextual_embedding, pooled_out = model(input_ids, token_types, valid_length)
+                    # We'd like to set the head gradient of
+                    # contextual_embedding to contextual_embedding_ograd
+                    # and the head gradient of pooled_out to pooled_out_ograd
+                    # Thus, we simply doing two hadamard product and sum up the results.
+                    fake_loss = mxnet.np.sum(contextual_embedding * contextual_embedding_ograd)\
+                                + mxnet.np.sum(pooled_out * pooled_out_ograd)
+                    fake_loss.backward()
+                mxnet.npx.waitall()
+        elif model_cls.__name__ in ['BartModel']:
+            def train_step():
+                with mxnet.autograd.record():
+                    contextual_embedding, pooled_out = model(input_ids, valid_length,
+                                                             input_ids, valid_length)
+                    fake_loss = (contextual_embedding * contextual_embedding_ograd).sum() \
+                                + (pooled_out * pooled_out_ograd).sum()
+                    fake_loss.backward()
+                mxnet.npx.waitall()
+        else:
+            raise NotImplementedError
+        timeit.repeat(train_step, repeat=1, number=3)
+        mxnet.npx.waitall()
+        for ctx in mx_all_contexts:
+            ctx.empty_cache()
+        runtimes = timeit.repeat(train_step, repeat=self._repeat, number=3)
+        mxnet.npx.waitall()
+        for ctx in mx_all_contexts:
+            ctx.empty_cache()
+        mxnet.npx.waitall()
+        # Profile memory
+        if self._use_gpu:
+            nvml.nvmlInit()
+            train_step()
+            mxnet.npx.waitall()
+            handle = nvml.nvmlDeviceGetHandleByIndex(self._device_idx)
+            meminfo = nvml.nvmlDeviceGetMemoryInfo(handle)
+            max_bytes_in_use = meminfo.used
+            memory = Memory(max_bytes_in_use)
+            # shutdown nvml
+            nvml.nvmlShutdown()
+        else:
+            # cpu
+            memory_bytes = measure_peak_memory_cpu(train_step)
+            memory = Memory(memory_bytes) if isinstance(memory_bytes, int) else memory_bytes
+        return float(np.min(runtimes) / 3.0), memory
+
+    def inference_speed_memory(self, *args, **kwargs) -> float:
+        return separate_process_wrapper_fn(self._inference_speed_memory, False)(*args, **kwargs)
+
+    def train_speed_memory(self, *args, **kwargs) -> float:
+        return separate_process_wrapper_fn(self._train_speed_memory, False)(*args, **kwargs)
+
+    def run(self):
+        result_dict = {model_name: {} for model_name in self._model_names}
+        inference_result = copy.deepcopy(result_dict)
+        train_result = copy.deepcopy(result_dict)
+
+        for c, model_name in enumerate(self.model_names):
+            logger.info(f"{c + 1} / {len(self.model_names)}")
+            inference_result[model_name] = dict()
+            train_result[model_name] = dict()
+
+            for workload in self._workloads:
+                batch_size, sequence_length = workload
+                if self._profile_inference:
+                    try:
+                        infer_time, infer_memory = self.inference_speed_memory(model_name,
+                                                                               batch_size,
+                                                                               sequence_length)
+                    except Exception as e:
+                        logger.info(e)
+                        infer_time = np.nan
+                        infer_memory = np.nan
+                    inference_result[model_name][workload] = (infer_time, infer_memory)
+                    for ctx in mx_all_contexts:
+                        ctx.empty_cache()
+                    mxnet.npx.waitall()
+                    self.save_to_csv(inference_result, self._inference_out_csv_file)
+                if self._profile_train:
+                    try:
+                        train_time, train_memory = self.train_speed_memory(model_name,
+                                                                           batch_size,
+                                                                           sequence_length)
+                    except Exception as e:
+                        logger.info(e)
+                        train_time = np.nan
+                        train_memory = np.nan
+                    train_result[model_name][workload] = (train_time, train_memory)
+                    for ctx in mx_all_contexts:
+                        ctx.empty_cache()
+                    mxnet.npx.waitall()
+                    self.save_to_csv(train_result, self._train_out_csv_file)
+
+        if self._profile_inference:
+            logger.info("\n" + 20 * "=" + ("INFERENCE - RESULT - SPEED - MEMORY").center(55) + 20 * "=")
+            self.print_results(inference_result)
+
+        if self._profile_train:
+            logger.info("\n" + 20 * "=" + ("TRAIN - RESULT - SPEED - RESULTS").center(55) + 20 * "=")
+            self.print_results(train_result)
+
+        if self._env_print:
+            logger.info("\n" + 20 * "=" + ("ENVIRONMENT INFORMATION").center(40) + 20 * "=")
+            logger.info(
+                "\n".join(["- {}: {}".format(prop, val)
+                           for prop, val in self.environment_info.items()]) + "\n"
+            )
+
+        if self._to_csv:
+            with open(self._env_info_file, mode="w", newline="") as csv_file:
+                writer = csv.writer(csv_file)
+                for key, value in self.environment_info.items():
+                    writer.writerow([key, value])
+
+        return BenchmarkOutput(
+            inference_result,
+            train_result
+        )
+
+    @property
+    def environment_info(self):
+        if self._environment_info is None:
+            info = {}
+            info["gluonnlp_version"] = gluonnlp.__version__
+            info["framework_version"] = mxnet.__version__
+            info["python_version"] = platform.python_version()
+            info["system"] = platform.system()
+            info["cpu"] = platform.processor()
+            info["architecture"] = platform.architecture()[0]
+            info["date"] = datetime.date(datetime.now())
+            info["time"] = datetime.time(datetime.now())
+            info["fp16"] = self._use_fp16
+
+            if is_psutil_available():
+                info["cpu_ram_mb"] = bytes_to_mega_bytes(psutil.virtual_memory().total)
+            else:
+                logger.warning(
+                    "Psutil not installed, we won't log available CPU memory."
+                    "Install psutil (pip install psutil) to log available CPU memory."
+                )
+                info["cpu_ram_mb"] = "N/A"
+
+            info["use_gpu"] = self._use_gpu
+            if self._use_gpu:
+                info["num_gpus"] = 1
+                if is_py3nvml_available():
+                    nvml.nvmlInit()
+                    handle = nvml.nvmlDeviceGetHandleByIndex(self._device_idx)
+                    info["gpu"] = nvml.nvmlDeviceGetName(handle)
+                    info["gpu_ram_mb"] = bytes_to_mega_bytes(nvml.nvmlDeviceGetMemoryInfo(handle).total)
+                    info["gpu_power_watts"] = nvml.nvmlDeviceGetPowerManagementLimit(handle) / 1000
+                    info["gpu_performance_state"] = nvml.nvmlDeviceGetPerformanceState(handle)
+                    nvml.nvmlShutdown()
+                else:
+                    logger.warning(
+                        "py3nvml not installed, we won't log GPU memory usage. "
+                        "Install py3nvml (pip install py3nvml) to log information about GPU."
+                    )
+                    info["gpu"] = "N/A"
+                    info["gpu_ram_mb"] = "N/A"
+                    info["gpu_power_watts"] = "N/A"
+                    info["gpu_performance_state"] = "N/A"
+            self._environment_info = info
+        return self._environment_info
+
+    def print_results(self, result_dict):
+        logger.info(95 * "-")
+        logger.info(
+            "Model Name".center(30)
+            + "Batch Size".center(15) + "Seq Length".center(15)
+            + "Latency (ms)".center(15) + "Memory".center(15)
+        )
+        logger.info(95 * "-")
+        for model_name in self._model_names:
+            for (batch_size, sequence_length), (time_spent, memory)\
+                    in result_dict[model_name].items():
+                if np.isnan(time_spent):
+                    time_spent = str(time_spent)
+                else:
+                    time_spent = round(1000 * time_spent)
+                    time_spent = str(time_spent)
+                memory = str(memory)
+                logger.info(
+                    model_name[:30].center(30) + str(batch_size).center(15) +
+                    str(sequence_length).center(15) +
+                    time_spent.center(15) + memory.center(15)
+                )
+        logger.info(95 * "-")
+
+    def print_memory_trace_statistics(self, summary: MemorySummary):
+        logger.info(
+            "\nLine by line memory consumption:\n"
+            + "\n".join(
+                f"{state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
+                for state in summary.sequential
+            )
+        )
+        logger.info(
+            "\nLines with top memory consumption:\n"
+            + "\n".join(
+                f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
+                for state in summary.cumulative[:6]
+            )
+        )
+        logger.info(
+            "\nLines with lowest memory consumption:\n"
+            + "\n".join(
+                f"=> {state.frame.filename}:{state.frame.line_number}: mem {state.cpu_gpu}: {state.frame.line_text}"
+                for state in summary.cumulative[-6:]
+            )
+        )
+        logger.info(f"\nTotal memory increase: {summary.total}")
+
+    def save_to_csv(self, result_dict, filename):
+        if not self._to_csv:
+            return
+        logger.info("Saving results to csv {}.".format(filename))
+        with open(filename, mode="w") as csv_file:
+
+            assert len(self._model_names) > 0, "At least 1 model should be defined, but got {}".format(
+                self._model_names
+            )
+
+            fieldnames = ["model", "batch_size", "sequence_length"]
+            writer = csv.DictWriter(csv_file, fieldnames=fieldnames + ["latency", "memory"])
+            writer.writeheader()
+
+            for model_name in self._model_names:
+                result_dict_model = result_dict[model_name]
+                for (bs, ss), (latency, memory) in result_dict_model.items():
+                    writer.writerow(
+                        {
+                            "model": model_name,
+                            "batch_size": bs,
+                            "sequence_length": ss,
+                            'latency': str(latency),
+                            'memory': str(memory),
+                        }
+                    )
diff --git a/scripts/benchmarks/requirements.txt b/scripts/benchmarks/requirements.txt
new file mode 100644
index 0000000000..41332a1cec
--- /dev/null
+++ b/scripts/benchmarks/requirements.txt
@@ -0,0 +1,4 @@
+transformers
+py3nvml
+torch
+torchvision
diff --git a/scripts/bert/__init__.py b/scripts/bert/__init__.py
deleted file mode 100644
index ea93605437..0000000000
--- a/scripts/bert/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""BERT Module."""
-from . import model, data
diff --git a/scripts/bert/bert_qa_evaluate.py b/scripts/bert/bert_qa_evaluate.py
deleted file mode 100644
index 1ba6989ac6..0000000000
--- a/scripts/bert/bert_qa_evaluate.py
+++ /dev/null
@@ -1,394 +0,0 @@
-
-# Copyright 2018 The Google AI Language Team Authors, Allenai and DMLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Bert SQuAD evaluate."""
-import re
-import string
-from collections import Counter, namedtuple, OrderedDict
-
-from mxnet import nd
-
-PredResult = namedtuple('PredResult', ['start', 'end'])
-
-def _get_best_indexes(logits, n_best_size):
-    """Get the n-best logits from a list."""
-    index_and_score = sorted(
-        enumerate(logits), key=lambda x: x[1], reverse=True)
-
-    best_indexes = []
-    for i, _ in enumerate(index_and_score):
-        if i >= n_best_size:
-            break
-        best_indexes.append(index_and_score[i][0])
-    return best_indexes
-
-
-def get_final_text(pred_text, orig_text, tokenizer):
-    """Project the tokenized prediction back to the original text."""
-
-    # When we created the data, we kept track of the alignment between original
-    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
-    # now `orig_text` contains the span of our original text corresponding to the
-    # span that we predicted.
-    #
-    # However, `orig_text` may contain extra characters that we don't want in
-    # our prediction.
-    #
-    # For example, let's say:
-    #   pred_text = steve smith
-    #   orig_text = Steve Smith's
-    #
-    # We don't want to return `orig_text` because it contains the extra "'s".
-    #
-    # We don't want to return `pred_text` because it's already been normalized
-    # (the SQuAD eval script also does punctuation stripping/lower casing but
-    # our tokenizer does additional normalization like stripping accent
-    # characters).
-    #
-    # What we really want to return is "Steve Smith".
-    #
-    # Therefore, we have to apply a semi-complicated alignment heruistic between
-    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
-    # can fail in certain cases in which case we just return `orig_text`.
-
-    def _strip_spaces(text):
-        ns_chars = []
-        ns_to_s_map = OrderedDict()
-        for (i, c) in enumerate(text):
-            if c == ' ':
-                continue
-            ns_to_s_map[len(ns_chars)] = i
-            ns_chars.append(c)
-        ns_text = ''.join(ns_chars)
-        return (ns_text, ns_to_s_map)
-
-    # We first tokenize `orig_text`, strip whitespace from the result
-    # and `pred_text`, and check if they are the same length. If they are
-    # NOT the same length, the heuristic has failed. If they are the same
-    # length, we assume the characters are one-to-one aligned.
-
-    tok_text = ' '.join(tokenizer(orig_text))
-
-    start_position = tok_text.find(pred_text)
-    if start_position == -1:
-        return orig_text
-    end_position = start_position + len(pred_text) - 1
-
-    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
-    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
-
-    if len(orig_ns_text) != len(tok_ns_text):
-        return orig_text
-
-    # We then project the characters in `pred_text` back to `orig_text` using
-    # the character-to-character alignment.
-    tok_s_to_ns_map = {}
-    for i in tok_ns_to_s_map.keys():
-        tok_index = tok_ns_to_s_map[i]
-        tok_s_to_ns_map[tok_index] = i
-
-    orig_start_position = None
-    if start_position in tok_s_to_ns_map:
-        ns_start_position = tok_s_to_ns_map[start_position]
-        if ns_start_position in orig_ns_to_s_map:
-            orig_start_position = orig_ns_to_s_map[ns_start_position]
-
-    if orig_start_position is None:
-        return orig_text
-
-    orig_end_position = None
-    if end_position in tok_s_to_ns_map:
-        ns_end_position = tok_s_to_ns_map[end_position]
-        if ns_end_position in orig_ns_to_s_map:
-            orig_end_position = orig_ns_to_s_map[ns_end_position]
-
-    if orig_end_position is None:
-        return orig_text
-
-    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
-    return output_text
-
-
-def predict(features,
-            results,
-            tokenizer,
-            max_answer_length=64,
-            null_score_diff_threshold=0.0,
-            n_best_size=10,
-            version_2=False):
-    """Get prediction results.
-
-    Parameters
-    ----------
-    features : list of SQuADFeature
-        List of squad features for the example.
-    results : list of data.qa.PredResult
-        List of model predictions for span start and span end.
-    tokenizer: callable
-        Tokenizer function.
-    max_answer_length: int, default 64
-        Maximum length of the answer tokens.
-    null_score_diff_threshold: float, default 0.0
-        If null_score - best_non_null is greater than the threshold predict null.
-    n_best_size: int, default 10
-        The total number of n-best predictions.
-    version_2: bool, default False
-        If true, the SQuAD examples contain some that do not have an answer.
-
-    Returns
-    -------
-    prediction: str
-        The final prediction.
-    nbest : list of (str, float)
-        n-best predictions with their probabilities.
-    """
-
-    _PrelimPrediction = namedtuple('PrelimPrediction',
-                                   ['feature_index', 'start_index', 'end_index',
-                                    'pred_start', 'pred_end'])
-
-    _NbestPrediction = namedtuple(
-        'NbestPrediction', ['text', 'pred_start', 'pred_end'])
-
-    prelim_predictions = []
-    score_diff = None
-
-    score_null = 1000000  # large and positive
-    min_null_feature_index = 0  # the paragraph slice with min mull score
-    null_pred_start = 0  # the start logit at the slice with min null score
-    null_pred_end = 0  # the end logit at the slice with min null score
-
-    for features_id, (result, feature) in enumerate(zip(results, features)):
-        start_indexes = _get_best_indexes(result.start, n_best_size)
-        end_indexes = _get_best_indexes(result.end, n_best_size)
-
-        if version_2:
-            feature_null_score = result.start[0] + \
-                result.end[0]
-            if feature_null_score < score_null:
-                score_null = feature_null_score
-                min_null_feature_index = features_id
-                null_pred_start = result.start[0]
-                null_pred_end = result.end[0]
-
-        for start_index in start_indexes:
-            for end_index in end_indexes:
-                # We could hypothetically create invalid predictions, e.g., predict
-                # that the start of the span is in the question. We throw out all
-                # invalid predictions.
-                if start_index >= len(feature.tokens):
-                    continue
-                if end_index >= len(feature.tokens):
-                    continue
-                if start_index not in feature.token_to_orig_map:
-                    continue
-                if end_index not in feature.token_to_orig_map:
-                    continue
-                if not feature.token_is_max_context.get(start_index, False):
-                    continue
-                if end_index < start_index:
-                    continue
-                length = end_index - start_index + 1
-                if length > max_answer_length:
-                    continue
-                prelim_predictions.append(
-                    _PrelimPrediction(
-                        feature_index=features_id,
-                        start_index=start_index,
-                        end_index=end_index,
-                        pred_start=result.start[start_index],
-                        pred_end=result.end[end_index]))
-
-    if version_2:
-        prelim_predictions.append(
-            _PrelimPrediction(
-                feature_index=min_null_feature_index,
-                start_index=0,
-                end_index=0,
-                pred_start=null_pred_start,
-                pred_end=null_pred_end))
-
-    prelim_predictions = sorted(
-        prelim_predictions,
-        key=lambda x: (x.pred_start + x.pred_end),
-        reverse=True)
-
-    seen_predictions = {}
-    nbest = []
-    for pred in prelim_predictions:
-        if len(nbest) >= n_best_size:
-            break
-        feature = features[pred.feature_index]
-        if pred.start_index > 0:  # this is a non-null prediction
-            tok_tokens = feature.tokens[pred.start_index:(
-                pred.end_index + 1)]
-            orig_doc_start = feature.token_to_orig_map[pred.start_index]
-            orig_doc_end = feature.token_to_orig_map[pred.end_index]
-            orig_tokens = feature.doc_tokens[orig_doc_start:(
-                orig_doc_end + 1)]
-            tok_text = ' '.join(tok_tokens)
-
-            # De-tokenize WordPieces that have been split off.
-            tok_text = tok_text.replace(' ##', '')
-            tok_text = tok_text.replace('##', '')
-
-            # Clean whitespace
-            tok_text = tok_text.strip()
-            tok_text = ' '.join(tok_text.split())
-            orig_text = ' '.join(orig_tokens)
-
-            final_text = get_final_text(tok_text, orig_text, tokenizer)
-            if final_text in seen_predictions:
-                continue
-
-            seen_predictions[final_text] = True
-        else:
-            final_text = ''
-            seen_predictions[final_text] = True
-
-        nbest.append(
-            _NbestPrediction(
-                text=final_text,
-                pred_start=pred.pred_start,
-                pred_end=pred.pred_end))
-
-    # if we didn't inlude the empty option in the n-best, inlcude it
-    if version_2:
-        if '' not in seen_predictions:
-            nbest.append(
-                _NbestPrediction(
-                    text='',
-                    pred_start=null_pred_start,
-                    pred_end=null_pred_end))
-    # In very rare edge cases we could have no valid predictions. So we
-    # just create a nonce prediction in this case to avoid failure.
-    if not nbest:
-        nbest.append(
-            _NbestPrediction(text='empty', pred_start=0.0, pred_end=0.0))
-
-    assert len(nbest) >= 1
-
-    total_scores = []
-    best_non_null_entry = None
-    for entry in nbest:
-        total_scores.append(entry.pred_start + entry.pred_end)
-        if not best_non_null_entry:
-            if entry.text:
-                best_non_null_entry = entry
-
-    probs = nd.softmax(nd.array(total_scores)).asnumpy()
-
-    nbest_json = []
-
-    for (i, entry) in enumerate(nbest):
-        nbest_json.append((entry.text, float(probs[i])))
-
-    if not version_2:
-        prediction = nbest_json[0][0]
-    else:
-        # predict '' iff the null score - the score of best non-null > threshold
-        score_diff = score_null - best_non_null_entry.pred_start - \
-            best_non_null_entry.pred_end
-
-        if score_diff > null_score_diff_threshold:
-            prediction = ''
-        else:
-            prediction = best_non_null_entry.text
-    return prediction, nbest_json
-
-
-def normalize_answer(s):
-    """Lower text and remove punctuation, articles and extra whitespace."""
-
-    def remove_articles(text):
-        return re.sub(r'\b(a|an|the)\b', ' ', text)
-
-    def white_space_fix(text):
-        return ' '.join(text.split())
-
-    def remove_punc(text):
-        exclude = set(string.punctuation)
-        return ''.join(ch for ch in text if ch not in exclude)
-
-    def lower(text):
-        return text.lower()
-
-    return white_space_fix(remove_articles(remove_punc(lower(s))))
-
-
-def f1_score(prediction, ground_truth):
-    """Calculate the F1 scores.
-    """
-    prediction_tokens = normalize_answer(prediction).split()
-    ground_truth_tokens = normalize_answer(ground_truth).split()
-    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
-    num_same = sum(common.values())
-    if num_same == 0:
-        return 0
-    precision = 1.0 * num_same / len(prediction_tokens)
-    recall = 1.0 * num_same / len(ground_truth_tokens)
-    f1 = (2 * precision * recall) / (precision + recall)
-    return f1
-
-
-def exact_match_score(prediction, ground_truth):
-    """Calculate the EM scores.
-    """
-    return (normalize_answer(prediction) == normalize_answer(ground_truth))
-
-
-def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
-    scores_for_ground_truths = []
-    for ground_truth in ground_truths:
-        score = metric_fn(prediction, ground_truth)
-        scores_for_ground_truths.append(score)
-    return max(scores_for_ground_truths)
-
-
-def get_F1_EM(dataset, predict_data):
-    """Calculate the F1 and EM scores of the predicted results.
-    Use only with the SQuAD1.1 dataset.
-
-    Parameters
-    ----------
-    dataset_file: string
-        Path to the data file.
-    predict_data: dict
-        All final predictions.
-
-    Returns
-    -------
-    scores: dict
-        F1 and EM scores.
-    """
-    f1 = exact_match = total = 0
-    for record in dataset:
-        total += 1
-        if record[1] not in predict_data:
-            message = 'Unanswered question ' + record[1] + \
-                ' will receive score 0.'
-            print(message)
-            continue
-        ground_truths = record[4]
-        prediction = predict_data[record[1]]
-        exact_match += metric_max_over_ground_truths(
-            exact_match_score, prediction, ground_truths)
-        f1 += metric_max_over_ground_truths(f1_score, prediction,
-                                            ground_truths)
-    exact_match = 100.0 * exact_match / total
-    f1 = 100.0 * f1 / total
-
-    scores = {'exact_match': exact_match, 'f1': f1}
-
-    return scores
diff --git a/scripts/bert/data/__init__.py b/scripts/bert/data/__init__.py
deleted file mode 100644
index 643fa2f832..0000000000
--- a/scripts/bert/data/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""BERT data."""
-
-from . import embedding, transform
diff --git a/scripts/bert/data/create_pretraining_data.py b/scripts/bert/data/create_pretraining_data.py
deleted file mode 100644
index 088e55e653..0000000000
--- a/scripts/bert/data/create_pretraining_data.py
+++ /dev/null
@@ -1,688 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and DMLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Create masked LM/next sentence masked_lm examples for BERT."""
-
-
-import argparse
-import logging
-import io
-import os
-import glob
-import collections
-import warnings
-import random
-import time
-from multiprocessing import Pool
-import numpy as np
-import gluonnlp as nlp
-from gluonnlp.data import BERTTokenizer
-
-
-class TrainingInstance:
-    """A single training instance (sentence pair)."""
-
-    def __init__(self, tokens, segment_ids, masked_lm_positions,
-                 masked_lm_labels, is_random_next, vocab):
-        self.tokens = tokens
-        self.segment_ids = segment_ids
-        self.is_random_next = is_random_next
-        self.masked_lm_positions = masked_lm_positions
-        self.masked_lm_labels = masked_lm_labels
-        self.vocab = vocab
-
-    def __str__(self):
-        tks = self.vocab.to_tokens(self.tokens)
-        mask_tks = self.vocab.to_tokens(self.masked_lm_labels)
-        s = ''
-        s += 'tokens: %s\n' % (' '.join(tks))
-        s += 'segment_ids: %s\n' % (' '.join(
-            [str(x) for x in self.segment_ids]))
-        s += 'is_random_next: %s\n' % self.is_random_next
-        s += 'masked_lm_positions: %s\n' % (' '.join(
-            [str(x) for x in self.masked_lm_positions]))
-        s += 'masked_lm_labels: %s\n' % (' '.join(mask_tks))
-        s += '\n'
-        return s
-
-    def __repr__(self):
-        return self.__str__()
-
-def transform(instance, max_seq_length):
-    """Transform instance to inputs for MLM and NSP."""
-    input_ids = instance.tokens
-    assert len(input_ids) <= max_seq_length
-    segment_ids = instance.segment_ids
-    masked_lm_positions = instance.masked_lm_positions
-    valid_lengths = len(input_ids)
-
-    masked_lm_ids = instance.masked_lm_labels
-    masked_lm_weights = [1.0] * len(masked_lm_ids)
-
-    next_sentence_label = 1 if instance.is_random_next else 0
-
-    features = {}
-    features['input_ids'] = input_ids
-    features['segment_ids'] = segment_ids
-    features['masked_lm_positions'] = masked_lm_positions
-    features['masked_lm_ids'] = masked_lm_ids
-    features['masked_lm_weights'] = masked_lm_weights
-    features['next_sentence_labels'] = [next_sentence_label]
-    features['valid_lengths'] = [valid_lengths]
-    return features
-
-def print_example(instance, features):
-    logging.debug('*** Example Instance ***')
-    logging.debug('\n%s', instance)
-
-    for feature_name in features.keys():
-        feature = features[feature_name]
-        logging.debug('Generated %s: %s', feature_name, feature)
-
-def write_to_files_np(features, tokenizer, max_seq_length,
-                      max_predictions_per_seq, output_files):
-    # pylint: disable=unused-argument
-    """Write to numpy files from `TrainingInstance`s."""
-    next_sentence_labels = []
-    valid_lengths = []
-
-    assert len(output_files) == 1, 'numpy format only support single output file'
-    output_file = output_files[0]
-    (input_ids, segment_ids, masked_lm_positions, masked_lm_ids,
-     masked_lm_weights, next_sentence_labels, valid_lengths) = features
-    total_written = len(next_sentence_labels)
-
-    # store variable length numpy array object directly.
-    outputs = collections.OrderedDict()
-    outputs['input_ids'] = np.array(input_ids, dtype=object)
-    outputs['segment_ids'] = np.array(segment_ids, dtype=object)
-    outputs['masked_lm_positions'] = np.array(masked_lm_positions, dtype=object)
-    outputs['masked_lm_ids'] = np.array(masked_lm_ids, dtype=object)
-    outputs['masked_lm_weights'] = np.array(masked_lm_weights, dtype=object)
-    outputs['next_sentence_labels'] = np.array(next_sentence_labels, dtype='int32')
-    outputs['valid_lengths'] = np.array(valid_lengths, dtype='int32')
-
-    np.savez_compressed(output_file, **outputs)
-    logging.info('Wrote %d total instances', total_written)
-
-def tokenize_lines_fn(x):
-    """Worker function to tokenize lines based on the tokenizer, and perform vocabulary lookup."""
-    lines, tokenizer, vocab = x
-    results = []
-    for line in lines:
-        if not line:
-            break
-        line = line.strip()
-        # Empty lines are used as document delimiters
-        if not line:
-            results.append([])
-        else:
-            tokens = vocab[tokenizer(line)]
-            if tokens:
-                results.append(tokens)
-    return results
-
-def convert_to_npz(instances, max_seq_length):
-    """Create masked language model and next sentence prediction samples as numpy arrays."""
-    input_ids = []
-    segment_ids = []
-    masked_lm_positions = []
-    masked_lm_ids = []
-    masked_lm_weights = []
-    next_sentence_labels = []
-    valid_lengths = []
-
-    for inst_index, instance in enumerate(instances):
-        features = transform(instance, max_seq_length)
-        input_id = features['input_ids']
-        segment_id = features['segment_ids']
-        masked_lm_position = features['masked_lm_positions']
-        masked_lm_id = features['masked_lm_ids']
-        masked_lm_weight = features['masked_lm_weights']
-        next_sentence_label = features['next_sentence_labels'][0]
-        valid_length = features['valid_lengths'][0]
-
-        input_ids.append(np.ascontiguousarray(input_id, dtype='int32'))
-        segment_ids.append(np.ascontiguousarray(segment_id, dtype='int32'))
-        masked_lm_positions.append(np.ascontiguousarray(masked_lm_position, dtype='int32'))
-        masked_lm_ids.append(np.ascontiguousarray(masked_lm_id, dtype='int32'))
-        masked_lm_weights.append(np.ascontiguousarray(masked_lm_weight, dtype='float32'))
-        next_sentence_labels.append(next_sentence_label)
-        valid_lengths.append(valid_length)
-        # debugging information
-        if inst_index < 1:
-            print_example(instance, features)
-    return input_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights,\
-           next_sentence_labels, segment_ids, valid_lengths
-
-def create_training_instances(x):
-    """Create `TrainingInstance`s from raw text.
-
-    The expected input file format is the following:
-
-    (1) One sentence per line. These should ideally be actual sentences, not
-    entire paragraphs or arbitrary spans of text. (Because we use the
-    sentence boundaries for the "next sentence prediction" task).
-    (2) Blank lines between documents. Document boundaries are needed so
-    that the "next sentence prediction" task doesn't span between documents.
-
-    The function expect arguments packed in a tuple as described below.
-
-    Parameters
-    ----------
-    input_files : list of str
-        List of paths to input text files.
-    tokenizer : BERTTokenizer
-        The BERT tokenizer
-    max_seq_length : int
-        The hard limit of maximum sequence length of sentence pairs
-    dupe_factor : int
-        Duplication factor.
-    short_seq_prob : float
-        The probability of sampling sequences shorter than the max_seq_length.
-    masked_lm_prob : float
-        The probability of replacing texts with masks/random words/original words.
-    max_predictions_per_seq : int
-        The hard limit of the number of predictions for masked words
-    whole_word_mask : bool
-        Whether to do masking for whole words
-    vocab : BERTVocab
-        The BERTVocab
-    nworker : int
-        The number of processes to help processing texts in parallel
-    worker_pool : multiprocessing.Pool
-        Must be provided if nworker > 1. The caller is responsible for the destruction of
-        the worker pool.
-    output_file : str or None
-        Path to the output file. If None, the result is not serialized. If provided,
-        results are  stored in the order of (input_ids, segment_ids, masked_lm_positions,
-        masked_lm_ids, masked_lm_weights, next_sentence_labels, valid_lengths).
-
-    Returns
-    -------
-    A tuple of np.ndarray : input_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights
-                            next_sentence_labels, segment_ids, valid_lengths
-    """
-    (input_files, tokenizer, max_seq_length, short_seq_prob,
-     masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab,
-     dupe_factor, nworker, worker_pool, output_file) = x
-
-    time_start = time.time()
-    if nworker > 1:
-        assert worker_pool is not None
-
-    all_documents = [[]]
-
-    for input_file in input_files:
-        logging.debug('*** Tokenizing file %s***', input_file)
-        with io.open(input_file, 'r', encoding='utf-8') as reader:
-            lines = reader.readlines()
-            num_lines = len(lines)
-            num_lines_per_worker = (num_lines + nworker - 1) // nworker
-            process_args = []
-
-            # tokenize in parallel
-            for worker_idx in range(nworker):
-                start = worker_idx * num_lines_per_worker
-                end = min((worker_idx + 1) * num_lines_per_worker, num_lines)
-                process_args.append((lines[start:end], tokenizer, vocab))
-            if worker_pool:
-                tokenized_results = worker_pool.map(tokenize_lines_fn, process_args)
-            else:
-                tokenized_results = [tokenize_lines_fn(process_args[0])]
-
-            for tokenized_result in tokenized_results:
-                for line in tokenized_result:
-                    if not line:
-                        if all_documents[-1]:
-                            all_documents.append([])
-                    else:
-                        all_documents[-1].append(line)
-
-    # remove the empty document if any
-    all_documents = [x for x in all_documents if x]
-    random.shuffle(all_documents)
-
-    # generate training instances
-    instances = []
-    if worker_pool:
-        process_args = []
-        for document_index in range(len(all_documents)):
-            process_args.append((all_documents, document_index, max_seq_length, short_seq_prob,
-                                 masked_lm_prob, max_predictions_per_seq, whole_word_mask,
-                                 vocab, tokenizer))
-        for _ in range(dupe_factor):
-            instances_results = worker_pool.map(create_instances_from_document, process_args)
-            for instances_result in instances_results:
-                instances.extend(instances_result)
-        random.shuffle(instances)
-        npz_instances = worker_pool.apply(convert_to_npz, (instances, max_seq_length))
-    else:
-        for _ in range(dupe_factor):
-            for document_index in range(len(all_documents)):
-                instances.extend(
-                    create_instances_from_document(
-                        (all_documents, document_index, max_seq_length, short_seq_prob,
-                         masked_lm_prob, max_predictions_per_seq, whole_word_mask,
-                         vocab, tokenizer)))
-        random.shuffle(instances)
-        npz_instances = convert_to_npz(instances, max_seq_length)
-
-    (input_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights,
-     next_sentence_labels, segment_ids, valid_lengths) = npz_instances
-
-    # write output to files. Used when pre-generating files
-    if output_file:
-        features = (input_ids, segment_ids, masked_lm_positions, masked_lm_ids,
-                    masked_lm_weights, next_sentence_labels, valid_lengths)
-        logging.debug('*** Writing to output file %s ***', output_file)
-        write_to_files_np(features, tokenizer, max_seq_length,
-                          max_predictions_per_seq, [output_file])
-        features = None
-    else:
-        features = (input_ids, masked_lm_ids, masked_lm_positions, masked_lm_weights,
-                    next_sentence_labels, segment_ids, valid_lengths)
-    time_end = time.time()
-    logging.debug('Process %d files took %.1f s', len(input_files), time_end - time_start)
-    return features
-
-def create_instances_from_document(x):
-    """Creates `TrainingInstance`s for a single document."""
-    (all_documents, document_index, max_seq_length, short_seq_prob,
-     masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab, tokenizer) = x
-    document = all_documents[document_index]
-    _MASK_TOKEN = vocab[vocab.mask_token]
-    _CLS_TOKEN = vocab[vocab.cls_token]
-    _SEP_TOKEN = vocab[vocab.sep_token]
-
-    # Account for [CLS], [SEP], [SEP]
-    max_num_tokens = max_seq_length - 3
-
-    # According to the original tensorflow implementation:
-    # We *usually* want to fill up the entire sequence since we are padding
-    # to `max_seq_length` anyways, so short sequences are generally wasted
-    # computation. However, we *sometimes*
-    # (i.e., short_seq_prob == 0.1, 10% of the time) want to use shorter
-    # sequences to minimize the mismatch between pre-training and fine-tuning.
-    # The `target_seq_length` is just a rough target however, whereas
-    # `max_seq_length` is a hard limit.
-    target_seq_length = max_num_tokens
-    if random.random() < short_seq_prob:
-        target_seq_length = random.randint(2, max_num_tokens)
-
-    # We DON'T just concatenate all of the tokens from a document into a long
-    # sequence and choose an arbitrary split point because this would make the
-    # next sentence prediction task too easy. Instead, we split the input into
-    # segments "A" and "B" based on the actual "sentences" provided by the user
-    # input.
-    instances = []
-    current_chunk = []
-    current_length = 0
-    i = 0
-    while i < len(document):  # pylint: disable=R1702
-        segment = document[i]
-        current_chunk.append(segment)
-        current_length += len(segment)
-        if i == len(document) - 1 or current_length >= target_seq_length:
-            if current_chunk:
-                # `a_end` is how many segments from `current_chunk` go into the `A`
-                # (first) sentence.
-                a_end = 1
-                if len(current_chunk) >= 2:
-                    a_end = random.randint(1, len(current_chunk) - 1)
-
-                tokens_a = []
-                for j in range(a_end):
-                    tokens_a.extend(current_chunk[j])
-
-                tokens_b = []
-                # Random next
-                is_random_next = False
-                if len(current_chunk) == 1 or random.random() < 0.5:
-                    is_random_next = True
-                    target_b_length = target_seq_length - len(tokens_a)
-
-                    # randomly choose a document other than itself
-                    random_document_index = random.randint(0, len(all_documents) - 2)
-                    if random_document_index == document_index:
-                        random_document_index = len(all_documents) - 1
-
-                    random_document = all_documents[random_document_index]
-                    random_start = random.randint(0, len(random_document) - 1)
-                    for j in range(random_start, len(random_document)):
-                        tokens_b.extend(random_document[j])
-                        if len(tokens_b) >= target_b_length:
-                            break
-                    # We didn't actually use these segments so we 'put them back' so
-                    # they don't go to waste.
-                    num_unused_segments = len(current_chunk) - a_end
-                    i -= num_unused_segments
-                # Actual next
-                else:
-                    is_random_next = False
-                    for j in range(a_end, len(current_chunk)):
-                        tokens_b.extend(current_chunk[j])
-                truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)
-
-                assert len(tokens_a) >= 1
-                assert len(tokens_b) >= 1
-
-                tokens = []
-                segment_ids = []
-                tokens.append(_CLS_TOKEN)
-                segment_ids.append(0)
-                for token in tokens_a:
-                    tokens.append(token)
-                    segment_ids.append(0)
-                tokens.append(_SEP_TOKEN)
-                segment_ids.append(0)
-
-                for token in tokens_b:
-                    tokens.append(token)
-                    segment_ids.append(1)
-                tokens.append(_SEP_TOKEN)
-                segment_ids.append(1)
-
-                (tokens, masked_lm_positions,
-                 masked_lm_labels) = create_masked_lm_predictions(
-                     tokens, masked_lm_prob, max_predictions_per_seq,
-                     whole_word_mask, vocab, tokenizer,
-                     _MASK_TOKEN, _CLS_TOKEN, _SEP_TOKEN)
-                instance = TrainingInstance(
-                    tokens=tokens,
-                    segment_ids=segment_ids,
-                    is_random_next=is_random_next,
-                    masked_lm_positions=masked_lm_positions,
-                    masked_lm_labels=masked_lm_labels,
-                    vocab=vocab)
-                instances.append(instance)
-            current_chunk = []
-            current_length = 0
-        i += 1
-
-    return instances
-
-
-MaskedLmInstance = collections.namedtuple('MaskedLmInstance',
-                                          ['index', 'label'])
-
-
-def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq,
-                                 whole_word_mask, vocab, tokenizer,
-                                 _MASK_TOKEN, _CLS_TOKEN, _SEP_TOKEN):
-    """Creates the predictions for the masked LM objective."""
-    cand_indexes = []
-    for (i, token) in enumerate(tokens):
-        if token in [_CLS_TOKEN, _SEP_TOKEN]:
-            continue
-        # Whole Word Masking means that if we mask all of the subwords
-        # corresponding to an original word. When a word has been split into
-        # subwords, the first token does not have any marker and any subsequence
-        # tokens are prefixed with ##. So whenever we see the ## token, we
-        # append it to the previous set of word indexes.
-        #
-        # Note that Whole Word Masking does *not* change the training code
-        # at all -- we still predict each subword independently, softmaxed
-        # over the entire vocabulary.
-        if whole_word_mask and len(cand_indexes) >= 1 and \
-           not tokenizer.is_first_subword(vocab.idx_to_token[token]):
-            cand_indexes[-1].append(i)
-        else:
-            cand_indexes.append([i])
-
-    random.shuffle(cand_indexes)
-
-    output_tokens = list(tokens)
-
-    num_to_predict = min(max_predictions_per_seq,
-                         max(1, int(round(len(tokens) * masked_lm_prob))))
-
-    masked_lms = []
-    covered_indexes = set()
-    for index_set in cand_indexes:
-        if len(masked_lms) >= num_to_predict:
-            break
-        # If adding a whole-word mask would exceed the maximum number of
-        # predictions, then just skip this candidate.
-        if len(masked_lms) + len(index_set) > num_to_predict:
-            continue
-        is_any_index_covered = False
-        for index in index_set:
-            if index in covered_indexes:
-                is_any_index_covered = True
-                break
-        if is_any_index_covered:
-            continue
-        for index in index_set:
-            covered_indexes.add(index)
-            masked_token = None
-            # 80% of the time, replace with [MASK]
-            if random.random() < 0.8:
-                masked_token = _MASK_TOKEN
-            else:
-                # 10% of the time, keep original
-                if random.random() < 0.5:
-                    masked_token = tokens[index]
-                # 10% of the time, replace with random word
-                else:
-                    # generate a random word in [0, vocab_size - 1]
-                    masked_token = random.randint(0, len(vocab) - 1)
-
-            output_tokens[index] = masked_token
-
-            masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
-    assert len(masked_lms) <= num_to_predict
-    masked_lms = sorted(masked_lms, key=lambda x: x.index)
-
-    masked_lm_positions = []
-    masked_lm_labels = []
-    for p in masked_lms:
-        masked_lm_positions.append(p.index)
-        masked_lm_labels.append(p.label)
-
-    return (output_tokens, masked_lm_positions, masked_lm_labels)
-
-
-def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
-    """Truncates a pair of sequences to a maximum sequence length."""
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_num_tokens:
-            break
-
-        trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
-        assert len(trunc_tokens) >= 1
-
-        # We want to sometimes truncate from the front and sometimes from the
-        # back to add more randomness and avoid biases.
-        if random.random() < 0.5:
-            del trunc_tokens[0]
-        else:
-            trunc_tokens.pop()
-
-
-def main():
-    """Main function."""
-    time_start = time.time()
-
-    # random seed
-    random.seed(args.random_seed)
-
-    # create output dir
-    output_dir = os.path.expanduser(args.output_dir)
-    nlp.utils.mkdir(output_dir)
-
-    # vocabulary and tokenizer
-    if args.sentencepiece:
-        logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece)
-        if args.dataset_name:
-            warnings.warn('Both --dataset_name and --sentencepiece are provided. '
-                          'The vocabulary will be loaded based on --sentencepiece.')
-        vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)
-        tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, num_best=args.sp_nbest,
-                                             alpha=args.sp_alpha, lower=not args.cased)
-    else:
-        logging.info('loading vocab file from pre-defined dataset: %s', args.dataset_name)
-        vocab = nlp.data.utils._load_pretrained_vocab(args.dataset_name, root=output_dir,
-                                                      cls=nlp.vocab.BERTVocab)
-        tokenizer = BERTTokenizer(vocab=vocab, lower='uncased' in args.dataset_name)
-
-    # count the number of input files
-    input_files = []
-    for input_pattern in args.input_file.split(','):
-        input_files.extend(glob.glob(os.path.expanduser(input_pattern)))
-    for input_file in input_files:
-        logging.info('\t%s', input_file)
-    num_inputs = len(input_files)
-    num_outputs = min(args.num_outputs, len(input_files))
-    logging.info('*** Reading from %d input files ***', num_inputs)
-
-    # calculate the number of splits
-    file_splits = []
-    split_size = (num_inputs + num_outputs - 1) // num_outputs
-    for i in range(num_outputs):
-        split_start = i * split_size
-        split_end = min(num_inputs, (i + 1) * split_size)
-        file_splits.append(input_files[split_start:split_end])
-
-    # prepare workload
-    count = 0
-    process_args = []
-
-    for i, file_split in enumerate(file_splits):
-        output_file = os.path.join(output_dir, 'part-{}.npz'.format(str(i).zfill(3)))
-        count += len(file_split)
-        process_args.append((file_split, tokenizer, args.max_seq_length, args.short_seq_prob,
-                             args.masked_lm_prob, args.max_predictions_per_seq,
-                             args.whole_word_mask,
-                             vocab, args.dupe_factor, 1, None, output_file))
-
-    # sanity check
-    assert count == len(input_files)
-
-    # dispatch to workers
-    nworker = args.num_workers
-    if nworker > 1:
-        pool = Pool(nworker)
-        pool.map(create_training_instances, process_args)
-    else:
-        for process_arg in process_args:
-            create_training_instances(process_arg)
-
-    time_end = time.time()
-    logging.info('Time cost=%.1f', time_end - time_start)
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='Pre-training data generator for BERT',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-    parser.add_argument(
-        '--input_file',
-        type=str,
-        required=True,
-        help='Input files, separated by comma. For example, "~/data/*.txt"')
-
-    parser.add_argument(
-        '--output_dir',
-        type=str,
-        required=True,
-        help='Output directory.')
-
-    parser.add_argument(
-        '--dataset_name',
-        type=str,
-        default=None,
-        choices=['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased',
-                 'wiki_multilingual_uncased', 'wiki_multilingual_cased', 'wiki_cn_cased'],
-        help='The dataset name for the vocab file BERT model was trained on. For example, '
-             '"book_corpus_wiki_en_uncased"')
-
-    parser.add_argument(
-        '--sentencepiece',
-        type=str,
-        default=None,
-        help='Path to the sentencepiece .model file for both tokenization and vocab.')
-
-    parser.add_argument(
-        '--cased',
-        action='store_true',
-        help='Effective only if --sentencepiece is set')
-
-    parser.add_argument('--sp_nbest', type=int, default=0,
-                        help='Number of best candidates for sampling subwords with sentencepiece. ')
-
-    parser.add_argument('--sp_alpha', type=float, default=1.0,
-                        help='Inverse temperature for probability rescaling for sentencepiece '
-                             'unigram sampling')
-
-    parser.add_argument(
-        '--whole_word_mask',
-        action='store_true',
-        help='Whether to use whole word masking rather than per-subword masking.')
-
-    parser.add_argument(
-        '--max_seq_length', type=int, default=512, help='Maximum sequence length.')
-
-    parser.add_argument(
-        '--max_predictions_per_seq',
-        type=int,
-        default=80,
-        help='Maximum number of masked LM predictions per sequence. ')
-
-    parser.add_argument(
-        '--random_seed',
-        type=int,
-        default=12345,
-        help='Random seed for data generation.')
-
-    parser.add_argument(
-        '--dupe_factor',
-        type=int,
-        default=5,
-        help='Number of times to duplicate the input data (with different masks).')
-
-    parser.add_argument(
-        '--masked_lm_prob',
-        type=float,
-        default=0.15,
-        help='Masked LM probability.')
-
-    parser.add_argument(
-        '--short_seq_prob',
-        type=float,
-        default=0.1,
-        help='Probability of creating sequences which are shorter than the '
-        'maximum length. ')
-
-    parser.add_argument(
-        '--verbose',
-        action='store_true',
-        help='Print debug information')
-
-    parser.add_argument(
-        '--num_workers',
-        type=int,
-        default=8,
-        help='Number of workers for parallel processing, where each generates an output file.')
-
-    parser.add_argument(
-        '--num_outputs',
-        type=int,
-        default=1,
-        help='Number of desired output files, where each is processed independently by a worker.')
-
-    args = parser.parse_args()
-    logging.getLogger().setLevel(logging.DEBUG if args.verbose else logging.INFO)
-    logging.info(args)
-    main()
diff --git a/scripts/bert/data/embedding.py b/scripts/bert/data/embedding.py
deleted file mode 100644
index 7a609cc6de..0000000000
--- a/scripts/bert/data/embedding.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and DMLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT embedding datasets."""
-from mxnet.gluon.data import Dataset
-
-__all__ = ['BertEmbeddingDataset']
-
-class BertEmbeddingDataset(Dataset):
-    """Dataset for BERT Embedding
-
-    Parameters
-    ----------
-    sentences : List[str].
-        Sentences for embeddings.
-    transform : BERTDatasetTransform, default None.
-        transformer for BERT input format
-    """
-
-    def __init__(self, sentences, transform=None):
-        """Dataset for BERT Embedding
-
-        Parameters
-        ----------
-        sentences : List[str].
-            Sentences for embeddings.
-        transform : BERTDatasetTransform, default None.
-            transformer for BERT input format
-        """
-        self.sentences = sentences
-        self.transform = transform
-
-    def __getitem__(self, idx):
-        sentence = (self.sentences[idx], 0)
-        if self.transform:
-            return self.transform(sentence)
-        else:
-            return sentence
-
-    def __len__(self):
-        return len(self.sentences)
diff --git a/scripts/bert/data/transform.py b/scripts/bert/data/transform.py
deleted file mode 100644
index d8bef6efc2..0000000000
--- a/scripts/bert/data/transform.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and DMLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT dataset transform."""
-
-
-__all__ = ['BERTDatasetTransform']
-
-import numpy as np
-from gluonnlp.data import BERTSentenceTransform
-
-class BERTDatasetTransform:
-    """Dataset transformation for BERT-style sentence classification or regression.
-
-    Parameters
-    ----------
-    tokenizer : BERTTokenizer.
-        Tokenizer for the sentences.
-    max_seq_length : int.
-        Maximum sequence length of the sentences.
-    vocab : Vocab or BERTVocab
-        The vocabulary.
-    labels : list of int , float or None. defaults None
-        List of all label ids for the classification task and regressing task.
-        If labels is None, the default task is regression
-    pad : bool, default True
-        Whether to pad the sentences to maximum length.
-    pair : bool, default True
-        Whether to transform sentences or sentence pairs.
-    label_dtype: int32 or float32, default float32
-        label_dtype = int32 for classification task
-        label_dtype = float32 for regression task
-    """
-
-    def __init__(self,
-                 tokenizer,
-                 max_seq_length,
-                 vocab=None,
-                 class_labels=None,
-                 label_alias=None,
-                 pad=True,
-                 pair=True,
-                 has_label=True):
-        self.class_labels = class_labels
-        self.has_label = has_label
-        self._label_dtype = 'int32' if class_labels else 'float32'
-        if has_label and class_labels:
-            self._label_map = {}
-            for (i, label) in enumerate(class_labels):
-                self._label_map[label] = i
-            if label_alias:
-                for key in label_alias:
-                    self._label_map[key] = self._label_map[label_alias[key]]
-        self._bert_xform = BERTSentenceTransform(
-            tokenizer, max_seq_length, vocab=vocab, pad=pad, pair=pair)
-
-    def __call__(self, line):
-        """Perform transformation for sequence pairs or single sequences.
-
-        The transformation is processed in the following steps:
-        - tokenize the input sequences
-        - insert [CLS], [SEP] as necessary
-        - generate type ids to indicate whether a token belongs to the first
-          sequence or the second sequence.
-        - generate valid length
-
-        For sequence pairs, the input is a tuple of 3 strings:
-        text_a, text_b and label.
-
-        Inputs:
-            text_a: 'is this jacksonville ?'
-            text_b: 'no it is not'
-            label: '0'
-        Tokenization:
-            text_a: 'is this jack ##son ##ville ?'
-            text_b: 'no it is not .'
-        Processed:
-            tokens:  '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]'
-            type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
-            valid_length: 14
-            label: 0
-
-        For single sequences, the input is a tuple of 2 strings: text_a and label.
-        Inputs:
-            text_a: 'the dog is hairy .'
-            label: '1'
-        Tokenization:
-            text_a: 'the dog is hairy .'
-        Processed:
-            text_a:  '[CLS] the dog is hairy . [SEP]'
-            type_ids: 0     0   0   0  0     0 0
-            valid_length: 7
-            label: 1
-
-        Parameters
-        ----------
-        line: tuple of str
-            Input strings. For sequence pairs, the input is a tuple of 3 strings:
-            (text_a, text_b, label). For single sequences, the input is a tuple
-            of 2 strings: (text_a, label).
-
-        Returns
-        -------
-        np.array: input token ids in 'int32', shape (batch_size, seq_length)
-        np.array: valid length in 'int32', shape (batch_size,)
-        np.array: input token type ids in 'int32', shape (batch_size, seq_length)
-        np.array: classification task: label id in 'int32', shape (batch_size, 1),
-            regression task: label in 'float32', shape (batch_size, 1)
-        """
-        if self.has_label:
-            input_ids, valid_length, segment_ids = self._bert_xform(line[:-1])
-            label = line[-1]
-            # map to int if class labels are available
-            if self.class_labels:
-                label = self._label_map[label]
-            label = np.array([label], dtype=self._label_dtype)
-            return input_ids, segment_ids, valid_length, label
-        else:
-            input_ids, valid_length, segment_ids = self._bert_xform(line)
-            return input_ids, segment_ids, valid_length
diff --git a/scripts/bert/embedding.py b/scripts/bert/embedding.py
deleted file mode 100644
index 248fba3d32..0000000000
--- a/scripts/bert/embedding.py
+++ /dev/null
@@ -1,271 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""BERT embedding."""
-
-import argparse
-import io
-import logging
-import os
-
-import numpy as np
-import mxnet as mx
-
-from mxnet.gluon.data import DataLoader
-
-import gluonnlp
-from gluonnlp.data import BERTTokenizer, BERTSentenceTransform, BERTSPTokenizer
-from gluonnlp.base import get_home_dir
-
-try:
-    from data.embedding import BertEmbeddingDataset
-except ImportError:
-    from .data.embedding import BertEmbeddingDataset
-
-
-__all__ = ['BertEmbedding']
-
-
-logger = logging.getLogger(__name__)
-
-
-class BertEmbedding:
-    """
-    Encoding from BERT model.
-
-    Parameters
-    ----------
-    ctx : Context.
-        running BertEmbedding on which gpu device id.
-    dtype: str
-        data type to use for the model.
-    model : str, default bert_12_768_12.
-        pre-trained BERT model
-    dataset_name : str, default book_corpus_wiki_en_uncased.
-        pre-trained model dataset
-    params_path: str, default None
-        path to a parameters file to load instead of the pretrained model.
-    max_seq_length : int, default 25
-        max length of each sequence
-    batch_size : int, default 256
-        batch size
-    sentencepiece : str, default None
-        Path to the sentencepiece .model file for both tokenization and vocab
-    root : str, default '$MXNET_HOME/models' with MXNET_HOME defaults to '~/.mxnet'
-        Location for keeping the model parameters.
-    """
-    def __init__(self, ctx=mx.cpu(), dtype='float32', model='bert_12_768_12',
-                 dataset_name='book_corpus_wiki_en_uncased', params_path=None,
-                 max_seq_length=25, batch_size=256, sentencepiece=None,
-                 root=os.path.join(get_home_dir(), 'models')):
-        self.ctx = ctx
-        self.dtype = dtype
-        self.max_seq_length = max_seq_length
-        self.batch_size = batch_size
-        self.dataset_name = dataset_name
-
-        # use sentencepiece vocab and a checkpoint
-        # we need to set dataset_name to None, otherwise it uses the downloaded vocab
-        if params_path and sentencepiece:
-            dataset_name = None
-        else:
-            dataset_name = self.dataset_name
-        if sentencepiece:
-            vocab = gluonnlp.vocab.BERTVocab.from_sentencepiece(sentencepiece)
-        else:
-            vocab = None
-
-        self.bert, self.vocab = gluonnlp.model.get_model(model,
-                                                         dataset_name=dataset_name,
-                                                         pretrained=params_path is None,
-                                                         ctx=self.ctx,
-                                                         use_pooler=False,
-                                                         use_decoder=False,
-                                                         use_classifier=False,
-                                                         root=root, vocab=vocab)
-
-        self.bert.cast(self.dtype)
-        if params_path:
-            logger.info('Loading params from %s', params_path)
-            self.bert.load_parameters(params_path, ctx=ctx, ignore_extra=True, cast_dtype=True)
-
-        lower = 'uncased' in self.dataset_name
-        if sentencepiece:
-            self.tokenizer = BERTSPTokenizer(sentencepiece, self.vocab, lower=lower)
-        else:
-            self.tokenizer = BERTTokenizer(self.vocab, lower=lower)
-        self.transform = BERTSentenceTransform(tokenizer=self.tokenizer,
-                                               max_seq_length=self.max_seq_length,
-                                               pair=False)
-
-    def __call__(self, sentences, oov_way='avg'):
-        return self.embedding(sentences, oov_way='avg')
-
-    def embedding(self, sentences, oov_way='avg'):
-        """
-        Get tokens, tokens embedding
-
-        Parameters
-        ----------
-        sentences : List[str]
-            sentences for encoding.
-        oov_way : str, default avg.
-            use **avg**, **sum** or **last** to get token embedding for those out of
-            vocabulary words
-
-        Returns
-        -------
-        List[(List[str], List[ndarray])]
-            List of tokens, and tokens embedding
-        """
-        data_iter = self.data_loader(sentences=sentences)
-        batches = []
-        for token_ids, valid_length, token_types in data_iter:
-            token_ids = token_ids.as_in_context(self.ctx)
-            valid_length = valid_length.as_in_context(self.ctx)
-            token_types = token_types.as_in_context(self.ctx)
-            sequence_outputs = self.bert(token_ids, token_types,
-                                         valid_length.astype(self.dtype))
-            for token_id, sequence_output in zip(token_ids.asnumpy(),
-                                                 sequence_outputs.asnumpy()):
-                batches.append((token_id, sequence_output))
-        return self.oov(batches, oov_way)
-
-    def data_loader(self, sentences, shuffle=False):
-        """Load, tokenize and prepare the input sentences."""
-        dataset = BertEmbeddingDataset(sentences, self.transform)
-        return DataLoader(dataset=dataset, batch_size=self.batch_size, shuffle=shuffle)
-
-    def oov(self, batches, oov_way='avg'):
-        """
-        How to handle oov. Also filter out [CLS], [SEP] tokens.
-
-        Parameters
-        ----------
-        batches : List[(tokens_id, sequence_outputs)].
-            batch   token_ids shape is (max_seq_length,),
-                    sequence_outputs shape is (max_seq_length, dim)
-        oov_way : str
-            use **avg**, **sum** or **last** to get token embedding for those out of
-            vocabulary words
-
-        Returns
-        -------
-        List[(List[str], List[ndarray])]
-            List of tokens, and tokens embedding
-        """
-        sentences = []
-        padding_idx, cls_idx, sep_idx = None, None, None
-        if self.vocab.padding_token:
-            padding_idx = self.vocab[self.vocab.padding_token]
-        if self.vocab.cls_token:
-            cls_idx = self.vocab[self.vocab.cls_token]
-        if self.vocab.sep_token:
-            sep_idx = self.vocab[self.vocab.sep_token]
-        for token_ids, sequence_outputs in batches:
-            tokens = []
-            tensors = []
-            oov_len = 1
-            for token_id, sequence_output in zip(token_ids, sequence_outputs):
-                # [PAD] token, sequence is finished.
-                if padding_idx and token_id == padding_idx:
-                    break
-                # [CLS], [SEP]
-                if cls_idx and token_id == cls_idx:
-                    continue
-                if sep_idx and token_id == sep_idx:
-                    continue
-                token = self.vocab.idx_to_token[token_id]
-                if not self.tokenizer.is_first_subword(token):
-                    tokens.append(token)
-                    if oov_way == 'last':
-                        tensors[-1] = sequence_output
-                    else:
-                        tensors[-1] += sequence_output
-                    if oov_way == 'avg':
-                        oov_len += 1
-                else:  # iv, avg last oov
-                    if oov_len > 1:
-                        tensors[-1] /= oov_len
-                        oov_len = 1
-                    tokens.append(token)
-                    tensors.append(sequence_output)
-            if oov_len > 1:  # if the whole sentence is one oov, handle this special case
-                tensors[-1] /= oov_len
-            sentences.append((tokens, tensors))
-        return sentences
-
-
-if __name__ == '__main__':
-    np.set_printoptions(threshold=5)
-    parser = argparse.ArgumentParser(description='Get embeddings from BERT',
-                                     formatter_class=argparse.RawTextHelpFormatter)
-    parser.add_argument('--gpu', type=int, default=None,
-                        help='id of the gpu to use. Set it to empty means to use cpu.')
-    parser.add_argument('--dtype', type=str, default='float32', help='data dtype')
-    parser.add_argument('--model', type=str, default='bert_12_768_12',
-                        help='pre-trained model')
-    parser.add_argument('--dataset_name', type=str, default='book_corpus_wiki_en_uncased',
-                        help='name of the dataset used for pre-training')
-    parser.add_argument('--params_path', type=str, default=None,
-                        help='path to a params file to load instead of the pretrained model.')
-    parser.add_argument('--sentencepiece', type=str, default=None,
-                        help='Path to the sentencepiece .model file for tokenization and vocab.')
-    parser.add_argument('--max_seq_length', type=int, default=128,
-                        help='max length of each sequence')
-    parser.add_argument('--batch_size', type=int, default=256,
-                        help='batch size')
-    parser.add_argument('--oov_way', type=str, default='avg',
-                        help='how to handle subword embeddings\n'
-                             'avg: average all subword embeddings to represent the original token\n'
-                             'sum: sum all subword embeddings to represent the original token\n'
-                             'last: use last subword embeddings to represent the original token\n')
-    parser.add_argument('--sentences', type=str, nargs='+', default=None,
-                        help='sentence for encoding')
-    parser.add_argument('--file', type=str, default=None,
-                        help='file for encoding')
-    parser.add_argument('--verbose', action='store_true', help='verbose logging')
-    args = parser.parse_args()
-
-    level = logging.DEBUG if args.verbose else logging.INFO
-    logging.getLogger().setLevel(level)
-    logging.info(args)
-
-    if args.gpu is not None:
-        context = mx.gpu(args.gpu)
-    else:
-        context = mx.cpu()
-    bert_embedding = BertEmbedding(ctx=context, model=args.model, dataset_name=args.dataset_name,
-                                   max_seq_length=args.max_seq_length, batch_size=args.batch_size,
-                                   params_path=args.params_path, sentencepiece=args.sentencepiece)
-    result = []
-    sents = []
-    if args.sentences:
-        sents = args.sentences
-        result = bert_embedding(sents, oov_way=args.oov_way)
-    elif args.file:
-        with io.open(args.file, 'r', encoding='utf8') as in_file:
-            for line in in_file:
-                sents.append(line.strip())
-        result = bert_embedding(sents, oov_way=args.oov_way)
-    else:
-        logger.error('Please specify --sentence or --file')
-
-    if result:
-        for _, embeddings in zip(sents, result):
-            sent, tokens_embedding = embeddings
-            print('Text: {}'.format(' '.join(sent)))
-            print('Tokens embedding: {}'.format(tokens_embedding))
diff --git a/scripts/bert/export.py b/scripts/bert/export.py
deleted file mode 100644
index 92778d7975..0000000000
--- a/scripts/bert/export.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-"""
-Export the BERT Model for Deployment
-====================================
-
-This script exports the BERT model to a hybrid model serialized as a symbol.json file,
-which is suitable for deployment, or use with MXNet Module API.
-
-@article{devlin2018bert,
-  title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
-  author={Devlin, Jacob and Chang, Ming- \
-      Wei and Lee, Kenton and Toutanova, Kristina},
-  journal={arXiv preprint arXiv:1810.04805},
-  year={2018}
-}
-"""
-
-import argparse
-import logging
-import warnings
-import os
-import time
-
-import mxnet as mx
-import gluonnlp as nlp
-from gluonnlp.model import get_model, BERTClassifier
-from model.qa import BertForQA
-
-nlp.utils.check_version('0.8.1')
-
-parser = argparse.ArgumentParser(description='Export hybrid BERT base model.')
-
-parser.add_argument('--model_parameters',
-                    type=str,
-                    default=None,
-                    help='The model parameter file saved from training.')
-
-parser.add_argument('--model_name',
-                    type=str,
-                    default='bert_12_768_12',
-                    choices=['bert_12_768_12', 'bert_24_1024_16'],
-                    help='BERT model name. Options are "bert_12_768_12" and "bert_24_1024_16"')
-
-parser.add_argument('--task',
-                    type=str,
-                    choices=['classification', 'regression', 'question_answering'],
-                    required=True,
-                    help='Task to export. Options are "classification", "regression", '
-                         '"question_answering"')
-
-parser.add_argument('--dataset_name',
-                    type=str,
-                    default='book_corpus_wiki_en_uncased',
-                    choices=['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased',
-                             'wiki_multilingual_uncased', 'wiki_multilingual_cased',
-                             'wiki_cn_cased'],
-                    help='BERT dataset name. Options include '
-                         '"book_corpus_wiki_en_uncased", "book_corpus_wiki_en_cased", '
-                         '"wiki_multilingual_uncased", "wiki_multilingual_cased", '
-                         '"wiki_cn_cased"')
-
-parser.add_argument('--output_dir',
-                    type=str,
-                    default='./output_dir',
-                    help='The directory where the exported model symbol will be created. '
-                         'The default is ./output_dir')
-
-parser.add_argument('--seq_length',
-                    type=int,
-                    default=64,
-                    help='The maximum total input sequence length after WordPiece tokenization.'
-                         'Sequences longer than this needs to be truncated, and sequences shorter '
-                         'than this needs to be padded. Default is 384')
-
-parser.add_argument('--dropout',
-                    type=float,
-                    default=0.1,
-                    help='The dropout probability for the classification/regression head.')
-
-args = parser.parse_args()
-
-# create output dir
-output_dir = args.output_dir
-nlp.utils.mkdir(output_dir)
-
-###############################################################################
-#                                Logging                                      #
-###############################################################################
-
-log = logging.getLogger('gluonnlp')
-log.setLevel(logging.DEBUG)
-formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s',
-                              datefmt='%H:%M:%S')
-fh = logging.FileHandler(os.path.join(args.output_dir, 'hybrid_export_bert.log'), mode='w')
-fh.setLevel(logging.INFO)
-fh.setFormatter(formatter)
-console = logging.StreamHandler()
-console.setLevel(logging.INFO)
-console.setFormatter(formatter)
-log.addHandler(console)
-log.addHandler(fh)
-log.info(args)
-
-###############################################################################
-#                              Hybridize the model                            #
-###############################################################################
-
-seq_length = args.seq_length
-
-if args.task == 'classification':
-    bert, _ = get_model(
-        name=args.model_name,
-        dataset_name=args.dataset_name,
-        pretrained=False,
-        use_pooler=True,
-        use_decoder=False,
-        use_classifier=False)
-    net = BERTClassifier(bert, num_classes=2, dropout=args.dropout)
-elif args.task == 'regression':
-    bert, _ = get_model(
-        name=args.model_name,
-        dataset_name=args.dataset_name,
-        pretrained=False,
-        use_pooler=True,
-        use_decoder=False,
-        use_classifier=False)
-    net = BERTClassifier(bert, num_classes=1, dropout=args.dropout)
-elif args.task == 'question_answering':
-    bert, _ = get_model(
-        name=args.model_name,
-        dataset_name=args.dataset_name,
-        pretrained=False,
-        use_pooler=False,
-        use_decoder=False,
-        use_classifier=False)
-    net = BertForQA(bert)
-else:
-    raise ValueError('unknown task: %s'%args.task)
-
-if args.model_parameters:
-    net.load_parameters(args.model_parameters)
-else:
-    net.initialize()
-    warnings.warn('--model_parameters is not provided. The parameter checkpoint (.params) '
-                  'file will be created based on default parameter initialization.')
-
-net.hybridize(static_alloc=True, static_shape=True)
-
-###############################################################################
-#                            Prepare dummy input data                         #
-###############################################################################
-
-test_batch_size = 1
-
-inputs = mx.nd.arange(test_batch_size * seq_length)
-inputs = inputs.reshape(shape=(test_batch_size, seq_length))
-token_types = mx.nd.zeros_like(inputs)
-valid_length = mx.nd.arange(test_batch_size)
-batch = inputs, token_types, valid_length
-
-def export(batch, prefix):
-    """Export the model."""
-    log.info('Exporting the model ... ')
-    inputs, token_types, valid_length = batch
-    net(inputs, token_types, valid_length)
-    net.export(prefix, epoch=0)
-    assert os.path.isfile(prefix + '-symbol.json')
-    assert os.path.isfile(prefix + '-0000.params')
-
-def infer(prefix):
-    """Evaluate the model on a mini-batch."""
-    log.info('Start inference ... ')
-
-    # import with SymbolBlock. Alternatively, you can use Module.load APIs.
-    imported_net = mx.gluon.nn.SymbolBlock.imports(prefix + '-symbol.json',
-                                                   ['data0', 'data1', 'data2'],
-                                                   prefix + '-0000.params')
-
-    # exported model should be length-agnostic. Using a different seq_length should work
-    inputs = mx.nd.arange(test_batch_size * (seq_length + 10))
-    inputs = inputs.reshape(shape=(test_batch_size, seq_length + 10))
-    token_types = mx.nd.zeros_like(inputs)
-    valid_length = mx.nd.arange(test_batch_size)
-
-    # run forward inference
-    imported_net(inputs, token_types, valid_length)
-    mx.nd.waitall()
-
-    # benchmark speed after warmup
-    tic = time.time()
-    num_trials = 10
-    for _ in range(num_trials):
-        imported_net(inputs, token_types, valid_length)
-    mx.nd.waitall()
-    toc = time.time()
-    log.info('Batch size={}, Thoughput={:.2f} batches/s'
-             .format(test_batch_size, num_trials / (toc - tic)))
-
-
-###############################################################################
-#                              Export the model                               #
-###############################################################################
-if __name__ == '__main__':
-    prefix = os.path.join(args.output_dir, args.task)
-    export(batch, prefix)
-    infer(prefix)
diff --git a/scripts/bert/finetune_classifier.py b/scripts/bert/finetune_classifier.py
deleted file mode 100644
index 8a400fb8b9..0000000000
--- a/scripts/bert/finetune_classifier.py
+++ /dev/null
@@ -1,704 +0,0 @@
-"""
-Sentence Pair Classification with Bidirectional Encoder Representations from Transformers
-
-=========================================================================================
-
-This example shows how to implement finetune a model with pre-trained BERT parameters for
-sentence pair classification, with Gluon NLP Toolkit.
-
-@article{devlin2018bert,
-  title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
-  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
-  journal={arXiv preprint arXiv:1810.04805},
-  year={2018}
-}
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-
-import io
-import os
-import time
-import argparse
-import random
-import logging
-import warnings
-from functools import partial
-import numpy as np
-import mxnet as mx
-from mxnet import gluon
-from mxnet.contrib.amp import amp
-import gluonnlp as nlp
-from gluonnlp.data import BERTTokenizer
-from gluonnlp.data.classification import get_task
-from gluonnlp.data.bert.glue import truncate_seqs_equal, concat_sequences
-from gluonnlp.model import BERTClassifier, RoBERTaClassifier
-from gluonnlp.calibration import BertLayerCollector
-
-nlp.utils.check_version('0.9', warning_only=True)
-
-parser = argparse.ArgumentParser(
-    description='BERT fine-tune examples for classification/regression tasks.',
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-parser.add_argument('--optimizer', type=str, default='bertadam',
-                    help='The optimizer to be used for training')
-parser.add_argument('--epochs', type=int, default=3, help='number of epochs.')
-parser.add_argument(
-    '--training_steps', type=int, help='The total training steps. '
-    'Note that if specified, epochs will be ignored.')
-parser.add_argument(
-    '--batch_size',
-    type=int,
-    default=32,
-    help='Batch size. Number of examples per gpu in a minibatch.')
-parser.add_argument(
-    '--dev_batch_size',
-    type=int,
-    default=8,
-    help='Batch size for dev set and test set')
-parser.add_argument(
-    '--lr',
-    type=float,
-    default=3e-5,
-    help='Initial learning rate')
-parser.add_argument(
-    '--epsilon',
-    type=float,
-    default=1e-6,
-    help='Small value to avoid division by 0'
-)
-parser.add_argument(
-    '--warmup_ratio',
-    type=float,
-    default=0.1,
-    help='ratio of warmup steps used in NOAM\'s stepsize schedule')
-parser.add_argument(
-    '--log_interval',
-    type=int,
-    default=10,
-    help='report interval')
-parser.add_argument(
-    '--max_len',
-    type=int,
-    default=128,
-    help='Maximum length of the sentence pairs')
-parser.add_argument(
-    '--round_to', type=int, default=None,
-    help='The length of padded sequences will be rounded up to be multiple of this argument.'
-         'When round to is set to 8, training throughput may increase for mixed precision'
-         'training on GPUs with tensorcores.')
-parser.add_argument(
-    '--seed', type=int, default=2, help='Random seed')
-parser.add_argument(
-    '--accumulate',
-    type=int,
-    default=None,
-    help='The number of batches for gradients accumulation to simulate large batch size. '
-         'Default is None')
-parser.add_argument(
-    '--gpu', type=int, default=None, help='Which gpu for finetuning.')
-parser.add_argument(
-    '--task_name',
-    type=str,
-    choices=['MRPC', 'QNLI', 'RTE', 'STS-B', 'CoLA',
-             'MNLI', 'WNLI', 'SST', 'XNLI', 'LCQMC', 'ChnSentiCorp'],
-    help='The name of the task to fine-tune. Choices include MRPC, QQP, '
-         'QNLI, RTE, STS-B, CoLA, MNLI, WNLI, SST.')
-parser.add_argument(
-    '--bert_model',
-    type=str,
-    default='bert_12_768_12',
-    choices=['bert_12_768_12', 'bert_24_1024_16', 'roberta_12_768_12', 'roberta_24_1024_16'],
-    help='The name of pre-trained BERT model to fine-tune')
-parser.add_argument(
-    '--bert_dataset',
-    type=str,
-    default='book_corpus_wiki_en_uncased',
-    choices=['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased',
-             'openwebtext_book_corpus_wiki_en_uncased', 'wiki_multilingual_uncased',
-             'wiki_multilingual_cased', 'wiki_cn_cased',
-             'openwebtext_ccnews_stories_books_cased'],
-    help='The dataset BERT pre-trained with.')
-parser.add_argument(
-    '--pretrained_bert_parameters',
-    type=str,
-    default=None,
-    help='Pre-trained bert model parameter file.')
-parser.add_argument(
-    '--model_parameters',
-    type=str,
-    default=None,
-    help='A parameter file for the model that is loaded into the model'
-    ' before training/inference. It is different from the parameter'
-    ' file written after the model is trained.')
-parser.add_argument(
-    '--output_dir',
-    type=str,
-    default='./output_dir',
-    help='The output directory where the model params will be written.')
-parser.add_argument(
-    '--only_inference',
-    action='store_true',
-    help='If set, we skip training and only perform inference on dev and test data.')
-parser.add_argument(
-    '--dtype',
-    type=str,
-    default='float32',
-    choices=['float32', 'float16'],
-    help='The data type for training.')
-parser.add_argument(
-    '--early_stop',
-    type=int,
-    default=None,
-    help='Whether to perform early stopping based on the metric on dev set. '
-         'The provided value is the patience. ')
-parser.add_argument('--deploy', action='store_true',
-                    help='whether load static model for deployment')
-parser.add_argument('--model_prefix', type=str, required=False,
-                    help='load static model as hybridblock.')
-parser.add_argument('--only_calibration', action='store_true',
-                    help='quantize model')
-parser.add_argument('--num_calib_batches', type=int, default=5,
-                    help='number of batches for calibration')
-parser.add_argument('--quantized_dtype', type=str, default='auto',
-                    choices=['auto', 'int8', 'uint8'],
-                    help='quantization destination data type for input data')
-parser.add_argument('--calib_mode', type=str, default='customize',
-                    choices=['none', 'naive', 'entropy', 'customize'],
-                    help='calibration mode used for generating calibration table '
-                         'for the quantized symbol.')
-
-args = parser.parse_args()
-
-
-log = logging.getLogger()
-log.setLevel(logging.INFO)
-
-logging.captureWarnings(True)
-fh = logging.FileHandler('log_{0}.txt'.format(args.task_name))
-formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s',
-                              datefmt='%H:%M:%S')
-fh.setLevel(logging.INFO)
-fh.setFormatter(formatter)
-console = logging.StreamHandler()
-console.setLevel(logging.INFO)
-console.setFormatter(formatter)
-log.addHandler(console)
-log.addHandler(fh)
-logging.info(args)
-
-batch_size = args.batch_size
-dev_batch_size = args.dev_batch_size
-task_name = args.task_name
-lr = args.lr
-epsilon = args.epsilon
-accumulate = args.accumulate
-log_interval = args.log_interval * accumulate if accumulate else args.log_interval
-if accumulate:
-    logging.info('Using gradient accumulation. Effective batch size = ' \
-                 'batch_size * accumulate = %d', accumulate * batch_size)
-
-# random seed
-np.random.seed(args.seed)
-random.seed(args.seed)
-mx.random.seed(args.seed)
-
-ctx = mx.cpu() if args.gpu is None else mx.gpu(args.gpu)
-
-task = get_task(task_name)
-
-# data type with mixed precision training
-if args.dtype == 'float16':
-    amp.init()
-
-# model and loss
-only_inference = args.only_inference
-model_name = args.bert_model
-dataset = args.bert_dataset
-pretrained_bert_parameters = args.pretrained_bert_parameters
-model_parameters = args.model_parameters
-
-# load symbolic model
-deploy = args.deploy
-model_prefix = args.model_prefix
-
-if only_inference and not model_parameters:
-    warnings.warn('model_parameters is not set. '
-                  'Randomly initialized model will be used for inference.')
-
-get_pretrained = not (pretrained_bert_parameters is not None or model_parameters is not None)
-
-use_roberta = 'roberta' in model_name
-get_model_params = {
-    'name': model_name,
-    'dataset_name': dataset,
-    'pretrained': get_pretrained,
-    'ctx': ctx,
-    'use_decoder': False,
-    'use_classifier': False,
-}
-# RoBERTa does not contain parameters for sentence pair classification
-if not use_roberta:
-    get_model_params['use_pooler'] = True
-
-bert, vocabulary = nlp.model.get_model(**get_model_params)
-
-# initialize the rest of the parameters
-initializer = mx.init.Normal(0.02)
-# STS-B is a regression task.
-# STSBTask().class_labels returns None
-do_regression = not task.class_labels
-if do_regression:
-    num_classes = 1
-    loss_function = gluon.loss.L2Loss()
-else:
-    num_classes = len(task.class_labels)
-    loss_function = gluon.loss.SoftmaxCELoss()
-# reuse the BERTClassifier class with num_classes=1 for regression
-if use_roberta:
-    model = RoBERTaClassifier(bert, dropout=0.0, num_classes=num_classes)
-else:
-    model = BERTClassifier(bert, dropout=0.1, num_classes=num_classes)
-# initialize classifier
-if not model_parameters:
-    model.classifier.initialize(init=initializer, ctx=ctx)
-
-# load checkpointing
-output_dir = args.output_dir
-if pretrained_bert_parameters:
-    logging.info('loading bert params from %s', pretrained_bert_parameters)
-    nlp.utils.load_parameters(model.bert, pretrained_bert_parameters, ctx=ctx, ignore_extra=True,
-                              cast_dtype=True)
-if model_parameters:
-    logging.info('loading model params from %s', model_parameters)
-    nlp.utils.load_parameters(model, model_parameters, ctx=ctx, cast_dtype=True)
-nlp.utils.mkdir(output_dir)
-
-logging.debug(model)
-model.hybridize(static_alloc=True)
-loss_function.hybridize(static_alloc=True)
-
-if deploy:
-    logging.info('load symbol file directly as SymbolBlock for model deployment')
-    model = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
-                                         ['data0', 'data1', 'data2'],
-                                         '{}-0000.params'.format(args.model_prefix))
-    model.hybridize(static_alloc=True, static_shape=True)
-
-# data processing
-do_lower_case = 'uncased' in dataset
-if use_roberta:
-    bert_tokenizer = nlp.data.GPT2BPETokenizer()
-else:
-    bert_tokenizer = BERTTokenizer(vocabulary, lower=do_lower_case)
-
-# calibration config
-only_calibration = args.only_calibration
-num_calib_batches = args.num_calib_batches
-quantized_dtype = args.quantized_dtype
-calib_mode = args.calib_mode
-
-def convert_examples_to_features(example, tokenizer=None, truncate_length=512, cls_token=None,
-                                 sep_token=None, class_labels=None, label_alias=None, vocab=None,
-                                 is_test=False):
-    """convert glue examples into necessary features"""
-    if not is_test:
-        label_dtype = 'int32' if class_labels else 'float32'
-        # get the label
-        label = example[-1]
-        example = example[:-1]
-        #create label maps if classification task
-        if class_labels:
-            label_map = {}
-            for (i, l) in enumerate(class_labels):
-                label_map[l] = i
-            if label_alias:
-                for key in label_alias:
-                    label_map[key] = label_map[label_alias[key]]
-            label = label_map[label]
-        label = np.array([label], dtype=label_dtype)
-
-    # tokenize raw text
-    tokens_raw = [tokenizer(l) for l in example]
-    # truncate to the truncate_length,
-    tokens_trun = truncate_seqs_equal(tokens_raw, truncate_length)
-    # concate the sequences with special tokens
-    tokens_trun[0] = [cls_token] + tokens_trun[0]
-    tokens, segment_ids, _ = concat_sequences(tokens_trun, [[sep_token]] * len(tokens_trun))
-    # convert the token to ids
-    input_ids = vocab[tokens]
-    valid_length = len(input_ids)
-    if not is_test:
-        return input_ids, segment_ids, valid_length, label
-    else:
-        return input_ids, segment_ids, valid_length
-
-
-def preprocess_data(tokenizer, task, batch_size, dev_batch_size, max_len, vocab):
-    """Train/eval Data preparation function."""
-    label_dtype = 'int32' if task.class_labels else 'float32'
-    truncate_length = max_len - 3 if task.is_pair else max_len - 2
-    trans = partial(convert_examples_to_features, tokenizer=tokenizer,
-                    truncate_length=truncate_length,
-                    cls_token=vocab.cls_token if not use_roberta else vocab.bos_token,
-                    sep_token=vocab.sep_token if not use_roberta else vocab.eos_token,
-                    class_labels=task.class_labels, label_alias=task.label_alias, vocab=vocab)
-
-    # data train
-    # task.dataset_train returns (segment_name, dataset)
-    train_tsv = task.dataset_train()[1]
-    data_train = mx.gluon.data.SimpleDataset(list(map(trans, train_tsv)))
-    data_train_len = data_train.transform(lambda _, segment_ids, valid_length, label: valid_length,
-                                          lazy=False)
-    # bucket sampler for training
-    pad_val = vocabulary[vocabulary.padding_token]
-    batchify_fn = nlp.data.batchify.Tuple(
-        nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to),  # input
-        nlp.data.batchify.Pad(axis=0, pad_val=0, round_to=args.round_to),  # segment
-        nlp.data.batchify.Stack(),  # length
-        nlp.data.batchify.Stack(label_dtype))  # label
-    batch_sampler = nlp.data.sampler.FixedBucketSampler(data_train_len, batch_size=batch_size,
-                                                        num_buckets=10, ratio=0, shuffle=True)
-    # data loader for training
-    loader_train = gluon.data.DataLoader(dataset=data_train, num_workers=4,
-                                         batch_sampler=batch_sampler, batchify_fn=batchify_fn)
-
-    # data dev. For MNLI, more than one dev set is available
-    dev_tsv = task.dataset_dev()
-    dev_tsv_list = dev_tsv if isinstance(dev_tsv, list) else [dev_tsv]
-    loader_dev_list = []
-    for segment, data in dev_tsv_list:
-        data_dev = mx.gluon.data.SimpleDataset(list(map(trans, data)))
-        loader_dev = mx.gluon.data.DataLoader(data_dev, batch_size=dev_batch_size, num_workers=4,
-                                              shuffle=False, batchify_fn=batchify_fn)
-        loader_dev_list.append((segment, loader_dev))
-
-    # batchify for data test
-    test_batchify_fn = nlp.data.batchify.Tuple(
-        nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to),
-        nlp.data.batchify.Pad(axis=0, pad_val=0, round_to=args.round_to),
-        nlp.data.batchify.Stack())
-    # transform for data test
-    test_trans = partial(convert_examples_to_features, tokenizer=tokenizer, truncate_length=max_len,
-                         cls_token=vocab.cls_token if not use_roberta else vocab.bos_token,
-                         sep_token=vocab.sep_token if not use_roberta else vocab.eos_token,
-                         class_labels=None, is_test=True, vocab=vocab)
-
-    # data test. For MNLI, more than one test set is available
-    test_tsv = task.dataset_test()
-    test_tsv_list = test_tsv if isinstance(test_tsv, list) else [test_tsv]
-    loader_test_list = []
-    for segment, data in test_tsv_list:
-        data_test = mx.gluon.data.SimpleDataset(list(map(test_trans, data)))
-        loader_test = mx.gluon.data.DataLoader(data_test, batch_size=dev_batch_size, num_workers=4,
-                                               shuffle=False, batchify_fn=test_batchify_fn)
-        loader_test_list.append((segment, loader_test))
-    return loader_train, loader_dev_list, loader_test_list, len(data_train)
-
-
-# Get the loader.
-logging.info('processing dataset...')
-train_data, dev_data_list, test_data_list, num_train_examples = preprocess_data(
-    bert_tokenizer, task, batch_size, dev_batch_size, args.max_len, vocabulary)
-
-def calibration(net, dev_data_list, num_calib_batches, quantized_dtype, calib_mode):
-    """calibration function on the dev dataset."""
-    assert len(dev_data_list) == 1, \
-        'Currectly, MNLI not supported.'
-    assert ctx == mx.cpu(), \
-        'Currently only supports CPU with MKL-DNN backend.'
-    logging.info('Now we are doing calibration on dev with %s.', ctx)
-    for _, dev_data in dev_data_list:
-        collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=logging)
-        num_calib_examples = dev_batch_size * num_calib_batches
-        net = mx.contrib.quantization.quantize_net_v2(net, quantized_dtype=quantized_dtype,
-                                                      exclude_layers=[],
-                                                      quantize_mode='smart',
-                                                      quantize_granularity='channel-wise',
-                                                      calib_data=dev_data,
-                                                      calib_mode=calib_mode,
-                                                      num_calib_examples=num_calib_examples,
-                                                      ctx=ctx,
-                                                      LayerOutputCollector=collector,
-                                                      logger=logging)
-        # save params
-        ckpt_name = 'model_bert_{0}_quantized_{1}'.format(task_name, calib_mode)
-        params_saved = os.path.join(output_dir, ckpt_name)
-        net.export(params_saved, epoch=0)
-        logging.info('Saving quantized model at %s', output_dir)
-
-
-def test(loader_test, segment):
-    """Inference function on the test dataset."""
-    logging.info('Now we are doing testing on %s with %s.', segment, ctx)
-
-    tic = time.time()
-    results = []
-    for _, seqs in enumerate(loader_test):
-        input_ids, segment_ids, valid_length = seqs
-        input_ids = input_ids.as_in_context(ctx)
-        valid_length = valid_length.as_in_context(ctx).astype('float32')
-        if use_roberta:
-            out = model(input_ids, valid_length)
-        else:
-            out = model(input_ids, segment_ids.as_in_context(ctx), valid_length)
-        if not task.class_labels:
-            # regression task
-            for result in out.asnumpy().reshape(-1).tolist():
-                results.append('{:.3f}'.format(result))
-        else:
-            # classification task
-            indices = mx.nd.topk(out, k=1, ret_typ='indices', dtype='int32').asnumpy()
-            for index in indices:
-                results.append(task.class_labels[int(index)])
-
-    mx.nd.waitall()
-    toc = time.time()
-    logging.info('Time cost=%.2fs, throughput=%.2f samples/s', toc - tic,
-                 dev_batch_size * len(loader_test) / (toc - tic))
-    # write result to a file.
-    segment = segment.replace('_mismatched', '-mm')
-    segment = segment.replace('_matched', '-m')
-    segment = segment.replace('SST', 'SST-2')
-    filename = args.task_name + segment.replace('test', '') + '.tsv'
-    test_path = os.path.join(args.output_dir, filename)
-    with io.open(test_path, 'w', encoding='utf-8') as f:
-        f.write(u'index\tprediction\n')
-        for i, pred in enumerate(results):
-            f.write(u'%d\t%s\n' % (i, str(pred)))
-
-
-def log_train(batch_id, batch_num, metric, step_loss, log_interval, epoch_id, learning_rate):
-    """Generate and print out the log message for training. """
-    metric_nm, metric_val = metric.get()
-    if not isinstance(metric_nm, list):
-        metric_nm, metric_val = [metric_nm], [metric_val]
-
-    train_str = '[Epoch %d Batch %d/%d] loss=%.4f, lr=%.7f, metrics:' + \
-                ','.join([i + ':%.4f' for i in metric_nm])
-    logging.info(train_str, epoch_id + 1, batch_id + 1, batch_num, step_loss / log_interval,
-                 learning_rate, *metric_val)
-
-
-def log_eval(batch_id, batch_num, metric, step_loss, log_interval):
-    """Generate and print out the log message for inference. """
-    metric_nm, metric_val = metric.get()
-    if not isinstance(metric_nm, list):
-        metric_nm, metric_val = [metric_nm], [metric_val]
-
-    eval_str = '[Batch %d/%d] loss=%.4f, metrics:' + \
-               ','.join([i + ':%.4f' for i in metric_nm])
-    logging.info(eval_str, batch_id + 1, batch_num, step_loss / log_interval, *metric_val)
-
-
-def train(metric):
-    """Training function."""
-    if not only_inference:
-        logging.info('Now we are doing BERT classification training on %s!', ctx)
-
-    all_model_params = model.collect_params()
-    optimizer_params = {'learning_rate': lr, 'epsilon': epsilon, 'wd': 0.01}
-    trainer = gluon.Trainer(all_model_params, args.optimizer, optimizer_params,
-                            update_on_kvstore=False)
-    if args.dtype == 'float16':
-        amp.init_trainer(trainer)
-
-    epoch_number = args.epochs
-    step_size = batch_size * accumulate if accumulate else batch_size
-    num_train_steps = int(num_train_examples / step_size * args.epochs)
-    if args.training_steps:
-        num_train_steps = args.training_steps
-        epoch_number = 9999
-
-    logging.info('training steps=%d', num_train_steps)
-    warmup_ratio = args.warmup_ratio
-    num_warmup_steps = int(num_train_steps * warmup_ratio)
-    step_num = 0
-
-    # Do not apply weight decay on LayerNorm and bias terms
-    for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
-        v.wd_mult = 0.0
-    # Collect differentiable parameters
-    params = [p for p in all_model_params.values() if p.grad_req != 'null']
-
-    # Set grad_req if gradient accumulation is required
-    if accumulate and accumulate > 1:
-        for p in params:
-            p.grad_req = 'add'
-    # track best eval score
-    metric_history = []
-    best_metric = None
-    patience = args.early_stop
-
-    tic = time.time()
-    finish_flag = False
-    for epoch_id in range(epoch_number):
-        if args.early_stop and patience == 0:
-            logging.info('Early stopping at epoch %d', epoch_id)
-            break
-        if finish_flag:
-            break
-        if not only_inference:
-            metric.reset()
-            step_loss = 0
-            tic = time.time()
-            all_model_params.zero_grad()
-
-            for batch_id, seqs in enumerate(train_data):
-                # learning rate schedule
-                if step_num < num_warmup_steps:
-                    new_lr = lr * step_num / num_warmup_steps
-                else:
-                    non_warmup_steps = step_num - num_warmup_steps
-                    offset = non_warmup_steps / (num_train_steps - num_warmup_steps)
-                    new_lr = lr - offset * lr
-                trainer.set_learning_rate(new_lr)
-
-                # forward and backward
-                with mx.autograd.record():
-                    input_ids, segment_ids, valid_length, label = seqs
-                    input_ids = input_ids.as_in_context(ctx)
-                    valid_length = valid_length.as_in_context(ctx).astype('float32')
-                    label = label.as_in_context(ctx)
-                    if use_roberta:
-                        out = model(input_ids, valid_length)
-                    else:
-                        out = model(input_ids, segment_ids.as_in_context(ctx), valid_length)
-                    ls = loss_function(out, label).mean()
-                    if args.dtype == 'float16':
-                        with amp.scale_loss(ls, trainer) as scaled_loss:
-                            mx.autograd.backward(scaled_loss)
-                    else:
-                        ls.backward()
-
-                # update
-                if not accumulate or (batch_id + 1) % accumulate == 0:
-                    trainer.allreduce_grads()
-                    nlp.utils.clip_grad_global_norm(params, 1)
-                    trainer.update(accumulate if accumulate else 1)
-                    step_num += 1
-                    if accumulate and accumulate > 1:
-                        # set grad to zero for gradient accumulation
-                        all_model_params.zero_grad()
-
-                step_loss += ls.asscalar()
-                if not do_regression:
-                    label = label.reshape((-1))
-                metric.update([label], [out])
-                if (batch_id + 1) % (args.log_interval) == 0:
-                    log_train(batch_id, len(train_data), metric, step_loss, args.log_interval,
-                              epoch_id, trainer.learning_rate)
-                    step_loss = 0
-                if step_num >= num_train_steps:
-                    logging.info('Finish training step: %d', step_num)
-                    finish_flag = True
-                    break
-            mx.nd.waitall()
-
-        # inference on dev data
-        for segment, dev_data in dev_data_list:
-            metric_nm, metric_val = evaluate(dev_data, metric, segment)
-            if best_metric is None or metric_val >= best_metric:
-                best_metric = metric_val
-                patience = args.early_stop
-            else:
-                if args.early_stop is not None:
-                    patience -= 1
-            metric_history.append((epoch_id, metric_nm, metric_val))
-
-        if not only_inference:
-            # save params
-            ckpt_name = 'model_bert_{0}_{1}.params'.format(task_name, epoch_id)
-            params_saved = os.path.join(output_dir, ckpt_name)
-
-            nlp.utils.save_parameters(model, params_saved)
-            logging.info('params saved in: %s', params_saved)
-            toc = time.time()
-            logging.info('Time cost=%.2fs', toc - tic)
-            tic = toc
-
-    if not only_inference:
-        # we choose the best model based on metric[0],
-        # assuming higher score stands for better model quality
-        metric_history.sort(key=lambda x: x[2][0], reverse=True)
-        epoch_id, metric_nm, metric_val = metric_history[0]
-        ckpt_name = 'model_bert_{0}_{1}.params'.format(task_name, epoch_id)
-        params_saved = os.path.join(output_dir, ckpt_name)
-        nlp.utils.load_parameters(model, params_saved)
-        metric_str = 'Best model at epoch {}. Validation metrics:'.format(epoch_id)
-        metric_str += ','.join([i + ':%.4f' for i in metric_nm])
-        logging.info(metric_str, *metric_val)
-
-    # inference on test data
-    for segment, test_data in test_data_list:
-        test(test_data, segment)
-
-
-def evaluate(loader_dev, metric, segment):
-    """Evaluate the model on validation dataset."""
-    logging.info('Now we are doing evaluation on %s with %s.', segment, ctx)
-    metric.reset()
-    step_loss = 0
-    tic = time.time()
-    for batch_id, seqs in enumerate(loader_dev):
-        input_ids, segment_ids, valid_length, label = seqs
-        input_ids = input_ids.as_in_context(ctx)
-        valid_length = valid_length.as_in_context(ctx).astype('float32')
-        label = label.as_in_context(ctx)
-        if use_roberta:
-            out = model(input_ids, valid_length)
-        else:
-            out = model(input_ids, segment_ids.as_in_context(ctx), valid_length)
-
-        ls = loss_function(out, label).mean()
-        step_loss += ls.asscalar()
-        if not do_regression:
-            label = label.reshape((-1))
-        metric.update([label], [out])
-        if (batch_id + 1) % (args.log_interval) == 0:
-            log_eval(batch_id, len(loader_dev), metric, step_loss, args.log_interval)
-            step_loss = 0
-
-    metric_nm, metric_val = metric.get()
-    if not isinstance(metric_nm, list):
-        metric_nm, metric_val = [metric_nm], [metric_val]
-    metric_str = 'validation metrics:' + ','.join([i + ':%.4f' for i in metric_nm])
-    logging.info(metric_str, *metric_val)
-
-    mx.nd.waitall()
-    toc = time.time()
-    logging.info('Time cost=%.2fs, throughput=%.2f samples/s', toc - tic,
-                 dev_batch_size * len(loader_dev) / (toc - tic))
-    return metric_nm, metric_val
-
-
-if __name__ == '__main__':
-    if only_calibration:
-        try:
-            calibration(model,
-                        dev_data_list,
-                        num_calib_batches,
-                        quantized_dtype,
-                        calib_mode)
-        except AttributeError:
-            nlp.utils.version.check_version('1.7.0', warning_only=True, library=mx)
-            warnings.warn('INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115')
-    else:
-        train(task.metrics)
diff --git a/scripts/bert/finetune_squad.py b/scripts/bert/finetune_squad.py
deleted file mode 100644
index b807123cd4..0000000000
--- a/scripts/bert/finetune_squad.py
+++ /dev/null
@@ -1,862 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-"""
-SQuAD with Bidirectional Encoder Representations from Transformers
-==================================================================
-
-This example shows how to implement finetune a model with pre-trained BERT parameters for
-SQuAD, with Gluon NLP Toolkit.
-
-@article{devlin2018bert,
-  title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
-  author={Devlin, Jacob and Chang, Ming- \
-      Wei and Lee, Kenton and Toutanova, Kristina},
-  journal={arXiv preprint arXiv:1810.04805},
-  year={2018}
-}
-"""
-
-import argparse
-import collections
-import json
-import logging
-import os
-import io
-import random
-import time
-import warnings
-import itertools
-import pickle
-import multiprocessing as mp
-from functools import partial
-
-import numpy as np
-import mxnet as mx
-
-import gluonnlp as nlp
-from gluonnlp.data import SQuAD
-from gluonnlp.data.bert.glue import concat_sequences
-from gluonnlp.data.bert.squad import improve_answer_span, \
-    tokenize_and_align_positions, get_doc_spans, align_position2doc_spans, \
-    check_is_max_context, convert_squad_examples
-from gluonnlp.calibration import BertLayerCollector
-from model.qa import BertForQALoss, BertForQA
-from bert_qa_evaluate import get_F1_EM, predict, PredResult
-
-np.random.seed(6)
-random.seed(6)
-mx.random.seed(6)
-
-log = logging.getLogger('gluonnlp')
-log.setLevel(logging.DEBUG)
-formatter = logging.Formatter(
-    fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s', datefmt='%H:%M:%S')
-
-parser = argparse.ArgumentParser(
-    description='BERT QA example.'
-    'We fine-tune the BERT model on SQuAD dataset.')
-
-parser.add_argument('--only_predict',
-                    action='store_true',
-                    help='Whether to predict only.')
-
-parser.add_argument('--model_parameters',
-                    type=str,
-                    default=None,
-                    help='Model parameter file')
-
-parser.add_argument('--bert_model',
-                    type=str,
-                    default='bert_12_768_12',
-                    help='BERT model name. options are bert_12_768_12 and bert_24_1024_16.')
-
-parser.add_argument('--bert_dataset',
-                    type=str,
-                    default='book_corpus_wiki_en_uncased',
-                    help='BERT dataset name.'
-                    'options are book_corpus_wiki_en_uncased and book_corpus_wiki_en_cased.')
-
-parser.add_argument('--pretrained_bert_parameters',
-                    type=str,
-                    default=None,
-                    help='Pre-trained bert model parameter file. default is None')
-
-parser.add_argument('--uncased',
-                    action='store_false',
-                    help='if not set, inputs are converted to lower case.')
-
-parser.add_argument('--output_dir',
-                    type=str,
-                    default='./output_dir',
-                    help='The output directory where the model params will be written.'
-                    ' default is ./output_dir')
-
-parser.add_argument('--epochs',
-                    type=int,
-                    default=3,
-                    help='number of epochs, default is 3')
-parser.add_argument('--training_steps',
-                    type=int,
-                    help='training steps, epochs will be ignored '
-                    'if trainin_steps is specified.')
-parser.add_argument('--batch_size',
-                    type=int,
-                    default=32,
-                    help='Batch size. Number of examples per gpu in a minibatch. default is 32')
-
-parser.add_argument('--test_batch_size',
-                    type=int,
-                    default=24,
-                    help='Test batch size. default is 24')
-
-parser.add_argument('--optimizer',
-                    type=str,
-                    default='bertadam',
-                    help='optimization algorithm. default is bertadam')
-
-parser.add_argument('--accumulate',
-                    type=int,
-                    default=None,
-                    help='The number of batches for '
-                    'gradients accumulation to simulate large batch size. Default is None')
-
-parser.add_argument('--lr',
-                    type=float,
-                    default=5e-5,
-                    help='Initial learning rate. default is 5e-5')
-
-parser.add_argument('--warmup_ratio',
-                    type=float,
-                    default=0.1,
-                    help='ratio of warmup steps that linearly increase learning rate from '
-                    '0 to target learning rate. default is 0.1')
-
-parser.add_argument('--log_interval',
-                    type=int,
-                    default=50,
-                    help='report interval. default is 50')
-
-parser.add_argument('--max_seq_length',
-                    type=int,
-                    default=384,
-                    help='The maximum total input sequence length after WordPiece tokenization.'
-                    'Sequences longer than this will be truncated, and sequences shorter '
-                    'than this will be padded. default is 384')
-
-parser.add_argument(
-    '--round_to', type=int, default=None,
-    help='The length of padded sequences will be rounded up to be multiple of this argument.'
-         'When round to is set to 8, training throughput may increase for mixed precision'
-         'training on GPUs with tensorcores.')
-
-parser.add_argument('--doc_stride',
-                    type=int,
-                    default=128,
-                    help='When splitting up a long document into chunks, how much stride to '
-                    'take between chunks. default is 128')
-
-parser.add_argument('--max_query_length',
-                    type=int,
-                    default=64,
-                    help='The maximum number of tokens for the question. Questions longer than '
-                    'this will be truncated to this length. default is 64')
-
-parser.add_argument('--n_best_size',
-                    type=int,
-                    default=20,
-                    help='The total number of n-best predictions to generate in the '
-                    'nbest_predictions.json output file. default is 20')
-
-parser.add_argument('--max_answer_length',
-                    type=int,
-                    default=30,
-                    help='The maximum length of an answer that can be generated. This is needed '
-                    'because the start and end predictions are not conditioned on one another.'
-                    ' default is 30')
-
-parser.add_argument('--version_2',
-                    action='store_true',
-                    help='SQuAD examples whether contain some that do not have an answer.')
-
-parser.add_argument('--null_score_diff_threshold',
-                    type=float,
-                    default=0.0,
-                    help='If null_score - best_non_null is greater than the threshold predict null.'
-                    'Typical values are between -1.0 and -5.0. default is 0.0')
-
-parser.add_argument('--gpu',
-                    action='store_true',
-                    help='use GPU instead of CPU')
-
-parser.add_argument('--sentencepiece',
-                    type=str,
-                    default=None,
-                    help='Path to the sentencepiece .model file for both tokenization and vocab.')
-
-parser.add_argument('--debug',
-                    action='store_true',
-                    help='Run the example in test mode for sanity checks')
-
-parser.add_argument('--dtype',
-                    type=str,
-                    default='float32',
-                    help='Data type used for training. Either float32 or float16')
-
-parser.add_argument('--comm_backend',
-                    type=str,
-                    default=None,
-                    help='Communication backend. Set to horovod if horovod is used for '
-                         'multi-GPU training')
-
-parser.add_argument('--deploy', action='store_true',
-                    help='whether load static model for deployment')
-
-parser.add_argument('--model_prefix', type=str, required=False,
-                    help='load static model as hybridblock.')
-
-parser.add_argument('--only_calibration', action='store_true',
-                    help='quantize model')
-
-parser.add_argument('--num_calib_batches', type=int, default=10,
-                    help='number of batches for calibration')
-
-parser.add_argument('--quantized_dtype', type=str, default='auto',
-                    choices=['auto', 'int8', 'uint8'],
-                    help='quantization destination data type for input data')
-
-parser.add_argument('--calib_mode', type=str, default='customize',
-                    choices=['none', 'naive', 'entropy', 'customize'],
-                    help='calibration mode used for generating calibration table '
-                         'for the quantized symbol.')
-
-args = parser.parse_args()
-
-output_dir = args.output_dir
-if not os.path.exists(output_dir):
-    os.mkdir(output_dir)
-
-fh = logging.FileHandler(os.path.join(args.output_dir, 'finetune_squad.log'),
-                         mode='w')
-fh.setLevel(logging.INFO)
-fh.setFormatter(formatter)
-console = logging.StreamHandler()
-console.setLevel(logging.INFO)
-console.setFormatter(formatter)
-log.addHandler(console)
-log.addHandler(fh)
-
-log.info(args)
-
-if args.comm_backend == 'horovod':
-    import horovod.mxnet as hvd
-    hvd.init()
-    rank = hvd.rank()
-    size = hvd.size()
-    local_rank = hvd.local_rank()
-else:
-    rank = 0
-    size = 1
-    local_rank = 0
-
-if args.dtype == 'float16':
-    from mxnet.contrib import amp
-    amp.init()
-
-model_name = args.bert_model
-dataset_name = args.bert_dataset
-only_predict = args.only_predict
-model_parameters = args.model_parameters
-pretrained_bert_parameters = args.pretrained_bert_parameters
-if pretrained_bert_parameters and model_parameters:
-    raise ValueError('Cannot provide both pre-trained BERT parameters and '
-                     'BertForQA model parameters.')
-lower = args.uncased
-
-batch_size = args.batch_size
-test_batch_size = args.test_batch_size
-lr = args.lr
-ctx = mx.gpu(local_rank) if args.gpu else mx.cpu()
-
-accumulate = args.accumulate
-log_interval = args.log_interval * accumulate if accumulate else args.log_interval
-if accumulate:
-    log.info('Using gradient accumulation. Effective total batch size = {}'.
-             format(accumulate*batch_size*size))
-
-optimizer = args.optimizer
-warmup_ratio = args.warmup_ratio
-
-
-version_2 = args.version_2
-null_score_diff_threshold = args.null_score_diff_threshold
-
-max_seq_length = args.max_seq_length
-doc_stride = args.doc_stride
-max_query_length = args.max_query_length
-n_best_size = args.n_best_size
-max_answer_length = args.max_answer_length
-
-if max_seq_length <= max_query_length + 3:
-    raise ValueError('The max_seq_length (%d) must be greater than max_query_length '
-                     '(%d) + 3' % (max_seq_length, max_query_length))
-
-# vocabulary and tokenizer
-if args.sentencepiece:
-    logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece)
-    if dataset_name:
-        warnings.warn('Both --dataset_name and --sentencepiece are provided. '
-                      'The vocabulary will be loaded based on --sentencepiece.')
-    vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)
-    dataset_name = None
-else:
-    vocab = None
-
-pretrained = not model_parameters and not pretrained_bert_parameters and not args.sentencepiece
-bert, vocab = nlp.model.get_model(
-    name=model_name,
-    dataset_name=dataset_name,
-    vocab=vocab,
-    pretrained=pretrained,
-    ctx=ctx,
-    use_pooler=False,
-    use_decoder=False,
-    use_classifier=False)
-
-if args.sentencepiece:
-    tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab, lower=lower)
-else:
-    tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=lower)
-
-batchify_fn = nlp.data.batchify.Tuple(
-    nlp.data.batchify.Stack(),
-    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
-    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
-    nlp.data.batchify.Stack('float32'),
-    nlp.data.batchify.Stack('float32'),
-    nlp.data.batchify.Stack('float32'))
-
-# load symbolic model
-deploy = args.deploy
-model_prefix = args.model_prefix
-
-net = BertForQA(bert=bert)
-if model_parameters:
-    # load complete BertForQA parameters
-    nlp.utils.load_parameters(net, model_parameters, ctx=ctx, cast_dtype=True)
-elif pretrained_bert_parameters:
-    # only load BertModel parameters
-    nlp.utils.load_parameters(bert, pretrained_bert_parameters, ctx=ctx,
-                              ignore_extra=True, cast_dtype=True)
-    net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
-elif pretrained:
-    # only load BertModel parameters
-    net.span_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
-else:
-    # no checkpoint is loaded
-    net.initialize(init=mx.init.Normal(0.02), ctx=ctx)
-
-net.hybridize(static_alloc=True)
-
-loss_function = BertForQALoss()
-loss_function.hybridize(static_alloc=True)
-
-if deploy:
-    logging.info('load symbol file directly as SymbolBlock for model deployment')
-    net = mx.gluon.SymbolBlock.imports('{}-symbol.json'.format(args.model_prefix),
-                                       ['data0', 'data1', 'data2'],
-                                       '{}-0000.params'.format(args.model_prefix))
-    net.hybridize(static_alloc=True, static_shape=True)
-
-# calibration config
-only_calibration = args.only_calibration
-num_calib_batches = args.num_calib_batches
-quantized_dtype = args.quantized_dtype
-calib_mode = args.calib_mode
-
-def train():
-    """Training function."""
-    segment = 'train'  #if not args.debug else 'dev'
-    log.info('Loading %s data...', segment)
-    if version_2:
-        train_data = SQuAD(segment, version='2.0')
-    else:
-        train_data = SQuAD(segment, version='1.1')
-    if args.debug:
-        sampled_data = [train_data[i] for i in range(0, 10000)]
-        train_data = mx.gluon.data.SimpleDataset(sampled_data)
-    log.info('Number of records in Train data:{}'.format(len(train_data)))
-    train_data_transform = preprocess_dataset(
-        tokenizer,
-        train_data,
-        max_seq_length=max_seq_length,
-        doc_stride=doc_stride,
-        max_query_length=max_query_length,
-        input_features=True)
-
-    log.info('The number of examples after preprocessing:{}'.format(
-        len(train_data_transform)))
-
-    sampler = nlp.data.SplitSampler(len(train_data_transform), num_parts=size,
-                                    part_index=rank, even_size=True)
-    num_train_examples = len(sampler)
-    train_dataloader = mx.gluon.data.DataLoader(train_data_transform,
-                                                batchify_fn=batchify_fn,
-                                                batch_size=batch_size,
-                                                num_workers=4,
-                                                sampler=sampler)
-
-    log.info('Start Training')
-
-    optimizer_params = {'learning_rate': lr, 'wd': 0.01}
-    param_dict = net.collect_params()
-    if args.comm_backend == 'horovod':
-        trainer = hvd.DistributedTrainer(param_dict, optimizer, optimizer_params)
-    else:
-        trainer = mx.gluon.Trainer(param_dict, optimizer, optimizer_params,
-                                   update_on_kvstore=False)
-    if args.dtype == 'float16':
-        amp.init_trainer(trainer)
-
-    step_size = batch_size * accumulate if accumulate else batch_size
-    num_train_steps = int(num_train_examples / step_size * args.epochs)
-    if args.training_steps:
-        num_train_steps = args.training_steps
-
-    num_warmup_steps = int(num_train_steps * warmup_ratio)
-
-    def set_new_lr(step_num, batch_id):
-        """set new learning rate"""
-        # set grad to zero for gradient accumulation
-        if accumulate:
-            if batch_id % accumulate == 0:
-                step_num += 1
-        else:
-            step_num += 1
-        # learning rate schedule
-        # Notice that this learning rate scheduler is adapted from traditional linear learning
-        # rate scheduler where step_num >= num_warmup_steps, new_lr = 1 - step_num/num_train_steps
-        if step_num < num_warmup_steps:
-            new_lr = lr * step_num / num_warmup_steps
-        else:
-            offset = (step_num - num_warmup_steps) * lr / \
-                (num_train_steps - num_warmup_steps)
-            new_lr = lr - offset
-        trainer.set_learning_rate(new_lr)
-        return step_num
-
-    # Do not apply weight decay on LayerNorm and bias terms
-    for _, v in net.collect_params('.*beta|.*gamma|.*bias').items():
-        v.wd_mult = 0.0
-    # Collect differentiable parameters
-    params = [p for p in param_dict.values() if p.grad_req != 'null']
-
-    # Set grad_req if gradient accumulation is required
-    if accumulate:
-        for p in params:
-            p.grad_req = 'add'
-    net.collect_params().zero_grad()
-
-    epoch_tic = time.time()
-
-    total_num = 0
-    log_num = 0
-    batch_id = 0
-    step_loss = 0.0
-    tic = time.time()
-    step_num = 0
-
-    tic = time.time()
-    while step_num < num_train_steps:
-        for _, data in enumerate(train_dataloader):
-            # set new lr
-            step_num = set_new_lr(step_num, batch_id)
-            # forward and backward
-            _, inputs, token_types, valid_length, start_label, end_label = data
-            num_labels = len(inputs)
-            log_num += num_labels
-            total_num += num_labels
-
-            with mx.autograd.record():
-                out = net(inputs.as_in_context(ctx),
-                          token_types.as_in_context(ctx),
-                          valid_length.as_in_context(ctx).astype('float32'))
-
-                loss = loss_function(out, [
-                    start_label.as_in_context(ctx).astype('float32'),
-                    end_label.as_in_context(ctx).astype('float32')
-                ]).sum() / num_labels
-
-                if accumulate:
-                    loss = loss / accumulate
-                if args.dtype == 'float16':
-                    with amp.scale_loss(loss, trainer) as l:
-                        mx.autograd.backward(l)
-                        norm_clip = 1.0 * size * trainer._amp_loss_scaler.loss_scale
-                else:
-                    mx.autograd.backward(loss)
-                    norm_clip = 1.0 * size
-
-            # update
-            if not accumulate or (batch_id + 1) % accumulate == 0:
-                trainer.allreduce_grads()
-                nlp.utils.clip_grad_global_norm(params, norm_clip)
-                trainer.update(1)
-                if accumulate:
-                    param_dict.zero_grad()
-
-            if args.comm_backend == 'horovod':
-                step_loss += hvd.allreduce(loss, average=True).asscalar()
-            else:
-                step_loss += loss.asscalar()
-
-            if (batch_id + 1) % log_interval == 0:
-                toc = time.time()
-                log.info('Batch: {}/{}, Loss={:.4f}, lr={:.7f} '
-                         'Thoughput={:.2f} samples/s'
-                         .format(batch_id % len(train_dataloader),
-                                 len(train_dataloader), step_loss / log_interval,
-                                 trainer.learning_rate, log_num/(toc - tic)))
-                tic = time.time()
-                step_loss = 0.0
-                log_num = 0
-
-            if step_num >= num_train_steps:
-                break
-            batch_id += 1
-
-        log.info('Finish training step: %d', step_num)
-        epoch_toc = time.time()
-        log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format(
-            epoch_toc - epoch_tic, total_num / (epoch_toc - epoch_tic)))
-
-    if rank == 0:
-        net.save_parameters(os.path.join(output_dir, 'net.params'))
-
-def calibration(net, num_calib_batches, quantized_dtype, calib_mode):
-    """calibration function on the dev dataset."""
-    log.info('Loading dev data...')
-    if version_2:
-        dev_data = SQuAD('dev', version='2.0')
-    else:
-        dev_data = SQuAD('dev', version='1.1')
-    if args.debug:
-        sampled_data = [dev_data[0], dev_data[1], dev_data[2]]
-        dev_data = mx.gluon.data.SimpleDataset(sampled_data)
-    log.info('Number of records in dev data:{}'.format(len(dev_data)))
-
-    batchify_fn_calib = nlp.data.batchify.Tuple(
-        nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
-        nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], round_to=args.round_to),
-        nlp.data.batchify.Stack('float32'),
-        nlp.data.batchify.Stack('float32'))
-
-    dev_data_transform = preprocess_dataset(tokenizer,
-                                            dev_data,
-                                            max_seq_length=max_seq_length,
-                                            doc_stride=doc_stride,
-                                            max_query_length=max_query_length,
-                                            input_features=True,
-                                            for_calibration=True)
-
-    dev_dataloader = mx.gluon.data.DataLoader(
-        dev_data_transform,
-        batchify_fn=batchify_fn_calib,
-        num_workers=4, batch_size=test_batch_size,
-        shuffle=False, last_batch='keep')
-
-    assert ctx == mx.cpu(), \
-        'Currently only supports CPU with MKL-DNN backend.'
-    log.info('Now we are doing calibration on dev with %s.', ctx)
-    collector = BertLayerCollector(clip_min=-50, clip_max=10, logger=log)
-    num_calib_examples = test_batch_size * num_calib_batches
-    net = mx.contrib.quantization.quantize_net_v2(net, quantized_dtype=quantized_dtype,
-                                                  exclude_layers=[],
-                                                  quantize_mode='smart',
-                                                  quantize_granularity='channel-wise',
-                                                  calib_data=dev_dataloader,
-                                                  calib_mode=calib_mode,
-                                                  num_calib_examples=num_calib_examples,
-                                                  ctx=ctx,
-                                                  LayerOutputCollector=collector,
-                                                  logger=log)
-    # save params
-    ckpt_name = 'model_bert_squad_quantized_{0}'.format(calib_mode)
-    params_saved = os.path.join(output_dir, ckpt_name)
-    net.export(params_saved, epoch=0)
-    log.info('Saving quantized model at %s', output_dir)
-
-def evaluate():
-    """Evaluate the model on validation dataset."""
-    log.info('Loading dev data...')
-    if version_2:
-        dev_data = SQuAD('dev', version='2.0')
-    else:
-        dev_data = SQuAD('dev', version='1.1')
-    if args.debug:
-        sampled_data = [dev_data[i] for i in range(100)]
-        dev_data = mx.gluon.data.SimpleDataset(sampled_data)
-    log.info('Number of records in dev data:{}'.format(len(dev_data)))
-
-    dev_dataset = preprocess_dataset(tokenizer,
-                                     dev_data,
-                                     max_seq_length=max_seq_length,
-                                     doc_stride=doc_stride,
-                                     max_query_length=max_query_length,
-                                     input_features=False)
-
-    dev_data_transform = preprocess_dataset(tokenizer,
-                                            dev_data,
-                                            max_seq_length=max_seq_length,
-                                            doc_stride=doc_stride,
-                                            max_query_length=max_query_length,
-                                            input_features=True)
-
-    log.info('The number of examples after preprocessing:{}'.format(
-        len(dev_data_transform)))
-
-    dev_dataloader = mx.gluon.data.DataLoader(dev_data_transform,
-                                              batchify_fn=batchify_fn,
-                                              num_workers=4,
-                                              batch_size=test_batch_size,
-                                              shuffle=False,
-                                              last_batch='keep')
-
-    log.info('start prediction')
-
-    all_results = collections.defaultdict(list)
-
-    epoch_tic = time.time()
-    total_num = 0
-    for data in dev_dataloader:
-        example_ids, inputs, token_types, valid_length, _, _ = data
-        total_num += len(inputs)
-        out = net(inputs.as_in_context(ctx),
-                  token_types.as_in_context(ctx),
-                  valid_length.as_in_context(ctx).astype('float32'))
-
-        output = mx.nd.split(out, axis=2, num_outputs=2)
-        example_ids = example_ids.asnumpy().tolist()
-        pred_start = output[0].reshape((0, -3)).asnumpy()
-        pred_end = output[1].reshape((0, -3)).asnumpy()
-
-        for example_id, start, end in zip(example_ids, pred_start, pred_end):
-            all_results[example_id].append(PredResult(start=start, end=end))
-
-    epoch_toc = time.time()
-    log.info('Time cost={:.2f} s, Thoughput={:.2f} samples/s'.format(
-        epoch_toc - epoch_tic, total_num / (epoch_toc - epoch_tic)))
-
-    log.info('Get prediction results...')
-
-    all_predictions = collections.OrderedDict()
-
-    for features in dev_dataset:
-        results = all_results[features[0].example_id]
-        example_qas_id = features[0].qas_id
-
-        prediction, _ = predict(
-            features=features,
-            results=results,
-            tokenizer=nlp.data.BERTBasicTokenizer(lower=lower),
-            max_answer_length=max_answer_length,
-            null_score_diff_threshold=null_score_diff_threshold,
-            n_best_size=n_best_size,
-            version_2=version_2)
-
-        all_predictions[example_qas_id] = prediction
-
-    if version_2:
-        log.info('Please run evaluate-v2.0.py to get evaluation results for SQuAD 2.0')
-    else:
-        F1_EM = get_F1_EM(dev_data, all_predictions)
-        log.info(F1_EM)
-
-    with io.open(os.path.join(output_dir, 'predictions.json'),
-                 'w', encoding='utf-8') as fout:
-        data = json.dumps(all_predictions, ensure_ascii=False)
-        fout.write(data)
-
-
-
-SquadBERTFeautre = collections.namedtuple('SquadBERTFeautre', [
-    'example_id', 'qas_id', 'doc_tokens', 'valid_length', 'tokens',
-    'token_to_orig_map', 'token_is_max_context', 'input_ids', 'p_mask',
-    'segment_ids', 'start_position', 'end_position', 'is_impossible'
-])
-
-
-def convert_examples_to_features(example,
-                                 tokenizer=None,
-                                 cls_token=None,
-                                 sep_token=None,
-                                 vocab=None,
-                                 max_seq_length=384,
-                                 doc_stride=128,
-                                 max_query_length=64,
-                                 cls_index=0):
-    """convert the examples to the BERT features"""
-    query_tokenized = [cls_token] + tokenizer(
-        example.question_text)[:max_query_length]
-    #tokenize paragraph and get start/end position of the answer in tokenized paragraph
-    tok_start_position, tok_end_position, all_doc_tokens, _, tok_to_orig_index = \
-        tokenize_and_align_positions(example.doc_tokens,
-                                     example.start_position,
-                                     example.end_position,
-                                     tokenizer)
-    # get doc spans using sliding window
-    doc_spans, doc_spans_indices = get_doc_spans(
-        all_doc_tokens, max_seq_length - len(query_tokenized) - 2, doc_stride)
-
-    if not example.is_impossible:
-        (tok_start_position, tok_end_position) = improve_answer_span(
-            all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
-            example.orig_answer_text)
-        # get the new start/end position
-        positions = [
-            align_position2doc_spans([tok_start_position, tok_end_position],
-                                     doc_idx,
-                                     offset=len(query_tokenized) + 1,
-                                     default_value=0)
-            for doc_idx in doc_spans_indices
-        ]
-    else:
-        # if the question is impossible to answer, set the start/end position to cls index
-        positions = [[cls_index, cls_index] for _ in doc_spans_indices]
-
-    # record whether the tokens in a docspan have max context
-    token_is_max_context = [{
-        len(query_tokenized) + p:
-        check_is_max_context(doc_spans_indices, i, p + doc_spans_indices[i][0])
-        for p in range(len(doc_span))
-    } for (i, doc_span) in enumerate(doc_spans)]
-
-    token_to_orig_map = [{
-        len(query_tokenized) + p + 1:
-        tok_to_orig_index[p + doc_spans_indices[i][0]]
-        for p in range(len(doc_span))
-    } for (i, doc_span) in enumerate(doc_spans)]
-
-    #get sequence features: tokens, segment_ids, p_masks
-    seq_features = [
-        concat_sequences([query_tokenized, doc_span], [[sep_token]] * 2)
-        for doc_span in doc_spans
-    ]
-
-    features = [
-        SquadBERTFeautre(example_id=example.example_id,
-                         qas_id=example.qas_id,
-                         doc_tokens=example.doc_tokens,
-                         valid_length=len(tokens),
-                         tokens=tokens,
-                         token_to_orig_map=t2o,
-                         token_is_max_context=is_max,
-                         input_ids=vocab[tokens],
-                         p_mask=p_mask,
-                         segment_ids=segment_ids,
-                         start_position=start,
-                         end_position=end,
-                         is_impossible=example.is_impossible)
-        for (tokens, segment_ids, p_mask), (start, end), is_max, t2o in zip(
-            seq_features, positions, token_is_max_context, token_to_orig_map)
-    ]
-    return features
-
-
-def preprocess_dataset(tokenizer,
-                       dataset,
-                       vocab=None,
-                       max_seq_length=384,
-                       doc_stride=128,
-                       max_query_length=64,
-                       input_features=True,
-                       num_workers=4,
-                       load_from_pickle=False,
-                       feature_file=None,
-                       for_calibration=False):
-    """Loads a dataset into features"""
-    vocab = tokenizer.vocab if vocab is None else vocab
-    trans = partial(convert_examples_to_features,
-                    tokenizer=tokenizer,
-                    cls_token=vocab.cls_token,
-                    sep_token=vocab.sep_token,
-                    vocab=vocab,
-                    max_seq_length=max_seq_length,
-                    doc_stride=doc_stride,
-                    max_query_length=max_query_length)
-    pool = mp.Pool(num_workers)
-    start = time.time()
-    if not load_from_pickle:
-        example_trans = partial(convert_squad_examples,
-                                is_training=input_features)
-        # convert the raw dataset into raw features
-        examples = pool.map(example_trans, dataset)
-        raw_features = pool.map(trans, examples)
-        if feature_file:
-            with open(feature_file, 'wb') as file:
-                pickle.dump(list(raw_features), file)
-    else:
-        assert feature_file, 'feature file should be provided.'
-        with open(feature_file, 'wb') as file:
-            raw_features = pickle.load(file)
-
-    if input_features:
-        # convert the full features into the training features
-        # Note that we will need the full features to make evaluation
-        # Due to using sliding windows in data preprocessing,
-        # we will have multiple examples for a single entry after processed.
-        # Thus we need to flatten it for training.
-        data_feature = mx.gluon.data.SimpleDataset(
-            list(itertools.chain.from_iterable(raw_features)))
-        if for_calibration:
-            data_feature = data_feature.transform(lambda *example: (
-                example[7],  # inputs_id
-                example[9],  # segment_ids
-                example[3],  # valid_length,
-                example[10]))  # start_position,
-        else:
-            data_feature = data_feature.transform(lambda *example: (
-                example[0],  # example_id
-                example[7],  # inputs_id
-                example[9],  # segment_ids
-                example[3],  # valid_length,
-                example[10],  # start_position,
-                example[11]))  # end_position
-    else:
-        data_feature = mx.gluon.data.SimpleDataset(list(raw_features))
-
-    end = time.time()
-    pool.close()
-    print('Done! Transform dataset costs %.2f seconds.' % (end - start))
-    return data_feature
-
-
-if __name__ == '__main__':
-    if only_calibration:
-        try:
-            calibration(net,
-                        num_calib_batches,
-                        quantized_dtype,
-                        calib_mode)
-        except AttributeError:
-            nlp.utils.version.check_version('1.7.0', warning_only=True, library=mx)
-            warnings.warn('INT8 Quantization for BERT need mxnet-mkl >= 1.6.0b20200115')
-    elif not only_predict:
-        train()
-        evaluate()
-    elif model_parameters or deploy:
-        evaluate()
diff --git a/scripts/bert/fp16_utils.py b/scripts/bert/fp16_utils.py
deleted file mode 100644
index f74c5528b1..0000000000
--- a/scripts/bert/fp16_utils.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Trainer for mixed precision training."""
-import warnings
-import mxnet as mx
-import gluonnlp as nlp
-
-class FP16Trainer:
-    """ Trainer for mixed precision training.
-
-    Parameters
-    ----------
-    trainer: gluon.Trainer
-      the original gluon Trainer object for fp32 training.
-    dynamic_loss_scale: bool. Default is True
-      whether to use dynamic loss scaling. This is recommended for optimizing model
-      parameters using FP16.
-    loss_scaler_params : dict
-        Key-word arguments to be passed to loss scaler constructor. For example,
-        `{"init_scale" : 2.**10, "scale_window" : 2000, "tolerance" : 0.05}`
-        for `DynamicLossScaler`.
-        See each `LossScaler` for a list of supported arguments'
-    """
-    def __init__(self, trainer, dynamic_loss_scale=True, loss_scaler_params=None):
-        if trainer._kvstore_params['update_on_kvstore'] is not False and trainer._kvstore:
-            err = 'Only gluon.Trainer created with update_on_kvstore=False is supported.'
-            raise NotImplementedError(err)
-        self.fp32_trainer = trainer
-        loss_scaler_params = loss_scaler_params if loss_scaler_params else {}
-        self._scaler = DynamicLossScaler(**loss_scaler_params) if dynamic_loss_scale \
-                       else StaticLossScaler(**loss_scaler_params)
-        # if the optimizer supports NaN check, we can always defer the NaN check to the optimizer
-        # TODO(haibin) this should be added via registry
-        self._support_nan_check = trainer._optimizer.__class__.__name__ == 'BERTAdam'
-
-    def backward(self, loss):
-        """backward propagation with loss"""
-        with mx.autograd.record():
-            if isinstance(loss, (tuple, list)):
-                ls = [l * self._scaler.loss_scale for l in loss]
-            else:
-                ls = loss * self._scaler.loss_scale
-        mx.autograd.backward(ls)
-
-    def step(self, batch_size, max_norm=None):
-        """Makes one step of parameter update. Should be called after
-        `fp16_optimizer.backward()`, and outside of `record()` scope.
-
-        Parameters
-        ----------
-        batch_size : int
-            Batch size of data processed. Gradient will be normalized by `1/batch_size`.
-            Set this to 1 if you normalized loss manually with `loss = mean(loss)`.
-        max_norm : NDArray, optional, default is None
-            max value for global 2-norm of gradients.
-        """
-        self.fp32_trainer.allreduce_grads()
-        step_size = batch_size * self._scaler.loss_scale
-        if max_norm:
-            _, ratio, is_finite = nlp.utils.grad_global_norm(self.fp32_trainer._params,
-                                                             max_norm * self._scaler.loss_scale)
-            step_size = ratio * step_size
-            if self._support_nan_check:
-                self.fp32_trainer.update(step_size)
-                overflow = is_finite.asscalar() < 1
-            else:
-                overflow = is_finite.asscalar() < 1
-                if not overflow:
-                    step_size = step_size.asscalar()
-                    self.fp32_trainer.update(step_size)
-        else:
-            # TODO(haibin) optimize the performance when max_norm is not present
-            # sequentially adding isnan/isinf results may be slow
-            if self._support_nan_check:
-                self.fp32_trainer.update(step_size)
-                overflow = self._scaler.has_overflow(self.fp32_trainer._params)
-            else:
-                overflow = self._scaler.has_overflow(self.fp32_trainer._params)
-                if not overflow:
-                    self.fp32_trainer.update(step_size)
-        # update scale based on overflow information
-        self._scaler.update_scale(overflow)
-
-class LossScaler:
-    """Abstract loss scaler"""
-    def has_overflow(self, params):
-        """ detect inf and nan """
-        is_not_finite = 0
-        for param in params:
-            if param.grad_req != 'null':
-                grad = param.list_grad()[0]
-                is_not_finite += mx.nd.contrib.isnan(grad).sum().astype('float32', copy=False)
-                is_not_finite += mx.nd.contrib.isinf(grad).sum().astype('float32', copy=False)
-        # NDArray is implicitly converted to bool
-        if is_not_finite == 0:
-            return False
-        else:
-            return True
-
-    def update_scale(self, overflow):
-        raise NotImplementedError()
-
-class StaticLossScaler(LossScaler):
-    """Static loss scaler"""
-    def __init__(self, init_scale=1):
-        self.loss_scale = init_scale
-
-    def update_scale(self, overflow):
-        """update loss scale"""
-
-class DynamicLossScaler(LossScaler):
-    """Class that manages dynamic loss scaling.
-
-    There are two problems regarding gradient scale when fp16 is used for training.
-    One is overflow: the fp16 gradient is too large that it causes NaN.
-    To combat such an issue, we need to scale down the gradient when such an event
-    is detected. The other is underflow: the gradient is too small such that the
-    precision suffers. This is hard to detect though. What dynamic loss scaler does
-    it that, it starts the scale at a relatively large value (e.g. 2**15).
-    Everytime when a NaN is detected in the gradient, the scale is reduced (by default)
-    by 2x. On the other hand, if a NaN is not detected for a long time
-    (e.g. 2000 steps), then the scale is increased (by default) by 2x."""
-    def __init__(self, init_scale=2.**10, scale_factor=2., scale_window=2000,
-                 tolerance=0.):
-        self.loss_scale = init_scale
-        self.scale_factor = scale_factor
-        self.scale_window = scale_window
-        self.tolerance = tolerance
-        self._num_steps = 0
-        self._last_overflow_iter = -1
-        self._last_rescale_iter = -1
-        self._overflows_since_rescale = 0
-
-    def update_scale(self, overflow):
-        """dynamically update loss scale"""
-        iter_since_rescale = self._num_steps - self._last_rescale_iter
-        if overflow:
-            self._last_overflow_iter = self._num_steps
-            self._overflows_since_rescale += 1
-            percentage = self._overflows_since_rescale / float(iter_since_rescale)
-            # we tolerate a certrain amount of NaNs before actually scaling it down
-            if percentage >= self.tolerance:
-                self.loss_scale /= self.scale_factor
-                self._last_rescale_iter = self._num_steps
-                self._overflows_since_rescale = 0
-                if self.loss_scale < 1:
-                    warnings.warn('DynamicLossScaler: overflow detected. set loss_scale = %s'%
-                                  self.loss_scale)
-        elif (self._num_steps - self._last_overflow_iter) % self.scale_window == 0:
-            self.loss_scale *= self.scale_factor
-            self._last_rescale_iter = self._num_steps
-        self._num_steps += 1
diff --git a/scripts/bert/index.rst b/scripts/bert/index.rst
deleted file mode 100644
index 158da3be52..0000000000
--- a/scripts/bert/index.rst
+++ /dev/null
@@ -1,369 +0,0 @@
-BERT
-----
-
-:download:`Download scripts </model_zoo/bert.zip>`
-
-
-Reference: Devlin, Jacob, et al. "`Bert: Pre-training of deep bidirectional transformers for language understanding. <https://arxiv.org/abs/1810.04805>`_" arXiv preprint arXiv:1810.04805 (2018).
-
-BERT Model Zoo
-~~~~~~~~~~~~~~
-
-The following pre-trained BERT models are available from the **gluonnlp.model.get_model** API:
-
-+-----------------------------------------+----------------+-----------------+
-|                                         | bert_12_768_12 | bert_24_1024_16 |
-+=========================================+================+=================+
-| book_corpus_wiki_en_uncased             | ✓              | ✓               |
-+-----------------------------------------+----------------+-----------------+
-| book_corpus_wiki_en_cased               | ✓              | ✓               |
-+-----------------------------------------+----------------+-----------------+
-| openwebtext_book_corpus_wiki_en_uncased | ✓              | x               |
-+-----------------------------------------+----------------+-----------------+
-| wiki_multilingual_uncased               | ✓              | x               |
-+-----------------------------------------+----------------+-----------------+
-| wiki_multilingual_cased                 | ✓              | x               |
-+-----------------------------------------+----------------+-----------------+
-| wiki_cn_cased                           | ✓              | x               |
-+-----------------------------------------+----------------+-----------------+
-| scibert_scivocab_uncased                | ✓              | x               |
-+-----------------------------------------+----------------+-----------------+
-| scibert_scivocab_cased                  | ✓              | x               |
-+-----------------------------------------+----------------+-----------------+
-| scibert_basevocab_uncased               | ✓              | x               |
-+-----------------------------------------+----------------+-----------------+
-| scibert_basevocab_cased                 | ✓              | x               |
-+-----------------------------------------+----------------+-----------------+
-| biobert_v1.0_pmc_cased                  | ✓              | x               |
-+-----------------------------------------+----------------+-----------------+
-| biobert_v1.0_pubmed_cased               | ✓              | x               |
-+-----------------------------------------+----------------+-----------------+
-| biobert_v1.0_pubmed_pmc_cased           | ✓              | x               |
-+-----------------------------------------+----------------+-----------------+
-| biobert_v1.1_pubmed_cased               | ✓              | x               |
-+-----------------------------------------+----------------+-----------------+
-| clinicalbert_uncased                    | ✓              | x               |
-+-----------------------------------------+----------------+-----------------+
-| kobert_news_wiki_ko_cased               | ✓              | x               |
-+-----------------------------------------+----------------+-----------------+
-
-where **bert_12_768_12** refers to the BERT BASE model, and **bert_24_1024_16** refers to the BERT LARGE model.
-
-.. code-block:: python
-
-    import gluonnlp as nlp; import mxnet as mx;
-    model, vocab = nlp.model.get_model('bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased', use_classifier=False, use_decoder=False);
-    tokenizer = nlp.data.BERTTokenizer(vocab, lower=True);
-    transform = nlp.data.BERTSentenceTransform(tokenizer, max_seq_length=512, pair=False, pad=False);
-    sample = transform(['Hello world!']);
-    words, valid_len, segments = mx.nd.array([sample[0]]), mx.nd.array([sample[1]]), mx.nd.array([sample[2]]);
-    seq_encoding, cls_encoding = model(words, segments, valid_len);
-
-
-The pretrained parameters for dataset_name
-'openwebtext_book_corpus_wiki_en_uncased' were obtained by running the GluonNLP
-BERT pre-training script on OpenWebText.
-
-The pretrained parameters for dataset_name 'scibert_scivocab_uncased',
-'scibert_scivocab_cased', 'scibert_basevocab_uncased', 'scibert_basevocab_cased'
-were obtained by converting the parameters published by "Beltagy, I., Cohan, A.,
-& Lo, K. (2019). Scibert: Pretrained contextualized embeddings for scientific
-text. arXiv preprint `arXiv:1903.10676 <https://arxiv.org/abs/1903.10676>`_."
-
-The pretrained parameters for dataset_name 'biobert_v1.0_pmc',
-'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc', 'biobert_v1.1_pubmed' were
-obtained by converting the parameters published by "Lee, J., Yoon, W., Kim, S.,
-Kim, D., Kim, S., So, C. H., & Kang, J. (2019). Biobert: pre-trained biomedical
-language representation model for biomedical text mining. arXiv preprint
-`arXiv:1901.08746 <https://arxiv.org/abs/1901.08746>`_."
-
-The pretrained parameters for dataset_name 'clinicalbert' were obtained by
-converting the parameters published by "Huang, K., Altosaar, J., & Ranganath, R.
-(2019). ClinicalBERT: Modeling Clinical Notes and Predicting Hospital
-Readmission. arXiv preprint `arXiv:1904.05342
-<https://arxiv.org/abs/1904.05342>`_."
-
-Additionally, GluonNLP supports the "`RoBERTa <https://arxiv.org/abs/1907.11692>`_" model:
-
-+-----------------------------------------+-------------------+--------------------+
-|                                         | roberta_12_768_12 | roberta_24_1024_16 |
-+=========================================+===================+====================+
-| openwebtext_ccnews_stories_books_cased  | ✓                 | ✓                  |
-+-----------------------------------------+-------------------+--------------------+
-
-.. code-block:: python
-
-    import gluonnlp as nlp; import mxnet as mx;
-    model, vocab = nlp.model.get_model('roberta_12_768_12', dataset_name='openwebtext_ccnews_stories_books_cased', use_decoder=False);
-    tokenizer = nlp.data.GPT2BPETokenizer();
-    text = [vocab.bos_token] + tokenizer('Hello world!') + [vocab.eos_token];
-    seq_encoding = model(mx.nd.array([vocab[text]]))
-
-GluonNLP also supports the "`DistilBERT <https://arxiv.org/abs/1910.01108>`_" model:
-
-+-----------------------------------------+----------------------+
-|                                         | distilbert_6_768_12  |
-+=========================================+======================+
-| distil_book_corpus_wiki_en_uncased      | ✓                    |
-+-----------------------------------------+----------------------+
-
-.. code-block:: python
-
-    import gluonnlp as nlp; import mxnet as mx;
-    model, vocab = nlp.model.get_model('distilbert_6_768_12', dataset_name='distil_book_corpus_wiki_en_uncased');
-    tokenizer = nlp.data.BERTTokenizer(vocab, lower=True);
-    transform = nlp.data.BERTSentenceTransform(tokenizer, max_seq_length=512, pair=False, pad=False);
-    sample = transform(['Hello world!']);
-    words, valid_len = mx.nd.array([sample[0]]), mx.nd.array([sample[1]])
-    seq_encoding, cls_encoding = model(words, valid_len);
-
-Finally, GluonNLP also suports Korean BERT pre-trained model, "`KoBERT <https://github.com/SKTBrain/KoBERT>`_", using Korean wiki dataset (`kobert_news_wiki_ko_cased`).
-
-.. code-block:: python
-
-    import gluonnlp as nlp; import mxnet as mx;
-    model, vocab = nlp.model.get_model('bert_12_768_12', dataset_name='kobert_news_wiki_ko_cased',use_decoder=False, use_classifier=False)
-    tok = nlp.data.get_tokenizer('bert_12_768_12', 'kobert_news_wiki_ko_cased')
-    tok('안녕하세요.')
-
-.. hint::
-
-   The pre-training, fine-tuning and export scripts are available `here. </_downloads/bert.zip>`__
-
-
-Sentence Classification
-~~~~~~~~~~~~~~~~~~~~~~~
-
-GluonNLP provides the following example script to fine-tune sentence classification with pre-trained
-BERT model.
-
-To enable mixed precision training with float16, set `--dtype` argument to `float16`.
-
-Results using `bert_12_768_12`:
-
-.. editing URL for the following table: https://tinyurl.com/y4n8q84w
-
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-|Task Name        |Metrics              |Results on Dev Set     |log                                                                                                                                         |command                                                                                                                                                          |
-+=================+=====================+=======================+============================================================================================================================================+=================================================================================================================================================================+
-| CoLA            |Matthew Corr.        |60.32                  |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_CoLA_base_mx1.6.0rc1.log>`__                                 |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_CoLA_base_mx1.6.0rc1.sh>`__                                                   |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| SST-2           |Accuracy             |93.46                  |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_SST_base_mx1.6.0rc1.log>`__                                  |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_SST_base_mx1.6.0rc1.sh>`__                                                    |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| MRPC            |Accuracy/F1          |88.73/91.96            |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_MRPC_base_mx1.6.0rc1.log>`__                                 |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_MRPC_base_mx1.6.0rc1.sh>`__                                                   |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| STS-B           |Pearson Corr.        |90.34                  |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_STS-B_base_mx1.6.0rc1.log>`__                                |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_STS-B_base_mx1.6.0rc1.sh>`__                                                  |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| QQP             |Accuracy             |91                     |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_QQP_base_mx1.6.0rc1.log>`__                                  |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_QQP_base_mx1.6.0rc1.sh>`__                                                    |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| MNLI            |Accuracy(m/mm)       |84.29/85.07            |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_MNLI_base_mx1.6.0rc1.log>`__                                 |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_MNLI_base_mx1.6.0rc1.sh>`__                                                   |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| XNLI (Chinese)  |Accuracy             |78.43                  |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_XNLI_base_mx1.6.0rc1.log>`__                                 |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_XNLI-B_base_mx1.6.0rc1.sh>`__                                                 |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| RTE             |Accuracy             |74                     |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_RTE_base_mx1.6.0rc1.log>`__                                  |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_RTE_base_mx1.6.0rc1.sh>`__                                                    |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-
-
-
-Results using `roberta_12_768_12`:
-
-.. editing URL for the following table: https://www.shorturl.at/cjAO7
-
-+---------------------+------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| Dataset             | SST-2                                                                                                | MNLI-M/MM                                                                                                        |
-+=====================+======================================================================================================+==================================================================================================================+
-| Validation Accuracy | 95.3%                                                                                                | 87.69%, 87.23%                                                                                                   |
-+---------------------+------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| Log                 | `log  <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/roberta/finetuned_sst.log>`__      | `log <https://raw.githubusercontent.com/dmlc/web-data/master/gluonnlp/logs/roberta/mnli_1e-5-32.log>`__          |
-+---------------------+------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+
-| Command             | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/roberta/finetuned_sst.sh>`__    | `command  <https://raw.githubusercontent.com/dmlc/web-data/master/gluonnlp/logs/roberta/finetuned_mnli.sh>`__    |
-+---------------------+------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------+
-
-.. editing URL for the following table: https://tinyurl.com/y5rrowj3
-
-Question Answering on SQuAD
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-+-----------+-----------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+
-| Dataset   | SQuAD 1.1                                                                                                                               | SQuAD 1.1                                                                                                                                | SQuAD 2.0                                                                                                                                |
-+===========+=========================================================================================================================================+==========================================================================================================================================+==========================================================================================================================================+
-| Model     | bert_12_768_12                                                                                                                          | bert_24_1024_16                                                                                                                          | bert_24_1024_16                                                                                                                          |
-+-----------+-----------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+
-| F1 / EM   | 88.58 / 81.26                                                                                                                           | 90.97 / 84.22                                                                                                                            | 81.27 / 78.14                                                                                                                            |
-+-----------+-----------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+
-| Log       | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_squad1.1_base_mx1.6.0rc1.log>`__                         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_squad1.1_large_mx1.6.0rc1.log>`__                         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_squad2.0_large_mx1.6.0rc1.log>`__                         |
-+-----------+-----------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+
-| Command   | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_squad1.1_base_mx1.6.0rc1.sh>`__                      | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_squad1.1_large_mx1.6.0rc1.sh>`__                      | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_squad2.0_large_mx1.6.0rc1.sh>`__                      |
-+-----------+-----------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+
-| Prediction| `predictions.json <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_squad1.1_base_mx1.6.0rc1.json>`__           | `predictions.json <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_squad1.1_large_mx1.6.0rc1.json>`__           | `predictions.json <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetune_squad2.0_large_mx1.6.0rc1.json>`__           |
-+-----------+-----------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------+
-
-For all model settings above, we set learing rate = 3e-5 and optimizer = adam.
-
-Note that the BERT model is memory-consuming. If you have limited GPU memory, you can use the following command to accumulate gradient to achieve the same result with a large batch size by setting *accumulate* and *batch_size* arguments accordingly.
-
-.. code-block:: console
-
-    $ python finetune_squad.py --optimizer adam --accumulate 2 --batch_size 6 --lr 3e-5 --epochs 2 --gpu
-
-We support multi-GPU training via horovod:
-
-.. code-block:: console
-
-    $ HOROVOD_WITH_MXNET=1 HOROVOD_GPU_ALLREDUCE=NCCL pip install horovod --user --no-cache-dir
-    $ horovodrun -np 8 python finetune_squad.py --bert_model bert_24_1024_16 --batch_size 4 --lr 3e-5 --epochs 2 --gpu --dtype float16 --comm_backend horovod
-
-SQuAD 2.0
-+++++++++
-
-For SQuAD 2.0, you need to specify the parameter *version_2* and specify the parameter *null_score_diff_threshold*. Typical values are between -1.0 and -5.0. Use the following command to fine-tune the BERT large model on SQuAD 2.0 and generate predictions.json.
-
-To get the score of the dev data, you need to download the dev dataset (`dev-v2.0.json <https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json>`_) and the evaluate script (`evaluate-2.0.py <https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/>`_). Then use the following command to get the score of the dev dataset.
-
-.. code-block:: console
-
-    $ python evaluate-v2.0.py dev-v2.0.json predictions.json
-
-BERT INT8 Quantization
-~~~~~~~~~~~~~~~~~~~~~~
-
-GluonNLP provides the following example scripts to quantize fine-tuned
-BERT models into int8 data type. Note that INT8 Quantization needs a nightly
-version of `mxnet-mkl <https://apache-mxnet.s3-us-west-2.amazonaws.com/dist/index.html>`_.
-
-Sentence Classification
-+++++++++++++++++++++++
-
-+-----------+-------------------+---------------+---------------+---------+---------+------------------------------------------------------------------------------------------------------------------------+
-|  Dataset  | Model             | FP32 Accuracy | INT8 Accuracy | FP32 F1 | INT8 F1 | Command                                                                                                                |
-+===========+===================+===============+===============+=========+=========+========================================================================================================================+
-| MRPC      | bert_12_768_12    | 87.01         | 87.01         | 90.97   | 90.88   |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/calibration_MRPC_base_mx1.6.0b20200125.sh>`__ |
-+-----------+-------------------+---------------+---------------+---------+---------+------------------------------------------------------------------------------------------------------------------------+
-| SST-2     | bert_12_768_12    | 93.23         | 93.00         |         |         |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/calibration_SST_base_mx1.6.0b20200125.sh>`__  |
-+-----------+-------------------+---------------+---------------+---------+---------+------------------------------------------------------------------------------------------------------------------------+
-
-Question Answering
-++++++++++++++++++
-
-+-----------+-------------------+---------+---------+---------+---------+----------------------------------------------------------------------------------------------------------------------------+
-|  Dataset  | Model             | FP32 EM | INT8 EM | FP32 F1 | INT8 F1 | Command                                                                                                                    |
-+===========+===================+=========+=========+=========+=========+============================================================================================================================+
-| SQuAD 1.1 | bert_12_768_12    | 81.18   | 80.32   | 88.58   | 88.10   |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/calibration_squad1.1_base_mx1.6.0b20200125.sh>`__ |
-+-----------+-------------------+---------+---------+---------+---------+----------------------------------------------------------------------------------------------------------------------------+
-
-For all model settings above, we use a subset of evaluation dataset for calibration.
-
-Pre-training from Scratch
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-We also provide scripts for pre-training BERT with masked language modeling and and next sentence prediction.
-
-The pre-training data format expects: (1) One sentence per line. These should ideally be actual sentences, not entire paragraphs or arbitrary spans of text for the "next sentence prediction" task. (2) Blank lines between documents. You can find a sample pre-training text with 3 documents `here <https://github.com/dmlc/gluon-nlp/blob/master/scripts/bert/sample_text.txt>`__. You can perform sentence segmentation with an off-the-shelf NLP toolkit such as NLTK.
-
-
-.. hint::
-
-   You can download pre-processed English wikipedia dataset `here. <https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/enwiki-197b5d8d.zip>`__
-
-
-Pre-requisite
-+++++++++++++
-
-We recommend horovod for scalable multi-gpu multi-machine training.
-
-To install horovod, you need:
-
-- `NCCL <https://developer.nvidia.com/nccl>`__, and
-- `OpenMPI <https://www.open-mpi.org/software/ompi/v4.0/>`__
-
-Then you can install horovod via the following command:
-
-.. code-block:: console
-
-    $ HOROVOD_WITH_MXNET=1 HOROVOD_GPU_ALLREDUCE=NCCL pip install horovod==0.16.2 --user --no-cache-dir
-
-Run Pre-training
-++++++++++++++++
-
-You can use the following command to run pre-training with 2 hosts, 8 GPUs each:
-
-.. code-block:: console
-
-    $ mpirun -np 16 -H host0_ip:8,host1_ip:8 -mca pml ob1 -mca btl ^openib \
-             -mca btl_tcp_if_exclude docker0,lo --map-by ppr:4:socket \
-             --mca plm_rsh_agent 'ssh -q -o StrictHostKeyChecking=no' \
-             -x NCCL_MIN_NRINGS=8 -x NCCL_DEBUG=INFO -x HOROVOD_HIERARCHICAL_ALLREDUCE=1 \
-             -x MXNET_SAFE_ACCUMULATION=1 --tag-output \
-             python run_pretraining.py --data='folder1/*.txt,folder2/*.txt,' \
-             --data_eval='dev_folder/*.txt,' --num_steps 1000000 \
-             --lr 1e-4 --total_batch_size 256 --accumulate 1 --raw --comm_backend horovod
-
-If you see out-of-memory error, try increasing --accumulate for gradient accumulation.
-
-When multiple hosts are present, please make sure you can ssh to these nodes without password.
-
-Alternatively, if horovod is not available, you could run pre-training with the MXNet native parameter server by setting --comm_backend and --gpus.
-
-.. code-block:: console
-
-    $ MXNET_SAFE_ACCUMULATION=1 python run_pretraining.py --comm_backend device --gpus 0,1,2,3,4,5,6,7 ...
-
-The BERT base model produced by gluonnlp pre-training script (`log <https://raw.githubusercontent.com/dmlc/web-data/master/gluonnlp/logs/bert/bert_base_pretrain.log>`__) achieves 83.6% on MNLI-mm, 93% on SST-2, 87.99% on MRPC and 80.99/88.60 on SQuAD 1.1 validation set on the books corpus and English wikipedia dataset.
-
-Custom Vocabulary
-+++++++++++++++++
-
-The pre-training script supports subword tokenization with a custom vocabulary using `sentencepiece <https://github.com/google/sentencepiece>`__.
-
-To install sentencepiece, run:
-
-.. code-block:: console
-
-    $ pip install sentencepiece==0.1.82 --user
-
-You can `train <//github.com/google/sentencepiece/tree/v0.1.82/python#model-training>`__ a custom sentencepiece vocabulary by specifying the vocabulary size:
-
-.. code-block:: python
-
-    import sentencepiece as spm
-    spm.SentencePieceTrainer.Train('--input=a.txt,b.txt --unk_id=0 --pad_id=3 --model_prefix=my_vocab --vocab_size=30000 --model_type=BPE')
-
-To use sentencepiece vocab for pre-training, please set --sentencepiece=my_vocab.model when using run_pretraining.py.
-
-
-
-Export BERT for Deployment
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Current export.py support exporting BERT models. Supported values for --task argument include classification, regression and question answering.
-
-.. code-block:: console
-
-    $ python export.py --task classification --model_parameters /path/to/saved/ckpt.params --output_dir /path/to/output/dir/ --seq_length 128
-
-This will export the BERT model for classification to a symbol.json file, saved to the directory specified by --output_dir.
-The --model_parameters argument is optional. If not set, the .params file saved in the output directory will be randomly initialized parameters.
-
-BERT for Sentence or Tokens Embedding
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The goal of this BERT Embedding is to obtain the token embedding from BERT's pre-trained model. In this way, instead of building and do fine-tuning for an end-to-end NLP model, you can build your model by just utilizing the token embeddings. You can use the command line interface below:
-
-.. code-block:: shell
-
-    python embedding.py --sentences "GluonNLP is a toolkit that enables easy text preprocessing, datasets loading and neural models building to help you speed up your Natural Language Processing (NLP) research."
-    Text: g ##lu ##on ##nl ##p is a tool ##kit that enables easy text prep ##ro ##ces ##sing , data ##set ##s loading and neural models building to help you speed up your natural language processing ( nl ##p ) research .
-    Tokens embedding: [array([-0.11881411, -0.59530115,  0.627092  , ...,  0.00648153,
-       -0.03886228,  0.03406909], dtype=float32), array([-0.7995638 , -0.6540758 , -0.00521846, ..., -0.42272145,
-       -0.5787281 ,  0.7021201 ], dtype=float32), array([-0.7406778 , -0.80276626,  0.3931962 , ..., -0.49068323,
-       -0.58128357,  0.6811132 ], dtype=float32), array([-0.43287313, -1.0018158 ,  0.79617643, ..., -0.26877284,
-       -0.621779  , -0.2731115 ], dtype=float32), array([-0.8515188 , -0.74098676,  0.4427735 , ..., -0.41267148,
-       -0.64225197,  0.3949393 ], dtype=float32), array([-0.86652845, -0.27746758,  0.8806506 , ..., -0.87452525,
-       -0.9551989 , -0.0786318 ], dtype=float32), array([-1.0987284 , -0.36603633,  0.2826037 , ..., -0.33794224,
-       -0.55210876, -0.09221527], dtype=float32), array([-0.3483025 ,  0.401534  ,  0.9361341 , ..., -0.29747447,
-       -0.49559578, -0.08878893], dtype=float32), array([-0.65626   , -0.14857645,  0.29733548, ..., -0.15890433,
-       -0.45487815, -0.28494897], dtype=float32), array([-0.1983894 ,  0.67196256,  0.7867421 , ..., -0.7990434 ,
-        0.05860569, -0.26884627], dtype=float32), array([-0.3775159 , -0.00590206,  0.5240432 , ..., -0.26754653,
-       -0.37806216,  0.23336883], dtype=float32), array([ 0.1876977 ,  0.30165672,  0.47167772, ..., -0.43823618,
-       -0.42823148, -0.48873612], dtype=float32), array([-0.6576557 , -0.09822252,  0.1121515 , ..., -0.21743725,
-       -0.1820574 , -0.16115054], dtype=float32)]
diff --git a/scripts/bert/model/__init__.py b/scripts/bert/model/__init__.py
deleted file mode 100644
index e1aae8e5ab..0000000000
--- a/scripts/bert/model/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""BERT model."""
-from . import qa
diff --git a/scripts/bert/model/qa.py b/scripts/bert/model/qa.py
deleted file mode 100644
index 39418bd54b..0000000000
--- a/scripts/bert/model/qa.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""BertForQA models."""
-
-__all__ = ['BertForQA', 'BertForQALoss']
-
-from mxnet.gluon import HybridBlock, loss, nn
-from mxnet.gluon.loss import Loss
-
-
-class BertForQA(HybridBlock):
-    """Model for SQuAD task with BERT.
-
-    The model feeds token ids and token type ids into BERT to get the
-    pooled BERT sequence representation, then apply a Dense layer for QA task.
-
-    Parameters
-    ----------
-    bert: BERTModel
-        Bidirectional encoder with transformer.
-    prefix : str or None
-        See document of `mx.gluon.Block`.
-    params : ParameterDict or None
-        See document of `mx.gluon.Block`.
-    """
-
-    def __init__(self, bert, prefix=None, params=None):
-        super(BertForQA, self).__init__(prefix=prefix, params=params)
-        self.bert = bert
-        with self.name_scope():
-            self.span_classifier = nn.Dense(units=2, flatten=False)
-
-    def __call__(self, inputs, token_types, valid_length=None):
-        #pylint: disable=arguments-differ, dangerous-default-value
-        """Generate the unnormalized score for the given the input sequences."""
-        # XXX Temporary hack for hybridization as hybridblock does not support None inputs
-        valid_length = [] if valid_length is None else valid_length
-        return super(BertForQA, self).__call__(inputs, token_types, valid_length)
-
-    def hybrid_forward(self, F, inputs, token_types, valid_length=None):
-        # pylint: disable=arguments-differ
-        """Generate the unnormalized score for the given the input sequences.
-
-        Parameters
-        ----------
-        inputs : NDArray, shape (batch_size, seq_length)
-            Input words for the sequences.
-        token_types : NDArray, shape (batch_size, seq_length)
-            Token types for the sequences, used to indicate whether the word belongs to the
-            first sentence or the second one.
-        valid_length : NDArray or None, shape (batch_size,)
-            Valid length of the sequence. This is used to mask the padded tokens.
-
-        Returns
-        -------
-        outputs : NDArray
-            Shape (batch_size, seq_length, 2)
-        """
-        # XXX Temporary hack for hybridization as hybridblock does not support None inputs
-        if isinstance(valid_length, list) and len(valid_length) == 0:
-            valid_length = None
-        bert_output = self.bert(inputs, token_types, valid_length)
-        output = self.span_classifier(bert_output)
-        return output
-
-
-class BertForQALoss(Loss):
-    """Loss for SQuAD task with BERT.
-
-    """
-
-    def __init__(self, weight=None, batch_axis=0, **kwargs):  # pylint: disable=unused-argument
-        super(BertForQALoss, self).__init__(
-            weight=None, batch_axis=0, **kwargs)
-        self.loss = loss.SoftmaxCELoss()
-
-    def hybrid_forward(self, F, pred, label):  # pylint: disable=arguments-differ
-        """
-        Parameters
-        ----------
-        pred : NDArray, shape (batch_size, seq_length, 2)
-            BERTSquad forward output.
-        label : list, length is 2, each shape is (batch_size,1)
-            label[0] is the starting position of the answer,
-            label[1] is the ending position of the answer.
-
-        Returns
-        -------
-        outputs : NDArray
-            Shape (batch_size,)
-        """
-        pred = F.split(pred, axis=2, num_outputs=2)
-        start_pred = pred[0].reshape((0, -3))
-        start_label = label[0]
-        end_pred = pred[1].reshape((0, -3))
-        end_label = label[1]
-        return (self.loss(start_pred, start_label) + self.loss(
-            end_pred, end_label)) / 2
diff --git a/scripts/bert/pretraining_utils.py b/scripts/bert/pretraining_utils.py
deleted file mode 100644
index 876703240c..0000000000
--- a/scripts/bert/pretraining_utils.py
+++ /dev/null
@@ -1,526 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Utilities for pre-training."""
-import time
-import os
-import sys
-import logging
-import random
-import multiprocessing
-
-import numpy as np
-import mxnet as mx
-import gluonnlp as nlp
-
-from data.create_pretraining_data import create_training_instances
-
-
-__all__ = ['get_model_loss', 'get_pretrain_data_npz', 'get_dummy_dataloader',
-           'save_parameters', 'save_states', 'evaluate', 'split_and_load',
-           'get_pretrain_data_text', 'generate_dev_set', 'profile']
-
-def get_model_loss(ctx, model, pretrained, dataset_name, vocab, dtype,
-                   ckpt_dir=None, start_step=None):
-    """Get model for pre-training.
-
-    Parameters
-    ----------
-    ctx : Context or list of Context
-        Contexts to initialize model
-    model : str
-        The name of the model, 'bert_12_768_12' or 'bert_24_1024_16'.
-    pretrained : bool
-        Whether to use pre-trained model weights as initialization.
-    dataset_name : str
-        The name of the dataset, which is used to retrieve the corresponding vocabulary file
-        when the vocab argument is not provided. Options include 'book_corpus_wiki_en_uncased',
-        'book_corpus_wiki_en_cased', 'wiki_multilingual_uncased', 'wiki_multilingual_cased',
-        'wiki_cn_cased'.
-    vocab : BERTVocab or None
-        The vocabulary for the model. If not provided, The vocabulary will be constructed
-        based on dataset_name.
-    dtype : float
-        Data type of the model for training.
-    ckpt_dir : str
-        The path to the checkpoint directory.
-    start_step : int or None
-        If provided, it loads the model from the corresponding checkpoint from the ckpt_dir.
-
-    Returns
-    -------
-    BERTForPretrain : the model for pre-training.
-    BERTVocab : the vocabulary.
-    """
-    # model
-    model, vocabulary = nlp.model.get_model(model, dataset_name=dataset_name, vocab=vocab,
-                                            pretrained=pretrained, ctx=ctx,
-                                            hparam_allow_override=True)
-
-    if not pretrained:
-        model.initialize(init=mx.init.Normal(0.02), ctx=ctx)
-    model.cast(dtype)
-
-    if ckpt_dir and start_step:
-        param_path = os.path.join(ckpt_dir, '%07d.params'%start_step)
-        nlp.utils.load_parameters(model, param_path, ctx=ctx, cast_dtype=True)
-        logging.info('Loading step %d checkpoints from %s.', start_step, param_path)
-
-    model.hybridize(static_alloc=True, static_shape=True)
-
-    # losses
-    nsp_loss = mx.gluon.loss.SoftmaxCELoss()
-    mlm_loss = mx.gluon.loss.SoftmaxCELoss()
-    nsp_loss.hybridize(static_alloc=True, static_shape=True)
-    mlm_loss.hybridize(static_alloc=True, static_shape=True)
-
-    model = BERTForPretrain(model, nsp_loss, mlm_loss, len(vocabulary))
-    return model, vocabulary
-
-
-def prepare_pretrain_npz_dataset(filename, allow_pickle=False):
-    """Create dataset based on the numpy npz file"""
-    if isinstance(filename, (list, tuple)):
-        assert len(filename) == 1, \
-            'When .npy/.npz data file is loaded, len(filename) must be 1.' \
-            ' Received len(filename)={}.'.format(len(filename))
-        filename = filename[0]
-    logging.debug('start to load file %s ...', filename)
-    return nlp.data.NumpyDataset(filename, allow_pickle=allow_pickle)
-
-
-def prepare_pretrain_text_dataset(filename, tokenizer, max_seq_length, short_seq_prob,
-                                  masked_lm_prob, max_predictions_per_seq, whole_word_mask,
-                                  vocab, num_workers=1, worker_pool=None):
-    """Create dataset based on the raw text files"""
-    dupe_factor = 1
-    if not isinstance(filename, (list, tuple)):
-        filename = [filename]
-    logging.debug('start to load files %s ...', filename)
-    instances = create_training_instances((filename, tokenizer, max_seq_length,
-                                           short_seq_prob, masked_lm_prob,
-                                           max_predictions_per_seq,
-                                           whole_word_mask, vocab,
-                                           dupe_factor, num_workers,
-                                           worker_pool, None))
-    return mx.gluon.data.ArrayDataset(*instances)
-
-
-def prepare_pretrain_bucket_sampler(dataset, batch_size, shuffle=False,
-                                    num_ctxes=1, num_buckets=1):
-    """Create data sampler based on the dataset"""
-    if isinstance(dataset, nlp.data.NumpyDataset):
-        lengths = dataset.get_field('valid_lengths')
-    else:
-        lengths = dataset.transform(lambda input_ids, segment_ids, masked_lm_positions, \
-                                           masked_lm_ids, masked_lm_weights, \
-                                           next_sentence_labels, valid_lengths: \
-                                        valid_lengths, lazy=False)
-    # calculate total batch size for all GPUs
-    batch_size = batch_size * num_ctxes
-    sampler = nlp.data.FixedBucketSampler(lengths,
-                                          batch_size=batch_size,
-                                          num_buckets=num_buckets,
-                                          ratio=0,
-                                          shuffle=shuffle)
-    logging.debug('Sampler created for a new dataset:\n%s', sampler.stats())
-    return sampler
-
-
-def get_pretrain_data_text(data, batch_size, num_ctxes, shuffle,
-                           num_buckets, vocab, tokenizer, max_seq_length, short_seq_prob,
-                           masked_lm_prob, max_predictions_per_seq, whole_word_mask,
-                           num_parts=1, part_idx=0, num_dataset_workers=1, num_batch_workers=1,
-                           circle_length=1, repeat=1,
-                           dataset_cached=False, num_max_dataset_cached=0):
-    """Get a data iterator from raw text documents.
-
-    Parameters
-    ----------
-    batch_size : int
-        The batch size per GPU.
-    num_ctxes : int
-        The number of GPUs.
-    shuffle : bool
-        Whether to shuffle the data.
-    num_buckets : int
-        The number of buckets for the FixedBucketSampler for training.
-    vocab : BERTVocab
-        The vocabulary.
-    tokenizer : BERTTokenizer or BERTSPTokenizer
-        The tokenizer.
-    max_seq_length : int
-        The hard limit of maximum sequence length of sentence pairs.
-    short_seq_prob : float
-        The probability of sampling sequences shorter than the max_seq_length.
-    masked_lm_prob : float
-        The probability of replacing texts with masks/random words/original words.
-    max_predictions_per_seq : int
-        The hard limit of the number of predictions for masked words
-    whole_word_mask : bool
-        Whether to use whole word masking.
-    num_parts : int
-        The number of partitions for the dataset.
-    part_idx : int
-        The index of the partition to read.
-    num_dataset_workers : int
-        The number of worker processes for dataset construction.
-    num_batch_workers : int
-        The number of worker processes for batch construction.
-    circle_length : int, default is 1
-        The number of files to be read for a single worker at the same time.
-        When circle_length is larger than 1, we merge circle_length files.
-    repeat : int, default is 1
-        The number of times that files are repeated.
-    dataset_cached : bool, default is False
-        Whether or not to cache last processed dataset.
-        Each processed dataset can only be cached for once.
-        When there is no new available processed dataset to be fetched,
-        we pop a cached processed dataset.
-    num_max_dataset_cached : int, default is 0
-        Maximum number of cached datasets. It is valid only if dataset_cached is True
-    """
-    num_files = len(nlp.utils.glob(data))
-    logging.info('%d files are found.', num_files)
-    assert num_files >= num_parts, \
-        'The number of text files must be no less than the number of ' \
-        'workers/partitions (%d). Only %d files at %s are found.'%(num_parts, num_files, data)
-    dataset_params = {'tokenizer': tokenizer, 'max_seq_length': max_seq_length,
-                      'short_seq_prob': short_seq_prob, 'masked_lm_prob': masked_lm_prob,
-                      'max_predictions_per_seq': max_predictions_per_seq, 'vocab':vocab,
-                      'whole_word_mask': whole_word_mask}
-    sampler_params = {'batch_size': batch_size, 'shuffle': shuffle,
-                      'num_ctxes': num_ctxes, 'num_buckets': num_buckets}
-    dataset_fn = prepare_pretrain_text_dataset
-    sampler_fn = prepare_pretrain_bucket_sampler
-    pad_val = vocab[vocab.padding_token]
-    batchify_fn = nlp.data.batchify.Tuple(
-        nlp.data.batchify.Pad(pad_val=pad_val, round_to=8),  # input_id
-        nlp.data.batchify.Pad(pad_val=pad_val),  # masked_id
-        nlp.data.batchify.Pad(pad_val=0),  # masked_position
-        nlp.data.batchify.Pad(pad_val=0),  # masked_weight
-        nlp.data.batchify.Stack(),  # next_sentence_label
-        nlp.data.batchify.Pad(pad_val=0, round_to=8),  # segment_id
-        nlp.data.batchify.Stack())
-    split_sampler = nlp.data.SplitSampler(num_files, num_parts=num_parts,
-                                          part_index=part_idx, repeat=repeat)
-    dataloader = nlp.data.DatasetLoader(data,
-                                        file_sampler=split_sampler,
-                                        dataset_fn=dataset_fn,
-                                        batch_sampler_fn=sampler_fn,
-                                        dataset_params=dataset_params,
-                                        batch_sampler_params=sampler_params,
-                                        batchify_fn=batchify_fn,
-                                        num_dataset_workers=num_dataset_workers,
-                                        num_batch_workers=num_batch_workers,
-                                        pin_memory=False,
-                                        circle_length=circle_length,
-                                        dataset_cached=dataset_cached,
-                                        num_max_dataset_cached=num_max_dataset_cached)
-    return dataloader
-
-
-def get_pretrain_data_npz(data, batch_size, num_ctxes,
-                          shuffle, num_buckets,
-                          vocab, num_parts=1, part_idx=0,
-                          num_dataset_workers=1, num_batch_workers=1,
-                          circle_length=1, repeat=1,
-                          dataset_cached=False, num_max_dataset_cached=0):
-    """Get a data iterator from pre-processed npz files.
-
-    Parameters
-    ----------
-    batch_size : int
-        The batch size per GPU.
-    num_ctxes : int
-        The number of GPUs.
-    shuffle : bool
-        Whether to shuffle the data.
-    num_buckets : int
-        The number of buckets for the FixedBucketSampler for training.
-    vocab : BERTVocab
-        The vocabulary.
-    num_parts : int
-        The number of partitions for the dataset.
-    part_idx : int
-        The index of the partition to read.
-    num_dataset_workers : int
-        The number of worker processes for dataset construction.
-    num_batch_workers : int
-        The number of worker processes for batch contruction.
-    circle_length : int, default is 1
-        The number of files to be read for a single worker at the same time.
-        When circle_length is larger than 1, we merge circle_length files.
-    repeat : int, default is 1
-        The number of times that files are repeated.
-    dataset_cached : bool, default is False
-        Whether or not to cache last processed dataset.
-        Each processed dataset can only be cached for once.
-        When there is no new available processed dataset to be fetched,
-        we pop a cached processed dataset.
-    num_max_dataset_cached : int, default is 0
-        Maximum number of cached datasets. It is valid only if dataset_cached is True
-    """
-    num_files = len(nlp.utils.glob(data))
-    logging.info('%d files are found.', num_files)
-    assert num_files >= num_parts, \
-        'The number of text files must be no less than the number of ' \
-        'workers/partitions (%d). Only %d files at %s are found.'%(num_parts, num_files, data)
-    dataset_params = {'allow_pickle': True}
-    sampler_params = {'batch_size': batch_size, 'shuffle': shuffle,
-                      'num_ctxes': num_ctxes, 'num_buckets': num_buckets}
-    dataset_fn = prepare_pretrain_npz_dataset
-    sampler_fn = prepare_pretrain_bucket_sampler
-    pad_val = vocab[vocab.padding_token]
-    batchify_fn = nlp.data.batchify.Tuple(
-        nlp.data.batchify.Pad(pad_val=pad_val, round_to=8),  # input_id
-        nlp.data.batchify.Pad(pad_val=pad_val),  # masked_id
-        nlp.data.batchify.Pad(pad_val=0),  # masked_position
-        nlp.data.batchify.Pad(pad_val=0),  # masked_weight
-        nlp.data.batchify.Stack(),  # next_sentence_label
-        nlp.data.batchify.Pad(pad_val=0, round_to=8),  # segment_id
-        nlp.data.batchify.Stack())
-    split_sampler = nlp.data.SplitSampler(num_files, num_parts=num_parts,
-                                          part_index=part_idx, repeat=repeat)
-    dataloader = nlp.data.DatasetLoader(data,
-                                        file_sampler=split_sampler,
-                                        dataset_fn=dataset_fn,
-                                        batch_sampler_fn=sampler_fn,
-                                        dataset_params=dataset_params,
-                                        batch_sampler_params=sampler_params,
-                                        batchify_fn=batchify_fn,
-                                        num_dataset_workers=num_dataset_workers,
-                                        num_batch_workers=num_batch_workers,
-                                        pin_memory=False,
-                                        circle_length=circle_length,
-                                        dataset_cached=dataset_cached,
-                                        num_max_dataset_cached=num_max_dataset_cached)
-    return dataloader
-
-
-def get_dummy_dataloader(batch_size, seq_len, max_predict):
-    """Return a dummy data loader which returns a fixed data batch of target shape"""
-    class DummyIter():
-        def __init__(self, batch):
-            self._batch = batch
-
-        def __iter__(self):
-            while True:
-                yield self._batch
-    data_batch = ((mx.nd.zeros((batch_size, seq_len)),
-                   mx.nd.zeros((batch_size, max_predict)),
-                   mx.nd.zeros((batch_size, max_predict)),
-                   mx.nd.zeros((batch_size, max_predict)),
-                   mx.nd.ones((batch_size,)) * seq_len,
-                   mx.nd.zeros((batch_size, seq_len)),
-                   mx.nd.ones((batch_size,)) * seq_len))
-    return DummyIter(data_batch)
-
-
-def save_parameters(step_num, model, ckpt_dir):
-    """Save the model parameter, marked by step_num."""
-    param_path = os.path.join(ckpt_dir, '%07d.params'%step_num)
-    logging.info('[step %d] Saving model params to %s.', step_num, param_path)
-    nlp.utils.save_parameters(model, param_path)
-
-def save_states(step_num, trainer, ckpt_dir, local_rank=0):
-    """Save the trainer states, marked by step_num."""
-    trainer_path = os.path.join(ckpt_dir, '%07d.states.%02d'%(step_num, local_rank))
-    logging.info('[step %d] Saving trainer states to %s.', step_num, trainer_path)
-    nlp.utils.save_states(trainer, trainer_path)
-
-def log_noacc(begin_time, running_num_tks, running_mlm_loss, running_nsp_loss, step_num,
-              trainer, log_interval):
-    """Log training progress."""
-    end_time = time.time()
-    duration = end_time - begin_time
-    throughput = running_num_tks / duration / 1000.0
-    running_mlm_loss = running_mlm_loss / log_interval
-    running_nsp_loss = running_nsp_loss / log_interval
-    lr = trainer.learning_rate if trainer else 0
-    # pylint: disable=line-too-long
-    logging.info('[step {}]\tmlm_loss={:7.5f}\tnsp_loss={:5.2f}\tthroughput={:.1f}K tks/s\tlr={:.7f} time={:.2f}, latency={:.1f} ms/step'
-                 .format(step_num, running_mlm_loss.asscalar(), running_nsp_loss.asscalar(),
-                         throughput.asscalar(), lr, duration, duration*1000/log_interval))
-    # pylint: enable=line-too-long
-
-def log(begin_time, running_num_tks, running_mlm_loss, running_nsp_loss, step_num,
-        mlm_metric, nsp_metric, trainer, log_interval):
-    """Log training progress."""
-    end_time = time.time()
-    duration = end_time - begin_time
-    throughput = running_num_tks / duration / 1000.0
-    running_mlm_loss = running_mlm_loss / log_interval
-    running_nsp_loss = running_nsp_loss / log_interval
-    lr = trainer.learning_rate if trainer else 0
-    # pylint: disable=line-too-long
-    logging.info('[step {}]\tmlm_loss={:7.5f}\tmlm_acc={:4.2f}\tnsp_loss={:5.2f}\tnsp_acc={:5.2f}\tthroughput={:.1f}K tks/s\tlr={:.7f} time={:.2f}, latency={:.1f} ms/step'
-                 .format(step_num, running_mlm_loss.asscalar(), mlm_metric.get()[1] * 100, running_nsp_loss.asscalar(),
-                         nsp_metric.get()[1] * 100, throughput.asscalar(), lr, duration, duration*1000/log_interval))
-    # pylint: enable=line-too-long
-
-
-def split_and_load(arrs, ctx):
-    """split and load arrays to a list of contexts"""
-    assert isinstance(arrs, (list, tuple))
-    # split and load
-    loaded_arrs = [mx.gluon.utils.split_and_load(arr, ctx, even_split=False) for arr in arrs]
-    return zip(*loaded_arrs)
-
-
-class BERTForPretrain(mx.gluon.Block):
-    """Model for pre-training MLM and NSP with BERT.
-
-    Parameters
-    ----------
-    bert: BERTModel
-        Bidirectional encoder with transformer.
-    mlm_loss : Loss or None
-    nsp_loss : Loss or None
-    vocab_size : int
-    prefix : str or None
-        See document of `mx.gluon.Block`.
-    params : ParameterDict or None
-        See document of `mx.gluon.Block`.
-    """
-
-    def __init__(self, bert, mlm_loss, nsp_loss, vocab_size, prefix=None, params=None):
-        super(BERTForPretrain, self).__init__(prefix=prefix, params=params)
-        self.bert = bert
-        self.mlm_loss = mlm_loss
-        self.nsp_loss = nsp_loss
-        self._vocab_size = vocab_size
-
-    def forward(self, input_id, masked_id, masked_position, masked_weight,
-                next_sentence_label=None, segment_id=None, valid_length=None):
-        # pylint: disable=arguments-differ
-        """Predict with BERT for MLM and NSP. """
-        num_masks = masked_weight.sum() + 1e-8
-        valid_length = valid_length.reshape(-1)
-        masked_id = masked_id.reshape(-1)
-        _, _, classified, decoded = self.bert(input_id, segment_id, valid_length, masked_position)
-        decoded = decoded.reshape((-1, self._vocab_size))
-        ls1 = self.mlm_loss(decoded.astype('float32', copy=False),
-                            masked_id, masked_weight.reshape((-1, 1)))
-        ls2 = self.nsp_loss(classified.astype('float32', copy=False), next_sentence_label)
-        ls1 = ls1.sum() / num_masks
-        ls2 = ls2.mean()
-        return classified, decoded, ls1, ls2
-
-
-def evaluate(data_eval, model, ctx, log_interval, dtype):
-    """Evaluation function."""
-    logging.info('Running evaluation ... ')
-    mlm_metric = nlp.metric.MaskedAccuracy()
-    nsp_metric = nlp.metric.MaskedAccuracy()
-    mlm_metric.reset()
-    nsp_metric.reset()
-
-    eval_begin_time = time.time()
-    begin_time = time.time()
-    step_num = 0
-    running_mlm_loss = running_nsp_loss = 0
-    total_mlm_loss = total_nsp_loss = 0
-    running_num_tks = 0
-    for _, data_batch in enumerate(data_eval):
-        step_num += 1
-
-        data_list = split_and_load(data_batch, ctx)
-        ns_label_list, ns_pred_list = [], []
-        mask_label_list, mask_pred_list, mask_weight_list = [], [], []
-        for data in data_list:
-            (input_id, masked_id, masked_position, masked_weight, \
-             next_sentence_label, segment_id, valid_length) = data
-            valid_length = valid_length.astype(dtype, copy=False)
-            out = model(input_id, masked_id, masked_position, masked_weight, \
-                        next_sentence_label, segment_id, valid_length)
-            classified, decoded, ls1, ls2 = out
-            masked_id = masked_id.reshape(-1)
-            ns_label_list.append(next_sentence_label)
-            ns_pred_list.append(classified)
-            mask_label_list.append(masked_id)
-            mask_pred_list.append(decoded)
-            mask_weight_list.append(masked_weight)
-
-            valid_length = valid_length.astype('float32', copy=False)
-            running_mlm_loss += ls1.as_in_context(mx.cpu())
-            running_nsp_loss += ls2.as_in_context(mx.cpu())
-            running_num_tks += valid_length.sum().as_in_context(mx.cpu())
-        nsp_metric.update(ns_label_list, ns_pred_list)
-        mlm_metric.update(mask_label_list, mask_pred_list, mask_weight_list)
-
-        # logging
-        if (step_num + 1) % (log_interval) == 0:
-            total_mlm_loss += running_mlm_loss
-            total_nsp_loss += running_nsp_loss
-            log(begin_time, running_num_tks, running_mlm_loss, running_nsp_loss,
-                step_num, mlm_metric, nsp_metric, None, log_interval)
-            begin_time = time.time()
-            running_mlm_loss = running_nsp_loss = running_num_tks = 0
-            mlm_metric.reset_local()
-            nsp_metric.reset_local()
-
-    mx.nd.waitall()
-    eval_end_time = time.time()
-    # accumulate losses from last few batches, too
-    if running_mlm_loss != 0:
-        total_mlm_loss += running_mlm_loss
-        total_nsp_loss += running_nsp_loss
-    total_mlm_loss /= step_num
-    total_nsp_loss /= step_num
-    logging.info('Eval mlm_loss={:.3f}\tmlm_acc={:.1f}\tnsp_loss={:.3f}\tnsp_acc={:.1f}\t'
-                 .format(total_mlm_loss.asscalar(), mlm_metric.get_global()[1] * 100,
-                         total_nsp_loss.asscalar(), nsp_metric.get_global()[1] * 100))
-    logging.info('Eval cost={:.1f}s'.format(eval_end_time - eval_begin_time))
-
-
-def generate_dev_set(tokenizer, vocab, cache_file, args):
-    """Generate validation set."""
-    # set random seed to generate dev data deterministically
-    np.random.seed(0)
-    random.seed(0)
-    mx.random.seed(0)
-    worker_pool = multiprocessing.Pool()
-    eval_files = nlp.utils.glob(args.data_eval)
-    num_files = len(eval_files)
-    assert num_files > 0, 'Number of eval files must be greater than 0.' \
-                          'Only found %d files at %s'%(num_files, args.data_eval)
-    logging.info('Generating validation set from %d files on rank 0.', len(eval_files))
-    create_training_instances((eval_files, tokenizer, args.max_seq_length,
-                               args.short_seq_prob, args.masked_lm_prob,
-                               args.max_predictions_per_seq,
-                               args.whole_word_mask, vocab,
-                               1, args.num_dataset_workers,
-                               worker_pool, cache_file))
-    logging.info('Done generating validation set on rank 0.')
-
-def profile(curr_step, start_step, end_step, profile_name='profile.json',
-            early_exit=True):
-    """profile the program between [start_step, end_step)."""
-    if curr_step == start_step:
-        mx.nd.waitall()
-        mx.profiler.set_config(profile_memory=False, profile_symbolic=True,
-                               profile_imperative=True, filename=profile_name,
-                               aggregate_stats=True)
-        mx.profiler.set_state('run')
-    elif curr_step == end_step:
-        mx.nd.waitall()
-        mx.profiler.set_state('stop')
-        logging.info(mx.profiler.dumps())
-        mx.profiler.dump()
-        if early_exit:
-            sys.exit(0)
diff --git a/scripts/bert/run_pretraining.py b/scripts/bert/run_pretraining.py
deleted file mode 100644
index 8a5b4bb295..0000000000
--- a/scripts/bert/run_pretraining.py
+++ /dev/null
@@ -1,479 +0,0 @@
-"""
-Pre-training Bidirectional Encoder Representations from Transformers
-=========================================================================================
-This example shows how to pre-train a BERT model with Gluon NLP Toolkit.
-@article{devlin2018bert,
-  title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
-  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
-  journal={arXiv preprint arXiv:1810.04805},
-  year={2018}
-}
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-
-import os
-import sys
-import random
-import warnings
-import logging
-import functools
-import time
-import argparse
-
-import mxnet as mx
-import gluonnlp as nlp
-try:
-    import horovod.mxnet as hvd
-except ImportError:
-    pass
-
-from fp16_utils import FP16Trainer
-from pretraining_utils import get_model_loss, get_pretrain_data_npz, get_dummy_dataloader
-from pretraining_utils import split_and_load, log, log_noacc, evaluate
-from pretraining_utils import save_parameters, save_states, profile
-from pretraining_utils import get_pretrain_data_text, generate_dev_set
-
-# parser
-parser = argparse.ArgumentParser(description='BERT pretraining example.')
-# logging and serialization
-parser.add_argument('--ckpt_dir', type=str, default='./ckpt_dir',
-                    help='Path to checkpoint directory')
-parser.add_argument('--log_interval', type=int, default=250, help='Report interval')
-parser.add_argument('--ckpt_interval', type=int, default=25000, help='Checkpoint interval')
-# model
-parser.add_argument('--pretrained', action='store_true',
-                    help='Initialize the model with pretrained weights')
-parser.add_argument('--model', type=str, default='bert_12_768_12',
-                    choices=['bert_12_768_12', 'bert_24_1024_16'],
-                    help='Model to pre-train.')
-parser.add_argument('--dataset_name', type=str, default='book_corpus_wiki_en_uncased',
-                    choices=['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased',
-                             'wiki_multilingual_uncased', 'wiki_multilingual_cased',
-                             'wiki_cn_cased'],
-                    help='The pre-defined dataset from which the vocabulary is created.')
-# training
-parser.add_argument('--data', type=str, default=None,
-                    help='Path to training data file. File name with wildcard such as '
-                         'dir/*.train is accepted.')
-parser.add_argument('--total_batch_size', type=int, default=256,
-                    help='Global effective batch size. '
-                         'total_batch_size = batch_size_per_worker * num_worker * accumulate.')
-parser.add_argument('--accumulate', type=int, default=1,
-                    help='Number of batches for gradient accumulation. '
-                         'total_batch_size = batch_size_per_worker * num_worker * accumulate.')
-parser.add_argument('--num_steps', type=int, default=20, help='Number of optimization steps')
-parser.add_argument('--optimizer', type=str, default='bertadam',
-                    help='The optimization algorithm')
-parser.add_argument('--start_step', type=int, default=0,
-                    help='Start optimization step from the checkpoint.')
-parser.add_argument('--lr', type=float, default=1e-4, help='Learning rate')
-parser.add_argument('--warmup_ratio', type=float, default=0.01,
-                    help='ratio of warmup steps used in NOAM\'s stepsize schedule')
-parser.add_argument('--dtype', type=str, default='float16', help='data dtype')
-parser.add_argument('--no_compute_acc', action='store_true',
-                    help='skip accuracy metric computation during training')
-# validation
-parser.add_argument('--eval_interval', type=int, default=50000, help='Evaluation interval')
-parser.add_argument('--total_batch_size_eval', type=int, default=256,
-                    help='Global batch size for evaluation. total_batch_size_eval = '
-                         'batch_size_eval_per_worker * num_worker * accumulate.')
-parser.add_argument('--data_eval', type=str, required=True,
-                    help='Path to evaluation data file. File name with wildcard such as '
-                         'dir/*.dev is accepted.')
-parser.add_argument('--eval_use_npz', action='store_true',
-                    help='Set to True if --data_eval provides npz files instead of raw text files')
-# debugging
-parser.add_argument('--synthetic_data', action='store_true',
-                    help='If provided, synthetic data is used for training')
-parser.add_argument('--verbose', action='store_true', help='verbose logging')
-parser.add_argument('--profile', type=str, default=None,
-                    help='output profiling result to the provided file path')
-# data pre-processing
-parser.add_argument('--num_buckets', type=int, default=1,
-                    help='Number of buckets for variable length sequence sampling')
-parser.add_argument('--raw', action='store_true',
-                    help='If set, both training and dev samples are generated on-the-fly '
-                         'from raw texts instead of pre-processed npz files. ')
-parser.add_argument('--max_seq_length', type=int, default=512,
-                    help='Maximum input sequence length. Effective only if --raw is set.')
-parser.add_argument('--short_seq_prob', type=float, default=0,
-                    help='The probability of producing sequences shorter than max_seq_length. '
-                         'Effective only if --raw is set.')
-parser.add_argument('--masked_lm_prob', type=float, default=0.15,
-                    help='Probability for masks. Effective only if --raw is set.')
-parser.add_argument('--max_predictions_per_seq', type=int, default=80,
-                    help='Maximum number of predictions per sequence. '
-                         'Effective only if --raw is set.')
-parser.add_argument('--cased', action='store_true',
-                    help='Whether to tokenize with cased characters. '
-                         'Effective only if --raw is set.')
-parser.add_argument('--whole_word_mask', action='store_true',
-                    help='Whether to use whole word masking rather than per-subword masking.'
-                         'Effective only if --raw is set.')
-parser.add_argument('--sentencepiece', default=None, type=str,
-                    help='Path to the sentencepiece .model file for both tokenization and vocab. '
-                         'Effective only if --raw is set.')
-parser.add_argument('--num_dataset_workers', type=int, default=4,
-                    help='Number of workers to pre-process dataset.')
-parser.add_argument('--num_batch_workers', type=int, default=2,
-                    help='Number of workers to pre-process mini-batch.')
-parser.add_argument('--circle_length', type=int, default=2,
-                    help='Number of files to be read for a single GPU at the same time.')
-parser.add_argument('--repeat', type=int, default=8,
-                    help='Number of times that files are repeated in each shuffle.')
-parser.add_argument('--dataset_cached', action='store_true',
-                    help='Whether or not to cache the last processed training dataset.')
-parser.add_argument('--num_max_dataset_cached', type=int, default=0,
-                    help='Maximum number of cached processed training dataset.')
-# stage 2
-parser.add_argument('--phase2', action='store_true', help='phase 2 training')
-parser.add_argument('--phase1_num_steps', type=int, help='number of steps for phase 1')
-# communication
-parser.add_argument('--comm_backend', type=str, default='device',
-                    choices=['horovod', 'dist_sync_device', 'device'],
-                    help='Communication backend.')
-parser.add_argument('--gpus', type=str, default=None,
-                    help='List of gpus to run when device or dist_sync_device is used for '
-                         'communication, e.g. 0 or 0,2,5. empty means using cpu.')
-args = parser.parse_args()
-
-# logging
-nlp.utils.mkdir(args.ckpt_dir)
-level = logging.DEBUG if args.verbose else logging.INFO
-os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
-
-class DataParallelBERT(nlp.utils.Parallelizable):
-    """Data parallel BERT model.
-
-    Parameters
-    ----------
-    model : Block
-        The BERT model.
-    """
-    def __init__(self, model, trainer):
-        self._model = model
-        self._trainer = trainer
-
-    def forward_backward(self, x):
-        """forward backward implementation"""
-        (input_id, masked_id, masked_position, masked_weight, \
-         next_sentence_label, segment_id, valid_length) = x
-
-        valid_length = valid_length.astype(args.dtype, copy=False)
-        with mx.autograd.record():
-            out = self._model(input_id, masked_id, masked_position, masked_weight,
-                              next_sentence_label, segment_id, valid_length)
-            classified, decoded, ls1, ls2 = out
-            ls = ls1 + ls2
-            ls = ls / args.accumulate
-        if self._trainer:
-            self._trainer.backward(ls)
-        else:
-            ls.backward()
-
-        masked_id = masked_id.reshape(-1)
-        valid_length = valid_length.astype('float32', copy=False)
-        return next_sentence_label, classified, masked_id, decoded, \
-               masked_weight, ls1, ls2, valid_length
-
-def init_comm(backend):
-    """Init communication backend"""
-    # backend specific implementation
-    if backend == 'horovod':
-        try:
-            import horovod.mxnet as hvd  # pylint: disable=import-outside-toplevel
-        except ImportError:
-            logging.info('horovod must be installed.')
-            sys.exit(1)
-        hvd.init()
-        store = None
-        num_workers = hvd.size()
-        rank = hvd.rank()
-        local_rank = hvd.local_rank()
-        is_master_node = rank == local_rank
-        ctxs = [mx.gpu(local_rank)]
-    else:
-        # kvstore
-        store = mx.kv.create(backend)
-        num_workers = store.num_workers
-        rank = store.rank
-        local_rank = 0
-        is_master_node = rank == local_rank
-        ctxs = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
-               [mx.gpu(int(x)) for x in args.gpus.split(',')]
-    return store, num_workers, rank, local_rank, is_master_node, ctxs
-
-backend = args.comm_backend
-store, num_workers, rank, local_rank, is_master_node, ctxs = init_comm(backend)
-
-filename = os.path.join(args.ckpt_dir,
-                        ('phase1_log.' if not args.phase2 else 'phase2_log.') + str(rank))
-logging.basicConfig(filename=filename)
-logging.getLogger().setLevel(level)
-logging.info(args)
-logging.info(os.environ)
-
-assert args.total_batch_size % (args.accumulate * num_workers) == 0
-assert args.total_batch_size_eval % (args.accumulate * num_workers) == 0
-batch_size = int(args.total_batch_size / num_workers / args.accumulate)
-batch_size_eval = int(args.total_batch_size_eval / num_workers / args.accumulate)
-assert batch_size > 0
-assert batch_size_eval > 0
-
-def train(data_train, data_eval, model):
-    """Training function."""
-    # backend specific implementation
-    param_dict = model.bert.collect_params()
-    if backend == 'horovod':
-        hvd.broadcast_parameters(param_dict, root_rank=0)
-
-    mlm_metric = nlp.metric.MaskedAccuracy()
-    nsp_metric = nlp.metric.MaskedAccuracy()
-    mlm_metric.reset()
-    nsp_metric.reset()
-
-    logging.info('Creating distributed trainer...')
-    lr = args.lr
-    optim_params = {'learning_rate': lr, 'epsilon': 1e-6, 'wd': 0.01}
-    if args.dtype == 'float16':
-        optim_params['multi_precision'] = True
-
-    dynamic_loss_scale = args.dtype == 'float16'
-    if dynamic_loss_scale:
-        loss_scale_param = {'scale_window': 2000 / num_workers, 'init_scale': 2**10}
-    else:
-        loss_scale_param = None
-
-    # backend specific implementation
-    if backend == 'horovod':
-        trainer = hvd.DistributedTrainer(param_dict, args.optimizer, optim_params)
-    else:
-        trainer = mx.gluon.Trainer(param_dict, args.optimizer, optim_params,
-                                   update_on_kvstore=False)
-    fp16_trainer = FP16Trainer(trainer, dynamic_loss_scale=dynamic_loss_scale,
-                               loss_scaler_params=loss_scale_param)
-
-    if args.start_step:
-        state_path = os.path.join(args.ckpt_dir, '%07d.states.%02d'%(args.start_step, local_rank))
-        logging.info('Loading trainer state from %s', state_path)
-        nlp.utils.load_states(trainer, state_path)
-
-    accumulate = args.accumulate
-    num_train_steps = args.num_steps
-    warmup_ratio = args.warmup_ratio
-    num_warmup_steps = int(num_train_steps * warmup_ratio)
-    params = [p for p in param_dict.values() if p.grad_req != 'null']
-
-    # Do not apply weight decay on LayerNorm and bias terms
-    for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
-        v.wd_mult = 0.0
-    if accumulate > 1:
-        for p in params:
-            p.grad_req = 'add'
-
-    train_begin_time = time.time()
-    begin_time = time.time()
-    running_mlm_loss, running_nsp_loss = 0, 0
-    running_num_tks = 0
-    batch_num = 0
-    step_num = args.start_step
-
-    if args.phase2:
-        step_num -= args.phase1_num_steps
-
-    logging.info('Training started')
-
-    # create dummy data loader if needed
-    parallel_model = DataParallelBERT(model, trainer=fp16_trainer)
-    num_ctxes = len(ctxs)
-    parallel = nlp.utils.Parallel(num_ctxes if num_ctxes > 1 else 0, parallel_model)
-
-    while step_num < num_train_steps:
-
-        data_train_iter = iter(data_train)
-        end_of_batch = False
-        next_data_batch = next(data_train_iter)
-        while not end_of_batch:
-            data_batch = next_data_batch
-            if step_num >= num_train_steps:
-                break
-            if batch_num % accumulate == 0:
-                step_num += 1
-                # update learning rate
-                if step_num <= num_warmup_steps:
-                    new_lr = lr * step_num / num_warmup_steps
-                else:
-                    offset = (num_train_steps - step_num) / (num_train_steps - num_warmup_steps)
-                    new_lr = lr * max(offset, 0)
-                trainer.set_learning_rate(new_lr)
-                if args.profile:
-                    profile(step_num, 10, 14, profile_name=args.profile + str(rank))
-
-            # load data
-            data_list = list(split_and_load(data_batch, ctxs))
-
-            ns_label_list, ns_pred_list = [], []
-            mask_label_list, mask_pred_list, mask_weight_list = [], [], []
-
-            num_data = len(data_list)
-            for i in range(num_data):
-                parallel.put(data_list[i])
-            for _ in range(num_data):
-                (next_sentence_label, classified, masked_id,
-                 decoded, masked_weight, ls1, ls2, valid_length) = parallel.get()
-                ns_label_list.append(next_sentence_label)
-                ns_pred_list.append(classified)
-                mask_label_list.append(masked_id)
-                mask_pred_list.append(decoded)
-                mask_weight_list.append(masked_weight)
-                running_mlm_loss += ls1.as_in_context(mx.cpu()) / len(ctxs)
-                running_nsp_loss += ls2.as_in_context(mx.cpu()) / len(ctxs)
-                running_num_tks += valid_length.sum().as_in_context(mx.cpu())
-            # pre fetch next batch
-            try:
-                next_data_batch = next(data_train_iter)
-            except StopIteration:
-                end_of_batch = True
-
-            # update
-            if (batch_num + 1) % accumulate == 0:
-                fp16_trainer.step(1, max_norm=1.0 * num_workers)
-                if accumulate > 1:
-                    param_dict.zero_grad()
-            # update metrics
-            if args.no_compute_acc:
-                mask_pred_list[0].wait_to_read()
-            else:
-                nsp_metric.update(ns_label_list, ns_pred_list)
-                mlm_metric.update(mask_label_list, mask_pred_list, mask_weight_list)
-
-            # logging
-            if step_num % (args.log_interval) == 0 and (batch_num + 1) % accumulate == 0:
-                if args.no_compute_acc:
-                    log_noacc(begin_time, running_num_tks, running_mlm_loss / accumulate,
-                              running_nsp_loss / accumulate, step_num,
-                              trainer, args.log_interval)
-                else:
-                    log(begin_time, running_num_tks, running_mlm_loss / accumulate,
-                        running_nsp_loss / accumulate, step_num, mlm_metric, nsp_metric,
-                        trainer, args.log_interval)
-                    mlm_metric.reset_local()
-                    nsp_metric.reset_local()
-                begin_time = time.time()
-                running_mlm_loss = running_nsp_loss = running_num_tks = 0
-
-            # saving checkpoints
-            if step_num % args.ckpt_interval == 0 and (batch_num + 1) % accumulate == 0:
-                if is_master_node:
-                    save_states(step_num, trainer, args.ckpt_dir, local_rank)
-                    if local_rank == 0:
-                        save_parameters(step_num, model.bert, args.ckpt_dir)
-            if step_num % args.eval_interval == 0 and data_eval \
-                    and (batch_num + 1) % accumulate == 0:
-                # eval data is always based on a fixed npz file.
-                dataset_eval = get_pretrain_data_npz(data_eval, batch_size_eval,
-                                                     1, False, 1, vocab)
-                evaluate(dataset_eval, model, ctxs, args.log_interval, args.dtype)
-
-            batch_num += 1
-
-    if is_master_node:
-        save_states(step_num, trainer, args.ckpt_dir, local_rank)
-        if local_rank == 0:
-            save_parameters(step_num, model.bert, args.ckpt_dir)
-    mx.nd.waitall()
-    train_end_time = time.time()
-    logging.info('Train cost={:.1f}s'.format(train_end_time - train_begin_time))
-
-if __name__ == '__main__':
-    random_seed = random.randint(0, 1000)
-
-    dataset_name, vocab = args.dataset_name, None
-    if args.sentencepiece:
-        logging.info('loading vocab file from sentence piece model: %s', args.sentencepiece)
-        if args.dataset_name:
-            warnings.warn('Both --dataset_name and --sentencepiece are provided. '
-                          'The vocabulary will be loaded based on --sentencepiece')
-            dataset_name = None
-        vocab = nlp.vocab.BERTVocab.from_sentencepiece(args.sentencepiece)
-
-    model, vocab = get_model_loss(ctxs, args.model, args.pretrained,
-                                  dataset_name, vocab, args.dtype,
-                                  ckpt_dir=args.ckpt_dir,
-                                  start_step=args.start_step)
-    logging.info('Model created')
-    data_eval = args.data_eval
-
-    if args.raw:
-        if args.sentencepiece:
-            tokenizer = nlp.data.BERTSPTokenizer(args.sentencepiece, vocab,
-                                                 lower=not args.cased)
-        else:
-            tokenizer = nlp.data.BERTTokenizer(vocab=vocab, lower=not args.cased)
-
-        cache_dir = os.path.join(args.ckpt_dir, 'data_eval_cache')
-        cache_file = os.path.join(cache_dir, 'part-000.npz')
-        nlp.utils.mkdir(cache_dir)
-
-        # generate dev dataset from the raw text if needed
-        if not args.eval_use_npz:
-            data_eval = cache_file
-            if not os.path.isfile(cache_file) and rank == 0:
-                generate_dev_set(tokenizer, vocab, cache_file, args)
-
-    logging.debug('Random seed set to %d', random_seed)
-    mx.random.seed(random_seed)
-
-    if args.data:
-        if args.raw:
-            get_dataset_fn = functools.partial(get_pretrain_data_text,
-                                               max_seq_length=args.max_seq_length,
-                                               short_seq_prob=args.short_seq_prob,
-                                               masked_lm_prob=args.masked_lm_prob,
-                                               max_predictions_per_seq=args.max_predictions_per_seq,
-                                               whole_word_mask=args.whole_word_mask,
-                                               tokenizer=tokenizer,
-                                               circle_length=args.circle_length,
-                                               repeat=args.repeat,
-                                               dataset_cached=args.dataset_cached,
-                                               num_max_dataset_cached=args.num_max_dataset_cached)
-        else:
-            get_dataset_fn = get_pretrain_data_npz
-
-        if args.synthetic_data:
-            data_train = get_dummy_dataloader(batch_size, args.max_seq_length,
-                                              args.max_predictions_per_seq)
-        else:
-            shuffle = True
-            logging.info('args.num_buckets: {}, num_workers: {}, rank: {}'.format(args.num_buckets,
-                                                                                  num_workers,
-                                                                                  rank))
-            data_train = get_dataset_fn(args.data, batch_size,
-                                        len(ctxs), shuffle, args.num_buckets, vocab,
-                                        num_parts=num_workers, part_idx=rank,
-                                        num_dataset_workers=args.num_dataset_workers,
-                                        num_batch_workers=args.num_batch_workers)
-        train(data_train, data_eval, model)
-    if data_eval:
-        # eval data is always based on a fixed npz file.
-        shuffle = False
-        dataset_eval = get_pretrain_data_npz(data_eval, batch_size_eval,
-                                             len(ctxs), shuffle, 1, vocab)
-        evaluate(dataset_eval, model, ctxs, args.log_interval, args.dtype)
diff --git a/scripts/bert/sample_text.txt b/scripts/bert/sample_text.txt
deleted file mode 100644
index a42812060c..0000000000
--- a/scripts/bert/sample_text.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-This text is included to make sure Unicode is handled properly: 力加勝北区ᴵᴺᵀᵃছজটডণত
-Text should be one-sentence-per-line, with empty lines between documents.
-This sample text is public domain and was randomly selected from Project Guttenberg.
-
-The rain had only ceased with the gray streaks of morning at Blazing Star, and the settlement awoke to a moral sense of cleanliness, and the finding of forgotten knives, tin cups, and smaller camp utensils, where the heavy showers had washed away the debris and dust heaps before the cabin doors.
-Indeed, it was recorded in Blazing Star that a fortunate early riser had once picked up on the highway a solid chunk of gold quartz which the rain had freed from its incumbering soil, and washed into immediate and glittering popularity.
-Possibly this may have been the reason why early risers in that locality, during the rainy season, adopted a thoughtful habit of body, and seldom lifted their eyes to the rifted or india-ink washed skies above them.
-"Cass" Beard had risen early that morning, but not with a view to discovery.
-A leak in his cabin roof,--quite consistent with his careless, improvident habits,--had roused him at 4 A. M., with a flooded "bunk" and wet blankets.
-The chips from his wood pile refused to kindle a fire to dry his bed-clothes, and he had recourse to a more provident neighbor's to supply the deficiency.
-This was nearly opposite.
-Mr. Cassius crossed the highway, and stopped suddenly.
-Something glittered in the nearest red pool before him.
-Gold, surely!
-But, wonderful to relate, not an irregular, shapeless fragment of crude ore, fresh from Nature's crucible, but a bit of jeweler's handicraft in the form of a plain gold ring.
-Looking at it more attentively, he saw that it bore the inscription, "May to Cass."
-Like most of his fellow gold-seekers, Cass was superstitious.
-
-The fountain of classic wisdom, Hypatia herself.
-As the ancient sage--the name is unimportant to a monk--pumped water nightly that he might study by day, so I, the guardian of cloaks and parasols, at the sacred doors of her lecture-room, imbibe celestial knowledge.
-From my youth I felt in me a soul above the matter-entangled herd.
-She revealed to me the glorious fact, that I am a spark of Divinity itself.
-A fallen star, I am, sir!' continued he, pensively, stroking his lean stomach--'a fallen star!--fallen, if the dignity of philosophy will allow of the simile, among the hogs of the lower world--indeed, even into the hog-bucket itself. Well, after all, I will show you the way to the Archbishop's.
-There is a philosophic pleasure in opening one's treasures to the modest young.
-Perhaps you will assist me by carrying this basket of fruit?' And the little man jumped up, put his basket on Philammon's head, and trotted off up a neighbouring street.
-Philammon followed, half contemptuous, half wondering at what this philosophy might be, which could feed the self-conceit of anything so abject as his ragged little apish guide;
-but the novel roar and whirl of the street, the perpetual stream of busy faces, the line of curricles, palanquins, laden asses, camels, elephants, which met and passed him, and squeezed him up steps and into doorways, as they threaded their way through the great Moon-gate into the ample street beyond, drove everything from his mind but wondering curiosity, and a vague, helpless dread of that great living wilderness, more terrible than any dead wilderness of sand which he had left behind.
-Already he longed for the repose, the silence of the Laura--for faces which knew him and smiled upon him; but it was too late to turn back now.
-His guide held on for more than a mile up the great main street, crossed in the centre of the city, at right angles, by one equally magnificent, at each end of which, miles away, appeared, dim and distant over the heads of the living stream of passengers, the yellow sand-hills of the desert;
-while at the end of the vista in front of them gleamed the blue harbour, through a network of countless masts.
-At last they reached the quay at the opposite end of the street;
-and there burst on Philammon's astonished eyes a vast semicircle of blue sea, ringed with palaces and towers.
-He stopped involuntarily; and his little guide stopped also, and looked askance at the young monk, to watch the effect which that grand panorama should produce on him.
diff --git a/scripts/bert/utils.py b/scripts/bert/utils.py
deleted file mode 100644
index c2c0b5694d..0000000000
--- a/scripts/bert/utils.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utility functions for BERT."""
-
-import sys
-import logging
-import collections
-import hashlib
-import io
-
-import mxnet as mx
-import gluonnlp as nlp
-
-__all__ = ['tf_vocab_to_gluon_vocab', 'load_text_vocab']
-
-
-def tf_vocab_to_gluon_vocab(tf_vocab):
-    special_tokens = ['[UNK]', '[PAD]', '[SEP]', '[MASK]', '[CLS]']
-    assert all(t in tf_vocab for t in special_tokens)
-    counter = nlp.data.count_tokens(tf_vocab.keys())
-    vocab = nlp.vocab.BERTVocab(counter, token_to_idx=tf_vocab)
-    return vocab
-
-
-def get_hash(filename):
-    sha1 = hashlib.sha1()
-    with open(filename, 'rb') as f:
-        while True:
-            data = f.read(1048576)
-            if not data:
-                break
-            sha1.update(data)
-    return sha1.hexdigest(), str(sha1.hexdigest())[:8]
-
-
-def read_tf_checkpoint(path):
-    """read tensorflow checkpoint"""
-    from tensorflow.python import pywrap_tensorflow  # pylint: disable=import-outside-toplevel
-    tensors = {}
-    reader = pywrap_tensorflow.NewCheckpointReader(path)
-    var_to_shape_map = reader.get_variable_to_shape_map()
-    for key in sorted(var_to_shape_map):
-        tensor = reader.get_tensor(key)
-        tensors[key] = tensor
-    return tensors
-
-def profile(curr_step, start_step, end_step, profile_name='profile.json',
-            early_exit=True):
-    """profile the program between [start_step, end_step)."""
-    if curr_step == start_step:
-        mx.nd.waitall()
-        mx.profiler.set_config(profile_memory=False, profile_symbolic=True,
-                               profile_imperative=True, filename=profile_name,
-                               aggregate_stats=True)
-        mx.profiler.set_state('run')
-    elif curr_step == end_step:
-        mx.nd.waitall()
-        mx.profiler.set_state('stop')
-        logging.info(mx.profiler.dumps())
-        mx.profiler.dump()
-        if early_exit:
-            sys.exit(0)
-
-def load_text_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    index = 0
-    with io.open(vocab_file, 'r') as reader:
-        while True:
-            token = reader.readline()
-            if not token:
-                break
-            token = token.strip()
-            vocab[token] = index
-            index += 1
-    return vocab
diff --git a/scripts/conversion_toolkits/README.md b/scripts/conversion_toolkits/README.md
new file mode 100644
index 0000000000..8437202287
--- /dev/null
+++ b/scripts/conversion_toolkits/README.md
@@ -0,0 +1,77 @@
+# Conversion Scripts
+
+In GluonNLP, we provide shared scripts to convert the model checkpoints in other repositories to GluonNLP.  
+
+At this stage, the model needs to be downloaded locally, and the converting scripts accepts only the file directory as the argument,
+without the support of accepting the url. In addition, both the tensorflow fine-tuned models that
+can be loaded in TF1 Hub modules and TF2 SavedModels are accepted, although the parameters of mask
+language model are not provided in TF2 SavedModels in most cases, and
+the differences of these parameters are not required to be tested after converting.
+
+The testing step mentioned above are controlled by the flag `--test`, in which the maximum
+tolerance of 1e-3 between gluon model with converted weights and original tensorflow model.
+In addition, we can use GPU in all converting scripts by adding `--gpu 0`.
+
+For RoBERTa XLM-R and BART model, we rely on the master version of [fairseq](https://github.com/pytorch/fairseq#requirements-and-installation) package locally as `pip install git+https://github.com/pytorch/fairseq.git@master`.
+
+## Convert all models
+
+``bash
+bash convert_all.sh
+``
+
+### BERT
+Convert model from [BERT LIST](https://tfhub.dev/google/collections/bert/1).
+
+You can use the script provided in [convert_bert_from_tf_hub.sh](convert_bert_from_tf_hub.sh).
+The following command give you a rough idea about the code.
+
+```bash
+bash convert_bert_from_tf_hub.sh
+```
+
+In the process, we downloaded the config file from the [official repo](https://github.com/google-research/bert#pre-trained-models), download the configuration file `bert_config.json`,
+and move it into `${case}_bert_${model}/assets/`.
+
+### ALBERT
+You can use the command described in
+```bash
+bash convert_albert_from_tf_hub.sh
+```
+
+### ELECTRA
+The TF Hub is not available for ELECTRA model currently.
+Thus, you will need to clone the [electra repository](https://github.com/ZheyuYe/electra)
+and download the checkpoint. The parameters are converted from local checkpoints.
+By running the following command, you can convert + verify the ELECTRA model with both the discriminator and the generator.
+
+Notice: please set up the `--electra_path` with the cloned path if you'd like to directly use `convert_electra.py`.
+
+```bash
+bash convert_electra.sh
+```
+
+### MobileBert
+```bash
+bash convert_mobilebert.sh
+```
+
+### RoBERTa
+```bash
+bash convert_roberta.sh
+```
+
+### XLM-R
+```bash
+bash convert_xlmr.sh
+```
+
+### BART
+```bash
+bash convert_bart.sh
+```
+
+### GPT-2
+```bash
+bash convert_gpt2.sh
+```
diff --git a/scripts/conversion_toolkits/bert_base_config.json b/scripts/conversion_toolkits/bert_base_config.json
new file mode 100644
index 0000000000..fca794a5f0
--- /dev/null
+++ b/scripts/conversion_toolkits/bert_base_config.json
@@ -0,0 +1,13 @@
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "type_vocab_size": 2,
+  "vocab_size": 30522
+}
diff --git a/scripts/conversion_toolkits/bert_large_config.json b/scripts/conversion_toolkits/bert_large_config.json
new file mode 100644
index 0000000000..a7efa973d7
--- /dev/null
+++ b/scripts/conversion_toolkits/bert_large_config.json
@@ -0,0 +1,13 @@
+{
+  "attention_probs_dropout_prob": 0.1,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "max_position_embeddings": 512,
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "type_vocab_size": 2,
+  "vocab_size": 30522
+}
diff --git a/scripts/conversion_toolkits/convert_albert.sh b/scripts/conversion_toolkits/convert_albert.sh
new file mode 100644
index 0000000000..69c37e7bd1
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_albert.sh
@@ -0,0 +1,11 @@
+python3 -m pip install tensorflow==1.15 --upgrade --user
+python3 -m pip install tensorflow_hub --upgrade --user
+export TF_FORCE_GPU_ALLOW_GROWTH="true"
+for model in base large xlarge xxlarge
+do
+    hub_directory="google_albert_${model}_v2"
+    mkdir ${hub_directory}
+    wget "https://tfhub.dev/google/albert_${model}/3?tf-hub-format=compressed" -O "${hub_directory}.tar.gz"
+    tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
+    python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type albert --test
+done
diff --git a/scripts/conversion_toolkits/convert_all.sh b/scripts/conversion_toolkits/convert_all.sh
new file mode 100644
index 0000000000..a38031e9e1
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_all.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+
+bash convert_bert_from_tf_hub.sh
+bash convert_albert_from_tf_hub.sh
+bash convert_electra.sh
+bash convert_mobilebert.sh
+bash convert_roberta.sh
+bash convert_xlmr.sh
+bash convert_bart.sh
+bash convert_gpt2.sh
diff --git a/scripts/conversion_toolkits/convert_bart.sh b/scripts/conversion_toolkits/convert_bart.sh
new file mode 100644
index 0000000000..ee6cd1b3ec
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_bart.sh
@@ -0,0 +1,8 @@
+python3 -m pip install git+https://github.com/pytorch/fairseq.git@master --upgrade --user
+for model in base large
+do
+    mkdir bart_${model}
+    wget  "https://dl.fbaipublicfiles.com/fairseq/models/bart.${model}.tar.gz"
+    tar zxf bart.${model}.tar.gz --directory bart_${model}
+    python3 convert_fairseq_bart.py --fairseq_model_path bart_${model}/bart.${model} --test
+done
diff --git a/scripts/conversion_toolkits/convert_bert.sh b/scripts/conversion_toolkits/convert_bert.sh
new file mode 100644
index 0000000000..1fd3432265
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_bert.sh
@@ -0,0 +1,52 @@
+python3 -m pip install tensorflow==2.3.0 --upgrade --user
+python3 -m pip install tensorflow_hub --upgrade --user
+export TF_FORCE_GPU_ALLOW_GROWTH="true"
+
+# Conversion for English Models
+for model in base large
+do
+    for case in cased uncased
+    do
+        hub_directory="google_en_${case}_bert_${model}"
+        mkdir ${hub_directory}
+        if [ ${model} == base ];then
+            url="https://tfhub.dev/google/bert_${case}_L-12_H-768_A-12/1?tf-hub-format=compressed"
+        else
+            url="https://tfhub.dev/google/bert_${case}_L-24_H-1024_A-16/1?tf-hub-format=compressed"
+        fi
+        wget ${url} -O "${hub_directory}.tar.gz"
+        tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
+        cp bert_${model}_config.json ${hub_directory}/assets/
+        python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test
+    done
+done
+
+# Conversion for Chinese Models
+url="https://tfhub.dev/tensorflow/bert_zh_L-12_H-768_A-12/2?tf-hub-format=compressed"
+hub_directory="google_zh_bert_base"
+mkdir ${hub_directory}
+wget ${url} -O "${hub_directory}.tar.gz"
+tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
+cp bert_base_config.json ${hub_directory}/assets/
+python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test
+
+# Conversion for Multi-lingual Models
+url="https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/2?tf-hub-format=compressed"
+hub_directory="google_multi_cased_bert_base"
+mkdir ${hub_directory}
+wget ${url} -O "${hub_directory}.tar.gz"
+tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
+cp bert_base_config.json ${hub_directory}/assets/
+python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test
+
+# Conversion for Whole-word-masking Models
+for case in cased uncased
+do
+    hub_directory="google_en_${case}_bert_wwm_large"
+    mkdir ${hub_directory}
+    url="https://tfhub.dev/tensorflow/bert_en_wwm_${case}_L-24_H-1024_A-16/2?tf-hub-format=compressed"
+    wget ${url} -O "${hub_directory}.tar.gz"
+    tar -xvf ${hub_directory}.tar.gz --directory ${hub_directory}
+    cp bert_large_config.json ${hub_directory}/assets/
+    python3 convert_tf_hub_model.py --tf_hub_model_path ${hub_directory} --model_type bert --test
+done
diff --git a/scripts/conversion_toolkits/convert_electra.py b/scripts/conversion_toolkits/convert_electra.py
new file mode 100644
index 0000000000..6d60f0e37b
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_electra.py
@@ -0,0 +1,439 @@
+import os
+import re
+import sys
+import shutil
+import logging
+import argparse
+
+import mxnet as mx
+import numpy as np
+from numpy.testing import assert_allclose
+
+from gluonnlp.utils.misc import naming_convention, logging_config
+from gluonnlp.data.tokenizers import HuggingFaceWordPieceTokenizer
+from gluonnlp.models.electra import ElectraModel, \
+    ElectraGenerator, ElectraDiscriminator, ElectraForPretrain, get_generator_cfg
+import tensorflow.compat.v1 as tf
+
+tf.disable_eager_execution()
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
+
+mx.npx.set_np()
+np.random.seed(1234)
+mx.npx.random.seed(1234)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Convert the TF Electra Model to Gluon')
+    parser.add_argument('--tf_model_path', type=str,
+                        help='Directory of the model downloaded from TF hub.')
+    parser.add_argument('--electra_path', type=str,
+                        help='Path to the github repository of electra, you may clone it by '
+                             '`git clone https://github.com/ZheyuYe/electra.git`.')
+    parser.add_argument('--model_size', type=str, choices=['small', 'base', 'large'],
+                        help='Size of the Electra model')
+    parser.add_argument('--save_dir', type=str, default=None,
+                        help='directory path to save the converted Electra model.')
+    parser.add_argument('--gpu', type=int, default=None,
+                        help='a single gpu to run mxnet, e.g. 0 or 1 The default device is cpu ')
+    parser.add_argument('--test', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+def read_tf_checkpoint(path):
+    """read tensorflow checkpoint"""
+    from tensorflow.python import pywrap_tensorflow
+    tensors = {}
+    reader = pywrap_tensorflow.NewCheckpointReader(path)
+    var_to_shape_map = reader.get_variable_to_shape_map()
+    for key in sorted(var_to_shape_map):
+        tensor = reader.get_tensor(key)
+        tensors[key] = tensor
+    return tensors
+
+
+def get_dict_config(model_size, electra_path):
+    sys.path.append(electra_path)
+    electra_dir = os.path.abspath(os.path.join(os.path.dirname(electra_path), os.path.pardir))
+    sys.path.append(electra_dir)
+    from electra.util.training_utils import get_bert_config
+    from electra.configure_pretraining import PretrainingConfig
+
+    config = PretrainingConfig(model_name='', data_dir='', model_size=model_size)
+    bert_config = get_bert_config(config)
+    # we are not store all configuration of electra generators but only the scale size.
+    config_dict = bert_config.to_dict()
+    config_dict.update(
+        {'embedding_size': config.embedding_size,
+         'generator_hidden_size': config.generator_hidden_size,
+         'generator_layers': config.generator_layers,
+         })
+    return config_dict
+
+
+def convert_tf_config(config_dict, vocab_size):
+    """Convert the config file"""
+
+    assert vocab_size == config_dict['vocab_size']
+    cfg = ElectraModel.get_cfg().clone()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = vocab_size
+    cfg.MODEL.units = config_dict['hidden_size']
+    cfg.MODEL.embed_size = config_dict['embedding_size']
+    cfg.MODEL.hidden_size = config_dict['intermediate_size']
+    cfg.MODEL.max_length = config_dict['max_position_embeddings']
+    cfg.MODEL.num_heads = config_dict['num_attention_heads']
+    cfg.MODEL.num_layers = config_dict['num_hidden_layers']
+    cfg.MODEL.pos_embed_type = 'learned'
+    cfg.MODEL.activation = config_dict['hidden_act']
+    cfg.MODEL.layer_norm_eps = 1E-12
+    cfg.MODEL.num_token_types = config_dict['type_vocab_size']
+    cfg.MODEL.hidden_dropout_prob = float(config_dict['hidden_dropout_prob'])
+    cfg.MODEL.attention_dropout_prob = float(config_dict['attention_probs_dropout_prob'])
+    cfg.MODEL.dtype = 'float32'
+    cfg.MODEL.generator_layers_scale = config_dict['generator_layers']
+    cfg.MODEL.generator_units_scale = config_dict['generator_hidden_size']
+    cfg.INITIALIZER.weight = ['truncnorm', 0,
+                              config_dict['initializer_range']]  # TruncNorm(0, 0.02)
+    cfg.INITIALIZER.bias = ['zeros']
+    cfg.VERSION = 1
+    cfg.freeze()
+    return cfg
+
+
+def convert_tf_assets(tf_assets_dir, model_size, electra_path):
+    """Convert the assets file including config, vocab and tokenizer model"""
+    file_names = os.listdir(tf_assets_dir)
+    vocab_path = None
+    for ele in file_names:
+        if ele.endswith('.txt'):
+            assert vocab_path is None
+            vocab_path = ele
+    assert vocab_path is not None
+
+    if vocab_path:
+        vocab_path = os.path.join(tf_assets_dir, vocab_path)
+        vocab_size = len(open(vocab_path, 'r', encoding='utf-8').readlines())
+    config_dict = get_dict_config(model_size, electra_path)
+    cfg = convert_tf_config(config_dict, vocab_size)
+    return cfg, vocab_path
+
+
+CONVERT_MAP = [
+    ('backbone_model.discriminator_predictions/dense_1', 'rtd_encoder.2'),
+    ('backbone_model.discriminator_predictions/dense', 'rtd_encoder.0'),
+    ('backbone_model.generator_predictions/dense', 'mlm_decoder.0'),
+    ('backbone_model.generator_predictions/LayerNorm', 'mlm_decoder.2'),
+    ('backbone_model.generator_predictions/output_bias', 'mlm_decoder.3.bias'),
+    ('electra/', ''),
+    ('generator/', ''),
+    ('embeddings_project', 'embed_factorized_proj'),
+    ('embeddings/word_embeddings', 'word_embed.weight'),
+    ('embeddings/token_type_embeddings', 'token_type_embed.weight'),
+    ('embeddings/position_embeddings', 'token_pos_embed._embed.weight'),
+    ('layer_', 'all_encoder_layers.'),
+    ('embeddings/LayerNorm', 'embed_layer_norm'),
+    ('attention/output/LayerNorm', 'layer_norm'),
+    ('attention/output/dense', 'attention_proj'),
+    ('output/LayerNorm', 'ffn.layer_norm'),
+    ('LayerNorm', 'layer_norm'),
+    ('intermediate/dense', 'ffn.ffn_1'),
+    ('output/dense', 'ffn.ffn_2'),
+    ('output/', ''),
+    ('kernel', 'weight'),
+    ('/', '.'),
+]
+
+
+def get_name_map(tf_names, convert_type='backbone'):
+    """
+    Get the converting mapping between tensor names and mxnet names.
+    The above mapping CONVERT_MAP is effectively adaptive to Bert and Albert,
+    but there is no guarantee that it can match to other tf models in case of
+    some sepecial variable_scope (tensorflow) and prefix (mxnet).
+
+    Redefined mapping is encouraged to adapt the personalization model.
+
+    Parameters
+    ----------
+    tf_names
+        the parameters names of tensorflow model
+    convert_type
+        choices=['backbone', 'disc', 'gen']
+    Returns
+    -------
+    A dictionary with the following format:
+        {tf_names : mx_names}
+    """
+    name_map = {}
+    for source_name in tf_names:
+        target_name = source_name
+        if convert_type == 'backbone':
+            if 'electra' not in source_name:
+                continue
+        elif convert_type == 'disc':
+            target_name = 'backbone_model.' + target_name
+            if 'generator' in source_name:
+                continue
+        elif convert_type == 'gen':
+            target_name = 'backbone_model.' + target_name
+            if 'generator' not in source_name:
+                continue
+        else:
+            raise NotImplementedError
+        # skip the qkv weights
+        if 'self/' in source_name:
+            name_map[source_name] = None
+            continue
+        for old, new in CONVERT_MAP:
+            target_name = target_name.replace(old, new)
+        name_map[source_name] = target_name
+    return name_map
+
+
+def convert_tf_model(model_dir, save_dir, test_conversion, model_size, gpu, electra_path):
+    ctx = mx.gpu(gpu) if gpu is not None else mx.cpu()
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+
+    cfg, vocab_path = convert_tf_assets(model_dir, model_size, electra_path)
+    with open(os.path.join(save_dir, 'model.yml'), 'w') as of:
+        of.write(cfg.dump())
+    new_vocab = HuggingFaceWordPieceTokenizer(
+        vocab_file=vocab_path,
+        unk_token='[UNK]',
+        pad_token='[PAD]',
+        cls_token='[CLS]',
+        sep_token='[SEP]',
+        mask_token='[MASK]',
+        lowercase=True).vocab
+    new_vocab.save(os.path.join(save_dir, 'vocab.json'))
+
+    # test input data
+    batch_size = 3
+    seq_length = 32
+    num_mask = 5
+    input_ids = np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length))
+    valid_length = np.random.randint(seq_length // 2, seq_length, (batch_size,))
+    input_mask = np.broadcast_to(np.arange(seq_length).reshape(1, -1), (batch_size, seq_length)) \
+        < np.expand_dims(valid_length, 1)
+    segment_ids = np.random.randint(0, 2, (batch_size, seq_length))
+    mlm_positions = np.random.randint(0, seq_length // 2, (batch_size, num_mask))
+
+    tf_input_ids = tf.constant(input_ids, dtype=np.int32)
+    tf_input_mask = tf.constant(input_mask, dtype=np.int32)
+    tf_segment_ids = tf.constant(segment_ids, dtype=np.int32)
+
+    init_checkpoint = os.path.join(model_dir, 'electra_{}'.format(model_size))
+    tf_params = read_tf_checkpoint(init_checkpoint)
+    # get parameter names for tensorflow with unused parameters filtered out.
+    tf_names = sorted(tf_params.keys())
+    tf_names = filter(lambda name: not name.endswith('adam_m'), tf_names)
+    tf_names = filter(lambda name: not name.endswith('adam_v'), tf_names)
+    tf_names = filter(lambda name: name != 'global_step', tf_names)
+    tf_names = filter(lambda name: name != 'generator_predictions/temperature', tf_names)
+    tf_names = list(tf_names)
+
+    # reload the electra module for this local scope
+    sys.path.append(electra_path)
+    electra_dir = os.path.abspath(os.path.join(os.path.dirname(electra_path), os.path.pardir))
+    sys.path.append(electra_dir)
+    from electra.util.training_utils import get_bert_config
+    from electra.configure_pretraining import PretrainingConfig
+    from electra.model import modeling
+
+    config = PretrainingConfig(model_name='', data_dir='', model_size=model_size)
+    bert_config = get_bert_config(config)
+    bert_model = modeling.BertModel(
+        bert_config=bert_config,
+        is_training=False,
+        input_ids=tf_input_ids,
+        input_mask=tf_input_mask,
+        token_type_ids=tf_segment_ids,
+        use_one_hot_embeddings=False,
+        embedding_size=cfg.MODEL.embed_size)
+    tvars = tf.trainable_variables()
+    assignment_map, _ = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        # the name of the parameters are ending with ':0' like
+        # 'electra/embeddings/word_embeddings:0'
+        backbone_params = {v.name.split(":")[0]: v.read_value() for v in tvars}
+        backbone_params = sess.run(backbone_params)
+        tf_token_outputs_np = {
+            'pooled_output': sess.run(bert_model.get_pooled_output()),
+            'sequence_output': sess.run(bert_model.get_sequence_output()),
+        }
+
+    # The following part only ensure the parameters in backbone model are valid
+    for k in backbone_params:
+        assert_allclose(tf_params[k], backbone_params[k])
+
+    # Build gluon model and initialize
+    gluon_model = ElectraModel.from_cfg(cfg)
+    gluon_model.initialize(ctx=ctx)
+    gluon_model.hybridize()
+
+    gluon_disc_model = ElectraDiscriminator(cfg)
+    gluon_disc_model.initialize(ctx=ctx)
+    gluon_disc_model.hybridize()
+
+    gen_cfg = get_generator_cfg(cfg)
+    disc_backbone = gluon_disc_model.backbone_model
+    gluon_gen_model = ElectraGenerator(gen_cfg)
+    gluon_gen_model.tie_embeddings(disc_backbone.word_embed.collect_params(),
+                                   disc_backbone.token_type_embed.collect_params(),
+                                   disc_backbone.token_pos_embed.collect_params(),
+                                   disc_backbone.embed_layer_norm.collect_params())
+    gluon_gen_model.initialize(ctx=ctx)
+    gluon_gen_model.hybridize()
+
+    # pepare test data
+    mx_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx)
+    mx_valid_length = mx.np.array(valid_length, dtype=np.int32, ctx=ctx)
+    mx_token_types = mx.np.array(segment_ids, dtype=np.int32, ctx=ctx)
+    mx_masked_positions = mx.np.array(mlm_positions, dtype=np.int32, ctx=ctx)
+
+    for convert_type in ['backbone', 'disc', 'gen']:
+        name_map = get_name_map(tf_names, convert_type=convert_type)
+        # go through the gluon model to infer the shape of parameters
+
+        if convert_type == 'backbone':
+            model = gluon_model
+            contextual_embedding, pooled_output = model(
+                mx_input_ids, mx_token_types, mx_valid_length)
+        elif convert_type == 'disc':
+            model = gluon_disc_model
+            contextual_embedding, pooled_output, rtd_scores = \
+                model(mx_input_ids, mx_token_types, mx_valid_length)
+        elif convert_type == 'gen':
+            model = gluon_gen_model
+            contextual_embedding, pooled_output, mlm_scores = \
+                model(mx_input_ids, mx_token_types, mx_valid_length, mx_masked_positions)
+
+        # replace tensorflow parameter names with gluon parameter names
+        mx_params = model.collect_params()
+        all_keys = set(mx_params.keys())
+        for (src_name, dst_name) in name_map.items():
+            tf_param_val = tf_params[src_name]
+            if dst_name is None:
+                continue
+            all_keys.remove(dst_name)
+            if src_name.endswith('kernel'):
+                mx_params[dst_name].set_data(tf_param_val.T)
+            else:
+                mx_params[dst_name].set_data(tf_param_val)
+
+        # Merge query/kernel, key/kernel, value/kernel to encoder.all_encoder_groups.0.attn_qkv.weight
+        def convert_qkv_weights(tf_prefix, mx_prefix):
+            """
+            To convert the qkv weights with different prefix.
+
+            In tensorflow framework, the prefix of query/key/value for the albert model is
+            'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/kernel',
+            and that for the bert model is 'bert/encoder/layer_{}/attention/self/key/bias'.
+            In gluonnlp framework, the prefix is slightly different as
+            'encoder.all_encoder_groups.0.attn_qkv.weight' for albert model and
+            'encoder.all_layers.{}.attn_qkv.weight' for bert model, as the
+            curly braces {} can be filled with the layer number.
+            """
+            # Merge query_weight, key_weight, value_weight to mx_params
+            query_weight = tf_params[
+                '{}/query/kernel'.format(tf_prefix)]
+            key_weight = tf_params[
+                '{}/key/kernel'.format(tf_prefix)]
+            value_weight = tf_params[
+                '{}/value/kernel'.format(tf_prefix)]
+            mx_params['{}.attn_qkv.weight'.format(mx_prefix)].set_data(
+                np.concatenate([query_weight, key_weight, value_weight], axis=1).T)
+            # Merge query_bias, key_bias, value_bias to mx_params
+            query_bias = tf_params[
+                '{}/query/bias'.format(tf_prefix)]
+            key_bias = tf_params[
+                '{}/key/bias'.format(tf_prefix)]
+            value_bias = tf_params[
+                '{}/value/bias'.format(tf_prefix)]
+            mx_params['{}.attn_qkv.bias'.format(mx_prefix)].set_data(
+                np.concatenate([query_bias, key_bias, value_bias], axis=0))
+
+        # The below parameters of the generator are already initialized in the
+        # discriminator, no need to reload.
+        disc_embed_params = set(['backbone_model.embed_layer_norm.beta',
+                                 'backbone_model.embed_layer_norm.gamma',
+                                 'backbone_model.token_pos_embed._embed.weight',
+                                 'backbone_model.token_type_embed.weight',
+                                 'mlm_decoder.3.weight',
+                                 'backbone_model.word_embed.weight'])
+
+        for key in all_keys:
+            if convert_type == 'gen' and key in disc_embed_params:
+                continue
+            assert re.match(r'^(backbone_model\.){0,1}encoder\.all_encoder_layers\.[\d]+\.attn_qkv\.(weight|bias)$',
+                            key) is not None, 'Parameter key {} mismatch'.format(key)
+
+        tf_prefix = None
+        for layer_id in range(cfg.MODEL.num_layers):
+            mx_prefix = 'encoder.all_encoder_layers.{}'.format(layer_id)
+            if convert_type == 'gen':
+                mx_prefix = 'backbone_model.' + mx_prefix
+                tf_prefix = 'generator/encoder/layer_{}/attention/self'.format(layer_id)
+            elif convert_type == 'disc':
+                mx_prefix = 'backbone_model.' + mx_prefix
+                tf_prefix = 'electra/encoder/layer_{}/attention/self'.format(layer_id)
+            else:
+                tf_prefix = 'electra/encoder/layer_{}/attention/self'.format(layer_id)
+
+            convert_qkv_weights(tf_prefix, mx_prefix)
+
+        if convert_type == 'backbone':
+            # test conversion results for backbone model
+            if test_conversion:
+                tf_contextual_embedding = tf_token_outputs_np['sequence_output']
+                tf_pooled_output = tf_token_outputs_np['pooled_output']
+                contextual_embedding, pooled_output = model(
+                    mx_input_ids, mx_token_types, mx_valid_length)
+                assert_allclose(pooled_output.asnumpy(), tf_pooled_output, 1E-3, 1E-3)
+                for i in range(batch_size):
+                    ele_valid_length = valid_length[i]
+                    assert_allclose(contextual_embedding[i, :ele_valid_length, :].asnumpy(),
+                                    tf_contextual_embedding[i, :ele_valid_length, :], 1E-3, 1E-3)
+            model.save_parameters(os.path.join(save_dir, 'model.params'), deduplicate=True)
+            logging.info('Convert the backbone model in {} to {}/{}'.format(model_dir,
+                                                                            save_dir, 'model.params'))
+        elif convert_type == 'disc':
+            model.save_parameters(os.path.join(save_dir, 'disc_model.params'), deduplicate=True)
+            logging.info(
+                'Convert the discriminator model in {} to {}/{}'.format(model_dir, save_dir, 'disc_model.params'))
+        elif convert_type == 'gen':
+            model.save_parameters(os.path.join(save_dir, 'gen_model.params'), deduplicate=True)
+            logging.info('Convert the generator model in {} to {}/{}'.format(model_dir,
+                                                                             save_dir, 'gen_model.params'))
+
+    logging.info('Conversion finished!')
+    logging.info('Statistics:')
+
+    old_names = os.listdir(save_dir)
+    for old_name in old_names:
+        new_name, long_hash = naming_convention(save_dir, old_name)
+        old_path = os.path.join(save_dir, old_name)
+        new_path = os.path.join(save_dir, new_name)
+        shutil.move(old_path, new_path)
+        file_size = os.path.getsize(new_path)
+        logging.info('\t{}/{} {} {}'.format(save_dir, new_name, long_hash, file_size))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    logging_config()
+    save_dir = args.save_dir if args.save_dir is not None else os.path.basename(
+        args.tf_model_path) + '_gluon'
+    convert_tf_model(
+        args.tf_model_path,
+        save_dir,
+        args.test,
+        args.model_size,
+        args.gpu,
+        args.electra_path)
diff --git a/scripts/conversion_toolkits/convert_electra.sh b/scripts/conversion_toolkits/convert_electra.sh
new file mode 100644
index 0000000000..93c452329c
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_electra.sh
@@ -0,0 +1,12 @@
+python3 -m pip install tensorflow==1.15 --upgrade --user
+export TF_FORCE_GPU_ALLOW_GROWTH="true"
+git clone https://github.com/ZheyuYe/electra.git
+cd electra
+git checkout 923179410471f9e1820b3f0771c239e1752e4e18
+cd ..
+for model in small base large
+do
+    wget https://storage.googleapis.com/electra-data/electra_${model}.zip
+    unzip electra_${model}.zip
+    python3 convert_electra.py --tf_model_path electra_${model} --electra_path electra --model_size ${model} --test
+done
diff --git a/scripts/conversion_toolkits/convert_fairseq_bart.py b/scripts/conversion_toolkits/convert_fairseq_bart.py
new file mode 100644
index 0000000000..4c78fff23c
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_fairseq_bart.py
@@ -0,0 +1,321 @@
+import os
+import shutil
+import logging
+import argparse
+
+import mxnet as mx
+import numpy as np
+from numpy.testing import assert_allclose
+
+import torch
+from fairseq.models.bart import BARTModel as fairseq_BARTModel
+from gluonnlp.utils.misc import sha1sum, logging_config, naming_convention
+from gluonnlp.models.bart import BartModel
+from convert_fairseq_roberta import convert_vocab
+
+mx.npx.set_np()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Convert the fairseq BART Model to Gluon.')
+    parser.add_argument('--fairseq_model_path', type=str, required=True,
+                        help='Directory of the fairseq BART model.')
+    parser.add_argument('--save_dir', type=str, default=None,
+                        help='Directory path to save the converted BART model.')
+    parser.add_argument('--gpu', type=int, default=None,
+                        help='The single gpu to run mxnet, (e.g. --gpu 0) the default device is cpu.')
+    parser.add_argument('--test', action='store_true',
+                        help='Whether to test the conversion.')
+    return parser.parse_args()
+
+
+def convert_config(fairseq_cfg, vocab_size, cfg):
+    print('converting config')
+    cfg.defrost()
+    # Config for the bart base model
+    cfg.MODEL.vocab_size = vocab_size
+    cfg.MODEL.max_src_length = fairseq_cfg.max_source_positions
+    cfg.MODEL.max_tgt_length = fairseq_cfg.max_target_positions
+    cfg.MODEL.pos_embed_type = 'learned'
+    cfg.MODEL.shared_embed = fairseq_cfg.share_all_embeddings
+    cfg.MODEL.scale_embed = not fairseq_cfg.no_scale_embedding
+    cfg.MODEL.tie_weights = fairseq_cfg.share_decoder_input_output_embed
+    cfg.MODEL.data_norm = fairseq_cfg.layernorm_embedding
+    cfg.MODEL.pooler_activation = fairseq_cfg.pooler_activation_fn
+    cfg.MODEL.layer_norm_eps = 1E-5
+    cfg.MODEL.dropout = fairseq_cfg.dropout
+    cfg.MODEL.activation_dropout = fairseq_cfg.activation_dropout
+    cfg.MODEL.attention_dropout = fairseq_cfg.attention_dropout
+    cfg.MODEL.dtype = 'float32'
+
+    # Parameters for the encoder
+    cfg.MODEL.ENCODER.pre_norm = fairseq_cfg.encoder_normalize_before
+    cfg.MODEL.ENCODER.num_layers = fairseq_cfg.encoder_layers
+    cfg.MODEL.ENCODER.units = fairseq_cfg.encoder_embed_dim
+    cfg.MODEL.ENCODER.num_heads = fairseq_cfg.encoder_attention_heads
+    cfg.MODEL.ENCODER.hidden_size = fairseq_cfg.encoder_ffn_embed_dim
+    cfg.MODEL.ENCODER.activation = fairseq_cfg.activation_fn
+
+    # Parameters for the decoder
+    cfg.MODEL.DECODER.pre_norm = fairseq_cfg.decoder_normalize_before
+    cfg.MODEL.DECODER.num_layers = fairseq_cfg.decoder_layers
+    cfg.MODEL.DECODER.units = fairseq_cfg.decoder_embed_dim
+    cfg.MODEL.DECODER.num_heads = fairseq_cfg.decoder_attention_heads
+    cfg.MODEL.DECODER.hidden_size = fairseq_cfg.decoder_ffn_embed_dim
+    cfg.MODEL.DECODER.activation = fairseq_cfg.activation_fn
+
+    cfg.INITIALIZER.embed = ['xavier', 'gaussian', 'in', 1.0]
+    cfg.INITIALIZER.weight = ['xavier', 'uniform', 'avg', 1.0]
+    cfg.INITIALIZER.bias = ['zeros']
+    cfg.VERSION = 1
+    cfg.freeze()
+    return cfg
+
+
+def convert_params(fairseq_model,
+                   gluon_cfg,
+                   ctx):
+    fairseq_params = fairseq_model.state_dict()
+    # apply a linear mapping to vocab dictionary
+    gluon_model = BartModel.from_cfg(gluon_cfg, use_pooler=False)
+    gluon_model.initialize(ctx=ctx)
+    gluon_model.hybridize()
+    gluon_params = gluon_model.collect_params()
+    all_keys = set(gluon_params.keys())
+
+    def convert_attention(num_layers,
+                          fairseq_prefix,
+                          gluon_prefix,
+                          fairseq_attn_prefix='self_attn',
+                          gluon_attn_prefix='attn_qkv'):
+        for layer_id in range(num_layers):
+            fs_atten_prefix = \
+                '{}.layers.{}.{}.' \
+                .format(fairseq_prefix, layer_id, fairseq_attn_prefix)
+            fs_q_weight = fairseq_params[fs_atten_prefix + 'q_proj.weight'].cpu().numpy()
+            fs_k_weight = fairseq_params[fs_atten_prefix + 'k_proj.weight'].cpu().numpy()
+            fs_v_weight = fairseq_params[fs_atten_prefix + 'v_proj.weight'].cpu().numpy()
+            fs_q_bias = fairseq_params[fs_atten_prefix + 'q_proj.bias'].cpu().numpy()
+            fs_k_bias = fairseq_params[fs_atten_prefix + 'k_proj.bias'].cpu().numpy()
+            fs_v_bias = fairseq_params[fs_atten_prefix + 'v_proj.bias'].cpu().numpy()
+            gl_qkv_prefix = \
+                '{}.layers.{}.{}.' \
+                .format(gluon_prefix, layer_id, gluon_attn_prefix)
+            gl_qkv_weight = gluon_params[gl_qkv_prefix + 'weight']
+            gl_qkv_bias = gluon_params[gl_qkv_prefix + 'bias']
+            all_keys.remove(gl_qkv_prefix + 'weight')
+            all_keys.remove(gl_qkv_prefix + 'bias')
+            gl_qkv_weight.set_data(
+                np.concatenate([fs_q_weight, fs_k_weight, fs_v_weight], axis=0))
+            gl_qkv_bias.set_data(
+                np.concatenate([fs_q_bias, fs_k_bias, fs_v_bias], axis=0))
+
+    def convert_ffn(num_layers, fairseq_prefix, gluon_prefix):
+        # convert feed forward layer in encoder
+        for layer_id in range(num_layers):
+            for k, v in [
+                ('fc1.weight', 'ffn.ffn_1.weight'),
+                ('fc1.bias', 'ffn.ffn_1.bias'),
+                ('fc2.weight', 'ffn.ffn_2.weight'),
+                ('fc2.bias', 'ffn.ffn_2.bias'),
+                ('final_layer_norm.weight', 'ffn.layer_norm.gamma'),
+                ('final_layer_norm.bias', 'ffn.layer_norm.beta')
+            ]:
+                fs_name = '{}.layers.{}.{}' \
+                          .format(fairseq_prefix, layer_id, k)
+                gl_name = '{}.layers.{}.{}' \
+                          .format(gluon_prefix, layer_id, v)
+                all_keys.remove(gl_name)
+                gluon_params[gl_name].set_data(
+                    fairseq_params[fs_name].cpu().numpy())
+
+    print('converting embedding params')
+    padding_idx = fairseq_model.task.dictionary.pad_index
+    for fs_name, gl_name in [
+        ('model.encoder.embed_tokens.weight', 'src_embed_layer.weight'),
+        ('model.encoder.embed_positions.weight', 'src_pos_embed_layer._embed.weight'),
+        ('model.encoder.layernorm_embedding.weight', 'encoder.ln_data.gamma'),
+        ('model.encoder.layernorm_embedding.bias', 'encoder.ln_data.beta'),
+        ('model.decoder.embed_tokens.weight', 'tgt_embed_layer.weight'),
+        ('model.decoder.embed_positions.weight', 'tgt_pos_embed_layer._embed.weight'),
+        ('model.decoder.layernorm_embedding.weight', 'decoder.ln_data.gamma'),
+        ('model.decoder.layernorm_embedding.bias', 'decoder.ln_data.beta'),
+        # final projection in decoder
+        ('model.decoder.output_projection.weight', 'tgt_final_layer.weight'),
+    ]:
+        all_keys.remove(gl_name)
+        if 'embed_positions' in fs_name:
+            # position embed weight
+            gluon_params[gl_name].set_data(
+                fairseq_params[fs_name].cpu().numpy()[padding_idx + 1:, :])
+        else:
+            gluon_params[gl_name].set_data(
+                fairseq_params[fs_name].cpu().numpy())
+
+    print('converting encoder params')
+    encoder_num_layers = gluon_cfg.MODEL.ENCODER.num_layers
+    convert_attention(encoder_num_layers, 'model.encoder', 'encoder')
+    convert_ffn(encoder_num_layers, 'model.encoder', 'encoder')
+    for layer_id in range(encoder_num_layers):
+        for k, v in [
+            ('self_attn.out_proj.weight', 'attention_proj.weight'),
+            ('self_attn.out_proj.bias', 'attention_proj.bias'),
+            ('self_attn_layer_norm.weight', 'layer_norm.gamma'),
+            ('self_attn_layer_norm.bias', 'layer_norm.beta'),
+        ]:
+            fs_name = 'model.encoder.layers.{}.{}' \
+                      .format(layer_id, k)
+            gl_name = 'encoder.layers.{}.{}' \
+                      .format(layer_id, v)
+            all_keys.remove(gl_name)
+            gluon_params[gl_name].set_data(
+                fairseq_params[fs_name].cpu().numpy())
+
+    print('converting decoder params')
+    decoder_num_layers = gluon_cfg.MODEL.DECODER.num_layers
+    convert_attention(decoder_num_layers, 'model.decoder', 'decoder',
+                      gluon_attn_prefix='attn_in_qkv')
+    convert_ffn(decoder_num_layers, 'model.decoder', 'decoder')
+
+    for layer_id in range(decoder_num_layers):
+        for k, v in [
+            ('self_attn.out_proj.weight', 'proj_in.weight'),
+            ('self_attn.out_proj.bias', 'proj_in.bias'),
+            ('self_attn_layer_norm.weight', 'ln_in.gamma'),
+            ('self_attn_layer_norm.bias', 'ln_in.beta'),
+            ('encoder_attn.out_proj.weight', 'proj_inter.weight'),
+            ('encoder_attn.out_proj.bias', 'proj_inter.bias'),
+            ('encoder_attn_layer_norm.weight', 'ln_inter.gamma'),
+            ('encoder_attn_layer_norm.bias', 'ln_inter.beta'),
+            ('encoder_attn.q_proj.weight', 'attn_inter_q.weight'),
+            ('encoder_attn.q_proj.bias', 'attn_inter_q.bias'),
+            ('encoder_attn.k_proj.weight', 'attn_inter_k.weight'),
+            ('encoder_attn.k_proj.bias', 'attn_inter_k.bias'),
+            ('encoder_attn.v_proj.weight', 'attn_inter_v.weight'),
+            ('encoder_attn.v_proj.bias', 'attn_inter_v.bias'),
+
+        ]:
+            fs_name = 'model.decoder.layers.{}.{}' \
+                      .format(layer_id, k)
+            gl_name = 'decoder.layers.{}.{}' \
+                      .format(layer_id, v)
+            all_keys.remove(gl_name)
+            gluon_params[gl_name].set_data(
+                fairseq_params[fs_name].cpu().numpy())
+
+    assert len(all_keys) == 0, 'parameters missing from tensorflow checkpoint'
+
+    # check parameters sharing if share_decoder_input_output_embed is true
+    assert np.array_equal(
+        fairseq_params['model.decoder.embed_tokens.weight'].cpu().numpy(),
+        fairseq_params['model.decoder.output_projection.weight'].cpu().numpy()
+    )
+    return gluon_model
+
+
+def test_model(fairseq_model, gluon_model, gpu):
+    print('testing model')
+    ctx = mx.gpu(gpu) if gpu is not None else mx.cpu()
+    batch_size = 3
+    seq_length = 32
+    vocab_size = len(fairseq_model.task.dictionary)
+    padding_id = fairseq_model.model.decoder.padding_idx
+    input_ids = np.random.randint(  # skip padding_id
+        padding_id + 1,
+        vocab_size,
+        (batch_size, seq_length)
+    )
+    valid_length = np.random.randint(
+        seq_length // 2,
+        seq_length,
+        (batch_size,)
+    )
+
+    for i in range(batch_size):  # add padding, for fairseq padding mask
+        input_ids[i, valid_length[i]:] = padding_id
+
+    gl_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx)
+    gl_valid_length = mx.np.array(valid_length, dtype=np.int32, ctx=ctx)
+    gl_dec_out = \
+        gluon_model(gl_input_ids, gl_valid_length, gl_input_ids, gl_valid_length)
+
+    fs_input_ids = torch.from_numpy(input_ids).cuda(gpu)
+    fairseq_model.model.eval()
+    fs_dec_out, fs_extra = \
+        fairseq_model.model.cuda(gpu)(
+            fs_input_ids,
+            valid_length,
+            fs_input_ids,
+            return_all_hiddens=True)
+
+    # checking decoder output
+    gl_dec_out = gl_dec_out.asnumpy()
+    fs_dec_out = fs_dec_out.detach().cpu().numpy()
+    for j in range(batch_size):
+        assert_allclose(
+            gl_dec_out[j, :valid_length[j], :],
+            fs_dec_out[j, :valid_length[j], :],
+            1E-3,
+            1E-3
+        )
+
+
+def rename(save_dir):
+    """Rename converted files with hash"""
+    old_names = os.listdir(save_dir)
+    for old_name in old_names:
+        old_path = os.path.join(save_dir, old_name)
+        long_hash = sha1sum(old_path)
+        file_prefix, file_sufix = old_name.split('.')
+        new_name = '{file_prefix}-{short_hash}.{file_sufix}'.format(
+            file_prefix=file_prefix,
+            short_hash=long_hash[:8],
+            file_sufix=file_sufix)
+        new_path = os.path.join(save_dir, new_name)
+        shutil.move(old_path, new_path)
+        file_size = os.path.getsize(new_path)
+        logging.info('\t{} {} {}'.format(new_path, long_hash, file_size))
+
+
+def convert_fairseq_model(args):
+    if not args.save_dir:
+        args.save_dir = os.path.basename(args.fairseq_model_path) + '_gluon'
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+
+    fairseq_bart = fairseq_BARTModel.from_pretrained(args.fairseq_model_path,
+                                                     checkpoint_file='model.pt')
+    vocab_size = convert_vocab(args, fairseq_bart)
+    gluon_cfg = convert_config(fairseq_bart.args, vocab_size,
+                               BartModel.get_cfg().clone())
+    with open(os.path.join(args.save_dir, 'model.yml'), 'w') as of:
+        of.write(gluon_cfg.dump())
+
+    ctx = mx.gpu(args.gpu) if args.gpu is not None else mx.cpu()
+    gluon_bart = convert_params(fairseq_bart,
+                                gluon_cfg,
+                                ctx)
+    if args.test:
+        test_model(fairseq_bart, gluon_bart, args.gpu)
+
+    gluon_bart.save_parameters(os.path.join(args.save_dir, 'model.params'), deduplicate=True)
+    logging.info('Convert the BART MLM model in {} to {}'.
+                 format(os.path.join(args.fairseq_model_path, 'model.pt'),
+                        os.path.join(args.save_dir, 'model.params')))
+
+    logging.info('Conversion finished!')
+    logging.info('Statistics:')
+    old_names = os.listdir(args.save_dir)
+    for old_name in old_names:
+        new_name, long_hash = naming_convention(args.save_dir, old_name)
+        old_path = os.path.join(args.save_dir, old_name)
+        new_path = os.path.join(args.save_dir, new_name)
+        shutil.move(old_path, new_path)
+        file_size = os.path.getsize(new_path)
+        logging.info('\t{}/{} {} {}'.format(args.save_dir, new_name, long_hash, file_size))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    logging_config()
+    convert_fairseq_model(args)
diff --git a/scripts/conversion_toolkits/convert_fairseq_roberta.py b/scripts/conversion_toolkits/convert_fairseq_roberta.py
new file mode 100644
index 0000000000..bcdac44436
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_fairseq_roberta.py
@@ -0,0 +1,387 @@
+import os
+import re
+import sys
+import json
+import shutil
+import logging
+import argparse
+
+import mxnet as mx
+import numpy as np
+from numpy.testing import assert_allclose
+
+import torch
+from gluonnlp.data.vocab import Vocab as gluon_Vocab
+from gluonnlp.utils.misc import sha1sum, logging_config, naming_convention
+from fairseq.models.roberta import RobertaModel as fairseq_RobertaModel
+from gluonnlp.models.roberta import RobertaModel, RobertaForMLM
+from gluonnlp.data.tokenizers import HuggingFaceByteBPETokenizer
+
+mx.npx.set_np()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Convert the fairseq RoBERTa Model to Gluon.')
+    parser.add_argument('--fairseq_model_path', type=str, required=True,
+                        help='Directory of the fairseq RoBERTa model.')
+    parser.add_argument('--save_dir', type=str, default=None,
+                        help='Directory path to save the converted RoBERTa model.')
+    parser.add_argument('--gpu', type=int, default=None,
+                        help='The single gpu to run mxnet, (e.g. --gpu 0) the default device is cpu.')
+    parser.add_argument('--test', action='store_true',
+                        help='Whether to test the conversion.')
+    return parser.parse_args()
+
+
+def convert_vocab(args, fairseq_model):
+    print('converting vocab')
+    fairseq_dict_path = os.path.join(args.fairseq_model_path, 'dict.txt')
+    merges_save_path = os.path.join(args.save_dir, 'gpt2.merges')
+    vocab_save_path = os.path.join(args.save_dir, 'gpt2.vocab')
+    fairseq_vocab = fairseq_model.task.dictionary
+    # bos_word attr missing in fairseq_vocab
+    fairseq_vocab.bos_word = fairseq_vocab[fairseq_vocab.bos_index]
+
+    assert os.path.exists(fairseq_dict_path), \
+        '{} not found'.format(fairseq_dict_path)
+    from mxnet.gluon.utils import download
+    temp_vocab_file = download(
+        'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json')
+    temp_merges_file = download(
+        'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe')
+    # copy merges directy
+    shutil.copy(temp_merges_file, merges_save_path)
+
+    # build vocab
+    transfer_dict = []
+    with open(fairseq_dict_path, 'r', encoding='utf-8') as f_dict:
+        for line in f_dict:
+            word_id, count = line.split(' ', 1)
+            transfer_dict.append(word_id)
+    transfer_dict = {transfer_dict[i]: i for i in range(len(transfer_dict))}
+    with open(temp_vocab_file, 'r', encoding='utf-8') as f_v:
+        inter_vocab = json.load(f_v)
+    # transfer by dict
+    for k in inter_vocab:
+        inter_vocab[k] = transfer_dict[str(inter_vocab[k])]
+    inter_vocab = list(inter_vocab.items())
+    inter_vocab = sorted(inter_vocab, key=lambda x: x[1])
+    tokens = [e[0] for e in inter_vocab]
+
+    tail = [
+        vocab for vocab in fairseq_vocab.indices.keys() if re.match(
+            r'^madeupword[\d]{4}$',
+            vocab) is not None]
+    all_tokens = ['<s>', '<pad>', '</s>', '<unk>'] + \
+        tokens + tail + ['<mask>']
+
+    gluon_vocab = gluon_Vocab(all_tokens,
+                              unk_token=fairseq_vocab.unk_word,
+                              pad_token=fairseq_vocab.pad_word,
+                              eos_token=fairseq_vocab.eos_word,
+                              bos_token=fairseq_vocab.bos_word,
+                              mask_token=fairseq_vocab[-1])
+    gluon_vocab.save(vocab_save_path)
+    os.remove(temp_vocab_file)
+    os.remove(temp_merges_file)
+
+    gluon_tokenizer = HuggingFaceByteBPETokenizer(
+        merges_save_path,
+        vocab_save_path
+    )
+
+    if args.test:
+        test_vocab(fairseq_model, gluon_tokenizer)
+
+    vocab_size = len(fairseq_vocab)
+    print('| converted dictionary: {} types'.format(vocab_size))
+    return vocab_size
+
+
+def test_vocab(fairseq_model, gluon_tokenizer, check_all_tokens=False):
+    print('testing vocab')
+    fairseq_vocab = fairseq_model.task.dictionary
+    gluon_vocab = gluon_tokenizer.vocab
+    assert len(fairseq_vocab) == \
+        len(gluon_vocab)
+
+    # assert all_tokens
+    # roberta with gpt2 bytebpe bpe does not provide all tokens directly
+    if check_all_tokens:
+        for i in range(len(fairseq_vocab)):
+            assert fairseq_vocab[i] == gluon_vocab.all_tokens[i], \
+                '{}, {}, {}'.format(i, fairseq_vocab[i], gluon_vocab.all_tokens[i])
+
+    # assert special tokens
+    for special_tokens in ['unk', 'pad', 'eos', 'bos']:
+        assert getattr(fairseq_vocab, special_tokens + '_index') == \
+            getattr(gluon_vocab, special_tokens + '_id')
+        assert getattr(fairseq_vocab, special_tokens + '_word') == \
+            getattr(gluon_vocab, special_tokens + '_token')
+        # <mask> is the last one
+        assert fairseq_vocab[-1] == \
+            gluon_vocab.all_tokens[-1] == \
+            '<mask>'
+
+    sentence = "Hello, y'all! How are you Ⅷ 😁 😁 😁 ?" + \
+               'GluonNLP is great！！！!!!' + \
+               "GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:!@# 'abc'"
+    # assert encode
+    fs_tokens = fairseq_model.encode(sentence)
+    gl_tokens = gluon_tokenizer.encode(sentence, int)
+    # Notice: we may append bos and eos
+    # manuually after tokenizing sentences
+    assert fs_tokens.numpy().tolist()[1:-1] == gl_tokens
+
+    # assert decode
+    fs_sentence = fairseq_model.decode(fs_tokens)
+    gl_sentence = gluon_tokenizer.decode(gl_tokens)
+    assert fs_sentence == gl_sentence
+
+
+def convert_config(fairseq_cfg, vocab_size, cfg):
+    print('converting config')
+    cfg.defrost()
+    cfg.MODEL.vocab_size = vocab_size
+    cfg.MODEL.units = fairseq_cfg.encoder_embed_dim
+    cfg.MODEL.hidden_size = fairseq_cfg.encoder_ffn_embed_dim
+    cfg.MODEL.max_length = fairseq_cfg.max_positions
+    cfg.MODEL.num_heads = fairseq_cfg.encoder_attention_heads
+    cfg.MODEL.num_layers = fairseq_cfg.encoder_layers
+    cfg.MODEL.pos_embed_type = 'learned'
+    cfg.MODEL.activation = fairseq_cfg.activation_fn
+    cfg.MODEL.pooler_activation = fairseq_cfg.pooler_activation_fn
+    cfg.MODEL.layer_norm_eps = 1E-5
+    cfg.MODEL.hidden_dropout_prob = fairseq_cfg.dropout
+    cfg.MODEL.attention_dropout_prob = fairseq_cfg.attention_dropout
+    cfg.MODEL.dtype = 'float32'
+    cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02]
+    cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02]
+    cfg.INITIALIZER.bias = ['zeros']
+    cfg.VERSION = 1
+    cfg.freeze()
+    return cfg
+
+
+def convert_params(fairseq_model,
+                   gluon_cfg,
+                   ctx):
+    fairseq_params = fairseq_model.state_dict()
+    fairseq_prefix = 'model.encoder.'
+    gluon_prefix = 'backbone_model.'
+    print('converting {} params'.format(gluon_prefix))
+
+    gluon_model = RobertaForMLM(backbone_cfg=gluon_cfg)
+    # output all hidden states for testing
+    gluon_model.backbone_model._output_all_encodings = True
+    gluon_model.backbone_model.encoder._output_all_encodings = True
+
+    gluon_model.initialize(ctx=ctx)
+    gluon_model.hybridize()
+    gluon_params = gluon_model.collect_params()
+    num_layers = gluon_cfg.MODEL.num_layers
+    for layer_id in range(num_layers):
+        fs_atten_prefix = \
+            '{}sentence_encoder.layers.{}.self_attn.' \
+            .format(fairseq_prefix, layer_id)
+        fs_q_weight = fairseq_params[fs_atten_prefix + 'q_proj.weight'].cpu().numpy()
+        fs_k_weight = fairseq_params[fs_atten_prefix + 'k_proj.weight'].cpu().numpy()
+        fs_v_weight = fairseq_params[fs_atten_prefix + 'v_proj.weight'].cpu().numpy()
+        fs_q_bias = fairseq_params[fs_atten_prefix + 'q_proj.bias'].cpu().numpy()
+        fs_k_bias = fairseq_params[fs_atten_prefix + 'k_proj.bias'].cpu().numpy()
+        fs_v_bias = fairseq_params[fs_atten_prefix + 'v_proj.bias'].cpu().numpy()
+        gl_qkv_prefix = \
+            '{}encoder.all_layers.{}.attn_qkv.' \
+            .format(gluon_prefix, layer_id)
+        gl_qkv_weight = gluon_params[gl_qkv_prefix + 'weight']
+        gl_qkv_bias = gluon_params[gl_qkv_prefix + 'bias']
+        gl_qkv_weight.set_data(
+            np.concatenate([fs_q_weight, fs_k_weight, fs_v_weight], axis=0))
+        gl_qkv_bias.set_data(
+            np.concatenate([fs_q_bias, fs_k_bias, fs_v_bias], axis=0))
+
+        for k, v in [
+            ('self_attn.out_proj.weight', 'attention_proj.weight'),
+            ('self_attn.out_proj.bias', 'attention_proj.bias'),
+            ('self_attn_layer_norm.weight', 'layer_norm.gamma'),
+            ('self_attn_layer_norm.bias', 'layer_norm.beta'),
+            ('fc1.weight', 'ffn.ffn_1.weight'),
+            ('fc1.bias', 'ffn.ffn_1.bias'),
+            ('fc2.weight', 'ffn.ffn_2.weight'),
+            ('fc2.bias', 'ffn.ffn_2.bias'),
+            ('final_layer_norm.weight', 'ffn.layer_norm.gamma'),
+            ('final_layer_norm.bias', 'ffn.layer_norm.beta')
+        ]:
+            fs_name = '{}sentence_encoder.layers.{}.{}' \
+                      .format(fairseq_prefix, layer_id, k)
+            gl_name = '{}encoder.all_layers.{}.{}' \
+                      .format(gluon_prefix, layer_id, v)
+            gluon_params[gl_name].set_data(
+                fairseq_params[fs_name].cpu().numpy())
+
+    for k, v in [
+        ('sentence_encoder.embed_tokens.weight', 'word_embed.weight'),
+        ('sentence_encoder.emb_layer_norm.weight', 'embed_ln.gamma'),
+        ('sentence_encoder.emb_layer_norm.bias', 'embed_ln.beta'),
+    ]:
+        fs_name = fairseq_prefix + k
+        gl_name = gluon_prefix + v
+        gluon_params[gl_name].set_data(
+            fairseq_params[fs_name].cpu().numpy())
+
+    # position embed weight
+    padding_idx = fairseq_model.task.dictionary.pad_index
+    fs_pos_embed_name = fairseq_prefix + 'sentence_encoder.embed_positions.weight'
+    gl_pos_embed_name = gluon_prefix + 'pos_embed._embed.weight'
+    gluon_params[gl_pos_embed_name].set_data(
+        fairseq_params[fs_pos_embed_name].cpu().numpy()[padding_idx + 1:, :])
+
+    for k, v in [
+        ('lm_head.dense.weight', 'mlm_decoder.0.weight'),
+        ('lm_head.dense.bias', 'mlm_decoder.0.bias'),
+        ('lm_head.layer_norm.weight', 'mlm_decoder.2.gamma'),
+        ('lm_head.layer_norm.bias', 'mlm_decoder.2.beta'),
+        ('lm_head.bias', 'mlm_decoder.3.bias')
+    ]:
+        fs_name = fairseq_prefix + k
+        gluon_params[v].set_data(
+            fairseq_params[fs_name].cpu().numpy())
+    # assert untie=False
+    assert np.array_equal(
+        fairseq_params[fairseq_prefix + 'sentence_encoder.embed_tokens.weight'].cpu().numpy(),
+        fairseq_params[fairseq_prefix + 'lm_head.weight'].cpu().numpy()
+    )
+    return gluon_model
+
+
+def test_model(fairseq_model, gluon_model, gpu):
+    print('testing model')
+    ctx = mx.gpu(gpu) if gpu is not None else mx.cpu()
+    batch_size = 3
+    seq_length = 32
+    vocab_size = len(fairseq_model.task.dictionary)
+    padding_id = fairseq_model.model.encoder.sentence_encoder.padding_idx
+    input_ids = np.random.randint(  # skip padding_id
+        padding_id + 1,
+        vocab_size,
+        (batch_size, seq_length)
+    )
+    valid_length = np.random.randint(
+        seq_length // 2,
+        seq_length,
+        (batch_size,)
+    )
+
+    for i in range(batch_size):  # add padding, for fairseq padding mask
+        input_ids[i, valid_length[i]:] = padding_id
+
+    gl_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx)
+    gl_valid_length = mx.np.array(valid_length, dtype=np.int32, ctx=ctx)
+    # project the all tokens that is taking whole positions
+    gl_masked_positions = mx.npx.arange_like(gl_input_ids, axis=1)
+    gl_masked_positions = gl_masked_positions + mx.np.zeros_like(gl_input_ids)
+
+    fs_input_ids = torch.from_numpy(input_ids).cuda(gpu)
+
+    fairseq_model.model.eval()
+
+    gl_all_hiddens, gl_pooled, gl_mlm_scores = \
+        gluon_model(gl_input_ids, gl_valid_length, gl_masked_positions)
+
+    fs_mlm_scores, fs_extra = \
+        fairseq_model.model.cuda(gpu)(
+            fs_input_ids,
+            return_all_hiddens=True)
+    fs_all_hiddens = fs_extra['inner_states']
+
+    # checking all_encodings_outputs
+    num_layers = fairseq_model.args.encoder_layers
+    for i in range(num_layers + 1):
+        gl_hidden = gl_all_hiddens[i].asnumpy()
+        fs_hidden = fs_all_hiddens[i]
+        fs_hidden = fs_hidden.transpose(0, 1)
+        fs_hidden = fs_hidden.detach().cpu().numpy()
+        for j in range(batch_size):
+            assert_allclose(
+                gl_hidden[j, :valid_length[j], :],
+                fs_hidden[j, :valid_length[j], :],
+                1E-3,
+                1E-3
+            )
+    # checking masked_language_scores
+    gl_mlm_scores = gl_mlm_scores.asnumpy()
+    fs_mlm_scores = fs_mlm_scores.detach().cpu().numpy()
+    for j in range(batch_size):
+        assert_allclose(
+            gl_mlm_scores[j, :valid_length[j], :],
+            fs_mlm_scores[j, :valid_length[j], :],
+            1E-3,
+            1E-3
+        )
+
+
+def rename(save_dir):
+    """Rename converted files with hash"""
+    old_names = os.listdir(save_dir)
+    for old_name in old_names:
+        old_path = os.path.join(save_dir, old_name)
+        long_hash = sha1sum(old_path)
+        file_prefix, file_sufix = old_name.split('.')
+        new_name = '{file_prefix}-{short_hash}.{file_sufix}'.format(
+            file_prefix=file_prefix,
+            short_hash=long_hash[:8],
+            file_sufix=file_sufix)
+        new_path = os.path.join(save_dir, new_name)
+        shutil.move(old_path, new_path)
+        file_size = os.path.getsize(new_path)
+        logging.info('\t{} {} {}'.format(new_path, long_hash, file_size))
+
+
+def convert_fairseq_model(args):
+    if not args.save_dir:
+        args.save_dir = os.path.basename(args.fairseq_model_path) + '_gluon'
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+
+    fairseq_roberta = fairseq_RobertaModel.from_pretrained(args.fairseq_model_path,
+                                                           checkpoint_file='model.pt')
+    vocab_size = convert_vocab(args, fairseq_roberta)
+
+    gluon_cfg = convert_config(fairseq_roberta.args, vocab_size,
+                               RobertaModel.get_cfg().clone())
+    with open(os.path.join(args.save_dir, 'model.yml'), 'w') as of:
+        of.write(gluon_cfg.dump())
+
+    ctx = mx.gpu(args.gpu) if args.gpu is not None else mx.cpu()
+    gluon_roberta = convert_params(fairseq_roberta,
+                                   gluon_cfg,
+                                   ctx)
+    if args.test:
+        test_model(fairseq_roberta, gluon_roberta, args.gpu)
+
+    gluon_roberta.save_parameters(os.path.join(args.save_dir, 'model_mlm.params'), deduplicate=True)
+    logging.info('Convert the RoBERTa MLM model in {} to {}'.
+                 format(os.path.join(args.fairseq_model_path, 'model.pt'),
+                        os.path.join(args.save_dir, 'model_mlm.params')))
+    gluon_roberta.backbone_model.save_parameters(
+        os.path.join(args.save_dir, 'model.params'), deduplicate=True)
+    logging.info('Convert the RoBERTa backbone model in {} to {}'.
+                 format(os.path.join(args.fairseq_model_path, 'model.pt'),
+                        os.path.join(args.save_dir, 'model.params')))
+
+    logging.info('Conversion finished!')
+    logging.info('Statistics:')
+    old_names = os.listdir(args.save_dir)
+    for old_name in old_names:
+        new_name, long_hash = naming_convention(args.save_dir, old_name)
+        old_path = os.path.join(args.save_dir, old_name)
+        new_path = os.path.join(args.save_dir, new_name)
+        shutil.move(old_path, new_path)
+        file_size = os.path.getsize(new_path)
+        logging.info('\t{}/{} {} {}'.format(args.save_dir, new_name, long_hash, file_size))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    logging_config()
+    convert_fairseq_model(args)
diff --git a/scripts/conversion_toolkits/convert_fairseq_xlmr.py b/scripts/conversion_toolkits/convert_fairseq_xlmr.py
new file mode 100644
index 0000000000..4b3ec74da6
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_fairseq_xlmr.py
@@ -0,0 +1,120 @@
+import os
+import copy
+import logging
+import argparse
+
+import mxnet as mx
+
+from gluonnlp.utils.misc import logging_config
+from gluonnlp.models.xlmr import XLMRModel, XLMRForMLM
+from gluonnlp.third_party import sentencepiece_model_pb2
+from fairseq.models.roberta import XLMRModel as fairseq_XLMRModel
+from convert_fairseq_roberta import rename, test_model, test_vocab, convert_config, convert_params
+from gluonnlp.data.tokenizers import SentencepieceTokenizer
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Convert the fairseq XLM-R Model to Gluon.')
+    parser.add_argument('--fairseq_model_path', type=str, required=True,
+                        help='Directory of the fairseq XLM-R model.')
+    parser.add_argument('--model_size', type=str, choices=['base', 'large'], default='base',
+                        help='Size of XLM-R model.')
+    parser.add_argument('--save_dir', type=str, default=None,
+                        help='Directory path to save the converted XLM-R model.')
+    parser.add_argument('--gpu', type=int, default=None,
+                        help='The single gpu to run mxnet, (e.g. --gpu 0) the default device is cpu.')
+    parser.add_argument('--test', action='store_true',
+                        help='Whether to test the conversion.')
+    return parser.parse_args()
+
+def convert_vocab(args, fairseq_model):
+    print('converting vocab')
+    origin_spm_path = os.path.join(args.fairseq_model_path, 'sentencepiece.bpe.model')
+    assert os.path.exists(origin_spm_path)
+    new_spm_path = os.path.join(args.save_dir, 'sentencepiece.model')
+    fairseq_vocab = fairseq_model.task.dictionary
+    # bos_word attr missing in fairseq_vocab
+    fairseq_vocab.bos_word = fairseq_vocab[fairseq_vocab.bos_index]
+
+    # model.pieces: <unk> <s> </s> other_tokens ->
+    # model.pieces: <s> <pad> </s> <unk> other_tokens <mask>
+    model = sentencepiece_model_pb2.ModelProto()
+    with open(origin_spm_path, 'rb') as f_m:
+        model.ParseFromString(f_m.read())
+    p0 = model.pieces[0]
+    p1 = model.pieces[1]
+    p2 = model.pieces[2]
+
+    pad_piece = copy.deepcopy(p0)
+    pad_piece.piece = fairseq_vocab.pad_word
+    pad_piece.type = pad_piece.CONTROL
+    mask_piece = copy.deepcopy(p0)
+    mask_piece.piece = '<mask>'
+    mask_piece.type = mask_piece.CONTROL
+
+    p0.type = p0.CONTROL
+    p0.piece = fairseq_vocab.bos_word
+    p1.type = p1.CONTROL
+    p1.piece = fairseq_vocab.eos_word
+    p2.type = p2.UNKNOWN
+    p2.piece = fairseq_vocab.unk_word
+    model.pieces.insert(fairseq_vocab.pad_index, pad_piece)
+    model.pieces.append(mask_piece)
+
+    model.trainer_spec.vocab_size = len(fairseq_vocab)
+    model.trainer_spec.unk_id = fairseq_vocab.unk_index
+    model.trainer_spec.bos_id = fairseq_vocab.bos_index
+    model.trainer_spec.eos_id = fairseq_vocab.eos_index
+    model.trainer_spec.pad_id = fairseq_vocab.pad_index
+
+    with open(new_spm_path, 'wb') as f:
+        f.write(model.SerializeToString())
+
+    gluon_tokenizer = SentencepieceTokenizer(new_spm_path)
+    if args.test:
+        test_vocab(fairseq_model, gluon_tokenizer, check_all_tokens=True)
+
+    vocab_size = len(fairseq_model.task.dictionary)
+    print('| converted dictionary: {} types'.format(vocab_size))
+    return vocab_size
+
+def convert_fairseq_model(args):
+    if not args.save_dir:
+        args.save_dir = os.path.basename(args.fairseq_model_path) + '_gluon'
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+    fairseq_xlmr = fairseq_XLMRModel.from_pretrained(args.fairseq_model_path,
+                                                     checkpoint_file='model.pt')
+    vocab_size = convert_vocab(args, fairseq_xlmr)
+
+    gluon_cfg = convert_config(fairseq_xlmr.args, vocab_size,
+                               XLMRModel.get_cfg().clone())
+    with open(os.path.join(args.save_dir, 'model.yml'), 'w') as of:
+        of.write(gluon_cfg.dump())
+
+    ctx = mx.gpu(args.gpu) if args.gpu is not None else mx.cpu()
+
+    gluon_xlmr = convert_params(fairseq_xlmr,
+                                   gluon_cfg,
+                                   ctx)
+    if args.test:
+        test_model(fairseq_xlmr, gluon_xlmr, args.gpu)
+
+    gluon_xlmr.save_parameters(os.path.join(args.save_dir, 'model_mlm.params'), deduplicate=True)
+    logging.info('Convert the RoBERTa MLM model in {} to {}'.
+                 format(os.path.join(args.fairseq_model_path, 'model.pt'), \
+                        os.path.join(args.save_dir, 'model_mlm.params')))
+    gluon_xlmr.backbone_model.save_parameters(
+        os.path.join(args.save_dir, 'model.params'), deduplicate=True)
+    logging.info('Convert the RoBERTa backbone model in {} to {}'.
+                 format(os.path.join(args.fairseq_model_path, 'model.pt'), \
+                        os.path.join(args.save_dir, 'model.params')))
+
+    logging.info('Conversion finished!')
+    logging.info('Statistics:')
+    rename(args.save_dir)
+
+if __name__ == '__main__':
+    args = parse_args()
+    logging_config()
+    convert_fairseq_model(args)
diff --git a/scripts/conversion_toolkits/convert_gpt2.py b/scripts/conversion_toolkits/convert_gpt2.py
new file mode 100644
index 0000000000..7efe720922
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_gpt2.py
@@ -0,0 +1,257 @@
+import os
+import re
+import json
+import shutil
+import logging
+import argparse
+
+import tensorflow as tf
+from tensorflow.contrib.training import HParams
+from gpt_2.src import model
+
+import mxnet as mx
+import numpy as np
+from numpy.testing import assert_allclose
+
+from gluonnlp.data.vocab import Vocab
+from gluonnlp.utils.misc import sha1sum, logging_config, naming_convention
+from gluonnlp.models.gpt2 import GPT2Model, GPT2ForLM
+
+mx.npx.set_np()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Convert the tf GPT-2 Model to Gluon.')
+    parser.add_argument('--tf_model_path', type=str, required=True,
+                        help='Directory of the tf GPT-2 model.')
+    parser.add_argument('--save_dir', type=str, default=None,
+                        help='Directory path to save the converted GPT-2 model.')
+    parser.add_argument('--test', action='store_true',
+                        help='Whether to test the conversion.')
+    return parser.parse_args()
+
+def convert_vocab(args):
+    print('converting vocab')
+    merges_path = os.path.join(args.tf_model_path, 'vocab.bpe')
+    vocab_path = os.path.join(args.tf_model_path, 'encoder.json')
+    gluon_merges_path = os.path.join(args.save_dir, 'gpt2.merges')
+    gluon_vocab_path = os.path.join(args.save_dir, 'gpt2.vocab')
+    
+    shutil.copy(merges_path, gluon_merges_path)
+    with open(vocab_path, 'r', encoding='utf-8') as f_v:
+        tf_vocab = json.load(f_v)
+    tf_vocab = list(tf_vocab.items())
+    tf_vocab = sorted(tf_vocab, key=lambda x: x[1])
+    all_tokens = [e[0] for e in tf_vocab]
+    eos_token = all_tokens[-1]
+    assert eos_token == '<|endoftext|>'
+    gluon_vocab = Vocab(all_tokens,
+                        unk_token=None,
+                        eos_token=eos_token)
+    gluon_vocab.save(gluon_vocab_path)
+
+    vocab_size = len(gluon_vocab)
+    print('| converted dictionary: {} types'.format(vocab_size))
+    return vocab_size
+
+
+def convert_config(tf_cfg, vocab_size):
+    print('converting config')
+    cfg = GPT2Model.get_cfg().clone()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = tf_cfg['n_vocab']
+    cfg.MODEL.units = tf_cfg['n_embd']
+    cfg.MODEL.max_length = tf_cfg['n_ctx']
+    cfg.MODEL.num_heads = tf_cfg['n_head']
+    cfg.MODEL.num_layers = tf_cfg['n_layer']
+    cfg.VERSION = 1
+    cfg.freeze()
+    return cfg
+
+
+def read_tf_ckpt(path):
+    from tensorflow.python import pywrap_tensorflow
+    tensors = {}
+    reader = pywrap_tensorflow.NewCheckpointReader(path)
+    var_to_shape_map = reader.get_variable_to_shape_map()
+    for key in sorted(var_to_shape_map):
+        tensor = reader.get_tensor(key)
+        tensors[key] = tensor
+    return tensors
+
+
+def convert_backbone_params(tf_params, gluon_backbone_model):
+    TF_GLUON_NAME_MAP = {
+        'model/wte' : '_embed.weight',
+        'model/wpe' : '_pos_embed._embed.weight',
+        'model/h(\d+)/ln_1/b' : '_layers.{}.atten.ln.beta',
+        'model/h(\d+)/ln_1/g' : '_layers.{}.atten.ln.gamma',
+        'model/h(\d+)/ln_2/b' : '_layers.{}.ffn.layer_norm.beta',
+        'model/h(\d+)/ln_2/g' : '_layers.{}.ffn.layer_norm.gamma',
+        'model/h(\d+)/mlp/c_fc/w' : '_layers.{}.ffn.ffn_1.weight',
+        'model/h(\d+)/mlp/c_fc/b' : '_layers.{}.ffn.ffn_1.bias',
+        'model/h(\d+)/mlp/c_proj/w' : '_layers.{}.ffn.ffn_2.weight',
+        'model/h(\d+)/mlp/c_proj/b' : '_layers.{}.ffn.ffn_2.bias',
+        'model/h(\d+)/attn/c_attn/w' : '_layers.{}.atten.qkv.weight',
+        'model/h(\d+)/attn/c_attn/b' : '_layers.{}.atten.qkv.bias',
+        'model/h(\d+)/attn/c_proj/w' : '_layers.{}.atten.out_proj.weight',
+        'model/h(\d+)/attn/c_proj/b' : '_layers.{}.atten.out_proj.bias',
+        'model/ln_f/b' : '_final_ln.beta',
+        'model/ln_f/g' : '_final_ln.gamma'
+    }
+    
+    params = gluon_backbone_model.collect_params()
+    loaded = {k: False for k in params}
+    for name, param_value in tf_params.items():
+        gluon_param_name = None
+        for lhs, rhs in TF_GLUON_NAME_MAP.items():
+            match = re.match(lhs, name)
+            if match is not None:
+                if len(match.groups()) > 0:
+                    gluon_param_name = rhs.format(match.groups()[0])
+                    break
+                else:
+                    gluon_param_name = rhs
+        assert gluon_param_name is not None
+        print('{} --> {}'.format(name, gluon_param_name))
+        if param_value.shape != params[gluon_param_name].shape:
+            params[gluon_param_name].set_data(param_value[0].T)
+        else:
+            params[gluon_param_name].set_data(param_value)
+        loaded[gluon_param_name] = True
+    for name in params:
+        if not loaded[name]:
+            print('{} is not loaded!'.format(name))
+
+
+def rename(save_dir):
+    """Rename converted files with hash"""
+    old_names = os.listdir(save_dir)
+    for old_name in old_names:
+        old_path = os.path.join(save_dir, old_name)
+        long_hash = sha1sum(old_path)
+        file_prefix, file_sufix = old_name.split('.')
+        new_name = '{file_prefix}-{short_hash}.{file_sufix}'.format(
+            file_prefix=file_prefix,
+            short_hash=long_hash[:8],
+            file_sufix=file_sufix)
+        new_path = os.path.join(save_dir, new_name)
+        shutil.move(old_path, new_path)
+        file_size = os.path.getsize(new_path)
+        logging.info('\t{} {} {}'.format(new_path, long_hash, file_size))
+
+
+def test_model(tf_model_path, gluon_model):
+    # test data
+    ctx = mx.cpu()
+
+    seed = 123
+    batch_size = 3
+    seq_length = 32
+    vocab_size = gluon_model._backbone_model._vocab_size
+    np.random.seed(seed)
+    input_ids = np.random.randint(
+        0,
+        vocab_size,
+        (batch_size, seq_length)
+    )
+
+    with open(os.path.join(tf_model_path, 'hparams.json'), 'r') as hf:
+        tf_cfg = json.load(hf)
+    hparams = HParams(
+        n_vocab=tf_cfg['n_vocab'],
+        n_ctx=tf_cfg['n_ctx'],
+        n_embd=tf_cfg['n_embd'],
+        n_head=tf_cfg['n_head'],
+        n_layer=tf_cfg['n_layer'],
+    )
+    tf_start_states = np.zeros((batch_size, hparams.n_layer, 2, hparams.n_head, 0, hparams.n_embd // hparams.n_head))
+    gl_start_states = gluon_model.init_states(batch_size, ctx)
+
+    # gluon model
+    gl_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx)
+    gl_logits_1, gl_states = gluon_model(gl_input_ids, gl_start_states, mx.np.array(0, dtype=np.int32, ctx=ctx))
+    gl_logits_2, _ = gluon_model(gl_input_ids, gl_states, mx.np.array(seq_length, dtype=np.int32, ctx=ctx))
+
+    # tf model
+    with tf.Session(graph=tf.Graph()) as sess:    
+        tf.set_random_seed(None)
+        tf_context = tf.placeholder(tf.int32, [batch_size, seq_length])
+        tf_past = tf.placeholder(tf.float32, [batch_size, hparams.n_layer, 2, hparams.n_head,
+                                            None, hparams.n_embd // hparams.n_head])
+        tf_lm_output = model.model(hparams=hparams, X=tf_context, past=tf_past, reuse=tf.AUTO_REUSE)
+        
+        tf_saver = tf.train.Saver()
+        tf_ckpt = tf.train.latest_checkpoint(tf_model_path)
+        tf_saver.restore(sess, tf_ckpt)
+        
+        tf_output_1 = sess.run(tf_lm_output, feed_dict={tf_context:input_ids, tf_past:tf_start_states})
+        tf_logits_1 = tf_output_1['logits']
+        tf_present = tf_output_1['present']
+        
+        tf_output_2 = sess.run(tf_lm_output, feed_dict={tf_context:input_ids, tf_past:tf_present})
+        tf_logits_2 = tf_output_2['logits']
+
+    for j in range(batch_size):
+        assert_allclose(
+            gl_logits_1[j, :, :].asnumpy(),
+            tf_logits_1[j, :, :],
+            1E-3,
+            1E-3
+        )
+    for j in range(batch_size):
+        assert_allclose(
+            gl_logits_2[j, :, :].asnumpy(),
+            tf_logits_2[j, :, :],
+            1E-3,
+            1E-3
+        )
+
+def convert_gpt2(args):
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+    
+    tf_ckpt_path = os.path.join(args.tf_model_path, 'model.ckpt')
+    tf_params = read_tf_ckpt(tf_ckpt_path)
+    with open(os.path.join(args.tf_model_path, 'hparams.json'), 'r') as hf:
+        tf_cfg = json.load(hf)
+    
+    vocab_size = convert_vocab(args)
+    gluon_backbone_cfg = convert_config(tf_cfg, vocab_size)
+    with open(os.path.join(args.save_dir, 'model.yml'), 'w') as of:
+        of.write(gluon_backbone_cfg.dump())
+
+    gluon_gpt2forlm_model = GPT2ForLM(gluon_backbone_cfg)
+    gluon_gpt2forlm_model.initialize(ctx=mx.cpu())
+    gluon_gpt2forlm_model.hybridize()
+    gluon_backbone_model = gluon_gpt2forlm_model._backbone_model
+    convert_backbone_params(tf_params, gluon_backbone_model)
+    
+    if args.test:
+        test_model(args.tf_model_path, gluon_gpt2forlm_model)
+
+    gluon_gpt2forlm_model.save_parameters(os.path.join(args.save_dir, 'model_lm.params'), deduplicate=True)
+    logging.info('Convert the GPT2 LM model in {} to {}'.
+                 format(os.path.join(args.tf_model_path, 'model.ckpt'),
+                        os.path.join(args.save_dir, 'model_lm.params')))
+    gluon_backbone_model.save_parameters(os.path.join(args.save_dir, 'model.params'), deduplicate=True)
+    logging.info('Convert the GPT2 backbone model in {} to {}'.
+                 format(os.path.join(args.tf_model_path, 'model.ckpt'),
+                        os.path.join(args.save_dir, 'model.params')))
+
+    logging.info('Conversion finished!')
+    logging.info('Statistics:')
+    old_names = os.listdir(args.save_dir)
+    for old_name in old_names:
+        new_name, long_hash = naming_convention(args.save_dir, old_name)
+        old_path = os.path.join(args.save_dir, old_name)
+        new_path = os.path.join(args.save_dir, new_name)
+        shutil.move(old_path, new_path)
+        file_size = os.path.getsize(new_path)
+        logging.info('\t{}/{} {} {}'.format(args.save_dir, new_name, long_hash, file_size))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    logging_config()
+    convert_gpt2(args)
diff --git a/scripts/conversion_toolkits/convert_gpt2.sh b/scripts/conversion_toolkits/convert_gpt2.sh
new file mode 100644
index 0000000000..a551250c4b
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_gpt2.sh
@@ -0,0 +1,8 @@
+python3 -m pip install tensorflow==1.15 --upgrade --user
+git clone https://github.com/openai/gpt-2.git gpt_2
+for model in 124M 355M 774M
+do
+    python3 gpt_2/download_model.py ${model}
+    mkdir gpt2_${model}
+    CUDA_VISIBLE_DEVICES="" python3 convert_gpt2.py --tf_model_path models/${model} --save_dir gpt2_${model} --test
+done
diff --git a/scripts/conversion_toolkits/convert_mobilebert.py b/scripts/conversion_toolkits/convert_mobilebert.py
new file mode 100644
index 0000000000..756b86ca31
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_mobilebert.py
@@ -0,0 +1,343 @@
+import os
+import re
+import json
+import sys
+import shutil
+import logging
+import argparse
+
+import mxnet as mx
+import numpy as np
+from numpy.testing import assert_allclose
+
+from gluonnlp.utils.misc import sha1sum, naming_convention, logging_config
+from gluonnlp.data.tokenizers import HuggingFaceWordPieceTokenizer
+from gluonnlp.models.mobilebert import MobileBertModel, MobileBertForPretrain
+import tensorflow.compat.v1 as tf
+
+tf.disable_eager_execution()
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
+
+mx.npx.set_np()
+np.random.seed(1234)
+mx.npx.random.seed(1234)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Convert the TF Mobile Bert Model to Gluon')
+    parser.add_argument('--tf_model_path', type=str,
+                        help='Directory of the model downloaded from TF hub.')
+    parser.add_argument('--mobilebert_dir', type=str,
+                        help='Path to the github repository of electra, you may clone it by '
+                             '`svn checkout https://github.com/google-research/google-research/trunk/mobilebert`.')
+    parser.add_argument('--save_dir', type=str, default=None,
+                        help='directory path to save the converted Mobile Bert model.')
+    parser.add_argument('--gpu', type=int, default=None,
+                        help='a single gpu to run mxnet, e.g. 0 or 1 The default device is cpu ')
+    parser.add_argument('--test', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+def read_tf_checkpoint(path):
+    """read tensorflow checkpoint"""
+    from tensorflow.python import pywrap_tensorflow
+    tensors = {}
+    reader = pywrap_tensorflow.NewCheckpointReader(path)
+    var_to_shape_map = reader.get_variable_to_shape_map()
+    for key in sorted(var_to_shape_map):
+        tensor = reader.get_tensor(key)
+        tensors[key] = tensor
+    return tensors
+
+
+def convert_tf_config(config_dict_path, vocab_size):
+    """Convert the config file"""
+    with open(config_dict_path, encoding='utf-8') as f:
+        config_dict = json.load(f)
+    assert vocab_size == config_dict['vocab_size']
+    cfg = MobileBertModel.get_cfg().clone()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = vocab_size
+    cfg.MODEL.units = config_dict['hidden_size']
+    cfg.MODEL.embed_size = config_dict['embedding_size']
+    cfg.MODEL.inner_size = config_dict['intra_bottleneck_size']
+    cfg.MODEL.hidden_size = config_dict['intermediate_size']
+    cfg.MODEL.max_length = config_dict['max_position_embeddings']
+    cfg.MODEL.num_heads = config_dict['num_attention_heads']
+    cfg.MODEL.num_layers = config_dict['num_hidden_layers']
+    cfg.MODEL.bottleneck_strategy
+    cfg.MODEL.num_stacked_ffn = config_dict['num_feedforward_networks']
+    cfg.MODEL.pos_embed_type = 'learned'
+    cfg.MODEL.activation = config_dict['hidden_act']
+    cfg.MODEL.num_token_types = config_dict['type_vocab_size']
+    cfg.MODEL.hidden_dropout_prob = float(config_dict['hidden_dropout_prob'])
+    cfg.MODEL.attention_dropout_prob = float(config_dict['attention_probs_dropout_prob'])
+    cfg.MODEL.normalization = config_dict['normalization_type']
+    cfg.MODEL.dtype = 'float32'
+
+    if 'use_bottleneck_attention' in config_dict.keys():
+        cfg.MODEL.bottleneck_strategy = 'from_bottleneck'
+    elif 'key_query_shared_bottleneck' in config_dict.keys():
+        cfg.MODEL.bottleneck_strategy = 'qk_sharing'
+    else:
+        cfg.MODEL.bottleneck_strategy = 'from_input'
+
+    cfg.INITIALIZER.weight = ['truncnorm', 0,
+                              config_dict['initializer_range']]  # TruncNorm(0, 0.02)
+    cfg.INITIALIZER.bias = ['zeros']
+    cfg.VERSION = 1
+    cfg.freeze()
+    return cfg
+
+
+def convert_tf_assets(tf_assets_dir):
+    """Convert the assets file including config, vocab and tokenizer model"""
+    file_names = os.listdir(tf_assets_dir)
+    vocab_path = None
+    json_cfg_path = None
+    for ele in file_names:
+        if ele.endswith('.txt'):
+            assert vocab_path is None
+            vocab_path = ele
+        elif ele.endswith('.json'):
+            assert json_cfg_path is None
+            json_cfg_path = ele
+    assert vocab_path is not None and json_cfg_path is not None
+
+    vocab_path = os.path.join(tf_assets_dir, vocab_path)
+    vocab_size = len(open(vocab_path, 'r', encoding='utf-8').readlines())
+    json_cfg_path = os.path.join(tf_assets_dir, json_cfg_path)
+    cfg = convert_tf_config(json_cfg_path, vocab_size)
+    return cfg, json_cfg_path, vocab_path
+
+
+CONVERT_MAP = [
+    # mlm model
+    ('cls/', ''),
+    ('predictions/extra_output_weights', 'extra_table.weight'),
+    ('predictions/output_bias', 'embedding_table.bias'),
+    ('predictions/transform/LayerNorm', 'mlm_decoder.2'),
+    ('predictions/transform/dense', 'mlm_decoder.0'),
+    ('seq_relationship/output_bias', 'nsp_classifier.bias'),
+    ('seq_relationship/output_weights', 'nsp_classifier.weight'),
+    # backbone
+    ('bert/', 'backbone_model.'),
+    ('layer_', 'all_layers.'),
+    ('attention/output/FakeLayerNorm', 'layer_norm'),
+    ('attention/output/dense', 'attention_proj'),
+    # inner ffn layer denoted by xxx
+    ('ffn_layers_xxx/intermediate/dense', 'stacked_ffn.xxx.ffn_1'),
+    ('ffn_layers_xxx/output/FakeLayerNorm', 'stacked_ffn.xxx.layer_norm'),
+    ('ffn_layers_xxx/output/dense', 'stacked_ffn.xxx.ffn_2'),
+    # last ffn layer denoted by xxy
+    ('intermediate/dense', 'stacked_ffn.xxy.ffn_1'),
+    ('output/FakeLayerNorm', 'stacked_ffn.xxy.layer_norm'),
+    ('output/dense', 'stacked_ffn.xxy.ffn_2'),
+    # embeddings
+    ('embeddings/word_embeddings', 'word_embed.weight'),
+    ('embeddings/token_type_embeddings', 'token_type_embed.weight'),
+    ('embeddings/position_embeddings', 'token_pos_embed._embed.weight'),
+    ('embeddings/embedding_transformation', 'embed_factorized_proj'),
+    ('embeddings/FakeLayerNorm', 'embed_layer_norm'),
+    ('bottleneck/input/FakeLayerNorm', 'in_bottleneck_ln'),
+    ('bottleneck/input/dense', 'in_bottleneck_proj'),
+    ('bottleneck/attention/FakeLayerNorm', 'shared_qk_ln'),
+    ('bottleneck/attention/dense', 'shared_qk'),
+    ('output/bottleneck/FakeLayerNorm', 'out_bottleneck_ln'),
+    ('output/bottleneck/dense', 'out_bottleneck_proj'),
+    ('attention/self/key', 'attn_key'),
+    ('attention/self/query', 'attn_query'),
+    ('attention/self/value', 'attn_value'),
+    ('output/', ''),
+    ('kernel', 'weight'),
+    ('FakeLayerNorm', 'layer_norm'),
+    ('LayerNorm', 'layer_norm'),
+    ('/', '.'),
+]
+
+
+def get_name_map(tf_names, num_stacked_ffn):
+    """
+    Get the converting mapping between tensor names and mxnet names.
+    The above mapping CONVERT_MAP is effectively adaptive to Bert and Albert,
+    but there is no guarantee that it can match to other tf models in case of
+    some sepecial variable_scope (tensorflow) and prefix (mxnet).
+
+    Redefined mapping is encouraged to adapt the personalization model.
+
+    Parameters
+    ----------
+    tf_names
+        the parameters names of tensorflow model
+    Returns
+    -------
+    A dictionary with the following format:
+        {tf_names : mx_names}
+    """
+    name_map = {}
+    for source_name in tf_names:
+        target_name = source_name
+        ffn_idx = re.findall(r'ffn_layer_\d+', target_name)
+        if ffn_idx:
+            target_name = target_name.replace(ffn_idx[0], 'ffn_layers_xxx')
+        for old, new in CONVERT_MAP:
+            target_name = target_name.replace(old, new)
+        if ffn_idx:
+            target_name = target_name.replace('stacked_ffn.xxx', 'stacked_ffn.' + ffn_idx[0][10:])
+        if 'stacked_ffn.xxy' in target_name:
+            target_name = target_name.replace(
+                'stacked_ffn.xxy', 'stacked_ffn.' + str(num_stacked_ffn - 1))
+        name_map[source_name] = target_name
+
+    return name_map
+
+
+def convert_tf_model(model_dir, save_dir, test_conversion, gpu, mobilebert_dir):
+    ctx = mx.gpu(gpu) if gpu is not None else mx.cpu()
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+
+    cfg, json_cfg_path, vocab_path = convert_tf_assets(model_dir)
+    with open(os.path.join(save_dir, 'model.yml'), 'w') as of:
+        of.write(cfg.dump())
+    new_vocab = HuggingFaceWordPieceTokenizer(
+        vocab_file=vocab_path,
+        unk_token='[UNK]',
+        pad_token='[PAD]',
+        cls_token='[CLS]',
+        sep_token='[SEP]',
+        mask_token='[MASK]',
+        lowercase=True).vocab
+    new_vocab.save(os.path.join(save_dir, 'vocab.json'))
+
+    # test input data
+    batch_size = 3
+    seq_length = 32
+    num_mask = 5
+    input_ids = np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length))
+    valid_length = np.random.randint(seq_length // 2, seq_length, (batch_size,))
+    input_mask = np.broadcast_to(np.arange(seq_length).reshape(1, -1), (batch_size, seq_length)) \
+        < np.expand_dims(valid_length, 1)
+    segment_ids = np.random.randint(0, 2, (batch_size, seq_length))
+    mlm_positions = np.random.randint(0, seq_length // 2, (batch_size, num_mask))
+
+    tf_input_ids = tf.constant(input_ids, dtype=np.int32)
+    tf_input_mask = tf.constant(input_mask, dtype=np.int32)
+    tf_segment_ids = tf.constant(segment_ids, dtype=np.int32)
+
+    init_checkpoint = os.path.join(model_dir, 'mobilebert_variables.ckpt')
+    tf_params = read_tf_checkpoint(init_checkpoint)
+    # get parameter names for tensorflow with unused parameters filtered out.
+    tf_names = sorted(tf_params.keys())
+    tf_names = filter(lambda name: not name.endswith('adam_m'), tf_names)
+    tf_names = filter(lambda name: not name.endswith('adam_v'), tf_names)
+    tf_names = filter(lambda name: name != 'global_step', tf_names)
+    tf_names = list(tf_names)
+
+    sys.path.append(mobilebert_dir)
+    from mobilebert import modeling
+
+    tf_bert_config = modeling.BertConfig.from_json_file(json_cfg_path)
+    bert_model = modeling.BertModel(
+        config=tf_bert_config,
+        is_training=False,
+        input_ids=tf_input_ids,
+        input_mask=tf_input_mask,
+        token_type_ids=tf_segment_ids,
+        use_one_hot_embeddings=False)
+    tvars = tf.trainable_variables()
+    assignment_map, _ = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+    tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+    with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        # the name of the parameters are ending with ':0' like 'Mobile
+        # Bert/embeddings/word_embeddings:0'
+        backbone_params = {v.name.split(":")[0]: v.read_value() for v in tvars}
+        backbone_params = sess.run(backbone_params)
+        tf_token_outputs_np = {
+            'pooled_output': sess.run(bert_model.get_pooled_output()),
+            'sequence_output': sess.run(bert_model.get_sequence_output()),
+        }
+
+    # The following part only ensure the parameters in backbone model are valid
+    for k in backbone_params:
+        assert_allclose(tf_params[k], backbone_params[k])
+
+    # Build gluon model and initialize
+    gluon_pretrain_model = MobileBertForPretrain(cfg)
+    gluon_pretrain_model.initialize(ctx=ctx)
+    gluon_pretrain_model.hybridize()
+
+    # pepare test data
+    mx_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx)
+    mx_valid_length = mx.np.array(valid_length, dtype=np.int32, ctx=ctx)
+    mx_token_types = mx.np.array(segment_ids, dtype=np.int32, ctx=ctx)
+    mx_masked_positions = mx.np.array(mlm_positions, dtype=np.int32, ctx=ctx)
+
+    has_mlm = True
+    name_map = get_name_map(tf_names, cfg.MODEL.num_stacked_ffn)
+    # go through the gluon model to infer the shape of parameters
+    model = gluon_pretrain_model
+    contextual_embedding, pooled_output, nsp_score, mlm_scores = \
+        model(mx_input_ids, mx_token_types, mx_valid_length, mx_masked_positions)
+    # replace tensorflow parameter names with gluon parameter names
+    mx_params = model.collect_params()
+    all_keys = set(mx_params.keys())
+    for (src_name, dst_name) in name_map.items():
+        tf_param_val = tf_params[src_name]
+        if dst_name is None:
+            continue
+        all_keys.remove(dst_name)
+        if src_name.endswith('kernel'):
+            mx_params[dst_name].set_data(tf_param_val.T)
+        else:
+            mx_params[dst_name].set_data(tf_param_val)
+
+    if has_mlm:
+        # 'embedding_table.weight' is shared with word_embed.weight
+        all_keys.remove('embedding_table.weight')
+    assert len(all_keys) == 0, 'parameters missing from tensorflow checkpoint'
+
+    # test conversion results for backbone model
+    if test_conversion:
+        tf_contextual_embedding = tf_token_outputs_np['sequence_output']
+        tf_pooled_output = tf_token_outputs_np['pooled_output']
+        contextual_embedding, pooled_output = model.backbone_model(
+            mx_input_ids, mx_token_types, mx_valid_length)
+        assert_allclose(pooled_output.asnumpy(), tf_pooled_output, 1E-2, 1E-2)
+        for i in range(batch_size):
+            ele_valid_length = valid_length[i]
+            assert_allclose(contextual_embedding[i, :ele_valid_length, :].asnumpy(),
+                            tf_contextual_embedding[i, :ele_valid_length, :], 1E-2, 1E-2)
+    model.backbone_model.save_parameters(os.path.join(save_dir, 'model.params'), deduplicate=True)
+    logging.info('Convert the backbone model in {} to {}/{}'.format(model_dir, save_dir, 'model.params'))
+    model.save_parameters(os.path.join(save_dir, 'model_mlm.params'), deduplicate=True)
+    logging.info('Convert the MLM and NSP model in {} to {}/{}'.format(model_dir,
+                                                                       save_dir, 'model_mlm.params'))
+
+    logging.info('Conversion finished!')
+    logging.info('Statistics:')
+
+    old_names = os.listdir(save_dir)
+    for old_name in old_names:
+        new_name, long_hash = naming_convention(save_dir, old_name)
+        old_path = os.path.join(save_dir, old_name)
+        new_path = os.path.join(save_dir, new_name)
+        shutil.move(old_path, new_path)
+        file_size = os.path.getsize(new_path)
+        logging.info('\t{}/{} {} {}'.format(save_dir, new_name, long_hash, file_size))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    logging_config()
+    save_dir = args.save_dir if args.save_dir is not None else os.path.basename(
+        args.tf_model_path) + '_gluon'
+    mobilebert_dir = os.path.abspath(
+        os.path.join(
+            os.path.dirname(
+                args.mobilebert_dir),
+            os.path.pardir))
+    convert_tf_model(args.tf_model_path, save_dir, args.test, args.gpu, mobilebert_dir)
diff --git a/scripts/conversion_toolkits/convert_mobilebert.sh b/scripts/conversion_toolkits/convert_mobilebert.sh
new file mode 100644
index 0000000000..f550ce8f3b
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_mobilebert.sh
@@ -0,0 +1,9 @@
+python3 -m pip install tensorflow==1.15 --upgrade --user
+export TF_FORCE_GPU_ALLOW_GROWTH="true"
+svn checkout https://github.com/google-research/google-research/trunk/mobilebert
+
+mkdir mobilebert_model
+url='https://storage.googleapis.com/cloud-tpu-checkpoints/mobilebert/uncased_L-24_H-128_B-512_A-4_F-4_OPT.tar.gz'
+wget ${url} -O "mobilebert.tar.gz"
+tar -xvf mobilebert.tar.gz --directory mobilebert_model
+python3 convert_mobilebert.py --tf_model_path mobilebert_model/mobilebert --mobilebert_dir mobilebert --test
diff --git a/scripts/conversion_toolkits/convert_roberta.sh b/scripts/conversion_toolkits/convert_roberta.sh
new file mode 100644
index 0000000000..8bb08b0607
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_roberta.sh
@@ -0,0 +1,8 @@
+python3 -m pip install git+https://github.com/pytorch/fairseq.git@master --upgrade --user
+for model in base large
+do
+    mkdir roberta_${model}
+    wget "https://dl.fbaipublicfiles.com/fairseq/models/roberta.${model}.tar.gz"
+    tar zxf roberta.${model}.tar.gz --directory roberta_${model}
+    python3 convert_fairseq_roberta.py --fairseq_model_path roberta_${model}/roberta.${model} --test
+done
diff --git a/scripts/conversion_toolkits/convert_tf_hub_model.py b/scripts/conversion_toolkits/convert_tf_hub_model.py
new file mode 100644
index 0000000000..b54726e54b
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_tf_hub_model.py
@@ -0,0 +1,534 @@
+import tensorflow_hub as hub
+import tensorflow.compat.v1 as tf
+import os
+import re
+import json
+import shutil
+import logging
+import argparse
+
+import mxnet as mx
+import numpy as np
+from numpy.testing import assert_allclose
+
+from gluonnlp.data.vocab import Vocab
+from gluonnlp.utils.misc import naming_convention, logging_config
+from gluonnlp.models.bert import BertModel, BertForMLM
+from gluonnlp.models.albert import AlbertModel, AlbertForMLM
+from gluonnlp.data.tokenizers import SentencepieceTokenizer, HuggingFaceWordPieceTokenizer
+
+import tensorflow
+USE_TF_V1 = tensorflow.version.VERSION.split('.')[0] < '2'
+tf.disable_eager_execution()
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
+
+mx.npx.set_np()
+np.random.seed(1234)
+mx.npx.random.seed(1234)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Convert the TF pretrained model to Gluon')
+    parser.add_argument('--tf_hub_model_path', type=str,
+                        help='Directory of the model downloaded from TF hub.')
+    parser.add_argument('--model_type', type=str, choices=['bert', 'albert'],
+                        help='The name of the model to be converted. '
+                             'Only Bert and Albert are currently supported.')
+    parser.add_argument('--save_dir', type=str, default=None,
+                        help='directory path to save the converted pretrained model.')
+    parser.add_argument('--gpu', type=int, default=None,
+                        help='a single gpu to run mxnet, e.g. 0 or 1 The default device is cpu ')
+    parser.add_argument('--test', action='store_true')
+    args = parser.parse_args()
+    return args
+
+
+def read_tf_checkpoint(path):
+    """read tensorflow checkpoint"""
+    from tensorflow.python import pywrap_tensorflow
+    tensors = {}
+    reader = pywrap_tensorflow.NewCheckpointReader(path)
+    var_to_shape_map = reader.get_variable_to_shape_map()
+    for key in sorted(var_to_shape_map):
+        tensor = reader.get_tensor(key)
+        tensors[key] = tensor
+    return tensors
+
+
+def convert_tf_config(json_cfg_path, vocab_size, model_type):
+    """Convert the config file"""
+
+    with open(json_cfg_path, encoding='utf-8') as f:
+        json_cfg = json.load(f)
+    if model_type == 'bert':
+        # For bert model, the config file are copied from local configuration file
+        # leaving the vocab_size indistinguishable. Actually, the verification of
+        # vocab_size would be done in the process of embedding weights conversion.
+        cfg = BertModel.get_cfg().clone()
+    elif model_type == 'albert':
+        assert vocab_size == json_cfg['vocab_size']
+        cfg = AlbertModel.get_cfg().clone()
+    else:
+        raise NotImplementedError
+    cfg.defrost()
+    cfg.MODEL.vocab_size = vocab_size
+
+    cfg.MODEL.units = json_cfg['hidden_size']
+    cfg.MODEL.hidden_size = json_cfg['intermediate_size']
+    cfg.MODEL.max_length = json_cfg['max_position_embeddings']
+    cfg.MODEL.num_heads = json_cfg['num_attention_heads']
+    cfg.MODEL.num_layers = json_cfg['num_hidden_layers']
+    cfg.MODEL.pos_embed_type = 'learned'
+    if json_cfg['hidden_act'] == 'gelu':
+        cfg.MODEL.activation = 'gelu(tanh)'
+    else:
+        cfg.MODEL.activation = json_cfg['hidden_act']
+    cfg.MODEL.layer_norm_eps = 1E-12
+    cfg.MODEL.num_token_types = json_cfg['type_vocab_size']
+    cfg.MODEL.hidden_dropout_prob = float(json_cfg['hidden_dropout_prob'])
+    cfg.MODEL.attention_dropout_prob = float(json_cfg['attention_probs_dropout_prob'])
+    cfg.MODEL.dtype = 'float32'
+    cfg.INITIALIZER.weight = ['truncnorm', 0, json_cfg['initializer_range']]  # TruncNorm(0, 0.02)
+    cfg.INITIALIZER.bias = ['zeros']
+    cfg.VERSION = 1
+    if model_type == 'albert':
+        # The below configurations are not supported in bert
+        cfg.MODEL.embed_size = json_cfg['embedding_size']
+        cfg.MODEL.num_groups = json_cfg['num_hidden_groups']
+    cfg.freeze()
+    return cfg
+
+
+def convert_tf_assets(tf_assets_dir, model_type):
+    """Convert the assets file including config, vocab and tokenizer model"""
+    file_names = os.listdir(tf_assets_dir)
+    json_cfg_path = None
+    spm_model_path = None
+    vocab_path = None
+    for ele in file_names:
+        if ele.endswith('.model'):
+            assert spm_model_path is None
+            spm_model_path = ele
+        elif ele.endswith('.json'):
+            assert json_cfg_path is None
+            json_cfg_path = ele
+        elif ele.endswith('.txt'):
+            assert vocab_path is None
+            vocab_path = ele
+    assert json_cfg_path is not None and \
+        (spm_model_path is not None or vocab_path is not None), "The file to be" \
+        "converted is missing and at least one word segmentation tool or dictionary exists"
+
+    json_cfg_path = os.path.join(tf_assets_dir, json_cfg_path)
+    if spm_model_path:
+        spm_model_path = os.path.join(tf_assets_dir, spm_model_path)
+        tokenizer = SentencepieceTokenizer(spm_model_path)
+        vocab_size = len(tokenizer.vocab)
+    elif vocab_path:
+        vocab_path = os.path.join(tf_assets_dir, vocab_path)
+        vocab_size = len(open(vocab_path, 'r', encoding='utf-8').readlines())
+    cfg = convert_tf_config(json_cfg_path, vocab_size, model_type)
+    return cfg, vocab_path, spm_model_path
+
+
+CONVERT_MAP_TF1 = [
+    ('bert/', 'backbone_model.'),
+    ('cls/', ''),
+    ('predictions/transform/dense', 'mlm_decoder.0'),
+    ('predictions/transform/LayerNorm', 'mlm_decoder.2'),
+    ('predictions/output_bias', 'mlm_decoder.3.bias'),
+    ('transformer/', ''),
+    ('transform/', ''),
+    ('embeddings/word_embeddings', 'word_embed.weight'),
+    ('embeddings/token_type_embeddings', 'token_type_embed.weight'),
+    ('embeddings/position_embeddings', 'token_pos_embed._embed.weight'),
+    ('encoder/embedding_hidden_mapping_in', 'embed_factorized_proj'),
+    ('group_0/inner_group_0/', 'all_encoder_groups.0.'),  # albert
+    ('layer_', 'all_layers.'),  # bert
+    ('embeddings/LayerNorm', 'embed_layer_norm'),
+    ('attention/output/LayerNorm', 'layer_norm'),  # bert
+    ('output/LayerNorm', 'ffn.layer_norm'),  # bert
+    ('LayerNorm_1', 'ffn.layer_norm'),  # albert
+    ('LayerNorm', 'layer_norm'),  # albert
+    ('attention_1', 'attention'),  # albert
+    ('attention/output/dense', 'attention_proj'),
+    ('ffn_1/', ''),  # bert & albert
+    ('intermediate/dense', 'ffn.ffn_1'),  # albert
+    ('intermediate/output/dense', 'ffn.ffn_2'),  # albert
+    ('output/dense', 'ffn.ffn_2'),  # bert
+    ('output/', ''),
+    ('pooler/dense', 'pooler'),
+    ('kernel', 'weight'),
+    ('attention/', ''),
+    ('/', '.'),
+]
+
+CONVERT_MAP_TF2 = [
+    (':0', ''),
+    ('cls/', ''),
+    ('predictions/output_bias', 'mlm_decoder.3.bias'),
+    ('transformer/layer_', 'encoder.all_layers.'),
+    ('word_embeddings/embeddings', 'word_embed.weight'),
+    ('type_embeddings/embeddings', 'token_type_embed.weight'),
+    ('position_embedding/embeddings', 'token_pos_embed._embed.weight'),
+    ('embeddings/layer_norm', 'embed_layer_norm'),
+    ('embedding_projection', 'embed_factorized_proj'),
+    ('self_attention/attention_output', 'attention_proj'),
+    ('self_attention_layer_norm', 'layer_norm'),
+    ('intermediate', 'ffn.ffn_1'),
+    ('output_layer_norm', 'ffn.layer_norm'),
+    ('output', 'ffn.ffn_2'),
+    ("pooler_transform", "pooler"),
+    ('kernel', 'weight'),
+    ('/', '.'),
+]
+
+
+def get_name_map(tf_names, is_TF1=True):
+    """
+    Get the converting mapping between TF names and mxnet names.
+    The above mapping CONVERT_MAP is effectively adaptive to Bert and Albert,
+    but there is no guarantee that it can match to other tf models in case of
+    some special variable_scope (tensorflow) and prefix (mxnet).
+
+    Redefined mapping is encouraged to adapt the personalization model.
+
+    Parameters
+    ----------
+    tf_names
+        the parameters names of tensorflow model
+    is_TF1
+        whether load from TF1 Hub Modules
+
+    Returns
+    -------
+    A dictionary with the following format:
+        {tf_names : mx_names}
+    """
+    convert_map = CONVERT_MAP_TF1 if is_TF1 else CONVERT_MAP_TF2
+    name_map = {}
+    for source_name in tf_names:
+        target_name = source_name
+        # skip the qkv weights
+        if 'self/' in source_name:
+            name_map[source_name] = None
+            continue
+        if re.match(r'^transformer\/layer_[\d]+\/self_attention\/(key|value|query)\/(kernel|bias)$',
+                    source_name) is not None:
+            name_map[source_name] = None
+            continue
+        for old, new in convert_map:
+            target_name = target_name.replace(old, new)
+        name_map[source_name] = target_name
+    return name_map
+
+
+def convert_tf_model(hub_model_dir, save_dir, test_conversion, model_type, gpu):
+    ctx = mx.gpu(gpu) if gpu is not None else mx.cpu()
+    # set up the model type to be converted
+    if model_type == 'bert':
+        PretrainedModel, PretrainedMLMModel = BertModel, BertForMLM
+    elif model_type == 'albert':
+        PretrainedModel, PretrainedMLMModel = AlbertModel, AlbertForMLM
+    else:
+        raise NotImplementedError
+
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+
+    cfg, vocab_path, spm_model_path = convert_tf_assets(os.path.join(hub_model_dir, 'assets'),
+                                                        model_type)
+    with open(os.path.join(save_dir, 'model.yml'), 'w') as of:
+        of.write(cfg.dump())
+    if spm_model_path:
+        # Sentencepiece Tokenizer that used in albert model
+        tokenizer = SentencepieceTokenizer(spm_model_path)
+        new_vocab = Vocab(tokenizer.vocab.all_tokens,
+                          unk_token='<unk>',
+                          pad_token='<pad>',
+                          cls_token='[CLS]',
+                          sep_token='[SEP]',
+                          mask_token='[MASK]')
+        shutil.copy(spm_model_path, os.path.join(save_dir, 'spm.model'))
+    elif vocab_path:
+        # Wordpiece Tokenizer that used in bert and electra model
+
+        # In this step, the vocabulary is converted with the help of the tokenizer,
+        # so whether tokenzier is case-dependent does not matter.
+        new_vocab = HuggingFaceWordPieceTokenizer(
+            vocab_file=vocab_path,
+            unk_token='[UNK]',
+            pad_token='[PAD]',
+            cls_token='[CLS]',
+            sep_token='[SEP]',
+            mask_token='[MASK]',
+            lowercase=True).vocab
+
+    new_vocab.save(os.path.join(save_dir, 'vocab.json'))
+
+    # test input data
+    batch_size = 2
+    seq_length = 16
+    num_mask = 5
+    input_ids = np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length))
+    valid_length = np.random.randint(seq_length // 2, seq_length, (batch_size,))
+    input_mask = np.broadcast_to(np.arange(seq_length).reshape(1, -1), (batch_size, seq_length)) \
+        < np.expand_dims(valid_length, 1)
+    segment_ids = np.random.randint(0, 2, (batch_size, seq_length))
+    mlm_positions = np.random.randint(0, seq_length // 2, (batch_size, num_mask))
+    TF1_Hub_Modules = True
+    try:
+        tf_model = hub.Module(hub_model_dir, trainable=True)
+        # see https://www.tensorflow.org/hub/tf1_hub_module for details
+        logging.info('The model is loaded as the TF1 Hub Model')
+        tf_input_ids = tf.constant(input_ids, dtype=np.int32)
+        tf_input_mask = tf.constant(input_mask, dtype=np.int32)
+        tf_segment_ids = tf.constant(segment_ids, dtype=np.int32)
+        tf_mlm_positions = tf.constant(mlm_positions, dtype=np.int32)
+        tf_mlm_outputs = tf_model(
+            dict(input_ids=tf_input_ids,
+                 input_mask=tf_input_mask,
+                 segment_ids=tf_segment_ids,
+                 mlm_positions=tf_mlm_positions), signature="mlm", as_dict=True)
+        tf_token_outputs = tf_model(
+            dict(input_ids=tf_input_ids,
+                 input_mask=tf_input_mask,
+                 segment_ids=tf_segment_ids), signature="tokens", as_dict=True)
+        with tf.Session() as sess:
+            sess.run(tf.global_variables_initializer())
+            tf_params = sess.run(tf_model.variable_map)
+            tf_token_outputs_np = sess.run(tf_token_outputs)
+            tf_mlm_outputs_np = sess.run(tf_mlm_outputs)
+    except RuntimeError as _:
+        logging.warning('The provided model directory is not valid for TF1 Hub Modules. '
+                        'Now try to load as TF2 SavedModels')
+        bert_layer = hub.KerasLayer(hub_model_dir, trainable=True)
+        # see https://www.tensorflow.org/hub/tf2_saved_model for details
+        logging.info('The model is loaded as the TF2 SavedModel')
+        TF1_Hub_Modules = False
+        input_word_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32,
+                                               name="input_word_ids")
+        input_word_mask = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32,
+                                                name="input_mask")
+        segment_type_ids = tf.keras.layers.Input(shape=(seq_length,), dtype=tf.int32,
+                                                 name="segment_ids")
+        pooled_output, sequence_output = bert_layer([input_word_ids, input_word_mask,
+                                                     segment_type_ids])
+        tf_model = tf.keras.Model(
+            inputs=[input_word_ids, input_word_mask, segment_type_ids],
+            outputs=[pooled_output, sequence_output]
+        )
+        tf_params = {}
+        with tf.Session() as sess:
+            sess.run(tf.global_variables_initializer())
+            pooled_output, sequence_output = tf_model.predict([input_ids, input_mask, segment_ids])
+            tf_token_outputs_np = {'pooled_output': pooled_output,
+                                   'sequence_output': sequence_output}
+            # The name of the parameters in TF2 SavedModel are ending with ':0'
+            # like 'bert_model/word_embeddings/embeddings_2:0'
+            tf_params = {v.name.split(":")[0]: v.read_value() for v in tf_model.variables}
+            tf_params = sess.run(tf_params)
+
+    if USE_TF_V1 and TF1_Hub_Modules:
+        tf_params_by_read = read_tf_checkpoint(
+            os.path.join(hub_model_dir, 'variables', 'variables'))
+        for k in tf_params:
+            assert_allclose(tf_params[k], tf_params_by_read[k])
+
+    # Get parameter names for Tensorflow with unused parameters filtered out.
+    tf_names = sorted(tf_params.keys())
+    tf_names = filter(lambda name: not name.endswith('adam_m'), tf_names)
+    tf_names = filter(lambda name: not name.endswith('adam_v'), tf_names)
+    tf_names = filter(lambda name: name != 'Variable', tf_names)
+    tf_names = filter(lambda name: name != 'global_step', tf_names)
+    tf_names = list(tf_names)
+
+    # Build gluon model and initialize
+    gluon_model = PretrainedModel.from_cfg(cfg, use_pooler=True)
+    gluon_model.initialize(ctx=ctx)
+    gluon_model.hybridize()
+    gluon_mlm_model = PretrainedMLMModel(backbone_cfg=cfg)
+    gluon_mlm_model.initialize(ctx=ctx)
+    gluon_mlm_model.hybridize()
+
+    # Pepare test data
+    mx_input_ids = mx.np.array(input_ids, dtype=np.int32, ctx=ctx)
+    mx_valid_length = mx.np.array(valid_length, dtype=np.int32, ctx=ctx)
+    mx_token_types = mx.np.array(segment_ids, dtype=np.int32, ctx=ctx)
+    mx_masked_positions = mx.np.array(mlm_positions, dtype=np.int32, ctx=ctx)
+
+    # start converting for 'backbone' and 'mlm' model.
+    # However sometimes there is no mlm parameter in Tf2 SavedModels like bert wmm large
+    if any(['cls' in name for name in tf_names]):
+        has_mlm = True
+    else:
+        has_mlm = False
+        logging.info('There is no mask language model parameter in this pretrained model')
+    name_map = get_name_map(tf_names, is_TF1=TF1_Hub_Modules)
+    # go through the gluon model to infer the shape of parameters
+    if has_mlm:
+        model = gluon_mlm_model
+        contextual_embedding, pooled_output, mlm_scores = \
+            model(mx_input_ids, mx_token_types, mx_valid_length, mx_masked_positions)
+    else:
+        model = gluon_model
+        contextual_embedding, pooled_output = model(mx_input_ids, mx_token_types,
+                                                    mx_valid_length)
+
+    # replace tensorflow parameter names with gluon parameter names
+    mx_params = model.collect_params()
+    all_keys = set(mx_params.keys())
+    for (src_name, dst_name) in name_map.items():
+        tf_param_val = tf_params[src_name]
+        if dst_name is None:
+            continue
+        all_keys.remove(dst_name)
+        if 'self_attention/attention_output/kernel' in src_name:
+            mx_params[dst_name].set_data(tf_param_val.reshape((cfg.MODEL.units, -1)).T)
+            continue
+        if src_name.endswith('kernel'):
+            mx_params[dst_name].set_data(tf_param_val.T)
+        else:
+            mx_params[dst_name].set_data(tf_param_val)
+
+    # Merge query/kernel, key/kernel, value/kernel to encoder.all_encoder_groups.0.attn_qkv.weight
+    def convert_qkv_weights(tf_prefix, mx_prefix, is_mlm):
+        """
+        To convert the qkv weights with different prefix.
+
+        In tensorflow framework, the prefix of query/key/value for the albert model is
+        'bert/encoder/transformer/group_0/inner_group_0/attention_1/self/query/kernel',
+        and that for the bert model is 'bert/encoder/layer_{}/attention/self/key/bias'.
+        In gluonnlp framework, the prefix is slightly different as
+        'encoder.all_encoder_groups.0.attn_qkv.weight' for albert model and
+        'encoder.all_layers.{}.attn_qkv.weight' for bert model, as the
+        curly braces {} can be filled with the layer number.
+        """
+        query_weight = tf_params[
+            '{}/query/kernel'.format(tf_prefix)]
+        key_weight = tf_params[
+            '{}/key/kernel'.format(tf_prefix)]
+        value_weight = tf_params[
+            '{}/value/kernel'.format(tf_prefix)]
+        query_bias = tf_params[
+            '{}/query/bias'.format(tf_prefix)]
+        key_bias = tf_params[
+            '{}/key/bias'.format(tf_prefix)]
+        value_bias = tf_params[
+            '{}/value/bias'.format(tf_prefix)]
+        if 'self_attention' in tf_prefix:
+            query_weight = query_weight.reshape((cfg.MODEL.units, -1))
+            key_weight = key_weight.reshape((cfg.MODEL.units, -1))
+            value_weight = value_weight.reshape((cfg.MODEL.units, -1))
+            query_bias = query_bias.reshape((-1,))
+            key_bias = key_bias.reshape((-1,))
+            value_bias = value_bias.reshape((-1,))
+        # Merge query_weight, key_weight, value_weight to mx_params
+        mx_weight_name = 'encoder.{}.attn_qkv.weight'.format(mx_prefix)
+        mx_bias_name = 'encoder.{}.attn_qkv.bias'.format(mx_prefix)
+        if is_mlm:
+            mx_weight_name = 'backbone_model.' + mx_weight_name
+            mx_bias_name = 'backbone_model.' + mx_bias_name
+        mx_params[mx_weight_name].set_data(
+            np.concatenate([query_weight, key_weight, value_weight], axis=1).T)
+        # Merge query_bias, key_bias, value_bias to mx_params
+        mx_params[mx_bias_name].set_data(
+            np.concatenate([query_bias, key_bias, value_bias], axis=0))
+
+    tf_prefix = None
+    if has_mlm:
+        all_keys.remove('mlm_decoder.3.weight')
+    if model_type == 'bert':
+        assert all(
+            [
+                re.match(
+                    r'^(backbone_model\.){0,1}encoder\.all_layers\.[\d]+\.attn_qkv\.(weight|bias)$',
+                    key) is not None for key in all_keys])
+        for layer_id in range(cfg.MODEL.num_layers):
+            mx_prefix = 'all_layers.{}'.format(layer_id)
+            if TF1_Hub_Modules:
+                tf_prefix = 'bert/encoder/layer_{}/attention/self'.format(layer_id)
+            else:
+                tf_prefix = 'transformer/layer_{}/self_attention'.format(layer_id)
+            convert_qkv_weights(tf_prefix, mx_prefix, has_mlm)
+    elif model_type == 'albert':
+        assert all(
+            [
+                re.match(
+                    r'^(backbone_model\.){0,1}encoder\.all_encoder_groups\.0\.attn_qkv\.(weight|bias)$',
+                    key) is not None for key in all_keys])
+        mx_prefix = 'all_encoder_groups.0'
+        assert TF1_Hub_Modules, 'Please download thr alber model from TF1 Hub'
+        tf_prefix = 'bert/encoder/transformer/group_0/inner_group_0/attention_1/self'
+        convert_qkv_weights(tf_prefix, mx_prefix, has_mlm)
+    else:
+        raise NotImplementedError
+
+    tolerance = 1E-2 if cfg.MODEL.num_layers == 24 else 1E-3
+    # The pooled_output of albert large will have 0.5% mismatch under the tolerance of 1E-2,
+    # for that we are going to use a small tolerance to pass the difference checking
+    tolerance = 0.2 if 'albert_large' in args.tf_hub_model_path else tolerance
+    def check_backbone(tested_model, tf_token_outputs_np):
+        # test conversion results for backbone model
+        tf_contextual_embedding = tf_token_outputs_np['sequence_output']
+        tf_pooled_output = tf_token_outputs_np['pooled_output']
+        contextual_embedding, pooled_output = \
+            tested_model(mx_input_ids, mx_token_types, mx_valid_length)
+        assert_allclose(pooled_output.asnumpy(), tf_pooled_output, tolerance, tolerance)
+        for i in range(batch_size):
+            ele_valid_length = valid_length[i]
+            assert_allclose(contextual_embedding[i, :ele_valid_length, :].asnumpy(),
+                            tf_contextual_embedding[i, :ele_valid_length, :], tolerance, tolerance)
+
+    if not has_mlm:
+        if test_conversion:
+            check_backbone(model, tf_token_outputs_np)
+        model.save_parameters(os.path.join(save_dir, 'model.params'), deduplicate=True)
+        logging.info('Convert the backbone model in {} to {}/{}'.format(hub_model_dir,
+                                                                        save_dir, 'model.params'))
+    else:
+        # test conversion results for mlm model
+        # TODO(zheyuye), figure out how to check the mlm model from TF2 SavedModel
+        if test_conversion:
+            check_backbone(model.backbone_model, tf_mlm_outputs_np)
+            if TF1_Hub_Modules:
+                tf_contextual_embedding = tf_mlm_outputs_np['sequence_output']
+                tf_pooled_output = tf_mlm_outputs_np['pooled_output']
+                tf_mlm_scores = tf_mlm_outputs_np['mlm_logits'].reshape((batch_size, num_mask, -1))
+                contextual_embedding, pooled_output, mlm_scores = \
+                    model(mx_input_ids, mx_token_types, mx_valid_length, mx_masked_positions)
+                assert_allclose(pooled_output.asnumpy(), tf_pooled_output, tolerance, tolerance)
+                assert_allclose(mlm_scores.asnumpy(), tf_mlm_scores, tolerance, tolerance)
+                for i in range(batch_size):
+                    ele_valid_length = valid_length[i]
+                    assert_allclose(contextual_embedding[i, :ele_valid_length, :].asnumpy(),
+                                    tf_contextual_embedding[i, :ele_valid_length, :], tolerance, tolerance)
+        model.backbone_model.save_parameters(os.path.join(
+            save_dir, 'model.params'), deduplicate=True)
+        logging.info('Convert the backbone model in {} to {}/{}'.format(hub_model_dir,
+                                                                        save_dir, 'model.params'))
+        model.save_parameters(os.path.join(save_dir, 'model_mlm.params'), deduplicate=True)
+        logging.info('Convert the MLM model in {} to {}/{}'.format(hub_model_dir,
+                                                                   save_dir, 'model_mlm.params'))
+
+    # TODO(zheyuye) the gradient checking could be explored in further development
+
+    logging.info('Conversion finished!')
+    logging.info('Statistics:')
+
+    old_names = os.listdir(save_dir)
+    for old_name in old_names:
+        new_name, long_hash = naming_convention(save_dir, old_name)
+        old_path = os.path.join(save_dir, old_name)
+        new_path = os.path.join(save_dir, new_name)
+        shutil.move(old_path, new_path)
+        file_size = os.path.getsize(new_path)
+        logging.info('\t{}/{} {} {}'.format(save_dir, new_name, long_hash, file_size))
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    logging_config()
+    save_dir = args.save_dir \
+        if args.save_dir is not None else os.path.basename(args.tf_hub_model_path) + '_gluon'
+    convert_tf_model(args.tf_hub_model_path, save_dir, args.test, args.model_type, args.gpu)
diff --git a/scripts/conversion_toolkits/convert_xlmr.sh b/scripts/conversion_toolkits/convert_xlmr.sh
new file mode 100644
index 0000000000..20fefff7a6
--- /dev/null
+++ b/scripts/conversion_toolkits/convert_xlmr.sh
@@ -0,0 +1,8 @@
+python3 -m pip install git+https://github.com/pytorch/fairseq.git@master --upgrade --user
+for model in base large
+do
+    mkdir xlmr_${model}
+    wget "https://dl.fbaipublicfiles.com/fairseq/models/xlmr.${model}.tar.gz"
+    tar zxf xlmr.${model}.tar.gz --directory xlmr_${model}
+    python3 convert_fairseq_xlmr.py --fairseq_model_path xlmr_${model}/xlmr.${model} --model_size ${model} --test
+done
diff --git a/scripts/conversion_tools/compare_gluon_ernie.py b/scripts/conversion_tools/compare_gluon_ernie.py
deleted file mode 100644
index 6c6bd63e33..0000000000
--- a/scripts/conversion_tools/compare_gluon_ernie.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import gluonnlp as nlp
-import argparse
-import os
-import mxnet as mx
-import json
-
-parser = argparse.ArgumentParser(description='inference compare script for ernie model in gluon',
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--input_file', type=str, default='input_cn.txt',
-                    help='sample input file for testing')
-parser.add_argument('--cased', action='store_true',
-                    help='if not set, inputs are converted to lower case')
-parser.add_argument('--gluon_dataset', type=str, default='baidu_ernie_uncased',
-                    help='gluon dataset name')
-parser.add_argument('--gluon_model', type=str, default='ernie_12_768_12',
-                    help='gluon model name')
-parser.add_argument('--gluon_parameter_file', type=str, default=None,
-                    help='gluon parameter file name.')
-parser.add_argument('--gluon_vocab_file', type=str, default=None,
-                    help='gluon vocab file corresponding to --gluon_parameter_file.')
-
-args = parser.parse_args()
-
-input_file = os.path.expanduser(args.input_file)
-do_lower_case = not args.cased
-max_length = 11
-if not args.gluon_dataset:
-    with open(args.gluon_vocab_file) as f:
-        vocab_str = json.load(f)
-    vocab = nlp.vocab.BERTVocab.from_json(json.dumps(vocab_str))
-else:
-    vocab = None
-bert, vocabulary = nlp.model.get_model(args.gluon_model,
-                                       dataset_name=args.gluon_dataset,
-                                       vocab=vocab,
-                                       pretrained=not args.gluon_parameter_file,
-                                       use_pooler=False,
-                                       use_decoder=False,
-                                       use_classifier=False)
-if args.gluon_parameter_file:
-    try:
-        bert.cast('float16')
-        bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
-        bert.cast('float32')
-    except AssertionError:
-        bert.cast('float32')
-        bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
-
-print(bert)
-tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=do_lower_case)
-dataset = nlp.data.TSVDataset(input_file, field_separator=nlp.data.Splitter('|||'))
-
-trans = nlp.data.BERTSentenceTransform(tokenizer, max_length)
-dataset = dataset.transform(trans)
-
-bert_dataloader = mx.gluon.data.DataLoader(dataset, batch_size=1,
-                                           shuffle=True, last_batch='rollover')
-
-# verify the output of the first sample
-for i, seq in enumerate(bert_dataloader):
-    input_ids, valid_length, type_ids = seq
-    out = bert(input_ids, type_ids,
-               valid_length.astype('float32'))
-    length = valid_length.asscalar()
-    gluon_np = out.asnumpy().squeeze(0)
-    print(out)
-    import numpy as np
-    paddle_np = np.load(os.path.expanduser(
-        'ernie_top_layer_emb.npy'))
-    np.testing.assert_array_almost_equal(paddle_np, gluon_np, decimal=6)
-    break
-print("verify success")
diff --git a/scripts/conversion_tools/compare_tf_gluon_model.py b/scripts/conversion_tools/compare_tf_gluon_model.py
deleted file mode 100644
index 8895194b28..0000000000
--- a/scripts/conversion_tools/compare_tf_gluon_model.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Script for model comparison between TF and Gluon."""
-
-# pylint: disable=wrong-import-position, wrong-import-order, wildcard-import
-
-import sys
-import os
-import argparse
-import numpy as np
-import mxnet as mx
-import gluonnlp as nlp
-
-sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
-
-parser = argparse.ArgumentParser(description='Comparison script for BERT model in Tensorflow'
-                                             'and that in Gluon. This script works with '
-                                             'google/bert@f39e881b',
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--input_file', type=str, default='input.txt',
-                    help='sample input file for testing')
-parser.add_argument('--tf_bert_repo_dir', type=str,
-                    default='~/bert/',
-                    help='path to the original Tensorflow bert repository. '
-                         'The repo should be at f39e881b.')
-parser.add_argument('--tf_model_dir', type=str,
-                    default='~/uncased_L-12_H-768_A-12/',
-                    help='path to the original Tensorflow bert checkpoint directory.')
-parser.add_argument('--tf_model_prefix', type=str,
-                    default='bert_model.ckpt',
-                    help='name of bert checkpoint file.')
-parser.add_argument('--tf_config_name', type=str,
-                    default='bert_config.json',
-                    help='Name of Bert config file')
-parser.add_argument('--cased', action='store_true',
-                    help='if not set, inputs are converted to lower case')
-parser.add_argument('--gluon_dataset', type=str, default='book_corpus_wiki_en_uncased',
-                    help='gluon dataset name')
-parser.add_argument('--gluon_model', type=str, default='bert_12_768_12',
-                    help='gluon model name')
-parser.add_argument('--gluon_parameter_file', type=str, default=None,
-                    help='gluon parameter file name.')
-parser.add_argument('--gluon_vocab_file', type=str, default=None,
-                    help='gluon vocab file corresponding to --gluon_parameter_file.')
-
-args = parser.parse_args()
-
-input_file = os.path.expanduser(args.input_file)
-tf_bert_repo_dir = os.path.expanduser(args.tf_bert_repo_dir)
-tf_model_dir = os.path.expanduser(args.tf_model_dir)
-vocab_file = os.path.join(tf_model_dir, 'vocab.txt')
-bert_config_file = os.path.join(tf_model_dir, args.tf_config_name)
-init_checkpoint = os.path.join(tf_model_dir, args.tf_model_prefix)
-do_lower_case = not args.cased
-max_length = 128
-
-###############################################################################
-#                          Tensorflow MODEL                                   #
-###############################################################################
-# import tensorflow modules
-sys.path.insert(0, tf_bert_repo_dir)
-
-# tensorflow model inference
-import modeling
-import tokenization
-from extract_features import *
-
-# data
-num_layers = int(args.gluon_model.split('_')[1])
-layer_indexes = list(range(num_layers))
-bert_config = modeling.BertConfig.from_json_file(bert_config_file)
-tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
-examples = read_examples(input_file)
-
-features = convert_examples_to_features(
-    examples=examples, seq_length=max_length, tokenizer=tokenizer)
-
-is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
-run_config = tf.contrib.tpu.RunConfig(
-    master=None,
-    tpu_config=tf.contrib.tpu.TPUConfig(
-        num_shards=1,
-        per_host_input_for_training=is_per_host))
-# model
-model_fn = model_fn_builder(
-    bert_config=bert_config,
-    init_checkpoint=init_checkpoint,
-    layer_indexes=layer_indexes,
-    use_tpu=False,
-    use_one_hot_embeddings=False)
-
-estimator = tf.contrib.tpu.TPUEstimator(
-    use_tpu=False,
-    model_fn=model_fn,
-    config=run_config,
-    predict_batch_size=1)
-
-input_fn = input_fn_builder(
-    features=features, seq_length=max_length)
-
-tensorflow_all_out = []
-for result in estimator.predict(input_fn, yield_single_examples=True):
-    output_json = collections.OrderedDict()
-    tensorflow_all_out_features = []
-    all_layers = []
-    for (j, layer_index) in enumerate(layer_indexes):
-        layer_output = result['layer_output_%d' % j]
-        layers = collections.OrderedDict()
-        layers['index'] = layer_index
-        layers['values'] = layer_output
-        all_layers.append(layers)
-    tensorflow_out_features = collections.OrderedDict()
-    tensorflow_out_features['layers'] = all_layers
-    tensorflow_all_out_features.append(tensorflow_out_features)
-
-    output_json['features'] = tensorflow_all_out_features
-    tensorflow_all_out.append(output_json)
-
-tf_outputs = [tensorflow_all_out[0]['features'][0]['layers'][t]['values'] for t in layer_indexes]
-
-###############################################################################
-#                               Gluon MODEL                                   #
-###############################################################################
-
-if args.gluon_parameter_file:
-    assert args.gluon_vocab_file, \
-        'Must specify --gluon_vocab_file when specifying --gluon_parameter_file'
-    with open(args.gluon_vocab_file, 'r') as f:
-        vocabulary = nlp.Vocab.from_json(f.read())
-    bert, vocabulary = nlp.model.get_model(args.gluon_model,
-                                           dataset_name=None,
-                                           vocab=vocabulary,
-                                           pretrained=not args.gluon_parameter_file,
-                                           use_pooler=False,
-                                           use_decoder=False,
-                                           use_classifier=False)
-    try:
-        bert.cast('float16')
-        bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
-        bert.cast('float32')
-    except AssertionError:
-        bert.cast('float32')
-        bert.load_parameters(args.gluon_parameter_file, ignore_extra=True)
-else:
-    assert not args.gluon_vocab_file, \
-        'Cannot specify --gluon_vocab_file without specifying --gluon_parameter_file'
-    bert, vocabulary = nlp.model.get_model(args.gluon_model,
-                                           dataset_name=args.gluon_dataset,
-                                           pretrained=not args.gluon_parameter_file,
-                                           use_pooler=False,
-                                           use_decoder=False,
-                                           use_classifier=False)
-
-print(bert)
-tokenizer = nlp.data.BERTTokenizer(vocabulary, lower=do_lower_case)
-dataset = nlp.data.TSVDataset(input_file, field_separator=nlp.data.Splitter(' ||| '))
-
-trans = nlp.data.BERTSentenceTransform(tokenizer, max_length)
-dataset = dataset.transform(trans)
-
-bert_dataloader = mx.gluon.data.DataLoader(dataset, batch_size=1,
-                                           shuffle=True, last_batch='rollover')
-
-# verify the output of the first sample
-for i, seq in enumerate(bert_dataloader):
-    input_ids, valid_length, type_ids = seq
-    out = bert(input_ids, type_ids,
-               valid_length.astype('float32'))
-    length = valid_length.asscalar()
-    a = tf_outputs[-1][:length]
-    b = out[0][:length].asnumpy()
-
-    print('stdev = %s' % (np.std(a - b)))
-    mx.test_utils.assert_almost_equal(a, b, atol=5e-6, rtol=5e-6)
-    break
diff --git a/scripts/conversion_tools/convert_fairseq_model.py b/scripts/conversion_tools/convert_fairseq_model.py
deleted file mode 100644
index 2dc97fcfa2..0000000000
--- a/scripts/conversion_tools/convert_fairseq_model.py
+++ /dev/null
@@ -1,213 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-""" Script for converting Fairseq Roberta Model to Gluon. """
-import argparse
-import logging
-import os
-import sys
-import io
-import numpy as np
-
-import torch
-from fairseq.models.roberta import RobertaModel
-
-import mxnet as mx
-import gluonnlp as nlp
-from gluonnlp.model import BERTEncoder, BERTModel
-from gluonnlp.model.bert import bert_hparams
-from gluonnlp.data.utils import _load_pretrained_vocab
-
-from utils import get_hash, load_text_vocab, tf_vocab_to_gluon_vocab
-
-parser = argparse.ArgumentParser(description='Conversion script for Fairseq RoBERTa model',
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--ckpt_dir', type=str, help='Full path to the roberta folder',
-                    default='/home/ubuntu/roberta/roberta.base')
-parser.add_argument('--model', type=str, help='Model type. ',
-                    choices=['roberta_12_768_12', 'roberta_24_1024_16'],
-                    default='roberta_12_768_12')
-parser.add_argument('--verbose', action='store_true', help='Verbose logging')
-
-args = parser.parse_args()
-
-ckpt_dir = os.path.expanduser(args.ckpt_dir)
-
-ckpt = torch.load(os.path.join(ckpt_dir, 'model.pt'))
-pytorch_params = ckpt['model']
-
-if args.verbose:
-    print(ckpt['args'])
-    for k, v in pytorch_params.items():
-        print(k, v.shape)
-
-# Load the model in fairseq
-roberta = RobertaModel.from_pretrained(ckpt_dir)
-roberta.eval()
-
-def fairseq_vocab_to_gluon_vocab(torch_vocab):
-    index_to_words = [None] * len(torch_vocab)
-
-    bos_idx = torch_vocab.bos()
-    pad_idx = torch_vocab.pad()
-    eos_idx = torch_vocab.eos()
-    unk_idx = torch_vocab.unk()
-
-    index_to_words[bos_idx] = torch_vocab.symbols[bos_idx]
-    index_to_words[pad_idx] = torch_vocab.symbols[pad_idx]
-    index_to_words[eos_idx] = torch_vocab.symbols[eos_idx]
-    index_to_words[unk_idx] = torch_vocab.symbols[unk_idx]
-
-    specials = [bos_idx, pad_idx, eos_idx, unk_idx]
-
-    openai_to_roberta = {}
-    openai_vocab = _load_pretrained_vocab('openai_webtext', '.')
-
-    with io.open(os.path.join(ckpt_dir, 'dict.txt'), encoding='utf-8') as f:
-        for i, line in enumerate(f):
-            token, count = line.split(' ')
-            try:
-                fake_token = int(token)
-                openai_to_roberta[token] = i + len(specials)
-            except ValueError:
-                index_to_words[i + len(specials)] = token
-
-    for idx, token in enumerate(openai_vocab.idx_to_token):
-        if str(idx) in openai_to_roberta:
-            index_to_words[openai_to_roberta[str(idx)]] = token
-        else:
-            assert token == u'<mask>', token
-
-    mask_idx = torch_vocab.index(u'<mask>')
-    index_to_words[mask_idx] = torch_vocab.string([mask_idx])
-    assert None not in index_to_words
-    word2idx = {}
-    for idx, token in enumerate(index_to_words):
-        word2idx[token] = idx
-
-    vocab = nlp.vocab.Vocab(word2idx, token_to_idx=word2idx,
-                            unknown_token=index_to_words[unk_idx],
-                            padding_token=index_to_words[pad_idx],
-                            bos_token=index_to_words[bos_idx],
-                            eos_token=index_to_words[eos_idx],
-                            mask_token=u'<mask>')
-    return vocab
-
-vocab = fairseq_vocab_to_gluon_vocab(roberta.task.dictionary)
-
-predefined_args = bert_hparams[args.model]
-
-# BERT encoder
-encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],
-                      num_layers=predefined_args['num_layers'], units=predefined_args['units'],
-                      hidden_size=predefined_args['hidden_size'],
-                      max_length=predefined_args['max_length'],
-                      num_heads=predefined_args['num_heads'], scaled=predefined_args['scaled'],
-                      dropout=predefined_args['dropout'],
-                      use_residual=predefined_args['use_residual'],
-                      layer_norm_eps=predefined_args['layer_norm_eps'])
-
-# BERT model
-bert = BERTModel(encoder, len(vocab),
-                 units=predefined_args['units'], embed_size=predefined_args['embed_size'],
-                 word_embed=predefined_args['word_embed'], use_pooler=False,
-                 use_token_type_embed=False, use_classifier=False)
-
-bert.initialize(init=mx.init.Normal(0.02))
-
-ones = mx.nd.ones((2, 8))
-out = bert(ones, None, mx.nd.array([5, 6]), mx.nd.array([[1], [2]]))
-params = bert._collect_params_with_prefix()
-
-
-
-mapping = {
-    'decoder.2' : 'decoder.lm_head.layer_norm',
-    'decoder.0' : 'decoder.lm_head.dense',
-    'decoder.3' : 'decoder.lm_head',
-    'encoder.layer_norm' : 'decoder.sentence_encoder.emb_layer_norm',
-    'encoder.position_weight' : 'decoder.sentence_encoder.embed_positions.weight',
-    'encoder.transformer_cells': 'decoder.sentence_encoder.layers',
-    'attention_cell.proj_key.' : 'self_attn.in_proj_',
-    'attention_cell.proj_value.' : 'self_attn.in_proj_',
-    'attention_cell.proj_query.' : 'self_attn.in_proj_',
-    'ffn.ffn_1' : 'fc1',
-    'ffn.ffn_2' : 'fc2',
-    'layer_norm.gamma' : 'layer_norm.weight',
-    'layer_norm.beta' : 'layer_norm.bias',
-    'ffn.layer_norm' : 'final_layer_norm',
-    'word_embed.0.weight' : 'decoder.sentence_encoder.embed_tokens.weight',
-}
-
-for i in range(24):
-    mapping['{}.layer_norm'.format(i)] = '{}.self_attn_layer_norm'.format(i)
-    mapping['{}.proj'.format(i)] = '{}.self_attn.out_proj'.format(i)
-
-# set parameter data
-loaded_params = {}
-visited_pytorch_params = {}
-for name in params:
-    pytorch_name = name
-    for source, dest in mapping.items():
-        pytorch_name = pytorch_name.replace(source, dest)
-
-    assert pytorch_name in pytorch_params.keys(), 'Key ' + pytorch_name + ' for ' + name + ' not found.'
-    torch_arr = pytorch_params[pytorch_name].cpu()
-    # fairseq positional embedding starts with index 2
-    if pytorch_name == 'decoder.sentence_encoder.embed_positions.weight':
-       torch_arr = torch_arr[2:]
-
-    arr = mx.nd.array(torch_arr)
-    if 'attention_cell.proj' in name:
-        unfused = ['query', 'key', 'value']
-        arrs = arr.split(num_outputs=3, axis=0)
-        for i, p in enumerate(unfused):
-            if p in name:
-                arr = arrs[i]
-    else:
-        assert arr.shape == params[name].shape, (arr.shape, params[name].shape, name, pytorch_name)
-    params[name].set_data(arr)
-    loaded_params[name] = True
-    visited_pytorch_params[pytorch_name] = True
-
-assert len(params) == len(loaded_params)
-assert len(visited_pytorch_params) == len(pytorch_params), "Gluon model does not match PyTorch model. " \
-    "Please fix the BERTModel hyperparameters\n" + str(len(visited_pytorch_params)) + ' v.s. ' + str(len(pytorch_params))
-
-
-texts = 'Hello world. abc, def and 中文!'
-torch_tokens = roberta.encode(texts)
-
-torch_features = roberta.extract_features(torch_tokens)
-pytorch_out = torch_features.detach().numpy()
-
-mx_tokenizer = nlp.data.GPT2BPETokenizer()
-mx_tokens = [vocab.bos_token] + mx_tokenizer(texts) + [vocab.eos_token]
-mx_data = vocab[mx_tokens]
-print(mx_tokens)
-print(vocab[mx_tokens])
-print(torch_tokens)
-assert mx_data == torch_tokens.tolist()
-
-mx_out = bert(mx.nd.array([mx_data]))
-print('stdev = ', np.std(mx_out.asnumpy() - pytorch_out))
-mx.test_utils.assert_almost_equal(mx_out.asnumpy(), pytorch_out, atol=1e-3, rtol=1e-3)
-mx.test_utils.assert_almost_equal(mx_out.asnumpy(), pytorch_out, atol=5e-6, rtol=5e-6)
-
-bert.save_parameters(os.path.join(ckpt_dir, args.model + '.params'))
-with io.open(os.path.join(ckpt_dir, args.model + '.vocab'), 'w', encoding='utf-8') as f:
-    f.write(vocab.to_json())
diff --git a/scripts/conversion_tools/convert_paddle_to_gluon.py b/scripts/conversion_tools/convert_paddle_to_gluon.py
deleted file mode 100644
index b5f71c2be9..0000000000
--- a/scripts/conversion_tools/convert_paddle_to_gluon.py
+++ /dev/null
@@ -1,254 +0,0 @@
-#!/usr/bin/env python
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-import collections
-import os
-import sys
-import numpy as np
-import argparse
-import logging
-import json
-import mxnet as mx
-import gluonnlp as nlp
-import paddle.fluid as fluid
-
-from gluonnlp.model import BERTEncoder, BERTModel
-from gluonnlp.model.bert import bert_hparams
-from utils import get_hash, tf_vocab_to_gluon_vocab, load_text_vocab
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--gluon_bert_model_base", default='ernie_12_768_12', type=str, help=".")
-parser.add_argument("--init_pretraining_params", default='./ERNIE_stable-1.0.1/params',
-                    type=str, help=".")
-parser.add_argument("--ernie_config_path", default='./ERNIE_stable-1.0.1/ernie_config.json',
-                    type=str, help=".")
-parser.add_argument("--ernie_vocab_path", default='./ERNIE_stable-1.0.1/vocab.txt',
-                    type=str, help=".")
-parser.add_argument("--out_dir", default='./ernie_gluon_model2', type=str, help=".")
-parser.add_argument("--baidu_lark_repo_dir", default='../../../../LARK', type=str,
-                    help='path to the original baidu lark repository. '
-                         'The repo should be at f97e3c8581e36dc1979560d62f75df862acd9585.'
-                         '(https://github.com/PaddlePaddle/LARK.git)')
-args = parser.parse_args()
-
-sys.path = [os.path.join(args.baidu_lark_repo_dir,'ERNIE')] + sys.path
-try:
-    from model.ernie import ErnieConfig
-    from finetune.classifier import create_model
-except:
-    raise ImportError('Place clone ERNIE first')
-
-def if_exist(var):
-    return os.path.exists(os.path.join(args.init_pretraining_params, var.name))
-
-
-def build_weight_map():
-    weight_map = collections.OrderedDict({
-        'word_embedding': 'word_embed.0.weight',
-        'pos_embedding': 'encoder.position_weight',
-        'sent_embedding': 'token_type_embed.0.weight',
-        'pre_encoder_layer_norm_scale': 'encoder.layer_norm.gamma',
-        'pre_encoder_layer_norm_bias': 'encoder.layer_norm.beta',
-    })
-
-    def add_w_and_b(ernie_pre, gluon_pre):
-        weight_map[ernie_pre + ".w_0"] = gluon_pre + ".weight"
-        weight_map[ernie_pre + ".b_0"] = gluon_pre + ".bias"
-
-    def add_one_encoder_layer(layer_number):
-        # attention
-        add_w_and_b("encoder_layer_{}_multi_head_att_query_fc".format(layer_number),
-                    "encoder.transformer_cells.{}.attention_cell.proj_query".format(layer_number))
-        add_w_and_b("encoder_layer_{}_multi_head_att_key_fc".format(layer_number),
-                    "encoder.transformer_cells.{}.attention_cell.proj_key".format(layer_number))
-        add_w_and_b("encoder_layer_{}_multi_head_att_value_fc".format(layer_number),
-                    "encoder.transformer_cells.{}.attention_cell.proj_value".format(layer_number))
-        add_w_and_b("encoder_layer_{}_multi_head_att_output_fc".format(layer_number),
-                    "encoder.transformer_cells.{}.proj".format(layer_number))
-        weight_map["encoder_layer_{}_post_att_layer_norm_bias".format(layer_number)] = \
-            "encoder.transformer_cells.{}.layer_norm.beta".format(layer_number)
-        weight_map["encoder_layer_{}_post_att_layer_norm_scale".format(layer_number)] = \
-            "encoder.transformer_cells.{}.layer_norm.gamma".format(layer_number)
-        # intermediate
-        add_w_and_b("encoder_layer_{}_ffn_fc_0".format(layer_number),
-                    "encoder.transformer_cells.{}.ffn.ffn_1".format(layer_number))
-        # output
-        add_w_and_b("encoder_layer_{}_ffn_fc_1".format(layer_number),
-                    "encoder.transformer_cells.{}.ffn.ffn_2".format(layer_number))
-        weight_map["encoder_layer_{}_post_ffn_layer_norm_bias".format(layer_number)] = \
-            "encoder.transformer_cells.{}.ffn.layer_norm.beta".format(layer_number)
-        weight_map["encoder_layer_{}_post_ffn_layer_norm_scale".format(layer_number)] = \
-            "encoder.transformer_cells.{}.ffn.layer_norm.gamma".format(layer_number)
-
-    for i in range(12):
-        add_one_encoder_layer(i)
-    add_w_and_b('pooled_fc', 'pooler')
-    return weight_map
-
-
-def extract_weights(args):
-    # add ERNIE to environment
-    print('extract weights start'.center(60, '='))
-    startup_prog = fluid.Program()
-    test_prog = fluid.Program()
-    place = fluid.CPUPlace()
-    exe = fluid.Executor(place)
-    exe.run(startup_prog)
-    args.max_seq_len = 512
-    args.use_fp16 = False
-    args.num_labels = 2
-    args.loss_scaling = 1.0
-    print('model config:')
-    ernie_config = ErnieConfig(args.ernie_config_path)
-    ernie_config.print_config()
-    with fluid.program_guard(test_prog, startup_prog):
-        with fluid.unique_name.guard():
-            _, _ = create_model(
-                args,
-                pyreader_name='train',
-                ernie_config=ernie_config)
-    fluid.io.load_vars(exe, args.init_pretraining_params, main_program=test_prog, predicate=if_exist)
-    state_dict = collections.OrderedDict()
-    weight_map = build_weight_map()
-    for ernie_name, gluon_name in weight_map.items():
-        fluid_tensor = fluid.global_scope().find_var(ernie_name).get_tensor()
-        fluid_array = np.array(fluid_tensor, dtype=np.float32)
-        if 'w_0' in ernie_name:
-            fluid_array = fluid_array.transpose()
-        state_dict[gluon_name] = fluid_array
-        print('{} -> {} {}'.format(ernie_name, gluon_name, fluid_array.shape))
-    print('extract weights done!'.center(60, '='))
-    return state_dict
-
-
-def save_model(new_gluon_parameters, output_dir):
-    print('save model start'.center(60, '='))
-    if not os.path.exists(output_dir):
-        os.makedirs(output_dir)
-    # save model
-    # load vocab
-    vocab_f = open(os.path.join(output_dir, "vocab.txt"), "wt", encoding='utf-8')
-    with open(args.ernie_vocab_path, "rt", encoding='utf-8') as f:
-        for line in f:
-            data = line.strip().split("\t")
-            vocab_f.writelines(data[0] + "\n")
-    vocab_f.close()
-    vocab = tf_vocab_to_gluon_vocab(load_text_vocab(os.path.join(output_dir, "vocab.txt")))
-    # vocab serialization
-    tmp_file_path = os.path.expanduser(os.path.join(output_dir, 'tmp'))
-    if not os.path.exists(os.path.join(args.out_dir)):
-        os.makedirs(os.path.join(args.out_dir))
-    with open(tmp_file_path, 'w') as f:
-        f.write(vocab.to_json())
-    hash_full, hash_short = get_hash(tmp_file_path)
-    gluon_vocab_path = os.path.expanduser(os.path.join(output_dir, hash_short + '.vocab'))
-    with open(gluon_vocab_path, 'w') as f:
-        f.write(vocab.to_json())
-        logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full)
-
-    # BERT config
-    tf_config_names_to_gluon_config_names = {
-        'attention_probs_dropout_prob': 'dropout',
-        'hidden_act': None,
-        'hidden_dropout_prob': 'dropout',
-        'hidden_size': 'units',
-        'initializer_range': None,
-        # 'intermediate_size': 'hidden_size',
-        'max_position_embeddings': 'max_length',
-        'num_attention_heads': 'num_heads',
-        'num_hidden_layers': 'num_layers',
-        'type_vocab_size': 'token_type_vocab_size',
-        'vocab_size': None
-    }
-    predefined_args = bert_hparams[args.gluon_bert_model_base]
-    with open(args.ernie_config_path, 'r') as f:
-        tf_config = json.load(f)
-        if 'layer_norm_eps' in tf_config: # ignore layer_norm_eps
-            del tf_config['layer_norm_eps']
-        assert len(tf_config) == len(tf_config_names_to_gluon_config_names)
-        for tf_name, gluon_name in tf_config_names_to_gluon_config_names.items():
-            if tf_name is None or gluon_name is None:
-                continue
-            if gluon_name != 'max_length':
-                assert tf_config[tf_name] == predefined_args[gluon_name]
-
-    encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],
-                          num_layers=predefined_args['num_layers'], units=predefined_args['units'],
-                          hidden_size=predefined_args['hidden_size'],
-                          max_length=predefined_args['max_length'],
-                          num_heads=predefined_args['num_heads'], scaled=predefined_args['scaled'],
-                          dropout=predefined_args['dropout'],
-                          use_residual=predefined_args['use_residual'],
-                          activation='relu')
-
-    bert = BERTModel(encoder, len(vocab),
-                     token_type_vocab_size=predefined_args['token_type_vocab_size'],
-                     units=predefined_args['units'], embed_size=predefined_args['embed_size'],
-                     word_embed=predefined_args['word_embed'], use_pooler=True,
-                     use_decoder=False, use_classifier=False)
-
-    bert.initialize(init=mx.init.Normal(0.02))
-
-    ones = mx.nd.ones((2, 8))
-    out = bert(ones, ones, mx.nd.array([5, 6]), mx.nd.array([[1], [2]]))
-    params = bert._collect_params_with_prefix()
-    assert len(params) == len(new_gluon_parameters), "Gluon model does not match paddle model. " \
-                                                   "Please fix the BERTModel hyperparameters"
-
-    # post processings for parameters:
-    # - handle tied decoder weight
-    new_gluon_parameters['decoder.3.weight'] = new_gluon_parameters['word_embed.0.weight']
-    # set parameter data
-    loaded_params = {}
-    for name in params:
-        if name == 'word_embed.0.weight':
-            arr = mx.nd.array(new_gluon_parameters[name][:params[name].shape[0]])
-        else:
-            arr = mx.nd.array(new_gluon_parameters[name])
-        try:
-            assert arr.shape == params[name].shape
-        except:
-            print(name)
-        params[name].set_data(arr)
-        loaded_params[name] = True
-
-    # post processings for parameters:
-    # - handle tied decoder weight
-    # - update word embedding for reserved tokens
-
-    if len(params) != len(loaded_params):
-        raise RuntimeError('The Gluon BERTModel comprises {} parameter arrays, '
-                           'but {} have been extracted from the paddle model. '.format(
-            len(params), len(loaded_params)))
-
-    # param serialization
-    bert.save_parameters(tmp_file_path)
-    hash_full, hash_short = get_hash(tmp_file_path)
-    gluon_param_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.params'))
-    logging.info('param saved to %s. hash = %s', gluon_param_path, hash_full)
-    bert.save_parameters(gluon_param_path)
-    mx.nd.waitall()
-    # save config
-    print('finish save vocab')
-    print('save model done!'.center(60, '='))
-
-
-if __name__ == "__main__":
-    state_dict = extract_weights(args)
-    save_model(state_dict, args.out_dir)
diff --git a/scripts/conversion_tools/convert_pytorch_model.py b/scripts/conversion_tools/convert_pytorch_model.py
deleted file mode 100644
index 26f0f4a06f..0000000000
--- a/scripts/conversion_tools/convert_pytorch_model.py
+++ /dev/null
@@ -1,177 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-""" Script for converting PyTorch Model to Gluon. """
-
-import argparse
-import json
-import logging
-import os
-import sys
-
-import mxnet as mx
-import gluonnlp as nlp
-import torch
-from gluonnlp.model import BERTEncoder, BERTModel
-from gluonnlp.model.bert import bert_hparams
-
-sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
-from utils import get_hash, load_text_vocab, tf_vocab_to_gluon_vocab
-
-parser = argparse.ArgumentParser(description='Conversion script for PyTorch BERT model',
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--model', type=str, default='bert_12_768_12',
-                    choices=['bert_12_768_12', 'bert_24_1024_16'], help='BERT model name')
-parser.add_argument('--pytorch_checkpoint_dir', type=str,
-                    help='Path to Tensorflow checkpoint folder.')
-parser.add_argument('--vocab_file', type=str, help='Full path to the vocab.txt')
-parser.add_argument('--gluon_pytorch_name_mapping', type=str,
-                    default='gluon_to_pytorch_naming.json',
-                    help='Output of infer_pytorch_gluon_parameter_name_mapping.py')
-parser.add_argument('--out_dir', type=str, default=os.path.join('~', 'output'),
-                    help='Path to output folder. The folder must exist.')
-parser.add_argument('--debug', action='store_true', help='debugging mode')
-args = parser.parse_args()
-logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
-logging.info(args)
-
-# convert vocabulary
-vocab = tf_vocab_to_gluon_vocab(load_text_vocab(args.vocab_file))
-
-# vocab serialization
-tmp_file_path = os.path.expanduser(os.path.join(args.out_dir, 'tmp'))
-with open(tmp_file_path, 'w') as f:
-    f.write(vocab.to_json())
-hash_full, hash_short = get_hash(tmp_file_path)
-gluon_vocab_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.vocab'))
-with open(gluon_vocab_path, 'w') as f:
-    f.write(vocab.to_json())
-    logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full)
-
-# Load PyTorch Model
-pytorch_parameters = torch.load(os.path.join(args.pytorch_checkpoint_dir, 'pytorch_model.bin'),
-                                map_location=lambda storage, loc: storage)
-pytorch_parameters = {k: v.numpy() for k, v in pytorch_parameters.items()}
-
-# Make sure vocab fits to model
-assert pytorch_parameters['bert.embeddings.word_embeddings.weight'].shape[0] == len(
-    vocab.idx_to_token)
-
-# Load Mapping
-with open(args.gluon_pytorch_name_mapping, 'r') as f:
-    mapping = json.load(f)
-
-# BERT config
-tf_config_names_to_gluon_config_names = {
-    'attention_probs_dropout_prob': 'dropout',
-    'hidden_act': None,
-    'hidden_dropout_prob': 'dropout',
-    'hidden_size': 'units',
-    'initializer_range': None,
-    'intermediate_size': 'hidden_size',
-    'max_position_embeddings': 'max_length',
-    'num_attention_heads': 'num_heads',
-    'num_hidden_layers': 'num_layers',
-    'type_vocab_size': 'token_type_vocab_size',
-    'vocab_size': None
-}
-predefined_args = bert_hparams[args.model]
-with open(os.path.join(args.pytorch_checkpoint_dir, 'bert_config.json'), 'r') as f:
-    tf_config = json.load(f)
-    assert len(tf_config) == len(tf_config_names_to_gluon_config_names)
-    for tf_name, gluon_name in tf_config_names_to_gluon_config_names.items():
-        if tf_name is None or gluon_name is None:
-            continue
-        assert tf_config[tf_name] == predefined_args[gluon_name]
-
-# BERT encoder
-encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],
-                      num_layers=predefined_args['num_layers'], units=predefined_args['units'],
-                      hidden_size=predefined_args['hidden_size'],
-                      max_length=predefined_args['max_length'],
-                      num_heads=predefined_args['num_heads'], scaled=predefined_args['scaled'],
-                      dropout=predefined_args['dropout'],
-                      use_residual=predefined_args['use_residual'])
-
-# Infer enabled BERTModel components
-use_pooler = any('pooler' in n for n in pytorch_parameters)
-use_decoder = any('cls.predictions.transform.dense.weight' in n for n in pytorch_parameters)
-use_classifier = any('cls.seq_relationship.weight' in n for n in pytorch_parameters)
-
-if not use_classifier and 'classifier.weight' in pytorch_parameters and \
-   pytorch_parameters['classifier.weight'].shape[0] == 2:
-    logging.info('Assuming classifier weights in provided Pytorch model are '
-                 'from next sentence prediction task.')
-    use_classifier = True
-
-logging.info('Inferred that the pytorch model provides the following parameters:')
-logging.info('- use_pooler = {}'.format(use_pooler))
-logging.info('- use_decoder = {}'.format(use_decoder))
-logging.info('- use_classifier = {}'.format(use_classifier))
-
-# BERT model
-bert = BERTModel(encoder, len(vocab),
-                 token_type_vocab_size=predefined_args['token_type_vocab_size'],
-                 units=predefined_args['units'], embed_size=predefined_args['embed_size'],
-                 word_embed=predefined_args['word_embed'], use_pooler=use_pooler,
-                 use_decoder=use_decoder, use_classifier=use_classifier)
-
-bert.initialize(init=mx.init.Normal(0.02))
-
-ones = mx.nd.ones((2, 8))
-out = bert(ones, ones, mx.nd.array([5, 6]), mx.nd.array([[1], [2]]))
-params = bert._collect_params_with_prefix()
-assert len(params) == len(pytorch_parameters), "Gluon model does not match PyTorch model. " \
-    "Please fix the BERTModel hyperparameters"
-
-# set parameter data
-loaded_params = {}
-for name in params:
-    if name not in mapping:
-        raise RuntimeError('Invalid json mapping file. '
-                           'The parameter {} is not described in the mapping file.'.format(name))
-    pytorch_name = mapping[name]
-    if pytorch_name not in pytorch_parameters.keys():
-        # Handle inconsistent naming in PyTorch
-        # The Expected names here are based on the PyTorch version of SciBert.
-        # The Inconsistencies were found in ClinicalBert
-        if 'LayerNorm' in pytorch_name:
-            pytorch_name = pytorch_name.replace('weight', 'gamma')
-            pytorch_name = pytorch_name.replace('bias', 'beta')
-            assert pytorch_name in pytorch_parameters.keys()
-
-        if 'cls.seq_relationship' in pytorch_name:
-            pytorch_name = pytorch_name.replace('cls.seq_relationship', 'classifier')
-
-    arr = mx.nd.array(pytorch_parameters[pytorch_name])
-
-    assert arr.shape == params[name].shape
-    params[name].set_data(arr)
-    loaded_params[name] = True
-
-if len(params) != len(loaded_params):
-    raise RuntimeError('The Gluon BERTModel comprises {} parameter arrays, '
-                       'but {} have been extracted from the pytorch model. '.format(
-                           len(params), len(loaded_params)))
-
-# param serialization
-bert.save_parameters(tmp_file_path)
-hash_full, hash_short = get_hash(tmp_file_path)
-gluon_param_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.params'))
-logging.info('param saved to %s. hash = %s', gluon_param_path, hash_full)
-bert.save_parameters(gluon_param_path)
-mx.nd.waitall()
diff --git a/scripts/conversion_tools/convert_pytorch_transformers.py b/scripts/conversion_tools/convert_pytorch_transformers.py
deleted file mode 100644
index 7dad51244c..0000000000
--- a/scripts/conversion_tools/convert_pytorch_transformers.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# coding: utf-8
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-""" Script for converting the distilbert model from pytorch-transformer to Gluon.
-
-Usage: 
-
-pip3 install pytorch-transformers
-
-python3 convert_pytorch_transformers.py
-
-If you are not converting the distilbert model, please change the code section noted
-by "TODO".
-
- """
-
-import argparse
-import pytorch_transformers
-import torch
-import mxnet as mx
-import gluonnlp as nlp
-import os, logging, json
-from utils import get_hash, load_text_vocab, tf_vocab_to_gluon_vocab
-
-parser = argparse.ArgumentParser(description='Conversion script for pytorch-transformer '
-                                             'distilbert model',
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--out_dir', type=str, help='Full path to the output folder',
-                    default='./converted-model')
-
-args = parser.parse_args()
-
-
-####################################################################
-#                  LOAD A BERT MODEL FROM PYTORCH                  #
-####################################################################
-# TODO: change this to your bert model and tokenizer used in pytorch-transformer
-tokenizer = pytorch_transformers.tokenization_distilbert.DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
-model = pytorch_transformers.DistilBertModel.from_pretrained('distilbert-base-uncased')
-
-dir_name = './temp'
-gluon_dir_name = args.out_dir
-nlp.utils.mkdir(dir_name)
-nlp.utils.mkdir(gluon_dir_name)
-model_name = 'bert_12_768_12'
-model.save_pretrained(dir_name)
-tokenizer.save_pretrained(dir_name)
-
-####################################################################
-#                  SHOW PYTORCH PARAMETER LIST                     #
-####################################################################
-pytorch_parameters = torch.load(os.path.join(dir_name, 'pytorch_model.bin'))
-print('parameters in pytorch')
-print(sorted(list(pytorch_parameters)))
-
-####################################################################
-#                        CONVERT VOCAB                             #
-####################################################################
-# convert vocabulary
-vocab = tf_vocab_to_gluon_vocab(load_text_vocab(os.path.join(dir_name, 'vocab.txt')))
-# vocab serialization
-tmp_file_path = os.path.expanduser(os.path.join(gluon_dir_name, 'temp'))
-with open(tmp_file_path, 'w') as f:
-    f.write(vocab.to_json())
-
-hash_full, hash_short = get_hash(tmp_file_path)
-gluon_vocab_path = os.path.expanduser(os.path.join(gluon_dir_name, hash_short + '.vocab'))
-with open(gluon_vocab_path, 'w') as f:
-    f.write(vocab.to_json())
-    print('vocab file saved to {}. hash = {}'.format(gluon_vocab_path, hash_full))
-
-####################################################################
-#                       CONVERT PARAMS OPTIONS                     #
-####################################################################
-torch_to_gluon_config_names = {
-  "attention_dropout": 'dropout',
-  "dim": 'embed_size',
-  "dropout": 'dropout',
-  "hidden_dim": 'hidden_size',
-  "max_position_embeddings": 'max_length',
-  "n_heads": 'num_heads',
-  "n_layers": 'num_layers',
-  "output_attentions": 'output_attention',
-  "output_hidden_states": 'output_all_encodings',
-  "vocab_size": 'vocab_size',
-}
-
-predefined_args = nlp.model.bert.bert_hparams[model_name]
-
-with open(os.path.join(dir_name, 'config.json'), 'r') as f:
-    torch_config = json.load(f)
-    for name, value in torch_config.items():
-        if name in torch_to_gluon_config_names:
-            predefined_args[torch_to_gluon_config_names[name]] = value
-
-# BERT encoder
-encoder = nlp.model.BERTEncoder(attention_cell=predefined_args['attention_cell'],
-                      num_layers=predefined_args['num_layers'], units=predefined_args['units'],
-                      hidden_size=predefined_args['hidden_size'],
-                      max_length=predefined_args['max_length'],
-                      num_heads=predefined_args['num_heads'], scaled=predefined_args['scaled'],
-                      dropout=predefined_args['dropout'],
-                      use_residual=predefined_args['use_residual'])
-
-# BERT model
-bert = nlp.model.BERTModel(encoder, len(vocab),
-                 units=predefined_args['units'], embed_size=predefined_args['embed_size'],
-                 embed_dropout=predefined_args['embed_dropout'],
-                 word_embed=predefined_args['word_embed'], use_pooler=False,
-                 # TODO: for some models, we may need to change the value for use_token_type_embed,
-                 # use_classifier, and use_decoder
-                 use_token_type_embed=False,
-                 token_type_vocab_size=predefined_args['token_type_vocab_size'],
-                 use_classifier=False, use_decoder=False)
-
-bert.initialize(init=mx.init.Normal(0.02))
-
-ones = mx.nd.ones((2, 8))
-out = bert(ones, ones, mx.nd.array([5, 6]), mx.nd.array([[1], [2]]))
-params = bert._collect_params_with_prefix()
-print('parameters in gluon')
-print(sorted(list(params.keys())))
-assert len(params) == len(pytorch_parameters), ("Gluon model does not match PyTorch model. " \
-    "Please fix the BERTModel hyperparameters", len(params), len(pytorch_parameters))
-
-####################################################################
-#                       CONVERT PARAMS VALUES                      #
-####################################################################
-mapping = {
-'encoder.layer_norm.beta': 'embeddings.LayerNorm.bias',
-'encoder.layer_norm.gamma': 'embeddings.LayerNorm.weight',
-'encoder.position_weight': 'embeddings.position_embeddings.weight',
-'word_embed.0.weight': 'embeddings.word_embeddings.weight',
-'encoder.transformer_cells': 'transformer.layer',
-'attention_cell': 'attention',
-'.proj.': '.attention.out_lin.',
-'proj_key':'k_lin',
-'proj_query':'q_lin',
-'proj_value':'v_lin',
-'ffn_1':'lin1',
-'ffn_2':'lin2',
-'ffn.layer_norm.beta':'output_layer_norm.bias',
-'ffn.layer_norm.gamma':'output_layer_norm.weight',
-}
-secondary_map = {'layer_norm.beta':'sa_layer_norm.bias',
-                 'layer_norm.gamma':'sa_layer_norm.weight'
-}
-
-# set parameter data
-loaded_params = {}
-for name in params:
-    pytorch_name = name
-    for k, v in mapping.items():
-        pytorch_name = pytorch_name.replace(k, v)
-    for k, v in secondary_map.items():
-        pytorch_name = pytorch_name.replace(k, v)
-    arr = mx.nd.array(pytorch_parameters[pytorch_name])
-    assert arr.shape == params[name].shape
-    params[name].set_data(arr)
-    loaded_params[name] = True
-
-if len(params) != len(loaded_params):
-    raise RuntimeError('The Gluon BERTModel comprises {} parameter arrays, '
-                       'but {} have been extracted from the pytorch model. '.format(
-                           len(params), len(loaded_params)))
-
-####################################################################
-#                       SAVE CONVERTED PARAMS                      #
-####################################################################
-# param serialization
-param_path = os.path.join(gluon_dir_name, 'net.params')
-bert.save_parameters(param_path)
-hash_full, hash_short = get_hash(param_path)
-print('param saved to {}. hash = {}'.format(param_path, hash_full))
-
-
-####################################################################
-#                       COMPARE OUTPUTS                            #
-####################################################################
-text = 'Hello, my dog is cute'
-# TODO: use nlp.data.GPT2Tokenizer if the GPT2 tokenizer in pytorch is used
-gluon_tokenizer = nlp.data.BERTTokenizer(vocab, lower=True)
-transform = nlp.data.BERTSentenceTransform(gluon_tokenizer, max_seq_length=512, pair=False, pad=False)
-sample = transform([text])
-words, valid_len, _ = mx.nd.array([sample[0]]), mx.nd.array([sample[1]]), mx.nd.array([sample[2]]);
-# TODO: for some tokenizers, no need to truncate words
-words = words[:,1:-1]
-seq_encoding = bert(words, None)
-print('\nconverted vocab:')
-print(vocab)
-
-print('\ntesting sample:')
-print(sample)
-print('\ngluon output: ', seq_encoding)
-
-input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)
-outputs = model(input_ids)
-last_hidden_states = outputs[0]
-print('\npytorch output: ')
-print(last_hidden_states)
-
-mx.nd.waitall()
-mx.test_utils.assert_almost_equal(seq_encoding.asnumpy(), last_hidden_states.detach().numpy(), atol=1e-3, rtol=1e-3)
-mx.test_utils.assert_almost_equal(seq_encoding.asnumpy(), last_hidden_states.detach().numpy(), atol=1e-5, rtol=1e-5)
-print('\nCongrats! The result is the same. Assertion passed.')
diff --git a/scripts/conversion_tools/convert_tf_model.py b/scripts/conversion_tools/convert_tf_model.py
deleted file mode 100644
index 09599dc85e..0000000000
--- a/scripts/conversion_tools/convert_tf_model.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-""" Script for converting TF Model to Gluon. """
-
-import argparse
-import json
-import logging
-import os
-import sys
-
-import mxnet as mx
-import gluonnlp as nlp
-from gluonnlp.model import BERTEncoder, BERTModel
-from gluonnlp.model.bert import bert_hparams
-
-sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
-
-from utils import (get_hash, load_text_vocab, read_tf_checkpoint,
-                   tf_vocab_to_gluon_vocab)
-
-
-parser = argparse.ArgumentParser(
-    description='Conversion script for Tensorflow BERT model',
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--model',
-                    type=str,
-                    default='bert_12_768_12',
-                    choices=['bert_12_768_12', 'bert_24_1024_16'],
-                    help='BERT model name')
-parser.add_argument('--tf_checkpoint_dir',
-                    type=str,
-                    help='Path to Tensorflow checkpoint folder.')
-parser.add_argument('--tf_model_prefix', type=str,
-                    default='bert_model.ckpt',
-                    help='name of bert checkpoint file.')
-parser.add_argument('--tf_config_name', type=str,
-                    default='bert_config.json',
-                    help='Name of Bert config file')
-parser.add_argument('--out_dir',
-                    type=str,
-                    default=os.path.join('~', 'output'),
-                    help='Path to output folder.')
-parser.add_argument('--debug', action='store_true', help='debugging mode')
-args = parser.parse_args()
-logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
-logging.info(args)
-
-# convert vocabulary
-vocab_path = os.path.join(args.tf_checkpoint_dir, 'vocab.txt')
-vocab = tf_vocab_to_gluon_vocab(load_text_vocab(vocab_path))
-
-# vocab serialization
-out_dir = os.path.expanduser(args.out_dir)
-nlp.utils.mkdir(out_dir)
-tmp_file_path = os.path.join(out_dir, 'tmp')
-with open(tmp_file_path, 'w') as f:
-    f.write(vocab.to_json())
-hash_full, hash_short = get_hash(tmp_file_path)
-gluon_vocab_path = os.path.join(out_dir, hash_short + '.vocab')
-with open(gluon_vocab_path, 'w') as f:
-    f.write(vocab.to_json())
-    logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full)
-
-# load tf model
-tf_checkpoint_file = os.path.expanduser(
-    os.path.join(args.tf_checkpoint_dir, args.tf_model_prefix))
-logging.info('loading Tensorflow checkpoint %s ...', tf_checkpoint_file)
-tf_tensors = read_tf_checkpoint(tf_checkpoint_file)
-tf_names = sorted(tf_tensors.keys())
-
-tf_names = filter(lambda name: not name.endswith('adam_m'), tf_names)
-tf_names = filter(lambda name: not name.endswith('adam_v'), tf_names)
-tf_names = filter(lambda name: name != 'global_step', tf_names)
-tf_names = list(tf_names)
-if len(tf_tensors) != len(tf_names):
-    logging.info('Tensorflow model was saved with Optimizer parameters. '
-                 'Ignoring them.')
-
-for name in tf_names:
-    logging.debug('%s: %s', name, tf_tensors[name].shape)
-
-# replace tensorflow parameter names with gluon parameter names
-NAME_MAP = [
-    ('bert/encoder/layer_', 'encoder.transformer_cells.'),
-    ('/attention/self/', '.attention_cell.'),
-    ('key', 'proj_key'),
-    ('query', 'proj_query'),
-    ('value', 'proj_value'),
-    ('/attention/output/LayerNorm/', '.layer_norm.'),
-    ('/attention/output/dense/', '.proj.'),
-    ('cls/seq_relationship/output_weights', 'classifier.weight'),
-    ('cls/seq_relationship/output_bias', 'classifier.bias'),
-    ('cls/predictions/output_bias', 'decoder.3.bias'),
-    ('cls/predictions/transform/dense/', 'decoder.0.'),
-    ('cls/predictions/transform/LayerNorm/', 'decoder.2.'),
-    ('kernel', 'weight'),
-    ('/intermediate/dense/', '.ffn.ffn_1.'),
-    ('/output/dense/', '.ffn.ffn_2.'),
-    ('/output/LayerNorm/', '.ffn.layer_norm.'),
-    ('bert/embeddings/LayerNorm/', 'encoder.layer_norm.'),
-    ('bert/embeddings/position_embeddings', 'encoder.position_weight'),
-    ('bert/embeddings/token_type_embeddings', 'token_type_embed.0.weight'),
-    ('bert/embeddings/word_embeddings', 'word_embed.0.weight'),
-    ('bert/pooler/dense/', 'pooler.'),
-    ('/', '.'),
-]
-
-# convert to gluon parameters
-mx_tensors = {}
-logging.info('converting to Gluon checkpoint ... ')
-for source_name in tf_names:
-    # get the source tensor and its transpose
-    source, source_t = tf_tensors[source_name], tf_tensors[source_name].T
-    target, target_name = source, source_name
-    for old, new in NAME_MAP:
-        target_name = target_name.replace(old, new)
-    # transpose kernel layer parameters
-    if 'kernel' in source_name:
-        target = source_t
-    mx_tensors[target_name] = target
-    if source_t.shape == source.shape and len(source.shape) > 1 and target is not source_t:
-        logging.info('warning: %s has symmetric shape %s', target_name, target.shape)
-    logging.debug('%s: %s', target_name, target.shape)
-
-# BERT config
-tf_config_names_to_gluon_config_names = {
-    'attention_probs_dropout_prob': 'dropout',
-    'hidden_act': None,
-    'hidden_dropout_prob': 'dropout',
-    'hidden_size': 'units',
-    'initializer_range': None,
-    'intermediate_size': 'hidden_size',
-    'max_position_embeddings': 'max_length',
-    'num_attention_heads': 'num_heads',
-    'num_hidden_layers': 'num_layers',
-    'type_vocab_size': 'token_type_vocab_size',
-    'vocab_size': None
-}
-predefined_args = bert_hparams[args.model]
-with open(os.path.join(args.tf_checkpoint_dir, args.tf_config_name), 'r') as f:
-    tf_config = json.load(f)
-    assert len(tf_config) == len(tf_config_names_to_gluon_config_names)
-    for tf_name, gluon_name in tf_config_names_to_gluon_config_names.items():
-        if tf_name is None or gluon_name is None:
-            continue
-        assert tf_config[tf_name] == predefined_args[gluon_name]
-
-# BERT encoder
-encoder = BERTEncoder(attention_cell=predefined_args['attention_cell'],
-                      num_layers=predefined_args['num_layers'],
-                      units=predefined_args['units'],
-                      hidden_size=predefined_args['hidden_size'],
-                      max_length=predefined_args['max_length'],
-                      num_heads=predefined_args['num_heads'],
-                      scaled=predefined_args['scaled'],
-                      dropout=predefined_args['dropout'],
-                      use_residual=predefined_args['use_residual'])
-
-# Infer enabled BERTModel components
-use_pooler = any('pooler' in n for n in mx_tensors)
-use_decoder = any('decoder.0' in n for n in mx_tensors)
-use_classifier = any('classifier.weight' in n for n in mx_tensors)
-
-logging.info('Inferred that the tensorflow model provides the following parameters:')
-logging.info('- use_pooler = {}'.format(use_pooler))
-logging.info('- use_decoder = {}'.format(use_decoder))
-logging.info('- use_classifier = {}'.format(use_classifier))
-
-# post processings for parameters:
-# - handle tied decoder weight
-logging.info('total number of tf parameters = %d', len(tf_names))
-if use_decoder:
-    mx_tensors['decoder.3.weight'] = mx_tensors['word_embed.0.weight']
-    logging.info('total number of mx parameters = %d'
-                 '(including decoder param for weight tying)', len(mx_tensors))
-else:
-    logging.info('total number of mx parameters = %d', len(mx_tensors))
-
-# BERT model
-bert = BERTModel(encoder, len(vocab),
-                 token_type_vocab_size=predefined_args['token_type_vocab_size'],
-                 units=predefined_args['units'],
-                 embed_size=predefined_args['embed_size'],
-                 word_embed=predefined_args['word_embed'],
-                 use_pooler=use_pooler, use_decoder=use_decoder,
-                 use_classifier=use_classifier)
-
-bert.initialize(init=mx.init.Normal(0.02))
-
-ones = mx.nd.ones((2, 8))
-out = bert(ones, ones, mx.nd.array([5, 6]), mx.nd.array([[1], [2]]))
-params = bert._collect_params_with_prefix()
-if len(params) != len(mx_tensors):
-    raise RuntimeError('The Gluon BERTModel comprises {} parameter arrays, '
-                       'but {} have been extracted from the tf model. '
-                       'Most likely the BERTModel hyperparameters do not match '
-                       'the hyperparameters of the tf model.'.format(len(params), len(mx_tensors)))
-
-# set parameter data
-loaded_params = {}
-for name in params:
-    try:
-        arr = mx.nd.array(mx_tensors[name])
-        params[name].set_data(arr)
-        loaded_params[name] = True
-    # pylint: disable=broad-except
-    except Exception:
-        if name not in mx_tensors:
-            raise RuntimeError('cannot initialize %s from tf checkpoint' % name)
-        else:
-            raise RuntimeError('cannot initialize %s. Expect shape = %s, but found %s' %
-                               name, params[name].shape, arr.shape)
-
-logging.info('num loaded params = %d, total num params = %d',
-             len(loaded_params), len(mx_tensors))
-for name in mx_tensors:
-    if name not in loaded_params:
-        logging.info('%s is not loaded', name)
-
-# param serialization
-bert.save_parameters(tmp_file_path)
-hash_full, hash_short = get_hash(tmp_file_path)
-gluon_param_path = os.path.join(out_dir, hash_short + '.params')
-logging.info('param saved to %s. hash = %s', gluon_param_path, hash_full)
-bert.save_parameters(gluon_param_path)
-mx.nd.waitall()
diff --git a/scripts/conversion_tools/index.rst b/scripts/conversion_tools/index.rst
deleted file mode 100644
index aac3a4cf6c..0000000000
--- a/scripts/conversion_tools/index.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Model Conversion Tools
-----------------------
-
-:download:`Download scripts </model_zoo/conversion_tools.zip>`
-
-Converting DistilBERT from PyTorch Transformer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The following command downloads the distilBERT model from pytorch-transformer,
-and converts the model to Gluon.
-
-.. code-block:: bash
-
-    pip3 install pytorch-transformers
-    python3 convert_pytorch_transformers.py --out_dir converted-model
-
-Converting RoBERTa from Fairseq
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The following command converts the `roberta checkpoint <https://github.com/pytorch/fairseq/tree/master/examples/roberta#pre-trained-models>` from fairseq to Gluon.
-The converted Gluon model is saved in the same folder as the checkpoint's.
-
-.. code-block:: bash
-
-    pip3 install fairseq
-    # download the roberta checkpoint from the website, then do:
-    python3 convert_fairseq_model.py --ckpt_dir ./roberta/roberta.base --model roberta_12_768_12
diff --git a/scripts/conversion_tools/infer_pytorch_gluon_parameter_name_mapping.py b/scripts/conversion_tools/infer_pytorch_gluon_parameter_name_mapping.py
deleted file mode 100644
index ea1bedd33d..0000000000
--- a/scripts/conversion_tools/infer_pytorch_gluon_parameter_name_mapping.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-"""PyTorch BERT parameter naming to Gluon BERT parameter naming.
-
-Given a Gluon BERT model (eg. obtained with the convert_tf_gluon.py script) and
-a pytorch_model.bin containing the same parameters, this script infers the
-naming convention of PyTorch.
-
-"""
-
-import argparse
-import json
-import logging
-import os
-import sys
-
-import gluonnlp as nlp
-import torch
-
-sys.path.insert(0, os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
-from utils import load_text_vocab, tf_vocab_to_gluon_vocab
-
-parser = argparse.ArgumentParser(description='Pytorch BERT Naming Convention',
-                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-parser.add_argument('--model', type=str, default='bert_12_768_12',
-                    choices=['bert_12_768_12', 'bert_24_1024_16'], help='BERT model name')
-parser.add_argument('--dataset_name', type=str, default='scibert_scivocab_uncased',
-                    help='Dataset name')
-parser.add_argument('--pytorch_checkpoint_dir', type=str,
-                    help='Path to Tensorflow checkpoint folder.')
-parser.add_argument('--debug', action='store_true', help='debugging mode')
-parser.add_argument('--out', default='gluon_to_pytorch_naming.json',
-                    help='Output file to store gluon to pytorch name mapping.')
-args = parser.parse_args()
-logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
-logging.info(args)
-
-# Load Gluon Model
-bert, vocab = nlp.model.get_model(args.model, dataset_name=args.dataset_name, pretrained=True)
-parameters = bert._collect_params_with_prefix()
-parameters = {k: v.data().asnumpy() for k, v in parameters.items()}
-
-# Load PyTorch Model
-pytorch_parameters = torch.load(os.path.join(args.pytorch_checkpoint_dir, 'pytorch_model.bin'),
-                                map_location=lambda storage, loc: storage)
-pytorch_vocab = tf_vocab_to_gluon_vocab(
-    load_text_vocab(os.path.join(args.pytorch_checkpoint_dir, 'vocab.txt')))
-pytorch_parameters = {k: v.numpy() for k, v in pytorch_parameters.items()}
-
-# Assert that vocabularies are equal
-assert pytorch_vocab.idx_to_token == vocab.idx_to_token
-
-mapping = dict()
-
-for name, param in parameters.items():
-    found_match = False
-    for pytorch_name, pytorch_param in pytorch_parameters.items():
-        if param.shape == pytorch_param.shape:
-            if (param == pytorch_param).all():
-                if found_match:
-                    print('Found multiple matches for {}. '
-                          'Ignoring new match {}'.format(name, pytorch_name))
-                else:
-                    found_match = True
-                    mapping.update({name: pytorch_name})
-
-        # We don't break here, in case there are mulitple matches
-
-    if not found_match:
-        raise RuntimeError('Pytorch and Gluon model do not match. '
-                           'Cannot infer mapping of names.')
-
-assert len(mapping) == len(parameters)
-
-with open(args.out, 'w') as f:
-    json.dump(mapping, f, indent="  ")
-    print('Wrote mapping to {}'.format(args.out))
diff --git a/scripts/conversion_tools/input.txt b/scripts/conversion_tools/input.txt
deleted file mode 100644
index d1e3f410d0..0000000000
--- a/scripts/conversion_tools/input.txt
+++ /dev/null
@@ -1 +0,0 @@
-Who was Jim Henson ? ||| Jim Henson was a puppeteer
diff --git a/scripts/conversion_tools/input_cn.txt b/scripts/conversion_tools/input_cn.txt
deleted file mode 100644
index d1f598b9c0..0000000000
--- a/scripts/conversion_tools/input_cn.txt
+++ /dev/null
@@ -1 +0,0 @@
-这是百度的ERNIE模型 |||
diff --git a/scripts/conversion_tools/utils.py b/scripts/conversion_tools/utils.py
deleted file mode 100644
index a056ceb834..0000000000
--- a/scripts/conversion_tools/utils.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utility functions for BERT."""
-
-import logging
-import collections
-import hashlib
-import io
-
-import mxnet as mx
-import gluonnlp as nlp
-
-__all__ = ['tf_vocab_to_gluon_vocab', 'load_text_vocab']
-
-
-def tf_vocab_to_gluon_vocab(tf_vocab):
-    special_tokens = ['[UNK]', '[PAD]', '[SEP]', '[MASK]', '[CLS]']
-    assert all(t in tf_vocab for t in special_tokens)
-    counter = nlp.data.count_tokens(tf_vocab.keys())
-    vocab = nlp.vocab.BERTVocab(counter, token_to_idx=tf_vocab)
-    return vocab
-
-
-def get_hash(filename):
-    sha1 = hashlib.sha1()
-    with open(filename, 'rb') as f:
-        while True:
-            data = f.read(1048576)
-            if not data:
-                break
-            sha1.update(data)
-    return sha1.hexdigest(), str(sha1.hexdigest())[:8]
-
-
-def read_tf_checkpoint(path):
-    """read tensorflow checkpoint"""
-    from tensorflow.python import pywrap_tensorflow
-    tensors = {}
-    reader = pywrap_tensorflow.NewCheckpointReader(path)
-    var_to_shape_map = reader.get_variable_to_shape_map()
-    for key in sorted(var_to_shape_map):
-        tensor = reader.get_tensor(key)
-        tensors[key] = tensor
-    return tensors
-
-def load_text_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    index = 0
-    with io.open(vocab_file, 'r') as reader:
-        while True:
-            token = reader.readline()
-            if not token:
-                break
-            token = token.strip()
-            vocab[token] = index
-            index += 1
-    return vocab
diff --git a/scripts/datasets/README.md b/scripts/datasets/README.md
new file mode 100644
index 0000000000..50cd555495
--- /dev/null
+++ b/scripts/datasets/README.md
@@ -0,0 +1,57 @@
+# Datasets
+
+This page describes how to download and prepare the datasets used in GluonNLP.
+
+Essentially, we provide scripts for downloading and preparing the datasets.
+The directory structure and the format of the processed datasets are well documented so that you are able to
+reuse the scripts with your own data (as long as the structure/format matches).
+
+Thus, the typical workflow for running experiments:
+
+- Download and prepare data with scripts in [datasets](.).
+In case you will need to preprocess the dataset, there are toolkits in [preprocess](../preprocess).
+- Run the experiments in [scripts](..)
+
+
+## Available Datasets
+- [Machine Translation](./machine_translation)
+    - [WMT](./machine_translation/README.md#wmt)
+- [Question Answering](./question_answering)
+    - [SQuAD](./question_answering/README.md#squad)
+    - [SearchQA](./question_answering/README.md#searchqa)
+    - [TriviaQA](./question_answering/README.md#triviaqa)
+    - [HotpotQA](./question_answering/README.md#hotpotqa)
+- [Language Modeling](./language_modeling)
+    - [WikiText-2](./language_modeling)
+    - [WikiText-103](./language_modeling)
+    - [Text8](./language_modeling)
+    - [Enwiki8](./language_modeling)
+    - [Google Billion Words](./language_modeling)
+- [Music Generation](./music_generation)
+    - [LakhMIDI](./music_generation/README.md#lakh-midi)
+    - [MAESTRO](./music_generation/README.md#maestro)
+- [Pretraining Corpus](./pretrain_corpus)
+    - [Wikipedia](./pretrain_corpus/README.md#wikipedia)
+    - [BookCorpus](./pretrain_corpus/README.md#bookcorpus)
+    - [OpenWebText](./pretrain_corpus/README.md#openwebtext)
+- [General NLP Benchmarks](./general_nlp_benchmark)
+    - [GLUE](./general_nlp_benchmark/README.md#glue-benchmark)
+    - [SuperGLUE](./general_nlp_benchmark/README.md#superglue-benchmark)
+
+## Contribution Guide
+
+We are very happy to receive and merge your contributions about new datasets :smiley:.
+
+To add a new dataset, you may create a `prepare_{DATASET_NAME}.py` file in the specific folder.
+Also, remember to add the documentation in the `README.md` about 1) the directory structure and 2) how to use the CLI tool for downloading + preprocessing.
+In addition, add citations in the `prepare_{DATASET_NAME}.py` to assign credit to the original author.
+Refer to the existing scripts or ask questions in Github if you need help.  
+
+All URLs are bound with SHA1-hash keys to make sure that the downloaded files are not corrupted. You can refer to the files in [url_checksums](./url_checksums) for examples.
+
+In order to generate the hash values of the data files, you can revise [update_download_stats.py](update_download_stats.py)
+and include the new URLS + create the stats file that will store the hash keys. Use the following command to update the hash key:
+
+```bash
+python3 update_download_stats.py
+```
diff --git a/scripts/datasets/__init__.py b/scripts/datasets/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/datasets/__main__.py b/scripts/datasets/__main__.py
new file mode 100644
index 0000000000..301c7036e9
--- /dev/null
+++ b/scripts/datasets/__main__.py
@@ -0,0 +1,41 @@
+import argparse
+from .machine_translation import prepare_wmt
+from .question_answering import prepare_squad, prepare_hotpotqa, prepare_searchqa, prepare_triviaqa
+from .language_modeling import prepare_lm
+from .music_generation import prepare_music_midi
+from .pretrain_corpus import prepare_bookcorpus, prepare_wikipedia, prepare_openwebtext
+from .general_nlp_benchmark import prepare_glue
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+
+# TODO(zheyuye), lazy_import theses data parser functions and data main function
+# and their dependencies by a dictionary mapping the datasets names to the functions.
+def list_all_subcommands():
+    out = []
+    for key in DATA_PARSER_REGISTRY.list_keys():
+        if key not in DATA_MAIN_REGISTRY._obj_map:
+            raise KeyError('The data cli "{}" is registered in parser but is missing'
+                           ' in main'.format(key))
+        out.append(key)
+    return out
+
+
+def cli_main():
+    parser = argparse.ArgumentParser(
+        description='Build-in scripts for downloading and preparing the data in GluonNLP.',
+        prog='nlp_data', add_help=False)
+    parser.add_argument('command', type=str,
+                        choices=list_all_subcommands() + ['help'],
+                        metavar='[subcommand]',
+                        help='The subcommand to use. '
+                             'Choices are {}.'.format(list_all_subcommands() + ['help']))
+    args, other_args = parser.parse_known_args()
+    if args.command == 'help':
+        parser.print_help()
+    else:
+        parser = DATA_PARSER_REGISTRY.create(args.command)
+        sub_args = parser.parse_args(other_args)
+        DATA_MAIN_REGISTRY.create(args.command, sub_args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/scripts/datasets/general_nlp_benchmark/README.md b/scripts/datasets/general_nlp_benchmark/README.md
new file mode 100644
index 0000000000..ff902750a1
--- /dev/null
+++ b/scripts/datasets/general_nlp_benchmark/README.md
@@ -0,0 +1,104 @@
+# Language Understanding Benchmarks
+
+We provide the documentation about how to download and prepare the 
+[GLUE](https://gluebenchmark.com/) and [SuperGLUE](https://super.gluebenchmark.com/).
+
+These benchmarks share the common goal of providing a robust set of downstream tasks for evaluating 
+the NLP models' performance.
+
+In essence, these NLP tasks share a similar structure. We are interested in the question: 
+can we design a model that can solve these tasks all in once? 
+[BERT](https://arxiv.org/pdf/1810.04805.pdf) has done a good job in unifying the way to 
+featurize the text data, in which we extract two types of embeddings: one for the 
+whole sentence and the other for each tokens in the sentence. Later, 
+in [T5](https://arxiv.org/pdf/1910.10683.pdf), the author proposed to convert every task 
+into a text-to-text problem. However, it is difficult to convert tasks like sentence similarity 
+match, or named-entity recognition into text-to-text, because they involve real-values or text 
+spans that are difficult to be encoded as raw text data.
+
+In GluonNLP, we propose a unified way to tackle these NLP problems. We convert these datasets 
+as tables. Each column in the table will be 1) raw text, 2) entity/list of entities associated with the 
+raw text, 3) numerical values or a list of numerical values. 
+In addition, we keep a metadata object that describes 1) the relationship among columns, 
+2) certain properties of the columns.
+
+All tasks used in these general benchmarks are converted to this format.
+
+
+## GLUE Benchmark
+
+The details of the benchmark are described in [GLUE Paper](https://openreview.net/pdf?id=rJ4km2R5t7).
+
+To obtain the dataset, run:
+
+```
+nlp_data prepare_glue --benchmark glue
+```
+
+There will be multiple task folders. All data are converted into pandas dataframes + an additional 
+`metadata.json` object.
+
+Here are the details of the datasets:
+
+| Dataset | #Train | #Dev | #Test   | Columns         | Task                         | Metrics                      | Domain              |
+|---------|--------|------|--------|---------------------|------------------------------|------------------------------|---------------------|
+| CoLA    | 8.5k   | 1k   | 1k     | sentence, **label**  | acceptability  (0 / 1)       | Matthews corr.               | misc.               |
+| SST-2   | 67k    | 872  | 1.8k   | sentence, **label**     | sentiment                    | acc.                         | movie reviews       |
+| MRPC    | 3.7k   | 408  | 1.7k   | sentence1, sentence2, **label** | paraphrase                   | acc./F1                      | news                |
+| STS-B   | 5.7k   | 1.5k | 1.4k   | sentence1, sentence2, **score** | sentence similarity          | Pearson/Spearman corr.       | misc.                |
+| QQP     | 364k   | 40k  | 391k   | sentence1, sentence2, **label** | paraphrase                   | acc./F1                      | social QA questions |
+| MNLI    | 393k   | 9.8k(m) / 9.8k(mm)  | 9.8k(m) / 9.8k(mm)  | sentence1, sentence2, genre, **label** | NLI    | matched acc./mismatched acc. | misc                |
+| QNLI    | 105k   | 5.4k | 5.4k   | question, sentence, **label** | QA/NLI                       | acc.                         | Wikipedia           |
+| RTE     | 2.5k   | 227  | 3k     | sentence1, sentence2, **label** | NLI                          | acc.                         | news, Wikipedia     |
+| WNLI    | 634    |  71  | 146    | sentence1, sentence2, **label** | NLI                          | acc.                         | fiction books       |
+
+In addition, GLUE has the diagnostic task that tries to analyze the system's performance on a broad range of linguistic phenomena. 
+It is best described in [GLUE Diagnostic](https://gluebenchmark.com/diagnostics). 
+The diagnostic dataset is based on Natural Language Inference (NLI) and you will need to use the model trained with 
+MNLI on this dataset.
+
+| Dataset     | #Sample | Data Format | Metrics         |
+|-------------|---------|-------------|-----------------|
+| Diagnostic  | 1104    | semantics, predicate, logic, knowledge, domain, premise, hypothesis, label | Matthews corr.  |
+
+In addition, we provide the SNLI dataset, which is recommend as an auxiliary data source for training MNLI. 
+This is the recommended approach in [GLUE](https://openreview.net/pdf?id=rJ4km2R5t7).
+
+| Dataset | #Train  | #Test  | Data Format                 | Task | Metrics | Domain |
+|---------|---------|--------|-----------------------------|------|---------|--------|
+| SNLI    | 549K    | 20k    | sentence1, sentence2, **label** | NLI  | acc.    | misc   |
+
+
+## SuperGLUE Benchmark
+
+The details are described in [SuperGLUE Paper](https://arxiv.org/pdf/1905.00537.pdf).
+
+To obtain the benchmark, run:
+
+```
+nlp_data prepare_glue --benchmark superglue
+```
+
+
+| Dataset  | #Train  | #Dev | #Test   | Columns         | Task         | Metrics                      | Domain                          |
+|----------|---------|------|---------|---------------------|--------------|------------------------------|---------------------------------|
+| BoolQ    | 9.4k    | 3.3k | 3.2k    | passage, question, **label** | QA           | acc.                         | Google queries, Wikipedia       |
+| CB       | 250     | 57   | 250     | premise, hypothesis, **label** | NLI          | acc./F1                      | various                         |
+| COPA     | 400     | 100  | 500     | premise, choice1, choice2, question, **label** | QA           | acc.                         | blogs, photography encyclopedia |
+| MultiRC* | 5.1k (27k)  | 953 (4.8k) | 1.8k (9.7k) | passage, question, answer, **label**                  | QA           | F1/EM                        | various                         |
+| ReCoRD   | 101k    | 10k  | 10k         | source, text, entities, query, **answers** | QA           | F1/EM                        | news                            |
+| RTE      | 2.5k    | 278  | 3k          | premise, hypothesis, **label**  | NLI          | acc.                         | news, Wikipedia                 |
+| WiC      | 6k    | 638  | 1.4k          | sentence1, sentence2, entities1, entities2, **label**  | WSD          | acc.                         | WordNet, VerbNet, Wiktionary    |
+| WSC      | 554     | 104  | 146         | text, entities, **label**  | coref.       | acc.                         | fiction books                   |
+
+*Note that for MultiRC, we enumerated all combinations of (passage, question, answer) triplets in 
+the dataset and the number of samples in the expanded format is recorded inside parenthesis.
+
+Similar to GLUE, SuperGLUE has two diagnostic tasks to analyze the system performance 
+on a broad range of linguistic phenomena. For more details, 
+see [SuperGLUE Diagnostic](https://super.gluebenchmark.com/diagnostics).
+
+| Dataset       | #Samples | Columns                        |Metrics         |
+|---------------|----------|----------------------|----------------|
+| Winogender    | 356 |hypothesis, premise, label | Accuracy       |
+| Broadcoverage | 1104  | label, sentence1, sentence2, logic | Matthews corr. |
diff --git a/scripts/datasets/general_nlp_benchmark/__init__.py b/scripts/datasets/general_nlp_benchmark/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/datasets/general_nlp_benchmark/prepare_glue.py b/scripts/datasets/general_nlp_benchmark/prepare_glue.py
new file mode 100644
index 0000000000..bbaf01cf48
--- /dev/null
+++ b/scripts/datasets/general_nlp_benchmark/prepare_glue.py
@@ -0,0 +1,707 @@
+# Disclaimer! The script here is partially based on
+# https://github.com/nyu-mll/jiant/blob/master/scripts/download_glue_data.py
+# and
+# https://github.com/nyu-mll/jiant/blob/master/scripts/download_superglue_data.py
+import os
+import sys
+import shutil
+import tempfile
+import argparse
+import zipfile
+import json
+import pathlib
+import pandas as pd
+import pyarrow
+import pyarrow.json
+from gluonnlp.utils.misc import download, load_checksum_stats
+from gluonnlp.base import get_data_home_dir
+from gluonnlp.registry import DATA_MAIN_REGISTRY, DATA_PARSER_REGISTRY
+from gluonnlp.data.tokenizers import WhitespaceTokenizer
+
+
+_CITATIONS = """
+@inproceedings{wang2019glue,
+  title={GLUE: A multi-task benchmark and analysis platform for natural language understanding},
+  author={Wang, Alex and Singh, Amanpreet and Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel R},
+  booktitle={ICLR},
+  year={2019}
+}
+
+@inproceedings{wang2019superglue,
+  title={Superglue: A stickier benchmark for general-purpose language understanding systems},
+  author={Wang, Alex and Pruksachatkun, Yada and Nangia, Nikita and Singh, Amanpreet and 
+  Michael, Julian and Hill, Felix and Levy, Omer and Bowman, Samuel},
+  booktitle={Advances in Neural Information Processing Systems},
+  pages={3261--3275},
+  year={2019}
+}
+"""
+
+GLUE_TASKS = ["cola", "sst", "mrpc", "qqp", "sts", "mnli",
+              "snli", "qnli", "rte", "wnli", "diagnostic"]
+SUPERGLUE_TASKS = ["cb", "copa", "multirc", "rte", "wic", "wsc", "boolq", "record",
+                   'broadcoverage-diagnostic', 'winogender-diagnostic']
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_URL_FILE_STATS = load_checksum_stats(os.path.join(
+    _CURR_DIR, '..', 'url_checksums', 'glue.txt'))
+_URL_FILE_STATS.update(load_checksum_stats(os.path.join(
+    _CURR_DIR, '..', 'url_checksums', 'superglue.txt')))
+
+
+def read_tsv_glue(tsv_file, num_skip=1, keep_column_names=False):
+    out = []
+    nrows = None
+    if keep_column_names:
+        assert num_skip == 1
+    column_names = None
+    with open(tsv_file, 'r') as f:
+        for i, line in enumerate(f):
+            line = line.strip()
+            if i < num_skip:
+                if keep_column_names:
+                    column_names = line.split()
+                continue
+            elements = line.split('\t')
+            out.append(elements)
+            if nrows is None:
+                nrows = len(elements)
+            else:
+                assert nrows == len(elements)
+    df = pd.DataFrame(out, columns=column_names)
+    series_l = []
+    for col_name in df.columns:
+        idx = df[col_name].first_valid_index()
+        val = df[col_name][idx]
+        if isinstance(val, str):
+            try:
+                dat = pd.to_numeric(df[col_name])
+                series_l.append(dat)
+                continue
+            except ValueError:
+                pass
+            finally:
+                pass
+        series_l.append(df[col_name])
+    new_df = pd.DataFrame({name: series for name, series in zip(df.columns, series_l)})
+    return new_df
+
+
+def read_jsonl_superglue(jsonl_file):
+    columns = None
+    out = []
+    with open(jsonl_file, 'r') as f:
+        for i, line in enumerate(f):
+            line = line.strip()
+            sample = json.loads(line)
+            if columns is None:
+                columns = list(sample.keys())
+            else:
+                assert sorted(columns) == sorted(list(sample.keys())),\
+                    'Columns={}, sample.keys()={}'.format(columns, sample.keys())
+            out.append([sample[col] for col in columns])
+    df = pd.DataFrame(out, columns=columns)
+    return df
+
+
+# Classification will be stored as pandas dataframe
+def read_cola(dir_path):
+    df_dict = dict()
+    for fold in ['train', 'dev', 'test']:
+        csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+        if fold == 'test':
+            df = pd.read_csv(csv_file, '\t')
+            df = df[['sentence']]
+            df_dict[fold] = df
+        else:
+            df = pd.read_csv(csv_file, '\t', header=None)
+            df = df[[3, 1]]
+            df.columns = ['sentence', 'label']
+            df_dict[fold] = df
+    return df_dict, None
+
+
+def read_sst(dir_path):
+    df_dict = dict()
+    for fold in ['train', 'dev', 'test']:
+        csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+        df = pd.read_csv(csv_file, '\t')
+        if fold == 'test':
+            df = df[['sentence']]
+        df_dict[fold] = df
+    return df_dict, None
+
+
+def read_mrpc(dir_path):
+    df_dict = dict()
+    for fold in ['train', 'dev', 'test']:
+        tsv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+        df = read_tsv_glue(tsv_file)
+        if fold == 'test':
+            df = df[[3, 4]]
+            df.columns = ['sentence1', 'sentence2']
+        else:
+            df = df[[3, 4, 0]]
+            df.columns = ['sentence1', 'sentence2', 'label']
+        df_dict[fold] = df
+    return df_dict, None
+
+
+def read_qqp(dir_path):
+    df_dict = dict()
+    for fold in ['train', 'dev', 'test']:
+        csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+        df = pd.read_csv(csv_file, '\t')
+        if fold == 'test':
+            df = df[['question1', 'question2']]
+            df.columns = ['sentence1', 'sentence2']
+        else:
+            df = df[['question1', 'question2', 'is_duplicate']]
+            df.columns = ['sentence1', 'sentence2', 'label']
+        df_dict[fold] = df
+    return df_dict, None
+
+
+def read_sts(dir_path):
+    df_dict = dict()
+    for fold in ['train', 'dev', 'test']:
+        csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+        df = read_tsv_glue(csv_file)
+        if fold == 'test':
+            df = df[[7, 8, 1]]
+            df.columns = ['sentence1', 'sentence2', 'genre']
+        else:
+            df = df[[7, 8, 1, 9]]
+            df.columns = ['sentence1', 'sentence2', 'genre', 'score']
+        genre_l = []
+        for ele in df['genre'].tolist():
+            if ele == 'main-forum':
+                genre_l.append('main-forums')
+            else:
+                genre_l.append(ele)
+        df['genre'] = pd.Series(genre_l)
+        df_dict[fold] = df
+    return df_dict, None
+
+
+def read_mnli(dir_path):
+    df_dict = dict()
+    for fold in ['train', 'dev_matched', 'dev_mismatched', 'test_matched', 'test_mismatched']:
+        csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+        df = read_tsv_glue(csv_file, 1, True)
+        if 'test' in fold:
+            df = df[['sentence1', 'sentence2', 'genre']]
+        else:
+            df = df[['sentence1', 'sentence2', 'genre', 'gold_label']]
+            df.columns = ['sentence1', 'sentence2', 'genre', 'label']
+        df_dict[fold] = df
+    return df_dict, None
+
+
+def read_snli(dir_path):
+    df_dict = dict()
+    for fold in ['train', 'dev', 'test']:
+        csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+        column_names = None
+        out = []
+        with open(csv_file) as f:
+            for i, line in enumerate(f):
+                line = line.strip()
+                if i == 0:
+                    column_names = line.split()
+                    column_names = column_names[:10] + [column_names[-1]]
+                    continue
+                elements = line.split('\t')
+                first_few_elements = elements[:10]
+                gold_label = elements[-1]
+                out.append(first_few_elements + [gold_label])
+        df = pd.DataFrame(out, columns=column_names)
+        df = df[['sentence1', 'sentence2', 'gold_label']]
+        df.columns = ['sentence1', 'sentence2', 'label']
+        df_dict[fold] = df
+    return df_dict, None
+
+
+def read_qnli(dir_path):
+    df_dict = dict()
+    for fold in ['train', 'dev', 'test']:
+        csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+        df = read_tsv_glue(csv_file, 1, True)
+        if fold == 'test':
+            df_dict[fold] = df[['question', 'sentence']]
+        else:
+            df_dict[fold] = df[['question', 'sentence', 'label']]
+    return df_dict, None
+
+
+def read_rte(dir_path):
+    df_dict = dict()
+    for fold in ['train', 'dev', 'test']:
+        csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+        df = pd.read_csv(csv_file, '\t')
+        if fold == 'test':
+            df_dict[fold] = df[['sentence1', 'sentence2']]
+        else:
+            df_dict[fold] = df[['sentence1', 'sentence2', 'label']]
+    return df_dict, None
+
+
+def read_wnli(dir_path):
+    df_dict = dict()
+    for fold in ['train', 'dev', 'test']:
+        csv_file = os.path.join(dir_path, '{}.tsv'.format(fold))
+        df = pd.read_csv(csv_file, '\t')
+        if fold == 'test':
+            df = df[['sentence1', 'sentence2']]
+        else:
+            df = df[['sentence1', 'sentence2', 'label']]
+        df_dict[fold] = df
+    return df_dict, None
+
+
+# The glue diagnostic will be in MNLI
+def read_glue_diagnostic(dir_path):
+    csv_file = os.path.join(dir_path, 'diagnostic-full.tsv')
+    df = pd.read_csv(csv_file, '\t')
+    df.columns = ['semantics', 'predicate', 'logic', 'knowledge', 'domain', 'premise',
+                  'hypothesis', 'label']
+    return df
+
+
+def read_cb(dir_path):
+    df_dict = dict()
+    for fold in ['train', 'val', 'test']:
+        columns = ['premise', 'hypothesis']
+        if fold != 'test':
+            columns.append('label')
+        jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
+        df = read_jsonl_superglue(jsonl_path)
+        df = df[columns]
+        df_dict[fold] = df
+    return df_dict, None
+
+
+def read_copa(dir_path):
+    df_dict = dict()
+    for fold in ['train', 'val', 'test']:
+        columns = ['premise', 'choice1', 'choice2', 'question']
+        if fold != 'test':
+            columns.append('label')
+        jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
+        df = read_jsonl_superglue(jsonl_path)
+        df = df[columns]
+        df_dict[fold] = df
+    return df_dict, None
+
+
+# passage, question, answer, passage_idx, question_idx, answer_idx
+def read_multirc(dir_path):
+    df_dict = dict()
+    for fold in ['train', 'val', 'test']:
+        columns = ['passage', 'question', 'answer', 'psg_idx', 'qst_idx', 'ans_idx']
+        if fold != 'test':
+            columns.append('label')
+        out = []
+        jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
+        with open(jsonl_path, 'r') as f:
+            for line in f:
+                sample = json.loads(line.strip())
+                psg_idx = sample['idx']
+                sample = json.loads(line.strip())
+                passage = sample['passage']['text']
+                for qa in sample['passage']['questions']:
+                    qst_idx = qa['idx']
+                    question = qa['question']
+                    for ans in qa['answers']:
+                        ans_idx = ans['idx']
+                        answer = ans['text']
+                        if fold == 'test':
+                            out.append((passage, question, answer, psg_idx, qst_idx, ans_idx))
+                        else:
+                            label = ans['label']
+                            out.append((passage, question, answer, psg_idx, qst_idx,
+                                        ans_idx, label))
+        df = pd.DataFrame(out, columns=columns)
+        df_dict[fold] = df
+    return df_dict, None
+
+
+def read_rte_superglue(dir_path):
+    df_dict = dict()
+    for fold in ['train', 'val', 'test']:
+        if fold == 'test':
+            columns = ['premise', 'hypothesis']
+        else:
+            columns = ['premise', 'hypothesis', 'label']
+        jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
+        df = read_jsonl_superglue(jsonl_path)
+        df = df[columns]
+        df_dict[fold] = df
+    return df_dict, None
+
+
+def read_wic(dir_path):
+    df_dict = dict()
+    meta_data = dict()
+    meta_data['entities1'] = {'type': 'entity', 'attrs': {'parent': 'sentence1'}}
+    meta_data['entities2'] = {'type': 'entity', 'attrs': {'parent': 'sentence2'}}
+
+    for fold in ['train', 'val', 'test']:
+        if fold != 'test':
+            columns = ['sentence1', 'sentence2', 'entities1', 'entities2', 'label']
+        else:
+            columns = ['sentence1', 'sentence2', 'entities1', 'entities2']
+        jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
+        df = read_jsonl_superglue(jsonl_path)
+        out = []
+        for idx, row in df.iterrows():
+            sentence1 = row['sentence1']
+            sentence2 = row['sentence2']
+            start1 = row['start1']
+            end1 = row['end1']
+            start2 = row['start2']
+            end2 = row['end2']
+            if fold == 'test':
+                out.append([sentence1, sentence2,
+                            {'start': start1, 'end': end1},
+                            {'start': start2, 'end': end2}])
+            else:
+                label = row['label']
+                out.append([sentence1, sentence2,
+                            {'start': start1, 'end': end1},
+                            {'start': start2, 'end': end2},
+                            label])
+        df = pd.DataFrame(out, columns=columns)
+        df_dict[fold] = df
+    return df_dict, meta_data
+
+
+def read_wsc(dir_path):
+    df_dict = dict()
+    tokenizer = WhitespaceTokenizer()
+    meta_data = dict()
+    meta_data['noun'] = {'type': 'entity', 'attrs': {'parent': 'text'}}
+    meta_data['pronoun'] = {'type': 'entity', 'attrs': {'parent': 'text'}}
+    for fold in ['train', 'val', 'test']:
+        jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
+        df = read_jsonl_superglue(jsonl_path)
+        samples = []
+        for i in range(len(df)):
+            text = df.loc[i, 'text']
+            if fold != 'test':
+                label = df.loc[i, 'label']
+            target = df.loc[i, 'target']
+            span1_index = target['span1_index']
+            span2_index = target['span2_index']
+            span1_text = target['span1_text']
+            span2_text = target['span2_text']
+            # Build entity
+            # list of entities
+            # 'entities': {'start': 0, 'end': 100}
+            tokens, offsets = tokenizer.encode_with_offsets(text, str)
+            pos_start1 = offsets[span1_index][0]
+            pos_end1 = pos_start1 + len(span1_text)
+            pos_start2 = offsets[span2_index][0]
+            pos_end2 = pos_start2 + len(span2_text)
+            if fold == 'test':
+                samples.append({'text': text,
+                                'noun': {'start': pos_start1, 'end': pos_end1},
+                                'pronoun': {'start': pos_start2, 'end': pos_end2}})
+            else:
+                samples.append({'text': text,
+                                'noun': {'start': pos_start1, 'end': pos_end1},
+                                'pronoun': {'start': pos_start2, 'end': pos_end2},
+                                'label': label})
+        df = pd.DataFrame(samples)
+        df_dict[fold] = df
+    return df_dict, meta_data
+
+
+def read_boolq(dir_path):
+    df_dict = dict()
+    for fold in ['train', 'val', 'test']:
+        jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
+        df = read_jsonl_superglue(jsonl_path)
+        df_dict[fold] = df
+    return df_dict, None
+
+
+def read_record(dir_path):
+    df_dict = dict()
+    meta_data = dict()
+    meta_data['entities'] = {'type': 'entity', 'attrs': {'parent': 'text'}}
+    meta_data['answers'] = {'type': 'entity', 'attrs': {'parent': 'text'}}
+    for fold in ['train', 'val', 'test']:
+        if fold != 'test':
+            columns = ['source', 'text', 'entities', 'query', 'answers']
+        else:
+            columns = ['source', 'text', 'entities', 'query']
+        jsonl_path = os.path.join(dir_path, '{}.jsonl'.format(fold))
+        df = read_jsonl_superglue(jsonl_path)
+        df_dict[fold] = df
+        out = []
+        for i, row in df.iterrows():
+            source = row['source']
+            passage = row['passage']
+            text = passage['text']
+            entities = passage['entities']
+            entities = [{'start': ele['start'], 'end': ele['end']} for ele in entities]
+            for qas in row['qas']:
+                query = qas['query']
+                if fold != 'test':
+                    answer_entities = qas['answers']
+                    out.append((source, text, entities, query, answer_entities))
+                else:
+                    out.append((source, text, entities, query))
+        df = pd.DataFrame(out, columns=columns)
+        df_dict[fold] = df
+    return df_dict, meta_data
+
+
+def read_winogender_diagnostic(dir_path):
+    jsonl_path = os.path.join(dir_path, 'AX-g.jsonl')
+    df = read_jsonl_superglue(jsonl_path)
+    return df
+
+
+def read_broadcoverage_diagnostic(dir_path):
+    df = pyarrow.json.read_json(os.path.join(dir_path, 'AX-b.jsonl')).to_pandas()
+    return df
+
+
+GLUE_TASK2PATH = {
+    "cola": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4",  # noqa
+    "sst": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8",  # noqa
+    "mrpc": {
+        'train': "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt",
+        'dev':  "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc",
+        'test': "https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt"
+    },
+    "qqp": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP-clean.zip?alt=media&token=11a647cb-ecd3-49c9-9d31-79f8ca8fe277",  # noqa
+    "sts": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5",  # noqa
+    "mnli": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce",  # noqa
+    "snli": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df",  # noqa
+    "qnli": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601",  # noqa
+    "rte": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb",  # noqa
+    "wnli": "https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf",  # noqa
+    "diagnostic": [
+        "https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D",  # noqa
+        "https://www.dropbox.com/s/ju7d95ifb072q9f/diagnostic-full.tsv?dl=1",
+    ],
+}
+
+GLUE_READERS = {
+    'cola': read_cola,
+    'sst': read_sst,
+    'mrpc': read_mrpc,
+    'qqp': read_qqp,
+    'sts': read_sts,
+    'mnli': read_mnli,
+    'snli': read_snli,
+    'qnli': read_qnli,
+    'rte': read_rte,
+    'wnli': read_wnli,
+    'diagnostic': read_glue_diagnostic
+}
+
+
+SUPERGLUE_TASK2PATH = {
+    "cb": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/CB.zip",
+    "copa": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/COPA.zip",
+    "multirc": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/MultiRC.zip",
+    "rte": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/RTE.zip",
+    "wic": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WiC.zip",
+    "wsc": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip",
+    "broadcoverage-diagnostic": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-b.zip",
+    "winogender-diagnostic": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-g.zip",
+    "boolq": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/BoolQ.zip",
+    "record": "https://dl.fbaipublicfiles.com/glue/superglue/data/v2/ReCoRD.zip",
+}
+
+SUPERGLUE_READER = {
+    'cb': read_cb,
+    'copa': read_copa,
+    'multirc': read_multirc,
+    'rte': read_rte_superglue,
+    'wic': read_wic,
+    'wsc': read_wsc,
+    'boolq': read_boolq,
+    'record': read_record,
+    'broadcoverage-diagnostic': read_broadcoverage_diagnostic,
+    'winogender-diagnostic': read_winogender_diagnostic
+}
+
+
+def format_mrpc(data_dir):
+    mrpc_dir = os.path.join(data_dir, "mrpc")
+    os.makedirs(mrpc_dir, exist_ok=True)
+    mrpc_train_file = os.path.join(mrpc_dir, "msr_paraphrase_train.txt")
+    mrpc_test_file = os.path.join(mrpc_dir, "msr_paraphrase_test.txt")
+    download(GLUE_TASK2PATH["mrpc"]['train'], mrpc_train_file,
+             sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['train']])
+    download(GLUE_TASK2PATH["mrpc"]['test'], mrpc_test_file,
+             sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['test']])
+    assert os.path.isfile(mrpc_train_file), "Train data not found at %s" % mrpc_train_file
+    assert os.path.isfile(mrpc_test_file), "Test data not found at %s" % mrpc_test_file
+    download(GLUE_TASK2PATH["mrpc"]['dev'],
+             os.path.join(mrpc_dir, "dev_ids.tsv"),
+             sha1_hash=_URL_FILE_STATS[GLUE_TASK2PATH["mrpc"]['dev']])
+
+    dev_ids = []
+    with open(os.path.join(mrpc_dir, "dev_ids.tsv"), encoding="utf8") as ids_fh:
+        for row in ids_fh:
+            dev_ids.append(row.strip().split("\t"))
+
+    with open(mrpc_train_file, encoding="utf8") as data_fh, open(
+        os.path.join(mrpc_dir, "train.tsv"), "w", encoding="utf8"
+    ) as train_fh, open(os.path.join(mrpc_dir, "dev.tsv"), "w", encoding="utf8") as dev_fh:
+        header = data_fh.readline()
+        train_fh.write(header)
+        dev_fh.write(header)
+        for row in data_fh:
+            label, id1, id2, s1, s2 = row.strip().split("\t")
+            if [id1, id2] in dev_ids:
+                dev_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+            else:
+                train_fh.write("%s\t%s\t%s\t%s\t%s\n" % (label, id1, id2, s1, s2))
+
+    with open(mrpc_test_file, encoding="utf8") as data_fh, open(
+        os.path.join(mrpc_dir, "test.tsv"), "w", encoding="utf8"
+    ) as test_fh:
+        header = data_fh.readline()
+        test_fh.write("index\t#1 ID\t#2 ID\t#1 String\t#2 String\n")
+        for idx, row in enumerate(data_fh):
+            label, id1, id2, s1, s2 = row.strip().split("\t")
+            test_fh.write("%d\t%s\t%s\t%s\t%s\n" % (idx, id1, id2, s1, s2))
+
+
+def get_tasks(benchmark, task_names):
+    task_names = task_names.split(",")
+    ALL_TASKS = GLUE_TASKS if benchmark == 'glue' else SUPERGLUE_TASKS
+    if "all" in task_names:
+        tasks = ALL_TASKS
+    else:
+        tasks = []
+        for task_name in task_names:
+            if task_name != 'diagnostic':
+                assert task_name in ALL_TASKS, "Task %s not found!" % task_name
+            tasks.append(task_name)
+        if "RTE" in tasks and "diagnostic" not in tasks:
+            tasks.append("diagnostic")
+    has_diagnostic = any(['diagnostic' in task for task in tasks])
+    if has_diagnostic:
+        tasks = [ele for ele in tasks if 'diagnostic' not in ele]
+        tasks.append('diagnostic')
+    return tasks
+
+
+@DATA_PARSER_REGISTRY.register('prepare_glue')
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--benchmark", choices=['glue', 'superglue'],
+                        default='glue', type=str)
+    parser.add_argument("-d", "--data_dir", help="directory to save data to", type=str,
+                        default=None)
+    parser.add_argument(
+        "-t",
+        "--tasks",
+        help="tasks to download data for as a comma separated string",
+        type=str,
+        default="all"
+    )
+    parser.add_argument('--cache-path', type=str,
+                        default=os.path.join(get_data_home_dir(), 'glue'),
+                        help='The temporary path to download the dataset.')
+    return parser
+
+
+@DATA_MAIN_REGISTRY.register('prepare_glue')
+def main(args):
+    if args.data_dir is None:
+        args.data_dir = args.benchmark
+    args.cache_path = os.path.join(args.cache_path, args.benchmark)
+    print('Downloading {} to {}. Selected tasks = {}'.format(args.benchmark,
+                                                             args.data_dir, args.tasks))
+    os.makedirs(args.cache_path, exist_ok=True)
+    os.makedirs(args.data_dir, exist_ok=True)
+    tasks = get_tasks(args.benchmark, args.tasks)
+    if args.benchmark == 'glue':
+        TASK2PATH = GLUE_TASK2PATH
+        TASK2READER = GLUE_READERS
+    elif args.benchmark == 'superglue':
+        TASK2PATH = SUPERGLUE_TASK2PATH
+        TASK2READER = SUPERGLUE_READER
+    else:
+        raise NotImplementedError
+    for task in tasks:
+        print('Processing {}...'.format(task))
+        if task == 'diagnostic' or 'diagnostic' in task:
+            if args.benchmark == 'glue':
+                reader = TASK2READER[task]
+                base_dir = os.path.join(args.data_dir, 'rte_diagnostic')
+                os.makedirs(base_dir, exist_ok=True)
+                download(TASK2PATH['diagnostic'][0],
+                         path=os.path.join(base_dir, 'diagnostic.tsv'),
+                         sha1_hash=_URL_FILE_STATS[TASK2PATH['diagnostic'][0]])
+                download(TASK2PATH['diagnostic'][1],
+                         path=os.path.join(base_dir, 'diagnostic-full.tsv'),
+                         sha1_hash=_URL_FILE_STATS[TASK2PATH['diagnostic'][1]])
+                df = reader(base_dir)
+                df.to_parquet(os.path.join(base_dir, 'diagnostic-full.parquet'))
+            else:
+                for key, name in [('broadcoverage-diagnostic', 'AX-b'),
+                                  ('winogender-diagnostic', 'AX-g')]:
+                    data_file = os.path.join(args.cache_path, "{}.zip".format(key))
+                    url = TASK2PATH[key]
+                    reader = TASK2READER[key]
+                    download(url, data_file, sha1_hash=_URL_FILE_STATS[url])
+                    with zipfile.ZipFile(data_file) as zipdata:
+                        zipdata.extractall(args.data_dir)
+                    df = reader(os.path.join(args.data_dir, name))
+                    df.to_parquet(os.path.join(args.data_dir, name, '{}.parquet'.format(name)))
+        elif task == 'mrpc':
+            reader = TASK2READER[task]
+            format_mrpc(args.data_dir)
+            df_dict, meta_data = reader(os.path.join(args.data_dir, 'mrpc'))
+            for key, df in df_dict.items():
+                if key == 'val':
+                    key = 'dev'
+                df.to_parquet(os.path.join(args.data_dir, 'mrpc', '{}.parquet'.format(key)))
+            with open(os.path.join(args.data_dir, 'mrpc', 'metadata.json'), 'w') as f:
+                json.dump(meta_data, f)
+        else:
+            # Download data
+            data_file = os.path.join(args.cache_path, "{}.zip".format(task))
+            url = TASK2PATH[task]
+            reader = TASK2READER[task]
+            download(url, data_file, sha1_hash=_URL_FILE_STATS[url])
+            base_dir = os.path.join(args.data_dir, task)
+            if os.path.exists(base_dir):
+                print('Found!')
+                continue
+            zip_dir_name = None
+            with zipfile.ZipFile(data_file) as zipdata:
+                if zip_dir_name is None:
+                    zip_dir_name = os.path.dirname(zipdata.infolist()[0].filename)
+                zipdata.extractall(args.data_dir)
+            shutil.move(os.path.join(args.data_dir, zip_dir_name),
+                        base_dir)
+            df_dict, meta_data = reader(base_dir)
+            for key, df in df_dict.items():
+                if key == 'val':
+                    key = 'dev'
+                df.to_parquet(os.path.join(base_dir, '{}.parquet'.format(key)))
+            if meta_data is not None:
+                with open(os.path.join(base_dir, 'metadata.json'), 'w') as f:
+                    json.dump(meta_data, f)
+        print("\tCompleted!")
+
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/scripts/datasets/language_modeling/README.md b/scripts/datasets/language_modeling/README.md
new file mode 100644
index 0000000000..a75779ea42
--- /dev/null
+++ b/scripts/datasets/language_modeling/README.md
@@ -0,0 +1,24 @@
+# Language Modeling Benchmark
+
+Prepare the language modeling benchmarking datasets. 
+In order to help reproduce the papers, we use 
+the tokenized corpus as the training/validation/testing dataset.
+
+```bash
+# WikiText-2
+nlp_data prepare_lm --dataset wikitext2
+
+# WikiText-103
+nlp_data prepare_lm --dataset wikitext103
+
+# enwik8
+nlp_data prepare_lm --dataset enwik8
+
+# Text-8
+nlp_data prepare_lm --dataset text8
+
+# Google One-Billion-Word
+nlp_data prepare_lm --dataset gbw
+```
+
+Happy language modeling :)
diff --git a/scripts/datasets/language_modeling/__init__.py b/scripts/datasets/language_modeling/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/datasets/language_modeling/prepare_lm.py b/scripts/datasets/language_modeling/prepare_lm.py
new file mode 100644
index 0000000000..6f56ddb02d
--- /dev/null
+++ b/scripts/datasets/language_modeling/prepare_lm.py
@@ -0,0 +1,265 @@
+import argparse
+import os
+import zipfile
+import tarfile
+import shutil
+from typing import List, Optional
+from collections import Counter
+from gluonnlp.base import get_data_home_dir
+from gluonnlp.registry import DATA_MAIN_REGISTRY, DATA_PARSER_REGISTRY
+from gluonnlp.utils.misc import download, load_checksum_stats
+from gluonnlp.data.vocab import Vocab
+
+
+_CITATIONS = """
+@ONLINE {mahoney2011large,
+  title={Large text compression benchmark},
+  author={Mahoney, Matt},
+  url={http://www.mattmahoney.net/dc/text.html},
+  year={2011}
+}
+
+@article{chelba2013one,
+  title={One billion word benchmark for measuring progress in statistical language modeling},
+  author={Chelba, Ciprian and Mikolov, Tomas and Schuster, Mike and Ge, Qi and Brants, Thorsten
+   and Koehn, Phillipp and Robinson, Tony},
+  journal={arXiv preprint arXiv:1312.3005},
+  year={2013}
+}
+
+
+@inproceedings{merity2016pointer,
+  title={Pointer sentinel mixture models},
+  author={Merity, Stephen and Xiong, Caiming and Bradbury, James and Socher, Richard},
+  booktitle={ICLR},
+  year={2017}
+}
+"""
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums',
+                                    'language_model.txt')
+_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
+
+
+_URLS = {
+    'wikitext2': 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip',
+    'wikitext103': 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip',
+    'enwik8': 'http://mattmahoney.net/dc/enwik8.zip',
+    'text8': 'http://mattmahoney.net/dc/text8.zip',
+    # The original address of Google One Billion Word dataset is
+    # http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz
+    # We uploaded the file to S3 to accelerate the speed
+    'gbw': 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1-billion-word-language-modeling-benchmark-r13output.tar.gz',
+    # The data is obtained from https://raw.githubusercontent.com/rafaljozefowicz/lm/master/1b_word_vocab.txt
+    'gbw_vocab': 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1b_word_vocab.txt'
+}
+
+
+@DATA_PARSER_REGISTRY.register('prepare_lm')
+def get_parser():
+    parser = argparse.ArgumentParser(description='Downloading and Preprocessing'
+                                                 ' Language Modeling Datasets.')
+    parser.add_argument('--dataset', type=str, required=True,
+                        choices=['wikitext2', 'wikitext103', 'text8', 'enwik8', 'gbw'],
+                        help='The dataset to use.')
+    parser.add_argument('--save-dir', type=str, default=None,
+                        help='The directory to save the dataset.'
+                             ' By default, it will save to a folder with the same name as the '
+                             'dataset')
+    parser.add_argument('--overwrite', action='store_true', help='Whether to overwrite the saved '
+                                                                 'files.')
+    parser.add_argument('--cache-path', type=str,
+                        default=os.path.join(get_data_home_dir(), 'lm_benchmark_data'),
+                        help='The temporary path to download the dataset.')
+    return parser
+
+
+def path_exist_and_skip(path, overwrite):
+    if os.path.exists(path) and not overwrite:
+        print('Found {}. Skip writing. Turn `--overwrite` to force update the file.'
+              .format(path))
+        return True
+    return False
+
+
+def build_vocab(corpus_path_l: List, eos_token: Optional[str] = '<eos>') -> Vocab:
+    """Build the default vocabulary used in datasets like
+
+        - wikitext2
+        - wikitext103
+        - text8
+        - enwiki8
+
+    The strategy is to split with white-space and store all appeared tokens.
+    Also, the tokens will be sorted with a descending order of their frequency.
+
+    Parameters
+    ----------
+    corpus_path_l
+        The corpus path
+    eos_token
+        If it is not None, the eos_token will be added to the vocabulary.
+
+    Returns
+    -------
+    vocab
+        The vocabulary
+    """
+    counter = Counter()
+    ntokens = 0
+    print('Build the default vocabulary used in benchmarks:')
+    for corpus_path in corpus_path_l:
+        with open(corpus_path, 'r', encoding='utf-8') as f:
+            for idx, line in enumerate(f):
+                if idx > 0 and idx % 500000 == 0:
+                    print('    line {}'.format(idx))
+                line = line.strip()
+                tokens = line.split()
+                counter.update(tokens)
+                ntokens += len(tokens)
+    if eos_token is not None and eos_token in counter:
+        raise ValueError('eos_token is set to be "{}", which appears in the text. '
+                         'Is it intended? You may choose another token as the eos_token.'
+                         .format(eos_token))
+    vocab = Vocab(counter, unk_token=None, eos_token=eos_token)
+    print('Processed {} tokens, vocab={}'.format(ntokens, vocab))
+    return vocab
+
+
+@DATA_MAIN_REGISTRY.register('prepare_lm')
+def main(args):
+    # Download the data
+    url = _URLS[args.dataset]
+    file_hash = _URL_FILE_STATS[url]
+    target_download_location = os.path.join(args.cache_path,
+                                            os.path.basename(url))
+    download(url, target_download_location, sha1_hash=file_hash)
+    save_dir = args.dataset if args.save_dir is None else args.save_dir
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir, exist_ok=True)
+    # Extract and process the data
+    if args.dataset == 'wikitext2':
+        with zipfile.ZipFile(target_download_location) as zf:
+            train_data = zf.read('wikitext-2/wiki.train.tokens')
+            valid_data = zf.read('wikitext-2/wiki.valid.tokens')
+            test_data = zf.read('wikitext-2/wiki.test.tokens')
+            for filename, part in [('train.txt', train_data),
+                                   ('valid.txt', valid_data),
+                                   ('test.txt', test_data)]:
+                filename = os.path.join(save_dir, filename)
+                print('{} will have {} bytes'.format(filename, len(part)))
+                if not path_exist_and_skip(filename, args.overwrite):
+                    with open(filename, 'wb') as of:
+                        of.write(part)
+            vocab = build_vocab([os.path.join(save_dir, 'train.txt'),
+                                 os.path.join(save_dir, 'valid.txt'),
+                                 os.path.join(save_dir, 'test.txt')])
+            vocab.save(os.path.join(save_dir, 'vocab.json'))
+    elif args.dataset == 'wikitext103':
+        with zipfile.ZipFile(target_download_location) as zf:
+            train_data = zf.read('wikitext-103/wiki.train.tokens')
+            valid_data = zf.read('wikitext-103/wiki.valid.tokens')
+            test_data = zf.read('wikitext-103/wiki.test.tokens')
+            for filename, part in [('train.txt', train_data),
+                                   ('valid.txt', valid_data),
+                                   ('test.txt', test_data)]:
+                filename = os.path.join(save_dir, filename)
+                if not path_exist_and_skip(filename, args.overwrite):
+                    print('{} will have {} bytes'.format(filename, len(part)))
+                    with open(filename, 'wb') as of:
+                        of.write(part)
+            vocab = build_vocab([os.path.join(save_dir, 'train.txt')])
+            vocab.save(os.path.join(save_dir, 'vocab.json'))
+    elif args.dataset == 'text8':
+        with zipfile.ZipFile(target_download_location) as zf:
+            with zf.open('text8', 'r') as f:
+                data = f.read().decode('utf-8')
+                num_test_chars = 5000000
+                train_data = data[: -2 * num_test_chars]
+                valid_data = data[-2 * num_test_chars: -num_test_chars]
+                test_data = data[-num_test_chars:]
+                for filename, part in [('train.txt', train_data),
+                                       ('valid.txt', valid_data),
+                                       ('test.txt', test_data)]:
+                    filename = os.path.join(save_dir, filename)
+                    print('{} will have {} bytes'.format(filename, len(part)))
+                    print('- Tokenizing...')
+                    # Change space ' ' to underscore '_'
+                    part_str = ' '.join(['_' if c == ' ' else c for c in part.strip()])
+                    print('- Writing...')
+                    if not path_exist_and_skip(filename, args.overwrite):
+                        with open(filename, 'w', encoding='utf-8') as of:
+                            of.write(part_str)
+                    if not path_exist_and_skip(filename + '.raw', args.overwrite):
+                        with open(filename + '.raw', 'w', encoding='utf-8') as of:
+                            of.write(part)
+            vocab = build_vocab([os.path.join(save_dir, 'train.txt')], eos_token=None)
+            vocab.save(os.path.join(save_dir, 'vocab.json'))
+    elif args.dataset == 'enwik8':
+        with zipfile.ZipFile(target_download_location) as zf:
+            data = zf.read('enwik8')
+            print('Length of enwik8: {}'.format(len(data)))
+            num_test_chars = 5000000
+            train_data = data[: -2 * num_test_chars]
+            valid_data = data[-2 * num_test_chars: -num_test_chars]
+            test_data = data[-num_test_chars:]
+
+            for filename, part in [('train.txt', train_data),
+                                   ('valid.txt', valid_data),
+                                   ('test.txt', test_data)]:
+                filename = os.path.join(save_dir, filename)
+                print('{} will have {} bytes'.format(filename, len(part)))
+                print('- Tokenizing...')
+                part_str = ' '.join([str(c) if c != ord('\n') else '\n' for c in part])
+                print('- Writing...')
+                if not path_exist_and_skip(filename, args.overwrite):
+                    with open(filename, 'w') as of:
+                        of.write(part_str)
+                if not path_exist_and_skip(filename + '.raw', args.overwrite):
+                    with open(filename + '.raw', 'wb') as of:
+                        of.write(part)
+            vocab = build_vocab([os.path.join(save_dir, 'train.txt')], eos_token=None)
+            vocab.save(os.path.join(save_dir, 'vocab.json'))
+
+    elif args.dataset == 'gbw':
+        vocab_path = download(_URLS['gbw_vocab'],
+                              os.path.join(args.cache_path, '1b_word_vocab.txt'),
+                              sha1_hash=_URL_FILE_STATS[_URLS['gbw_vocab']])
+        with tarfile.open(target_download_location) as f:
+            os.makedirs(os.path.join(save_dir, 'train'), exist_ok=True)
+            os.makedirs(os.path.join(save_dir, 'test'), exist_ok=True)
+            for member in f.getmembers():
+                if 'training-monolingual.tokenized.shuffled' in member.name \
+                        and 'news.en' in member.name:
+                    basename = os.path.basename(member.name)
+                    with f.extractfile(member) as f_in:
+                        with open(os.path.join(save_dir, 'train', basename), 'wb') as f_out:
+                            shutil.copyfileobj(f_in, f_out)
+                elif 'heldout-monolingual.tokenized.shuffled' in member.name and \
+                        '.heldout-' in member.name:
+                    basename = os.path.basename(member.name)
+                    with f.extractfile(member) as f_in:
+                        with open(os.path.join(save_dir, 'test', basename), 'wb') as f_out:
+                            shutil.copyfileobj(f_in, f_out)
+        all_tokens = []
+        with open(vocab_path, 'r') as f:
+            for token in f:
+                token = token.strip().split()[0]
+                all_tokens.append(token)
+        vocab = Vocab(all_tokens, bos_token='<S>', unk_token='<UNK>')
+        vocab.save(os.path.join(save_dir, 'vocab.json'))
+        print('Saved Google-One-Billion-Word in {}'.format(save_dir))
+        print('Vocab={}'.format(vocab))
+    else:
+        raise NotImplementedError
+
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/scripts/datasets/machine_translation/README.md b/scripts/datasets/machine_translation/README.md
new file mode 100644
index 0000000000..e9b2f7c194
--- /dev/null
+++ b/scripts/datasets/machine_translation/README.md
@@ -0,0 +1,89 @@
+# Machine Translation
+
+In machine translation, we train a model to map a sentence from the source language, e.g., English, 
+to the target language, e.g., Chinese. Here, we provide scripts to download the common benchmark 
+datasets for machine translation. The downloaded datasets are stored as a pair of corpus files, 
+one for the source and the other for the target.  
+
+## WMT
+You can use [prepare_wmt.py](prepare_wmt.py) to download and prepare the raw training corpus and 
+then use [clean_parallel_corpus.py](../../preprocess/clean_parallel_corpus.py) to clean and 
+filter the corpus. 
+
+You may download the raw WMT2014 en-de  
+```bash
+nlp_data prepare_wmt \
+        --dataset wmt2014 \
+        --lang-pair en-de \
+        --save-path wmt2014_en_de
+```
+
+By combining `nlp_data` and `nlp_preprocess`, we provide the example for preparing the 
+WMT2014 en-de training dataset: [wmt2014_ende.sh](wmt2014_ende.sh). This involves three steps:
+- Downloading the raw text data
+- Clean and tokenize the data
+- Learn subword model and apply the learned subword model.
+
+```bash
+bash wmt2014_ende.sh yttm
+```
+
+We support the following subword learning algorithms:
+
+```bash
+# BPE from YouTokenToMe
+bash wmt2014_ende.sh yttm
+
+# BPE from Huggingface
+bash wmt2014_ende.sh hf_bpe
+
+# BPE from subword-nmt
+bash wmt2014_ende.sh subword_nmt
+
+# Byte-level BPE
+bash wmt2014_ende.sh hf_bytebpe
+
+# Sentencepiece
+bash wmt2014_ende.sh spm
+
+# WordPiece
+bash wmt2014_ende.sh hf_wordpiece
+```
+
+
+Apart from WMT2014 EN-DE, we also provided the script for preparing the training data for 
+WMT2017 ZH-EN task: 
+[wmt2017_zhen.sh](wmt2017_zhen.sh).
+
+### Monolingual Corpus
+In the WMT competition, there are additional monolingual corpus that helps you train NMT models. 
+You may download the raw monolingual corpus by adding `--mono` flag.
+
+One example is to download the newscrawl monolingual corpus in German:
+
+```bash
+nlp_data prepare_wmt \
+        --mono \
+        --mono_lang de \
+        --dataset newscrawl \
+        --save-path wmt2014_mono
+```   
+
+
+### Directory Structure of Translation Dataset
+
+The basic structure of a translation dataset is like the following:
+```
+folder_name
+├── train.raw.{src}
+├── train.raw.{tgt}
+├── train.tok.{src}
+├── train.tok.{tgt}
+├── train.tok.{subword_model}.{src}
+├── train.tok.{subword_model}.{tgt}
+├── ... 
+├── ... Repeat for valid and test
+├── ...
+├── {subword_model}.model
+├── {subword_model}.path
+```
diff --git a/scripts/datasets/machine_translation/__init__.py b/scripts/datasets/machine_translation/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/datasets/machine_translation/prepare_wmt.py b/scripts/datasets/machine_translation/prepare_wmt.py
new file mode 100644
index 0000000000..2ac5f77772
--- /dev/null
+++ b/scripts/datasets/machine_translation/prepare_wmt.py
@@ -0,0 +1,1071 @@
+from typing import List, Union, IO, AnyStr, Tuple, Optional
+import re
+import os
+import argparse
+import zipfile
+import shutil
+import functools
+import tarfile
+import gzip
+import json
+from xml.etree import ElementTree
+from gluonnlp.data.filtering import ProfanityFilter
+from gluonnlp.utils.misc import file_line_number, download, load_checksum_stats
+from gluonnlp.base import get_data_home_dir, get_repo_url
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+
+# The datasets are provided by WMT2014-WMT2019 and can be freely used for research purposes.
+# You will need to cite the WMT14-WMT19 shared task overview paper and additional citation
+# requirements for specific individual datasets
+#   http://www.statmt.org/wmt14/translation-task.html to
+#   http://www.statmt.org/wmt19/translation-task.html
+
+
+_CITATIONS = """
+@inproceedings{ziemski2016united,
+  title={The united nations parallel corpus v1. 0},
+  author={Ziemski, Micha{\l} and Junczys-Dowmunt, Marcin and Pouliquen, Bruno},
+  booktitle={Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC'16)},
+  pages={3530--3534},
+  year={2016}
+}
+
+@inproceedings{barrault2019findings,
+  title={Findings of the 2019 conference on machine translation (wmt19)},
+  author={Barrault, Lo{\"\i}c and Bojar, Ond{\v{r}}ej and Costa-juss{\`a}, Marta R and Federmann, Christian and Fishel, Mark and Graham, Yvette and Haddow, Barry and Huck, Matthias and Koehn, Philipp and Malmasi, Shervin and others},
+  booktitle={Proceedings of the Fourth Conference on Machine Translation (Volume 2: Shared Task Papers, Day 1)},
+  pages={1--61},
+  year={2019}
+}
+"""
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'wmt')
+_URL_FILE_STATS = load_checksum_stats(os.path.join(_CURR_DIR, '..', 'url_checksums', 'wmt.txt'))
+
+
+# Here, we will make sure that the languages follow the standard ISO 639-1 language tag.
+# Also, for more information related to the language tag, you may refer to
+# https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
+_PARA_URLS = {
+    'europarl': {
+        'v7': {
+            'cs-en': {
+                'url': 'http://www.statmt.org/europarl/v7/cs-en.tgz',
+                'cs': 'europarl-v7.cs-en.cs',
+                'en': 'europarl-v7.cs-en.en',
+            },
+            'de-en': {
+                'url': 'http://www.statmt.org/europarl/v7/de-en.tgz',
+                'de': 'europarl-v7.de-en.de',
+                'en': 'europarl-v7.de-en.en',
+            }
+        },
+        'v8': {
+            'url': 'http://data.statmt.org/wmt18/translation-task/training-parallel-ep-v8.tgz',
+            'fi-en': {
+                'fi': 'training/europarl-v8.fi-en.fi',
+                'en': 'training/europarl-v8.fi-en.en'
+            },
+            'et-en': {
+                'et': 'training/europarl-v8.et-en.et',
+                'en': 'training/europarl-v8.et-en.en'
+            }
+        },
+        'v9': {
+            'cs-en': {
+                'url': 'http://www.statmt.org/europarl/v9/training/europarl-v9.cs-en.tsv.gz',
+                'all': 'europarl-v9.cs-en.tsv'
+            },
+            'de-en': {
+                'url': 'http://www.statmt.org/europarl/v9/training/europarl-v9.de-en.tsv.gz',
+                'all': 'europarl-v9.de-en.tsv'
+            },
+            'fi-en': {
+                'url': 'http://www.statmt.org/europarl/v9/training/europarl-v9.fi-en.tsv.gz',
+                'all': 'europarl-v9.fi-en.tsv'
+            },
+            'lt-en': {
+                'url': 'http://www.statmt.org/europarl/v9/training/europarl-v9.lt-en.tsv.gz',
+                'all': 'europarl-v9.lt-en.tsv'
+            }
+        }
+    },
+    'paracrawl': {
+        'r3': {
+            'en-cs': {
+                'url': 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-cs.bicleaner07.tmx.gz',
+                'all': 'en-cs.bicleaner07.tmx'
+            },
+            'en-de': {
+                'url': 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-de.bicleaner07.tmx.gz',
+                'all': 'en-de.bicleaner07.tmx'
+            },
+            'en-fi': {
+                'url': 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-fi.bicleaner07.tmx.gz',
+                'all': 'en-fi.bicleaner07.tmx'
+            },
+            'en-lt': {
+                'url': 'https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-lt.bicleaner07.tmx.gz',
+                'all': 'en-lt.bicleaner07.tmx'
+            }
+        }
+    },
+    'commoncrawl': {
+        'wmt13': {
+            'url': 'http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz',
+            'de-en': {
+                'de': 'commoncrawl.de-en.de',
+                'en': 'commoncrawl.de-en.en',
+            }
+        }
+    },
+    'newscommentary': {
+        'v9': {
+            'url': 'http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz',
+            'de-en': {
+                'de': 'training/news-commentary-v9.de-en.de',
+                'en': 'training/news-commentary-v9.de-en.en'
+            }
+        },
+        'v10': {
+            'url': 'http://www.statmt.org/wmt15/training-parallel-nc-v10.tgz',
+            'de-en': {
+                'de': 'news-commentary-v10.de-en.de',
+                'en': 'news-commentary-v10.de-en.de'
+            }
+        },
+        'v11': {
+            'url': 'http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz',
+            'de-en': {
+                'de': 'training-parallel-nc-v11/news-commentary-v11.de-en.de',
+                'en': 'training-parallel-nc-v11/news-commentary-v11.de-en.en'
+            }
+        },
+        'v12': {
+            'url': 'http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz',
+            'de-en': {
+                'de': 'training/news-commentary-v12.de-en.de',
+                'en': 'training/news-commentary-v12.de-en.en',
+            },
+            'zh-en': {
+                'zh': 'training/news-commentary-v12.zh-en.zh',
+                'en': 'training/news-commentary-v12.zh-en.en'
+            }
+        },
+        'v13': {
+            'url': 'http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz',
+            'de-en': {
+                'de': 'training-parallel-nc-v13/news-commentary-v13.de-en.de',
+                'en': 'training-parallel-nc-v13/news-commentary-v13.de-en.en',
+            },
+            'zh-en': {
+                'zh': 'training-parallel-nc-v13/news-commentary-v13.zh-en.zh',
+                'en': 'training-parallel-nc-v13/news-commentary-v13.zh-en.en'
+            }
+        },
+        'v14': {
+            'de-en': {
+                'url': 'http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.de-en.tsv.gz',
+                'all': 'news-commentary-v14.de-en.tsv'
+            },
+            'en-zh': {
+                'url': 'http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-zh.tsv.gz',
+                'all': 'news-commentary-v14.en-zh.tsv'
+            }
+        }
+    },
+    'wikititles': {
+        'v1': {
+            'cs-en': {
+                'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.cs-en.tsv.gz',
+                'all': 'wikititles-v1.cs-en.tsv'
+            },
+            'cs-pl': {
+                'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.cs-pl.tsv.gz',
+                'all': 'wikititles-v1.cs-pl.tsv'
+            },
+            'de-en': {
+                'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.de-en.tsv.gz',
+                'all': 'wikititles-v1.de-en.tsv'
+            },
+            'es-pt': {
+                'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.es-pt.tsv.gz',
+                'all': 'wikititles-v1.es-pt.tsv'
+            },
+            'fi-en': {
+                'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.fi-en.tsv.gz',
+                'all': 'wikititles-v1.fi-en.tsv'
+            },
+            'gu-en': {
+                'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.gu-en.tsv.gz',
+                'all': 'wikititles-v1.gu-en.tsv'
+            },
+            'hi-ne': {
+                'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.hi-ne.tsv.gz',
+                'all': 'wikititles-v1.hi-ne.tsv'
+            },
+            'kk-en': {
+                'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz',
+                'all': 'wikititles-v1.kk-en.tsv'
+            },
+            'lt-en': {
+                'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.lt-en.tsv.gz',
+                'all': 'wikititles-v1.lt-en.tsv'
+            },
+            'ru-en': {
+                'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.ru-en.tsv.gz',
+                'all': 'wikititles-v1.ru-en.tsv'
+            },
+            'zh-en': {
+                'url': 'http://data.statmt.org/wikititles/v1/wikititles-v1.zh-en.tsv.gz',
+                'all': 'wikititles-v1.zh-en.tsv'
+            }
+        }
+    },
+    'uncorpus': {
+        'v1': {
+            'en-zh': {
+                'url': ['https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00',
+                        'https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01'],
+                'en': 'en-zh/UNv1.0.en-zh.en',
+                'zh': 'en-zh/UNv1.0.en-zh.zh'
+            }
+        }
+    },
+    # For the CWMT dataset, you can also download them from the official location: http://nlp.nju.edu.cn/cwmt-wmt/
+    # Currently, this version is processed via https://gist.github.com/sxjscience/54bedd68ce3fb69b3b1b264377efb5a5
+    'cwmt': {
+        'url': 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt.tar.gz',
+        'zh-en': {
+            'en': 'cwmt/cwmt-zh-en.en',
+            'zh': 'cwmt/cwmt-zh-en.zh'
+        }
+    },
+    'rapid': {
+        '2016': {
+            'url': 'http://data.statmt.org/wmt17/translation-task/rapid2016.tgz',
+            'de-en': {
+                'de': 'rapid2016.de-en.de',
+                'en': 'rapid2016.de-en.en'
+            }
+        },
+        '2019': {
+            'de-en': {
+                'url': 'https://s3-eu-west-1.amazonaws.com/tilde-model/rapid2019.de-en.zip',
+                'de': 'rapid2019.de-en.de',
+                'en': 'rapid2019.de-en.en'
+            }
+        }
+    },
+}
+
+_MONOLINGUAL_URLS = {
+    'newscrawl': {
+        '2007': {
+            'de': {
+                'url': 'http://data.statmt.org/news-crawl/de/news.2007.de.shuffled.deduped.gz',
+                'de': 'newscrawl2007.de',
+            }
+        },
+        '2008': {
+            'de': {
+                'url': 'http://data.statmt.org/news-crawl/de/news.2008.de.shuffled.deduped.gz',
+                'de': 'newscrawl2008.de',
+            }
+        },
+        '2009': {
+            'de': {
+                'url': 'http://data.statmt.org/news-crawl/de/news.2009.de.shuffled.deduped.gz',
+                'de': 'newscrawl2009.de',
+            }
+        },
+        '20010': {
+            'de': {
+                'url': 'http://data.statmt.org/news-crawl/de/news.2010.de.shuffled.deduped.gz',
+                'de': 'newscrawl2010.de',
+            }
+        },
+        '2011': {
+            'de': {
+                'url': 'http://data.statmt.org/news-crawl/de/news.2011.de.shuffled.deduped.gz',
+                'de': 'newscrawl2011.de',
+            }
+        },
+        '2012': {
+            'de': {
+                'url': 'http://data.statmt.org/news-crawl/de/news.2012.de.shuffled.deduped.gz',
+                'de': 'newscrawl2012.de',
+            }
+        },
+        '2013': {
+            'de': {
+                'url': 'http://data.statmt.org/news-crawl/de/news.2013.de.shuffled.deduped.gz',
+                'de': 'newscrawl2013.de',
+            }
+        },
+        '2014': {
+            'de': {
+                'url': 'http://data.statmt.org/news-crawl/de/news.2014.de.shuffled.deduped.gz',
+                'de': 'newscrawl2014.de',
+            }
+        },
+        '2015': {
+            'de': {
+                'url': 'http://data.statmt.org/news-crawl/de/news.2015.de.shuffled.deduped.gz',
+                'de': 'newscrawl2015.de',
+            }
+        },
+        '2016': {
+            'de': {
+                'url': 'http://data.statmt.org/news-crawl/de/news.2016.de.shuffled.deduped.gz',
+                'de': 'newscrawl2016.de',
+            }
+        },
+        '2017': {
+            'de': {
+                'url': 'http://data.statmt.org/news-crawl/de/news.2017.de.shuffled.deduped.gz',
+                'de': 'newscrawl2017.de',
+            }
+        },
+        '2018': {
+            'de': {
+                'url': 'http://data.statmt.org/news-crawl/de/news.2018.de.shuffled.deduped.gz',
+                'de': 'newscrawl2018.de',
+            }
+        },
+    }
+}
+
+with open(os.path.join(_CURR_DIR, '..', 'url_checksums', 'mirror', 'wmt.json')) as wmt_mirror_map_f:
+    _WMT_MIRROR_URL_MAP = json.load(wmt_mirror_map_f)
+
+def _download_with_mirror(url, path, sha1_hash):
+    return download(
+        get_repo_url() + _WMT_MIRROR_URL_MAP[url] if url in _WMT_MIRROR_URL_MAP else url,
+        path=path,
+        sha1_hash=sha1_hash
+    )
+
+def _clean_space(s: str):
+    """Removes trailing and leading spaces and collapses multiple consecutive internal spaces to a single one.
+    This is borrowed from sacrebleu: https://github.com/mjpost/sacreBLEU/blob/069b0c88fceb29f3e24c3c19ba25342a3e7f96cb/sacrebleu.py#L1077
+
+    Parameters
+    ----------
+    s
+        The input string
+
+    Returns
+    -------
+    ret
+        The cleaned string
+    """
+    return re.sub(r'\s+', ' ', s.strip())
+
+
+def _get_buffer(path_or_buffer: Union[str, IO[AnyStr]], mode='r'):
+    if isinstance(path_or_buffer, str):
+        buf = open(path_or_buffer, mode)
+    else:
+        buf = path_or_buffer
+    return buf
+
+
+def parse_sgm(path_or_buffer: Union[str, IO[AnyStr]],
+              out_path_or_buffer: Optional[Union[str, IO[AnyStr]]] = None,
+              return_sentences=False,
+              clean_space=True) -> Optional[List[str]]:
+    """Returns sentences from a single SGML file. This is compatible to the behavior of
+    `input-from-sgm.perl` in
+    https://github.com/moses-smt/mosesdecoder/blob/a89691fee395bb7eb6dfd51e368825f0578f437d/scripts/ems/support/input-from-sgm.perl
+
+    Parameters
+    ----------
+    path_or_buffer
+        The source path to parse the file
+    out_path_or_buffer
+        The output path
+    return_sentences
+        Whether to return the parsed sentences
+    clean_space
+        Whether to clean the spaces in the sentence with the similar strategy in
+         input-from-sgm.perl.
+
+    Returns
+    -------
+    sentences
+        The list contains the parsed sentences in the input file.
+        If the return_sentences is False, return None.
+    """
+    if out_path_or_buffer is None:
+        assert return_sentences, 'Must return sentences if the output path is not specified!'
+    if return_sentences:
+        sentences = []
+    else:
+        sentences = None
+    f_buffer = _get_buffer(path_or_buffer, 'r')
+    of_buffer = _get_buffer(out_path_or_buffer, 'w')
+    seg_re = re.compile(r'<seg.*?>(.*)</seg>.*?')
+    for line in f_buffer:
+        if isinstance(line, bytes):
+            line = line.decode('utf-8')
+        seg_match = re.match(seg_re, line)
+        if seg_match:
+            assert len(seg_match.groups()) == 1,\
+                'File content is not supported, unmatched line: {}'.format(line)
+            line = seg_match.groups()[0]
+            if clean_space:
+                line = _clean_space(line)
+            if of_buffer is not None:
+                of_buffer.write(line + '\n')
+            if sentences is not None:
+                sentences.append(line)
+    if of_buffer is not None:
+        of_buffer.close()
+    return sentences
+
+
+def parse_paracrawl_tmx(path_or_buffer, src_lang, tgt_lang, out_src_path, out_tgt_path,
+                        clean_space=True, filter_profanity=False):
+    candidate_lang = {src_lang, tgt_lang}
+    sent_num = 0
+    if filter_profanity:
+        src_profanity_filter = ProfanityFilter(langs=[src_lang])
+        tgt_profanity_filter = ProfanityFilter(langs=[tgt_lang])
+    has_src = False
+    has_tgt = False
+    src_sentence = None
+    tgt_sentence = None
+    f = _get_buffer(path_or_buffer)
+    src_out_f = open(out_src_path, 'w', encoding='utf-8')
+    tgt_out_f = open(out_tgt_path, 'w', encoding='utf-8')
+    for i, (_, elem) in enumerate(ElementTree.iterparse(f)):
+        if elem.tag == "tu":
+            for tuv in elem.iterfind("tuv"):
+                lang = None
+                for k, v in tuv.items():
+                    if k.endswith('}lang'):
+                        assert v in candidate_lang,\
+                            'Find language={} in data, which is not the same as either' \
+                            ' the source/target languages={}/{}'.format(v, src_lang, tgt_lang)
+                        lang = v
+                        break
+                if lang is not None:
+                    segs = tuv.findall("seg")
+                    assert len(segs) == 1, "Invalid number of segments: {}".format(len(segs))
+                    if lang == src_lang:
+                        assert not has_src
+                        has_src = True
+                        src_sentence = segs[0].text
+                    else:
+                        assert not has_tgt
+                        has_tgt = True
+                        tgt_sentence = segs[0].text
+                    if has_src and has_tgt:
+                        has_src, has_tgt = False, False
+                        if clean_space:
+                            # Merge the spaces
+                            src_sentence = _clean_space(src_sentence)
+                            tgt_sentence = _clean_space(tgt_sentence)
+                        if filter_profanity:
+                            if src_profanity_filter.match(src_sentence)\
+                                    or tgt_profanity_filter.match(tgt_sentence):
+                                continue
+                        sent_num += 1
+                        if sent_num % 500000 == 0:
+                            print('Processed {} sentences'.format(sent_num))
+                        src_out_f.write(src_sentence + '\n')
+                        tgt_out_f.write(tgt_sentence + '\n')
+            elem.clear()
+    src_out_f.close()
+    tgt_out_f.close()
+    assert has_src or has_tgt,\
+        'The number of source and target sentences are not the same.'
+
+
+def parse_tsv(path_or_buffer, src_out_path, tgt_out_path):
+    in_f = _get_buffer(path_or_buffer, 'r')
+    src_out_f = _get_buffer(src_out_path, 'w')
+    tgt_out_f = _get_buffer(tgt_out_path, 'w')
+    for line in in_f:
+        line = line.strip()
+        split_data = line.split('\t')
+        if len(split_data) == 2:
+            # Here, some lines may be corrupted and may not have a target translation
+            src_sentence, tgt_sentence = split_data
+            src_out_f.write(src_sentence + '\n')
+            tgt_out_f.write(tgt_sentence + '\n')
+
+
+def split_lang_pair(pair: str = 'de-en') -> Tuple[str, str]:
+    try:
+        src_lang, tgt_lang = pair.split('-')
+    except ValueError:
+        raise ValueError('pair must be format like "en-de", "zh-en". Received {}'
+                         .format(pair))
+    return src_lang, tgt_lang
+
+
+def concatenate_files(fname_l: List[str],
+                      out_fname: Optional[str] = None,
+                      chunk_size: int = 128 * 1024) -> str:
+    """Concatenate multiple files into a single file. This is used to recover a large file that has
+    been split into multiple parts. E.g.,
+
+    UNv1.0.en-zh.tar.gz.00, UNv1.0.en-zh.tar.gz.01 --> UNv1.0.en-zh.tar.gz
+
+    Parameters
+    ----------
+    fname_l
+    out_fname
+    chunk_size
+
+    Returns
+    -------
+    ret
+    """
+    assert len(fname_l) > 1
+    ext_l = []
+    base_prefix, ext = os.path.splitext(fname_l[0])
+    ext_l.append(ext)
+    for i in range(1, len(fname_l)):
+        prefix, ext = os.path.splitext(fname_l[i])
+        ext_l.append(ext)
+        if prefix != base_prefix:
+            raise ValueError('Cannot concatenate the input files! The prefix does not match! '
+                             'Find prefix={}, Expected prefix={}'.format(prefix, base_prefix))
+    fname_ext_l = sorted(zip(fname_l, ext_l), key=lambda ele: ele[1])
+    if out_fname is None:
+        out_fname = base_prefix
+    with open(out_fname, 'wb') as of:
+        for fname, _ in fname_ext_l:
+            with open(fname, 'rb') as infile:
+                for block in iter(functools.partial(infile.read, chunk_size), b''):
+                    of.write(block)
+    return out_fname
+
+
+def extract_mono_corpus(compressed_data_path, lang, name, out_src_path):
+    tmp_dir = os.path.join(os.path.dirname(compressed_data_path), 'raw_data')
+    if not os.path.exists(tmp_dir):
+        os.makedirs(tmp_dir)
+    # Uncompress data
+    if compressed_data_path.endswith('.gz'):
+        with gzip.open(compressed_data_path) as f_in:
+            with open(os.path.join(tmp_dir, name), 'wb') as f_out:
+                shutil.copyfileobj(f_in, f_out)
+    else:
+        raise NotImplementedError('Cannot process {}'.format(compressed_data_path))
+    # Parse data and move to the required src paths
+    
+    shutil.copyfile(os.path.join(tmp_dir, name), out_src_path)
+        
+    # Clean-up
+    shutil.rmtree(tmp_dir)
+
+
+def fetch_mono_dataset(selection: Union[str, List[str], List[List[str]]],
+                       lang: str = 'de',
+                       path: Optional[str] = _BASE_DATASET_PATH,
+                       overwrite: bool = False) -> List[str]:
+    """Fetch the monolingual dataset provided by WMT
+
+    Parameters
+    ----------
+    selection
+        The selected datasets
+    lang
+        Language of the monolingual corpus
+    path
+
+    overwrite
+        Whether to overwrite the downloaded dataset
+
+    Returns
+    -------
+    src_corpus_paths
+    """
+    base_url_info = _MONOLINGUAL_URLS
+    if isinstance(selection, str):
+        selection = [selection]
+    elif isinstance(selection, list):
+        if isinstance(selection[0], list):
+            corpus_paths = []
+            for ele in selection:
+                ele_corpus_paths =\
+                    fetch_mono_dataset(ele, lang, path, overwrite)
+                corpus_paths.extend(ele_corpus_paths)
+            return corpus_paths
+    else:
+        raise NotImplementedError
+    for sel in selection:
+        base_url_info = base_url_info[sel]
+
+    # Check the pair is valid
+    available_lang = set(base_url_info.keys())
+    if 'url' in available_lang:
+        available_lang.remove('url')
+    if lang in available_lang:
+        matched_lang = '{}'.format(lang)
+    else:
+        raise ValueError('Unsupported lang, lang={}. All supported: {}'
+                         .format(lang, available_lang))
+    save_dir_path = os.path.join(path, *(selection + [matched_lang]))
+    if not os.path.exists(save_dir_path):
+        os.makedirs(save_dir_path)
+    out_path = os.path.join(save_dir_path, lang + '.txt')
+    # Check for whether we can load the cached version
+    if os.path.exists(out_path) and not overwrite:
+        print('Found data in {}, skip:\n'
+              '\tSource: {}\n'.format(selection + [lang], out_path))
+        return [out_path]
+    lang_data_info = base_url_info[matched_lang]
+    if 'url' in lang_data_info:
+        url_l = lang_data_info['url']
+    else:
+        url_l = base_url_info['url']
+    # Download the data + Concatenate the file-parts (if necessary)
+    download_fname_l = []
+    if isinstance(url_l, str):
+        url_l = [url_l]
+    for url in url_l:
+        original_filename = url[url.rfind("/") + 1:]
+        sha1_hash = _URL_FILE_STATS[url]
+        if 'url' in lang_data_info:
+            save_path_l = [path] + selection + [matched_lang, original_filename]
+        else:
+            save_path_l = [path] + selection + [original_filename]
+        download_fname = _download_with_mirror(
+            url,
+            path=os.path.join(*save_path_l),
+            sha1_hash=sha1_hash
+        )
+        download_fname_l.append(download_fname)
+    if len(download_fname_l) > 1:
+        data_path = concatenate_files(download_fname_l)
+    else:
+        data_path = download_fname_l[0]
+    
+    src_name = lang_data_info[lang]
+    print('Prepare data for {}\n'
+          '\tCompressed File: {}\n'
+          '\t{}: {}\n'.format(selection + [lang],
+                              data_path,
+                              lang, out_path))
+    extract_mono_corpus(data_path,
+                        lang=lang,
+                        name=src_name,
+                        out_src_path=out_path)
+    return [out_path]
+
+
+def extract_src_tgt_corpus(compressed_data_path,
+                           data_lang_pair, src_lang, tgt_lang,
+                           src_name, tgt_name, src_tgt_name,
+                           out_src_path, out_tgt_path):
+    data_src_lang, data_tgt_lang = split_lang_pair(data_lang_pair)
+    if not ((src_lang == data_src_lang and tgt_lang == data_tgt_lang) or
+            (src_lang == data_tgt_lang and tgt_lang == data_src_lang)):
+        raise ValueError('Mismatch src/tgt language. Required pair={}, Given src={}, tgt={}'
+                         .format(data_lang_pair, src_lang, tgt_lang))
+    reverse_pair = (src_lang == data_tgt_lang) and (tgt_lang == data_src_lang)
+    if src_tgt_name is not None:
+        assert src_name is None and tgt_name is None
+    tmp_dir = os.path.join(os.path.dirname(compressed_data_path), 'raw_data')
+    if not os.path.exists(tmp_dir):
+        os.makedirs(tmp_dir)
+    # Uncompress data
+    if compressed_data_path.endswith('.tar.gz') or compressed_data_path.endswith('.tgz'):
+        with tarfile.open(compressed_data_path) as f:
+            if src_tgt_name is None:
+                f.extract(src_name, tmp_dir)
+                f.extract(tgt_name, tmp_dir)
+            else:
+                f.extract(src_tgt_name, os.path.join(tmp_dir, src_tgt_name))
+    elif compressed_data_path.endswith('.gz'):
+        assert src_tgt_name is not None
+        with gzip.open(compressed_data_path) as f_in:
+            with open(os.path.join(tmp_dir, src_tgt_name), 'wb') as f_out:
+                shutil.copyfileobj(f_in, f_out)
+    elif compressed_data_path.endswith('.zip'):
+        with zipfile.ZipFile(compressed_data_path) as zip_handler:
+            if src_tgt_name is None:
+                with zip_handler.open(src_name) as f_in:
+                    with open(os.path.join(tmp_dir, src_name), 'wb') as f_out:
+                        shutil.copyfileobj(f_in, f_out)
+                with zip_handler.open(tgt_name) as f_in:
+                    with open(os.path.join(tmp_dir, tgt_name), 'wb') as f_out:
+                        shutil.copyfileobj(f_in, f_out)
+            else:
+                with zip_handler.open(src_tgt_name) as f_in:
+                    with open(os.path.join(tmp_dir, src_tgt_name), 'wb') as f_out:
+                        shutil.copyfileobj(f_in, f_out)
+    else:
+        raise NotImplementedError('Cannot process {}'.format(compressed_data_path))
+    # Parse data and move to the required src/tgt path
+    if src_tgt_name is None:
+        if src_name.endswith('.sgm'):
+            parse_sgm(os.path.join(tmp_dir, src_name), out_src_path)
+            parse_sgm(os.path.join(tmp_dir, tgt_name), out_tgt_path)
+        else:
+            shutil.copyfile(os.path.join(tmp_dir, src_name), out_src_path)
+            shutil.copyfile(os.path.join(tmp_dir, tgt_name), out_tgt_path)
+    else:
+        if src_tgt_name.endswith('.tmx'):
+            parse_paracrawl_tmx(os.path.join(tmp_dir, src_tgt_name),
+                                src_lang=src_lang,
+                                tgt_lang=tgt_lang,
+                                out_src_path=out_src_path,
+                                out_tgt_path=out_tgt_path,
+                                clean_space=True,
+                                filter_profanity=False)
+        elif src_tgt_name.endswith('.tsv'):
+            if reverse_pair:
+                parse_tsv(os.path.join(tmp_dir, src_tgt_name), out_tgt_path, out_src_path)
+            else:
+                parse_tsv(os.path.join(tmp_dir, src_tgt_name), out_src_path, out_tgt_path)
+        else:
+            raise NotImplementedError
+    # Clean-up
+    shutil.rmtree(tmp_dir)
+
+
+def fetch_wmt_parallel_dataset(selection: Union[str, List[str], List[List[str]]],
+                               lang_pair: str = 'de-en',
+                               path: Optional[str] = _BASE_DATASET_PATH,
+                               overwrite: bool = False) -> Tuple[List[str], List[str]]:
+    """
+
+    Parameters
+    ----------
+    selection
+    lang_pair
+    path
+    overwrite
+
+    Returns
+    -------
+    src_corpus_paths
+    target_corpus_paths
+    """
+    src_lang, tgt_lang = split_lang_pair(lang_pair)
+    base_url_info = _PARA_URLS
+    if isinstance(selection, str):
+        selection = [selection]
+    elif isinstance(selection, list):
+        if isinstance(selection[0], list):
+            src_corpus_paths = []
+            tgt_corpus_paths = []
+            for ele in selection:
+                ele_src_corpus_paths, ele_tgt_corpus_paths =\
+                    fetch_wmt_parallel_dataset(ele, lang_pair, path, overwrite)
+                src_corpus_paths.extend(ele_src_corpus_paths)
+                tgt_corpus_paths.extend(ele_tgt_corpus_paths)
+            return src_corpus_paths, tgt_corpus_paths
+    else:
+        raise NotImplementedError
+    for sel in selection:
+        base_url_info = base_url_info[sel]
+    # Check the pair is valid
+    available_pairs = set(base_url_info.keys())
+    if 'url' in available_pairs:
+        available_pairs.remove('url')
+    if str(src_lang) + '-' + str(tgt_lang) in available_pairs:
+        matched_pair = '{}-{}'.format(src_lang, tgt_lang)
+    elif str(tgt_lang) + '-' + str(src_lang) in available_pairs:
+        matched_pair = '{}-{}'.format(tgt_lang, src_lang)
+    else:
+        raise ValueError('Unsupported pairs, src_lang={}, tgt_lang={}. All supported: {}'
+                         .format(src_lang, tgt_lang, available_pairs))
+    save_dir_path = os.path.join(path, *(selection + [matched_pair]))
+    if not os.path.exists(save_dir_path):
+        os.makedirs(save_dir_path)
+    out_src_path = os.path.join(save_dir_path, src_lang + '.txt')
+    out_tgt_path = os.path.join(save_dir_path, tgt_lang + '.txt')
+    # Check for whether we can load the cached version
+    # TODO we can do something smarter here
+    if os.path.exists(out_src_path) and os.path.exists(out_tgt_path) and not overwrite:
+        print('Found data in {}, skip:\n'
+              '\tSource: {}\n'
+              '\tTarget: {}\n'.format(selection + [lang_pair], out_src_path, out_tgt_path))
+        return [out_src_path], [out_tgt_path]
+    pair_data_info = base_url_info[matched_pair]
+    if 'url' in pair_data_info:
+        url_l = pair_data_info['url']
+    else:
+        url_l = base_url_info['url']
+    # Download the data + Concatenate the file-parts (if necessary)
+    download_fname_l = []
+    if isinstance(url_l, str):
+        url_l = [url_l]
+    for url in url_l:
+        original_filename = url[url.rfind("/") + 1:]
+        sha1_hash = _URL_FILE_STATS[url]
+        if 'url' in pair_data_info:
+            save_path_l = [path] + selection + [matched_pair, original_filename]
+        else:
+            save_path_l = [path] + selection + [original_filename]
+        download_fname = _download_with_mirror(
+            url,
+            path=os.path.join(*save_path_l),
+            sha1_hash=sha1_hash
+        )
+        download_fname_l.append(download_fname)
+    if len(download_fname_l) > 1:
+        data_path = concatenate_files(download_fname_l)
+    else:
+        data_path = download_fname_l[0]
+    if 'all' in pair_data_info:
+        src_name, tgt_name, src_tgt_name = None, None, pair_data_info['all']
+    else:
+        src_name, tgt_name, src_tgt_name = pair_data_info[src_lang], pair_data_info[tgt_lang], None
+    print('Prepare data for {}\n'
+          '\tCompressed File: {}\n'
+          '\t{}: {}\n'
+          '\t{}: {}\n'.format(selection + [lang_pair],
+                            data_path,
+                            src_lang, out_src_path,
+                            tgt_lang, out_tgt_path))
+    extract_src_tgt_corpus(data_path,
+                           data_lang_pair=matched_pair,
+                           src_lang=src_lang,
+                           tgt_lang=tgt_lang,
+                           src_name=src_name,
+                           tgt_name=tgt_name,
+                           src_tgt_name=src_tgt_name,
+                           out_src_path=out_src_path,
+                           out_tgt_path=out_tgt_path)
+    assert file_line_number(out_src_path) == file_line_number(out_tgt_path)
+    return [out_src_path], [out_tgt_path]
+
+
+def download_mono_newscrawl(lang: str = 'de', path: str = _BASE_DATASET_PATH)\
+        -> List[str]:
+    """Download the train dataset used for WMT2014
+
+    Parameters
+    ----------
+    lang
+    path
+
+    Returns
+    -------
+    train_src_paths
+    """
+    if lang == 'de':
+        train_src_paths =\
+            fetch_mono_dataset([['newscrawl', '2017'],
+                                ['newscrawl', '2018']],
+                               lang=lang,
+                               path=path)
+    else:
+        raise NotImplementedError
+    return train_src_paths
+
+
+def download_wmt14_train(lang_pair: str = 'en-de', path: str = _BASE_DATASET_PATH)\
+        -> Tuple[List[str], List[str]]:
+    """Download the train dataset used for WMT2014
+
+    Parameters
+    ----------
+    lang_pair
+    path
+
+    Returns
+    -------
+    train_src_paths
+    train_tgt_paths
+    """
+    if lang_pair == 'en-de' or lang_pair == 'de-en':
+        train_src_paths, train_tgt_paths =\
+            fetch_wmt_parallel_dataset([['europarl', 'v7'],
+                                        ['commoncrawl', 'wmt13'],
+                                        ['newscommentary', 'v9']], lang_pair, path=path)
+    else:
+        raise NotImplementedError
+    return train_src_paths, train_tgt_paths
+
+
+def download_wmt16_train(lang_pair: str = 'en-de', path: str = _BASE_DATASET_PATH)\
+        -> Tuple[List[str], List[str]]:
+    """Download the train dataset used for WMT2016
+
+    Parameters
+    ----------
+    lang_pair
+    path
+
+    Returns
+    -------
+    train_src_paths
+    train_tgt_paths
+
+    """
+    if lang_pair == 'en-de' or lang_pair == 'de-en':
+        train_src_paths, train_tgt_paths = \
+            fetch_wmt_parallel_dataset([['europarl', 'v7'],
+                                        ['commoncrawl', 'wmt13'],
+                                        ['newscommentary', 'v11']], lang_pair, path=path)
+    else:
+        raise NotImplementedError
+    return train_src_paths, train_tgt_paths
+
+
+def download_wmt17_train(lang_pair: str = 'en-de', path: str = _BASE_DATASET_PATH)\
+        -> Tuple[List[str], List[str]]:
+    """Download the train dataset used for WMT2017
+
+    Parameters
+    ----------
+    lang_pair
+    path
+
+    Returns
+    -------
+    train_src_paths
+    train_tgt_paths
+
+    """
+    if lang_pair == 'en-de' or lang_pair == 'de-en':
+        train_src_paths, train_tgt_paths = \
+            fetch_wmt_parallel_dataset([['europarl', 'v7'],
+                                        ['commoncrawl', 'wmt13'],
+                                        ['newscommentary', 'v12'],
+                                        ['rapid', '2016']], lang_pair, path=path)
+    elif lang_pair == 'zh-en' or lang_pair == 'en-zh':
+        train_src_paths, train_tgt_paths = \
+            fetch_wmt_parallel_dataset([['newscommentary', 'v13'],
+                                        ['uncorpus', 'v1'],
+                                        ['cwmt']], lang_pair, path=path)
+    else:
+        raise NotImplementedError
+    return train_src_paths, train_tgt_paths
+
+
+@DATA_PARSER_REGISTRY.register('prepare_wmt')
+def get_parser():
+    parser = argparse.ArgumentParser(description='Downloading and Preprocessing WMT Datasets.')
+    parser.add_argument('--dataset', type=str, required=True,
+                        choices=['wmt2014', 'wmt2017', 'newscrawl'],
+                        help='The dataset to use.')
+    parser.add_argument('--mono', action='store_true',
+                        help='Download monolingual dataset.')
+    parser.add_argument('--mono_lang', type=str, default='de',
+                        help='The monolingual language.')                  
+    parser.add_argument('--lang-pair', type=str, default='en-de',
+                        help='The pair of source language and target language separated by "-", '
+                             'e.g. "en-de", "en-zh".')
+    parser.add_argument('--mode', choices=['path_only',
+                                           'raw'],
+                        default='raw',
+                        help='If the mode is "path_only",'
+                             '    the script will only output the path of the raw corpus.'
+                             'If mode is "raw", the script will concatenate all the related'
+                             '    corpus and save to the folder.')
+    parser.add_argument('--save-path', type=str, default='wmt_data',
+                        help='The path to save the dataset.')
+    parser.add_argument('--prefix', type=str, default='train.raw',
+                        help='The prefix of the saved raw files.')
+    parser.add_argument('--overwrite', action='store_true', help='Whether to overwrite the ')
+    parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
+                        help='The path to cache the downloaded files.')
+    return parser
+
+
+def mono_main(args):
+    lang = args.mono_lang
+    if args.dataset.lower() == 'newscrawl':
+        if lang == 'de':
+            train_src_paths =\
+                download_mono_newscrawl('de', args.cache_path)
+        else:
+            raise NotImplementedError
+    else:
+        raise NotImplementedError
+    if args.mode == 'path_only':
+        print('Dataset: {}/{}'.format(args.dataset, args.mono_lang))
+        print('Train Source:')
+        for path in train_src_paths:
+            print('\t{}'.format(path))
+    elif args.mode == 'raw':
+        assert args.save_path is not None
+        if not os.path.exists(args.save_path):
+            os.makedirs(args.save_path)
+        print('Save to {}'.format(args.save_path))
+        raw_src_path = os.path.join(args.save_path, '{}.{}'.format(args.prefix, lang))
+        if not os.path.exists(raw_src_path) or args.overwrite:
+            with open(raw_src_path, 'wb') as out_f:
+                for ele_path in train_src_paths:
+                    with open(ele_path, 'rb') as in_f:
+                        shutil.copyfileobj(in_f, out_f)
+    else:
+        raise NotImplementedError
+
+
+@DATA_MAIN_REGISTRY.register('prepare_wmt')
+def main(args):
+    if args.mono:
+        mono_main(args)
+    else:
+        src_lang, tgt_lang = split_lang_pair(args.lang_pair)
+        if args.dataset.lower() == 'wmt2014':
+            if (src_lang, tgt_lang) in [('en', 'de'), ('de', 'en')]:
+                train_src_paths, train_tgt_paths =\
+                    download_wmt14_train(args.lang_pair, args.cache_path)
+            else:
+                raise NotImplementedError
+        elif args.dataset.lower() == 'wmt2016':
+            if (src_lang, tgt_lang) in [('en', 'de'), ('de', 'en')]:
+                train_src_paths, train_tgt_paths =\
+                    download_wmt16_train(args.lang_pair, args.cache_path)
+            else:
+                raise NotImplementedError
+        elif args.dataset.lower() == 'wmt2017':
+            if (src_lang, tgt_lang) in [('en', 'de'), ('de', 'en'),
+                                        ('zh', 'en'), ('en', 'zh')]:
+                train_src_paths, train_tgt_paths =\
+                    download_wmt17_train(args.lang_pair, args.cache_path)
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+        if args.mode == 'path_only':
+            print('Dataset: {}/{}'.format(args.dataset, args.lang_pair))
+            print('Train Source:')
+            for path in train_src_paths:
+                print('\t{}'.format(path))
+            print('Train Target:')
+            for path in train_tgt_paths:
+                print('\t{}'.format(path))
+        elif args.mode == 'raw':
+            if not os.path.exists(args.save_path):
+                os.makedirs(args.save_path)
+            print('Save to {}'.format(args.save_path))
+            raw_src_path = os.path.join(args.save_path, '{}.{}'.format(args.prefix, src_lang))
+            raw_tgt_path = os.path.join(args.save_path, '{}.{}'.format(args.prefix, tgt_lang))
+            if not os.path.exists(raw_src_path) or args.overwrite:
+                with open(raw_src_path, 'wb') as out_f:
+                    for ele_path in train_src_paths:
+                        with open(ele_path, 'rb') as in_f:
+                            shutil.copyfileobj(in_f, out_f)
+            if not os.path.exists(raw_tgt_path) or args.overwrite:
+                with open(raw_tgt_path, 'wb') as out_f:
+                    for ele_path in train_tgt_paths:
+                        with open(ele_path, 'rb') as in_f:
+                            shutil.copyfileobj(in_f, out_f)
+            assert file_line_number(raw_src_path) == file_line_number(raw_tgt_path)
+        else:
+            raise NotImplementedError
+
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == '__main__':
+    cli_main()
+
diff --git a/scripts/datasets/machine_translation/wmt2014_ende.sh b/scripts/datasets/machine_translation/wmt2014_ende.sh
new file mode 100644
index 0000000000..6557715365
--- /dev/null
+++ b/scripts/datasets/machine_translation/wmt2014_ende.sh
@@ -0,0 +1,78 @@
+SUBWORD_ALGO=$1
+SRC=en
+TGT=de
+SAVE_PATH=wmt2014_ende
+
+# Fetch the raw text
+nlp_data prepare_wmt \
+        --dataset wmt2014 \
+        --lang-pair ${SRC}-${TGT} \
+        --save-path ${SAVE_PATH}
+
+# We use sacrebleu to fetch the dev set (newstest2013) and test set (newstest2014)
+sacrebleu -t wmt13 -l ${SRC}-${TGT} --echo src > ${SAVE_PATH}/dev.raw.${SRC}
+sacrebleu -t wmt13 -l ${SRC}-${TGT} --echo ref > ${SAVE_PATH}/dev.raw.${TGT}
+sacrebleu -t wmt14/full -l ${SRC}-${TGT} --echo src > ${SAVE_PATH}/test.raw.${SRC}
+sacrebleu -t wmt14/full -l ${SRC}-${TGT} --echo ref > ${SAVE_PATH}/test.raw.${TGT}
+
+
+# Clean and tokenize the training + dev corpus
+cd ${SAVE_PATH}
+nlp_preprocess clean_tok_para_corpus --src-lang ${SRC} \
+                      --tgt-lang ${TGT} \
+                      --src-corpus train.raw.${SRC} \
+                      --tgt-corpus train.raw.${TGT} \
+                      --min-num-words 1 \
+                      --max-num-words 100 \
+                      --max-ratio 1.5 \
+                      --src-save-path train.tok.${SRC} \
+                      --tgt-save-path train.tok.${TGT}
+
+nlp_preprocess clean_tok_para_corpus --src-lang ${SRC} \
+                      --tgt-lang ${TGT} \
+                      --src-corpus dev.raw.${SRC} \
+                      --tgt-corpus dev.raw.${TGT} \
+                      --min-num-words 1 \
+                      --max-num-words 100 \
+                      --max-ratio 1.5 \
+                      --src-save-path dev.tok.${SRC} \
+                      --tgt-save-path dev.tok.${TGT}
+
+# For test corpus, we will just tokenize the data
+nlp_preprocess clean_tok_para_corpus --src-lang ${SRC} \
+                      --tgt-lang ${TGT} \
+                      --src-corpus test.raw.${SRC} \
+                      --tgt-corpus test.raw.${TGT} \
+                      --src-save-path test.tok.${SRC} \
+                      --tgt-save-path test.tok.${TGT}
+
+# Learn BPE with the training data
+nlp_preprocess learn_subword --corpus train.tok.${SRC} train.tok.${TGT} \
+                             --model ${SUBWORD_ALGO} \
+                             --save-dir . \
+                             --vocab-size 32768
+
+# Apply the learned codes to the training set
+for LANG in ${SRC} ${TGT}
+do
+nlp_preprocess apply_subword --model ${SUBWORD_ALGO}\
+                             --output-type subword \
+                             --model-path ${SUBWORD_ALGO}.model \
+                             --vocab-path ${SUBWORD_ALGO}.vocab \
+                             --corpus train.tok.${LANG} \
+                             --save-path train.tok.${SUBWORD_ALGO}.${LANG}
+done
+
+# Apply the learned codes to the dev/test set
+for LANG in ${SRC} ${TGT}
+do
+  for SPLIT in dev test
+  do
+    nlp_preprocess apply_subword --model ${SUBWORD_ALGO} \
+                                 --output-type subword \
+                                 --model-path ${SUBWORD_ALGO}.model \
+                                 --vocab-path ${SUBWORD_ALGO}.vocab \
+                                 --corpus ${SPLIT}.tok.${LANG} \
+                                 --save-path ${SPLIT}.tok.${SUBWORD_ALGO}.${LANG}
+  done
+done
diff --git a/scripts/datasets/machine_translation/wmt2017_zhen.sh b/scripts/datasets/machine_translation/wmt2017_zhen.sh
new file mode 100644
index 0000000000..95e1b6492d
--- /dev/null
+++ b/scripts/datasets/machine_translation/wmt2017_zhen.sh
@@ -0,0 +1,89 @@
+SUBWORD_ALGO=$1
+SRC=zh
+TGT=en
+SAVE_PATH=wmt2017_zhen
+
+# Fetch the raw text
+nlp_data prepare_wmt \
+        --dataset wmt2017 \
+        --lang-pair ${SRC}-${TGT} \
+        --save-path ${SAVE_PATH}
+
+# We use sacrebleu to fetch the dev set and test set of wmt17
+sacrebleu -t wmt17/dev -l ${SRC}-${TGT} --echo src > ${SAVE_PATH}/dev.raw.${SRC}
+sacrebleu -t wmt17/dev -l ${SRC}-${TGT} --echo ref > ${SAVE_PATH}/dev.raw.${TGT}
+sacrebleu -t wmt17 -l ${SRC}-${TGT} --echo src > ${SAVE_PATH}/test.raw.${SRC}
+sacrebleu -t wmt17 -l ${SRC}-${TGT} --echo ref > ${SAVE_PATH}/test.raw.${TGT}
+
+
+# Clean and tokenize the training + dev corpus
+cd ${SAVE_PATH}
+nlp_preprocess clean_tok_para_corpus --src-lang ${SRC} \
+                      --tgt-lang ${TGT} \
+                      --src-corpus train.raw.${SRC} \
+                      --tgt-corpus train.raw.${TGT} \
+                      --src-tokenizer jieba \
+                      --tgt-tokenizer moses \
+                      --max-ratio 1.3 \
+                      --min-num-words 3 \
+                      --max-num-words 70 \
+                      --src-save-path train.tok.${SRC} \
+                      --tgt-save-path train.tok.${TGT}
+
+nlp_preprocess clean_tok_para_corpus --src-lang ${SRC} \
+                      --tgt-lang ${TGT} \
+                      --src-corpus dev.raw.${SRC} \
+                      --tgt-corpus dev.raw.${TGT} \
+                      --src-tokenizer jieba \
+                      --tgt-tokenizer moses \
+                      --max-ratio 1.3 \
+                      --min-num-words 3 \
+                      --max-num-words 70 \
+                      --src-save-path dev.tok.${SRC} \
+                      --tgt-save-path dev.tok.${TGT}
+
+# For test corpus, we will just tokenize the data
+nlp_preprocess clean_tok_para_corpus --src-lang ${SRC} \
+                      --tgt-lang ${TGT} \
+                      --src-corpus test.raw.${SRC} \
+                      --tgt-corpus test.raw.${TGT} \
+                      --src-tokenizer jieba \
+                      --tgt-tokenizer moses \
+                      --src-save-path test.tok.${SRC} \
+                      --tgt-save-path test.tok.${TGT}
+
+# Learn BPE with the training data. We learn independent source/target vocabulary
+
+nlp_preprocess learn_subword --corpus train.tok.${SRC} \
+                             --model ${SUBWORD_ALGO} \
+                             --save-dir ./${SRC}_model \
+                             --vocab-size 44000
+nlp_preprocess learn_subword --corpus train.tok.${TGT} \
+                             --model ${SUBWORD_ALGO} \
+                             --save-dir ./${TGT}_model \
+                             --vocab-size 33000
+
+# Apply the learned codes to the training set
+for LANG in ${SRC} ${TGT}
+do
+nlp_preprocess apply_subword --model ${SUBWORD_ALGO}\
+                             --output-type subword \
+                             --model-path ${LANG}_model/${SUBWORD_ALGO}.model \
+                             --vocab-path ${LANG}_model/${SUBWORD_ALGO}.vocab \
+                             --corpus train.tok.${LANG} \
+                             --save-path train.tok.${SUBWORD_ALGO}.${LANG}
+done
+
+# Apply the learned codes to the dev/test set
+for LANG in ${SRC} ${TGT}
+do
+  for SPLIT in dev test
+  do
+    nlp_preprocess apply_subword --model ${SUBWORD_ALGO} \
+                                 --output-type subword \
+                                 --model-path ${LANG}_model/${SUBWORD_ALGO}.model \
+                                 --vocab-path ${LANG}_model/${SUBWORD_ALGO}.vocab \
+                                 --corpus ${SPLIT}.tok.${LANG} \
+                                 --save-path ${SPLIT}.tok.${SUBWORD_ALGO}.${LANG}
+  done
+done
diff --git a/scripts/datasets/music_generation/README.md b/scripts/datasets/music_generation/README.md
new file mode 100644
index 0000000000..983c271de5
--- /dev/null
+++ b/scripts/datasets/music_generation/README.md
@@ -0,0 +1,42 @@
+# Music Generation
+
+We provide datasets for training a music generation model. 
+
+## Maestro
+
+See https://magenta.tensorflow.org/datasets/maestro for detailed introduction.
+
+```
+# Get V1 Dataset
+nlp_data prepare_music_midi --dataset maestro_v1
+
+# Get V2 Dataset
+nlp_data prepare_music_midi --dataset maestro_v2
+```
+
+## LakhMIDI
+
+See https://colinraffel.com/projects/lmd/ for more details
+
+```
+# Get Lakh MIDI Full Dataset
+nlp_data prepare_music_midi --dataset lmd_full
+
+# Get the subset of 45,129 files from LMD-full 
+# which have been matched to entries in the Million Song Datase
+nlp_data prepare_music_midi --dataset lmd_matched
+
+# Get the aligned version of lmd_matched
+nlp_data prepare_music_midi --dataset lmd_aligned
+
+# Get the clean midi data
+nlp_data prepare_music_midi --dataset clean_midi
+```
+
+## Geocities
+
+The Geocities collection of MIDI files. 
+See https://archive.org/details/archiveteam-geocities-midi-collection-2009 for more details.
+```
+nlp_data prepare_music_midi --dataset geocities
+```
diff --git a/scripts/datasets/music_generation/__init__.py b/scripts/datasets/music_generation/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/datasets/music_generation/prepare_music_midi.py b/scripts/datasets/music_generation/prepare_music_midi.py
new file mode 100644
index 0000000000..cb07cb5687
--- /dev/null
+++ b/scripts/datasets/music_generation/prepare_music_midi.py
@@ -0,0 +1,110 @@
+import argparse
+import os
+import tarfile
+from gluonnlp.base import get_data_home_dir
+from gluonnlp.utils.misc import download, load_checksum_stats
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+import zipfile
+
+_CITATIONS = """
+@phdthesis{raffel2016learning,
+  title={Learning-based methods for comparing sequences, with applications to audio-to-midi alignment and matching},
+  author={Raffel, Colin},
+  year={2016},
+  school={Columbia University}
+}
+
+@inproceedings{hawthorne2018enabling,
+  title={Enabling Factorized Piano Music Modeling and Generation with the {MAESTRO} Dataset},
+  author={Curtis Hawthorne and Andriy Stasyuk and Adam Roberts and Ian Simon and Cheng-Zhi Anna Huang and Sander Dieleman and Erich Elsen and Jesse Engel and Douglas Eck},
+  booktitle={International Conference on Learning Representations},
+  year={2019},
+  url={https://openreview.net/forum?id=r1lYRjC9F7},
+}
+"""
+
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'music_midi_data')
+
+_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'music_midi.txt')
+_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
+
+
+_URLS = {
+    'lmd_full': 'http://hog.ee.columbia.edu/craffel/lmd/lmd_full.tar.gz',
+    'lmd_matched': 'http://hog.ee.columbia.edu/craffel/lmd/lmd_matched.tar.gz',
+    'lmd_aligned': 'http://hog.ee.columbia.edu/craffel/lmd/lmd_aligned.tar.gz',
+    'clean_midi': 'http://hog.ee.columbia.edu/craffel/lmd/clean_midi.tar.gz',
+    'maestro_v1': 'https://storage.googleapis.com/magentadata/datasets/maestro/v1.0.0/maestro-v1.0.0-midi.zip',
+    'maestro_v2': 'https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0-midi.zip',
+    'geocities': 'https://archive.org/download/archiveteam-geocities-midi-collection-2009/2009.GeoCities.MIDI.ArchiveTeam.zip'
+}
+
+
+@DATA_PARSER_REGISTRY.register('prepare_music_midi')
+def get_parser():
+    parser = argparse.ArgumentParser(description='Download the Music Midi Datasets.')
+    parser.add_argument('--dataset', type=str, required=True,
+                        choices=['lmd_full', 'lmd_matched', 'lmd_aligned', 'clean_midi',
+                                 'maestro_v1', 'maestro_v2', 'geocities'],
+                        help='The dataset to download.')
+    parser.add_argument('--save-dir', type=str, default=None,
+                        help='The directory to save the dataset.'
+                             ' By default, it will save to a folder with the same name as the '
+                             'dataset')
+    parser.add_argument('--overwrite', action='store_true',
+                        help='Whether to overwrite the directory.')
+    parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
+                        help='The temporary path to download the compressed dataset.')
+    return parser
+
+
+@DATA_MAIN_REGISTRY.register('prepare_music_midi')
+def main(args):
+    # Download the data
+    url = _URLS[args.dataset]
+    file_hash = _URL_FILE_STATS[url]
+    target_download_location = os.path.join(args.cache_path, os.path.basename(url))
+    download(url, target_download_location, sha1_hash=file_hash)
+    if args.save_dir is None:
+        save_dir = args.dataset
+    else:
+        save_dir = args.save_dir
+    if not args.overwrite and os.path.exists(save_dir):
+        print('{} found, skip! Turn on --overwrite to force overwrite'.format(save_dir))
+    print('Extract the data from {} into {}'.format(target_download_location,
+                                                    save_dir))
+    if args.dataset == 'lmd_full':
+        with tarfile.open(target_download_location) as f:
+            f.extractall(save_dir)
+    elif args.dataset == 'lmd_matched':
+        with tarfile.open(target_download_location) as f:
+            f.extractall(save_dir)
+    elif args.dataset == 'lmd_aligned':
+        with tarfile.open(target_download_location) as f:
+            f.extractall(save_dir)
+    elif args.dataset == 'clean_midi':
+        with tarfile.open(target_download_location) as f:
+            f.extractall(save_dir)
+    elif args.dataset == 'maestro_v1':
+        with zipfile.ZipFile(target_download_location, 'r') as fobj:
+            fobj.extractall(save_dir)
+    elif args.dataset == 'maestro_v2':
+        with zipfile.ZipFile(target_download_location, 'r') as fobj:
+            fobj.extractall(save_dir)
+    elif args.dataset == 'geocities':
+        with zipfile.ZipFile(target_download_location, 'r') as fobj:
+            fobj.extractall(save_dir)
+    else:
+        raise NotImplementedError
+
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/scripts/datasets/pretrain_corpus/README.md b/scripts/datasets/pretrain_corpus/README.md
new file mode 100644
index 0000000000..1f49996bfb
--- /dev/null
+++ b/scripts/datasets/pretrain_corpus/README.md
@@ -0,0 +1,55 @@
+# Pretraining Corpus
+
+We provide a series of shared scripts for downloading/preparing the text corpus for pretraining NLP models.
+This helps create a unified text corpus for studying the performance of different pretraining algorithms.
+When releasing the datasets, we follow the [FAIR principle](https://www.go-fair.org/fair-principles/),
+i.e., the dataset needs to be findable, accessible, interoperable, and reusable.
+
+## BookCorpus
+Unfortunately, we are unable to provide the original [Toronto BookCorpus dataset](https://yknzhu.wixsite.com/mbweb) due to licensing issues.
+
+There are some open source efforts for reproducing the dataset, e.g.,
+ using [soskek/bookcorpus](https://github.com/soskek/bookcorpus) or directly downloading the [preprocessed version](https://drive.google.com/file/d/16KCjV9z_FHm8LgZw05RSuk4EsAWPOP_z/view).
+
+Nevertheless, we utilize the [Project Gutenberg](https://www.gutenberg.org/) as an alternative to Toronto BookCorpus.
+
+You can use the following command to download and prepare the Gutenberg dataset.
+
+```bash
+python3 prepare_bookcorpus.py --dataset gutenberg
+```
+
+Also, you should follow the [license](https://www.gutenberg.org/wiki/Gutenberg:The_Project_Gutenberg_License) for using the data.
+
+## Wikipedia
+
+Please install [attardi/wikiextractor](https://github.com/attardi/wikiextractor) for preparing the data.
+
+```bash
+# Download
+python3 prepare_wikipedia.py --mode download --lang en --date latest -o ./
+
+# Properly format the text files
+python3 prepare_wikipedia.py --mode format -i [path-to-wiki.xml.bz2] -o ./
+
+```
+The process of downloading and formatting is time consuming, and we offer an alternative solution to download the prepared raw text file from S3 bucket. This raw text file is in English and was dumped at 2020-06-20 being formated by the above very process (` --lang en --date 20200620`).
+
+```bash
+python3 prepare_wikipedia.py --mode download_prepared -o ./
+```
+### References
+- [NVIDIA/DeepLearningExamples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT)
+- [attardi/wikiextractor](https://github.com/attardi/wikiextractor)
+
+## OpenWebText
+
+You can download the OpenWebText from [link](https://skylion007.github.io/OpenWebTextCorpus/).
+After downloading and extracting the OpenWebText (i.e., `tar xf openwebtext.tar.xz`), you can use the following command to preprocess the dataset.
+
+```bash
+python3 prepare_openwebtext.py --input openwebtext/ --output prepared_owt --shuffle
+```
+
+In this step, the archived txt are directly read without decompressing.
+They are concatenated together in a single txt file with the same name as the archived file, using double empty lines as the document separation.
diff --git a/scripts/datasets/pretrain_corpus/__init__.py b/scripts/datasets/pretrain_corpus/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/datasets/pretrain_corpus/prepare_bookcorpus.py b/scripts/datasets/pretrain_corpus/prepare_bookcorpus.py
new file mode 100644
index 0000000000..7e00f73a98
--- /dev/null
+++ b/scripts/datasets/pretrain_corpus/prepare_bookcorpus.py
@@ -0,0 +1,91 @@
+import glob
+import os
+import argparse
+import zipfile
+from gluonnlp.base import get_data_home_dir
+from gluonnlp.utils.misc import download, load_checksum_stats
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+
+
+_CITATIONS = r"""
+@InProceedings{lahiri:2014:SRW,
+  author    = {Lahiri, Shibamouli},
+  title     = {{Complexity of Word Collocation Networks: A Preliminary Structural Analysis}},
+  booktitle = {Proceedings of the Student Research Workshop at the 14th Conference of the European Chapter of the Association for Computational Linguistics},
+  month     = {April},
+  year      = {2014},
+  address   = {Gothenburg, Sweden},
+  publisher = {Association for Computational Linguistics},
+  pages     = {96--105},
+  url       = {http://www.aclweb.org/anthology/E14-3011}
+}
+"""
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'book_corpus.txt')
+_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
+
+
+# The Gutenberg dataset is downloaded from:
+# https://web.eecs.umich.edu/~lahiri/gutenberg_dataset.html, and
+# is a small subset of the Project Gutenberg corpus
+# The original link for
+# downloading is https://drive.google.com/file/d/0B2Mzhc7popBga2RkcWZNcjlRTGM/edit?usp=sharing
+
+_URLS = {
+    'gutenberg':
+        'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/pretrain_corpus/Gutenberg.zip',
+}
+
+
+@DATA_PARSER_REGISTRY.register('prepare_bookcorpus')
+def get_parser():
+    parser = argparse.ArgumentParser(description='Download and Prepare the BookCorpus dataset.')
+    parser.add_argument('--dataset', type=str, choices=['gutenberg'], default='gutenberg')
+    parser.add_argument('--mode', type=str, default='raw', choices=['raw', 'format'],
+                        help='Specify the mode for preparing the data.'
+                             ' "raw" means to download and extract the books into the output'
+                             ' folder, each file is a book and the filename is the tile of the '
+                             'book. "format" means to format the extracted txt files for '
+                             'usage of pretraining.')
+    parser.add_argument('--save_dir', type=str, default=None,
+                        help='The directory to save the dataset. Default is the same as the'
+                             ' dataset.')
+    parser.add_argument('--cache-path', type=str,
+                        default=os.path.join(get_data_home_dir(), 'book_corpus'),
+                        help='The temporary path to download the compressed dataset.')
+    return parser
+
+
+@DATA_MAIN_REGISTRY.register('prepare_bookcorpus')
+def main(args):
+    url = _URLS[args.dataset]
+    file_hash = _URL_FILE_STATS[url]
+    target_download_location = os.path.join(args.cache_path,
+                                            os.path.basename(url))
+    download(url, target_download_location, sha1_hash=file_hash)
+    save_dir = args.dataset if args.save_dir is None else args.save_dir
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir, exist_ok=True)
+    if args.dataset == 'gutenberg':
+        if args.mode == 'raw':
+            with zipfile.ZipFile(target_download_location) as f:
+                for name in f.namelist():
+                    if name.endswith('.txt'):
+                        filename = os.path.basename(name)
+                    f.extract(name, os.path.join(save_dir, filename))
+        else:
+            # TODO(zheyuye), format for pretraining
+            raise NotImplementedError
+    else:
+        raise NotImplementedError
+
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/scripts/datasets/pretrain_corpus/prepare_openwebtext.py b/scripts/datasets/pretrain_corpus/prepare_openwebtext.py
new file mode 100644
index 0000000000..ff3edf75f5
--- /dev/null
+++ b/scripts/datasets/pretrain_corpus/prepare_openwebtext.py
@@ -0,0 +1,106 @@
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare the OpenWebText Dataset Corpus for pre-training. """
+
+import os
+import re
+import time
+import random
+import tarfile
+import argparse
+import functools
+import multiprocessing
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+
+_CITATIONS = r"""
+@misc{Gokaslan2019OpenWeb,
+    title={OpenWebText Corpus},
+    author={Aaron Gokaslan and Vanya Cohen},
+    howpublished{\url{http://Skylion007.github.io/OpenWebTextCorpus}},
+    year={2019}
+}
+"""
+
+
+@DATA_PARSER_REGISTRY.register('prepare_openwebtext')
+def get_parser():
+    parser = argparse.ArgumentParser(description='Prepare the OpenWebText corpus for pretraining')
+    parser.add_argument("-i", "--input", required=True,
+                        help="path to openwebtext dataset")
+    parser.add_argument("-o", "--output", default="openwebtext",
+                        help="directory for extracted files")
+    parser.add_argument("--num_process", type=int, default=8,
+                        help="number of processes for multiprocessing")
+    parser.add_argument("--shuffle", action="store_true",
+                        help="Wether to shuffle the data order")
+    return parser
+
+
+def extract_files(full_name, output_dir, shuffle=False):
+    """
+    Extract the file and concatenate all the TXT files it archives
+    """
+    if not full_name.endswith(".xz"):
+        return
+    file_prefix = re.split(r'\.|/', full_name)[-2]
+    file_prefix = file_prefix.replace('urlsf_subset', 'openwebtext-prepared-')
+    with open("{}.txt".format(os.path.join(output_dir, file_prefix)), "w") as fp:
+        with tarfile.open(full_name) as t:
+            txt_names = t.getnames()
+            if shuffle:
+                random.shuffle(txt_names)
+            for txt_name in txt_names:
+                f = t.extractfile(txt_name)
+                for line in f.readlines():
+                    # skip empty line
+                    line = line.strip()
+                    if line:
+                        fp.write(line.decode() + '\n')
+                # Two extra line break to mark the document separation
+                fp.write('\n')
+
+
+@DATA_MAIN_REGISTRY.register('prepare_openwebtext')
+def main(args):
+    num_process = min(multiprocessing.cpu_count(), args.num_process)
+    if not os.path.exists(args.output):
+        os.makedirs(args.output, exist_ok=True)
+    fnames = sorted(os.listdir(args.input))
+    fnames = [os.path.join(args.input, fname) for fname in fnames]
+    if args.shuffle:
+        random.shuffle(fnames)
+    print('Start extracting {} files with {} cores'.format(len(fnames), num_process))
+    start_time = time.time()
+    with multiprocessing.Pool(num_process) as pool:
+        iter = pool.imap(
+            functools.partial(
+                extract_files,
+                output_dir=args.output,
+                shuffle=args.shuffle),
+            fnames)
+        for f_index, _ in enumerate(iter):
+            if f_index > 0 and f_index % 250 == 0:
+                elapsed = time.time() - start_time
+                print("Extracted {:}, Elapsed: {:}s, ETA: {:}s, ".format(
+                    f_index, int(elapsed), int((len(fnames) - f_index) / (f_index / elapsed))))
+
+    print("Done!")
+
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/scripts/datasets/pretrain_corpus/prepare_wikipedia.py b/scripts/datasets/pretrain_corpus/prepare_wikipedia.py
new file mode 100644
index 0000000000..481598c22e
--- /dev/null
+++ b/scripts/datasets/pretrain_corpus/prepare_wikipedia.py
@@ -0,0 +1,253 @@
+"""Prepare the Wikipedia dataset that contain cleaned articles of all languages."""
+import os
+import sys
+import glob
+import math
+import time
+import tarfile
+import argparse
+import multiprocessing
+
+from gluonnlp.registry import DATA_MAIN_REGISTRY, DATA_PARSER_REGISTRY
+from gluonnlp.utils.misc import download, load_checksum_stats
+
+_CITATION = """\
+@ONLINE {wikidump,
+    author = "Wikimedia Foundation",
+    title  = "Wikimedia Downloads",
+    url    = "https://dumps.wikimedia.org"
+}
+"""
+
+# See https://en.wikipedia.org/wiki/List_of_Wikipedias for details
+__LANGUAGES_BANK = [
+    "aa", "ab", "ace", "ady", "af", "ak", "als", "am", "an", "ang", "ar", "arc",
+    "arz", "as", "ast", "atj", "av", "ay", "az", "azb", "ba", "bar", "bat-smg",
+    "bcl", "be", "be-x-old", "bg", "bh", "bi", "bjn", "bm", "bn", "bo", "bpy",
+    "br", "bs", "bug", "bxr", "ca", "cbk-zam", "cdo", "ce", "ceb", "ch", "cho",
+    "chr", "chy", "ckb", "co", "cr", "crh", "cs", "csb", "cu", "cv", "cy", "da",
+    "de", "din", "diq", "dsb", "dty", "dv", "dz", "ee", "el", "eml", "en", "eo",
+    "es", "et", "eu", "ext", "fa", "ff", "fi", "fiu-vro", "fj", "fo", "fr",
+    "frp", "frr", "fur", "fy", "ga", "gag", "gan", "gd", "gl", "glk", "gn",
+    "gom", "gor", "got", "gu", "gv", "ha", "hak", "haw", "he", "hi", "hif",
+    "ho", "hr", "hsb", "ht", "hu", "hy", "ia", "id", "ie", "ig", "ii",
+    "ik", "ilo", "inh", "io", "is", "it", "iu", "ja", "jam", "jbo", "jv", "ka",
+    "kaa", "kab", "kbd", "kbp", "kg", "ki", "kj", "kk", "kl", "km", "kn", "ko",
+    "koi", "krc", "ks", "ksh", "ku", "kv", "kw", "ky", "la", "lad", "lb",
+    "lbe", "lez", "lfn", "lg", "li", "lij", "lmo", "ln", "lo", "lrc", "lt",
+    "ltg", "lv", "mai", "map-bms", "mdf", "mg", "mh", "mhr", "mi", "min", "mk",
+    "ml", "mn", "mr", "mrj", "ms", "mt", "mus", "mwl", "my", "myv", "mzn", "na",
+    "nah", "nap", "nds", "nds-nl", "ne", "new", "ng", "nl", "nn", "no", "nov",
+    "nrm", "nso", "nv", "ny", "oc", "olo", "om", "or", "os", "pa", "pag", "pam",
+    "pap", "pcd", "pdc", "pfl", "pi", "pih", "pl", "pms", "pnb", "pnt", "ps",
+    "pt", "qu", "rm", "rmy", "rn", "ro", "roa-rup", "roa-tara", "ru", "rue",
+    "rw", "sa", "sah", "sat", "sc", "scn", "sco", "sd", "se", "sg", "sh", "si",
+    "simple", "sk", "sl", "sm", "sn", "so", "sq", "sr", "srn", "ss", "st",
+    "stq", "su", "sv", "sw", "szl", "ta", "tcy", "te", "tet", "tg", "th", "ti",
+    "tk", "tl", "tn", "to", "tpi", "tr", "ts", "tt", "tum", "tw", "ty", "tyv",
+    "udm", "ug", "uk", "ur", "uz", "ve", "vec", "vep", "vi", "vls", "vo", "wa",
+    "war", "wo", "wuu", "xal", "xh", "xmf", "yi", "yo", "za", "zea", "zh",
+    "zh-classical", "zh-min-nan", "zh-yue", "zu"]
+
+_BASE_URL_TMPL\
+    = "https://dumps.wikimedia.org/{lang}wiki/{date}/{lang}wiki-{date}-pages-articles.xml.bz2"
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'wikipedia.txt')
+_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
+
+_URLS = {
+    'wikipedia-en-20200620':
+        'https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/pretrain_corpus/wikipedia-en-20200620.tar.gz',
+}
+
+
+def get_url(lang, date):
+    return _BASE_URL_TMPL.format(lang=lang, date=date)
+
+
+def try_import_wikiextractor():
+    try:
+        sys.path.append(_CURR_DIR)
+        import WikiExtractor
+    except ImportError:
+        try:
+            download(
+                'https://raw.githubusercontent.com/attardi/wikiextractor/master/WikiExtractor.py',
+                path=os.path.join(_CURR_DIR, 'WikiExtractor.py'),
+                sha1_hash='3c4896a837b75c476d23c037e8d6c7fdfd9a29eb')
+            sys.path.append(_CURR_DIR)
+            import WikiExtractor
+        except BaseException:
+            raise ImportError('Cannot import WikiExtractor! You can download the "WikiExtractor.py"'
+                              ' in https://github.com/attardi/wikiextractor to {}'
+                              .format(_CURR_DIR))
+    return WikiExtractor
+
+
+def get_formatting_list(wiki_path, recursive=False):
+    """
+    get formatting list of file names from extracted content
+    """
+    filenames = []
+    for dirname in glob.glob(os.path.join(wiki_path, '*'), recursive=False):
+        for filename in glob.glob(os.path.join(dirname, 'wiki_*'), recursive=recursive):
+            filenames.append(filename)
+    return filenames
+
+
+def merge(x):
+    """
+    Puts one article per line
+    """
+    file_list, output_filename = x
+    article_lines = []
+    article_open = False
+
+    with open(output_filename, mode='w', newline='\n') as ofile:
+        for filename in file_list:
+            with open(filename, mode='r', newline='\n') as file:
+                for line in file:
+                    if '<doc id=' in line:
+                        article_open = True
+                    elif '</doc>' in line:
+                        article_open = False
+                        for oline in article_lines[1:]:
+                            if oline != '\n':
+                                ofile.write(oline.rstrip() + " ")
+                        ofile.write("\n\n")
+                        article_lines = []
+                    else:
+                        if article_open:
+                            article_lines.append(line)
+
+
+@DATA_PARSER_REGISTRY.register('prepare_wikipedia')
+def get_parser():
+    parser = argparse.ArgumentParser(description='Download and Prepare the Wikipedia')
+    parser.add_argument('--mode', type=str,
+                        default='download+format',
+                        choices=['download', 'format', 'download+format', 'download_prepared'],
+                        help='Specify the action you want the app to take. '
+                             '"download" means to download the Wikipedia dump. '
+                             '"format" means to extract the content and '
+                             'format it for pretraining. "download+format" means to combine '
+                             'these two options'
+                             '"download_prepared" downloads the prepared txt from S3 directly')
+    parser.add_argument('--lang', type=str, default='en',
+                        help='Language of the wikipedia dump file.'
+                             'We only support English and Chinese for current version')
+    parser.add_argument('--date', type=str, default='latest',
+                        help='Date of the wikipedia dump file. You can choose a date like '
+                             '"--date 20200201" or use "--date latest"')
+    parser.add_argument("-i", "--input", default=None,
+                        help="path to XML wiki dump file.")
+    parser.add_argument("-o", "--output", default="wikicorpus",
+                        help="directory for downloaded or formatted files")
+    parser.add_argument("-b", "--bytes", default="100M",
+                        help="maximum bytes per extracted file (default %(default)s)",
+                        metavar="n[KMG]")
+    parser.add_argument("--num_process", type=int, default=8,
+                        help="number of processes for multiprocessing")
+    parser.add_argument("--num_out_files", type=int, default=1000,
+                        help="Number of desired output files, where each is processed"
+                             " independently by a worker.")
+    return parser
+
+
+def download_wikicorpus(lang, date, output):
+    """
+    lang: the language code such as en, zh
+    date: string, the date of the Wikipedia with format of YYYYMMDD, or 'latest'.
+    """
+    if not os.path.exists(output):
+        os.makedirs(output)
+    if lang not in __LANGUAGES_BANK:
+        raise ValueError('Unsupported language code')
+    language = lang.replace('-', '_')
+    output_file = os.path.join(output, 'download', language, date,
+                               'wikicorpus.xml.bz2')
+    download(get_url(language, date), output_file)
+    return output_file
+
+
+def format_wikicorpus(input, output, bytes, num_process, num_out_files):
+    if input is None:
+        raise ValueError('input file is empty.')
+    if not input.endswith('xml.bz2'):
+        raise ValueError('input file not *.xml.bz2.')
+    if not os.path.exists(output):
+        os.makedirs(output)
+
+    # Use WikiExtractor to extract the content
+    WikiExtractor = try_import_wikiextractor()
+    wiki_path = os.path.join(output, 'extracted')
+    sys.argv = ['prog', '-b', bytes, '-o', wiki_path, input]
+    WikiExtractor.main()
+
+    # Merge extracted content into txt files
+    prepared_path = os.path.join(output, 'prepared_wikipedia')
+    if not os.path.exists(prepared_path):
+        os.makedirs(prepared_path)
+    filenames = get_formatting_list(wiki_path, recursive=True)
+    num_files = len(filenames)
+    num_out_files = min(num_out_files, num_files)
+    file_volume = math.ceil(num_files / num_out_files)
+    splited_files = [filenames[i: i + file_volume] for i in range(0, num_files, file_volume)]
+    num_out_files = len(splited_files)
+    output_files = [
+        os.path.join(
+            prepared_path,
+            "wikipedia-prepared-{}.txt".format(
+                str(i).zfill(4))) for i in range(num_out_files)]
+    print("All prepared raw text will be saved in {} txt files".format(num_out_files))
+    num_process = min(num_process, num_out_files)
+    print('Start preprocessing {} text files with {} cores'.format(num_files, num_process))
+    process_args = [(splited_files[i], output_files[i]) for i in range(num_out_files)]
+
+    start_time = time.time()
+    with multiprocessing.Pool(num_process) as pool:
+        f_read = 0
+        for i, _ in enumerate(pool.imap(merge, process_args)):
+            elapsed = time.time() - start_time
+            f_read += len(splited_files[i])
+            print("prepared {:} files, Elapsed: {:.2f}s, ETA: {:.2f}s, ".format(
+                f_read, elapsed, (num_files - f_read) / (num_files / elapsed)))
+    print("Done preparation within {:.2f} seconds".format(elapsed))
+
+
+@DATA_MAIN_REGISTRY.register('prepare_wikipedia')
+def main(args):
+    num_process = min(multiprocessing.cpu_count(), args.num_process)
+    if args.mode == 'download':
+        download_wikicorpus(args.lang, args.date, args.output)
+    elif args.mode == 'format':
+        format_wikicorpus(args.input, args.output, args.bytes, num_process, args.num_out_files)
+    elif args.mode == 'download+format':
+        downloaded_file = download_wikicorpus(args.lang, args.date, args.output)
+        format_wikicorpus(downloaded_file, args.output, args.bytes, num_process, args.num_out_files)
+    elif args.mode == 'download_prepared':
+        url = _URLS['wikipedia-en-20200620']
+        file_hash = _URL_FILE_STATS[url]
+        target_download_location = os.path.join(args.output,
+                                                os.path.basename(url))
+        download(url, target_download_location, sha1_hash=file_hash)
+        tar = tarfile.open(target_download_location)
+        names = tar.getnames()
+        print('Start unarchiving raw text files')
+        start_time = time.time()
+        for name in names:
+            tar.extract(name, path=args.output)
+        tar.close()
+        print("Done unarchiving within {:.2f} seconds".format(time.time() - start_time))
+    else:
+        raise NotImplementedError
+
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/scripts/datasets/question_answering/README.md b/scripts/datasets/question_answering/README.md
new file mode 100644
index 0000000000..96e53f03dd
--- /dev/null
+++ b/scripts/datasets/question_answering/README.md
@@ -0,0 +1,101 @@
+# Question Answering
+
+## SQuAD
+SQuAD datasets is distributed under the [CC BY-SA 4.0](http://creativecommons.org/licenses/by-sa/4.0/legalcode) license.
+
+Run the following command to download squad
+
+```bash
+python3 prepare_squad.py --version 1.1 # Squad 1.1
+python3 prepare_squad.py --version 2.0 # Squad 2.0
+```
+
+For all datasets we support, we provide command-line-toolkits for downloading them as
+
+```bash
+nlp_data prepare_squad --version 1.1
+nlp_data prepare_squad --version 2.0
+```
+
+Directory structure of the squad dataset will be as follows, where `version` can be 1.1 or 2.0:
+```
+squad
+├── train-v{version}.json
+├── dev-v{version}.json
+```
+
+## SearchQA
+Following BSD-3-Clause License, we uploaded the SearchQA to our S3 bucket and provide the link to download the processed txt files. Please check out the [Google drive link](https://drive.google.com/drive/u/0/folders/1kBkQGooNyG0h8waaOJpgdGtOnlb1S649) to download to raw and split files collected through web search using the scraper from [GitHub repository](https://github.com/nyu-dl/dl4ir-searchQA).
+
+Download SearchQA Dataset with python command or Command-line Toolkits
+
+```bash
+python3 prepare_searchqa.py
+
+# Or download with command-line toolkits
+nlp_data prepare_searchqa
+```
+
+Directory structure of the searchqa dataset will be as follows
+```
+searchqa
+├── train.txt
+├── val.txt
+├── test.txt
+```
+
+## TriviaQA
+[TriviaQA](https://nlp.cs.washington.edu/triviaqa/) is an open domain QA dataset. See more useful scripts in [Offical Github](https://github.com/mandarjoshi90/triviaqa)
+
+Run the following command to download triviaqa
+
+```bash
+python3 prepare_triviaqa.py --version rc         # Download TriviaQA version 1.0 for RC (2.5G)
+python3 prepare_triviaqa.py --version unfiltered # Download unfiltered TriviaQA version 1.0 (604M)
+
+# Or download with command-line toolkits
+nlp_data prepare_triviaqa --version rc
+nlp_data prepare_triviaqa --version unfiltered
+```
+
+Directory structure of the triviaqa (rc and unfiltered) dataset will be as follows:
+```
+triviaqa
+├── triviaqa-rc
+    ├── qa
+        ├── verified-web-dev.json        
+        ├── web-dev.json                   
+        ├── web-train.json     
+        ├── web-test-without-answers.json
+        ├── verified-wikipedia-dev.json
+        ├── wikipedia-test-without-answers.json
+        ├── wikipedia-dev.json  
+        ├── wikipedia-train.json
+    ├── evidence
+        ├── web
+        ├── wikipedia
+
+├── triviaqa-unfiltered
+    ├── unfiltered-web-train.json
+    ├── unfiltered-web-dev.json
+    ├── unfiltered-web-test-without-answers.json
+```
+
+## HotpotQA
+HotpotQA is distributed under a [CC BY-SA 4.0 License](https://creativecommons.org/licenses/by-sa/4.0/). We only provide download scripts (run by the following command), and please check out the [GitHub repository](https://github.com/hotpotqa/hotpot) for the details of preprocessing and evaluation.
+
+```bash
+python3 prepare_hotpotqa.py
+
+# Or download with command-line toolkits
+nlp_data prepare_hotpotqa
+```
+
+Directory structure of the hotpotqa dataset will be as follows
+```
+hotpotqa
+├── hotpot_train_v1.1.json
+├── hotpot_dev_fullwiki_v1.json
+├── hotpot_dev_distractor_v1.json
+├── hotpot_test_fullwiki_v1.json
+```
diff --git a/scripts/datasets/question_answering/__init__.py b/scripts/datasets/question_answering/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/datasets/question_answering/prepare_hotpotqa.py b/scripts/datasets/question_answering/prepare_hotpotqa.py
new file mode 100644
index 0000000000..f894b91f8a
--- /dev/null
+++ b/scripts/datasets/question_answering/prepare_hotpotqa.py
@@ -0,0 +1,62 @@
+import os
+import argparse
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+from gluonnlp.utils.misc import download, load_checksum_stats
+from gluonnlp.base import get_data_home_dir
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'hotpotqa')
+_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'hotpotqa.txt')
+_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
+
+
+_CITATIONS = """
+@inproceedings{yang2018hotpotqa,
+  title={{HotpotQA}: A Dataset for Diverse, Explainable Multi-hop Question Answering},
+  author={Yang, Zhilin and Qi, Peng and Zhang, Saizheng and Bengio, Yoshua and Cohen, William W. and Salakhutdinov, Ruslan and Manning, Christopher D.},
+  booktitle={Conference on Empirical Methods in Natural Language Processing ({EMNLP})},
+  year={2018}
+}
+
+"""
+
+_URLS = {
+    'train': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json',
+    'dev_fullwiki': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json',
+    'dev_distractor': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json',
+    'test_fullwiki': 'http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_test_fullwiki_v1.json',
+}
+
+
+@DATA_PARSER_REGISTRY.register('prepare_hotpotqa')
+def get_parser():
+    parser = argparse.ArgumentParser(description='Downloading the HotpotQA Dataset.')
+    parser.add_argument('--save-path', type=str, default='hotpotqa')
+    parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
+                        help='The path to download the dataset.')
+    parser.add_argument('--overwrite', action='store_true')
+    return parser
+
+
+@DATA_MAIN_REGISTRY.register('prepare_hotpotqa')
+def main(args):
+    if not os.path.exists(args.save_path):
+        os.makedirs(args.save_path)
+    for url in _URLS.values():
+        file_name = url[url.rfind('/') + 1:]
+        file_hash = _URL_FILE_STATS[url]
+        download(url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash)
+        if not os.path.exists(os.path.join(args.save_path, file_name))\
+                or (args.overwrite and args.save_path != args.cache_path):
+            os.symlink(os.path.join(args.cache_path, file_name),
+                       os.path.join(args.save_path, file_name))
+
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/scripts/datasets/question_answering/prepare_searchqa.py b/scripts/datasets/question_answering/prepare_searchqa.py
new file mode 100644
index 0000000000..f48236a944
--- /dev/null
+++ b/scripts/datasets/question_answering/prepare_searchqa.py
@@ -0,0 +1,61 @@
+import os
+import argparse
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+from gluonnlp.utils.misc import download, load_checksum_stats
+from gluonnlp.base import get_data_home_dir
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'searchqa')
+_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'searchqa.txt')
+_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
+
+
+_CITATIONS = """
+@article{dunn2017searchqa,
+  title={Searchqa: A new q\&a dataset augmented with context from a search engine},
+  author={Dunn, Matthew and Sagun, Levent and Higgins, Mike and Guney, V Ugur and Cirik, Volkan and Cho, Kyunghyun},
+  journal={arXiv preprint arXiv:1704.05179},
+  year={2017}
+}
+
+"""
+
+_URLS = {
+    'train': 's3://gluonnlp-numpy-data/datasets/question_answering/searchqa/train.txt',
+    'val': 's3://gluonnlp-numpy-data/datasets/question_answering/searchqa/val.txt',
+    'test': 's3://gluonnlp-numpy-data/datasets/question_answering/searchqa/test.txt'
+}
+
+
+@DATA_PARSER_REGISTRY.register('prepare_searchqa')
+def get_parser():
+    parser = argparse.ArgumentParser(description='Downloading the SearchQA Dataset.')
+    parser.add_argument('--save-path', type=str, default='searchqa')
+    parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
+                        help='The path to download the dataset.')
+    parser.add_argument('--overwrite', action='store_true')
+    return parser
+
+
+@DATA_MAIN_REGISTRY.register('prepare_searchqa')
+def main(args):
+    if not os.path.exists(args.save_path):
+        os.makedirs(args.save_path)
+    for url in _URLS.values():
+        file_name = url[url.rfind('/') + 1:]
+        file_hash = _URL_FILE_STATS[url]
+        download(url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash)
+        if not os.path.exists(os.path.join(args.save_path, file_name))\
+                or (args.overwrite and args.save_path != args.cache_path):
+            os.symlink(os.path.join(args.cache_path, file_name),
+                       os.path.join(args.save_path, file_name))
+
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/scripts/datasets/question_answering/prepare_squad.py b/scripts/datasets/question_answering/prepare_squad.py
new file mode 100644
index 0000000000..777a336609
--- /dev/null
+++ b/scripts/datasets/question_answering/prepare_squad.py
@@ -0,0 +1,81 @@
+import os
+import argparse
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+from gluonnlp.utils.misc import download, load_checksum_stats
+from gluonnlp.base import get_data_home_dir
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'squad')
+_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'squad.txt')
+_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
+
+
+_CITATIONS = """
+@inproceedings{rajpurkar2016squad,
+  title={Squad: 100,000+ questions for machine comprehension of text},
+  author={Rajpurkar, Pranav and Zhang, Jian and Lopyrev, Konstantin and Liang, Percy},
+  booktitle={EMNLP},
+  year={2016}
+}
+
+@inproceedings{rajpurkar2018know,
+  title={Know What You Don't Know: Unanswerable Questions for SQuAD},
+  author={Rajpurkar, Pranav and Jia, Robin and Liang, Percy},
+  booktitle={ACL},
+  year={2018}
+}
+
+"""
+
+_URLS = {
+    '1.1': {
+        'train': 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json',
+        'dev': 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json',
+    },
+    '2.0': {
+        'train': 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json',
+        'dev': 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json'
+    }
+}
+
+
+@DATA_PARSER_REGISTRY.register('prepare_squad')
+def get_parser():
+    parser = argparse.ArgumentParser(description='Downloading the SQuAD Dataset.')
+    parser.add_argument('--version', type=str, choices=['1.1', '2.0'], default='1.1',
+                        help='Version of the squad dataset.')
+    parser.add_argument('--save-path', type=str, default='squad')
+    parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
+                        help='The path to download the dataset.')
+    parser.add_argument('--overwrite', action='store_true')
+    return parser
+
+
+@DATA_MAIN_REGISTRY.register('prepare_squad')
+def main(args):
+    train_url = _URLS[args.version]['train']
+    dev_url = _URLS[args.version]['dev']
+    train_file_name = train_url[train_url.rfind('/') + 1:]
+    dev_file_name = dev_url[dev_url.rfind('/') + 1:]
+    download(train_url, path=os.path.join(args.cache_path, train_file_name))
+    download(dev_url, path=os.path.join(args.cache_path, dev_file_name))
+    if not os.path.exists(args.save_path):
+        os.makedirs(args.save_path)
+    if not os.path.exists(os.path.join(args.save_path, train_file_name))\
+            or (args.overwrite and args.save_path != args.cache_path):
+        os.symlink(os.path.join(args.cache_path, train_file_name),
+                   os.path.join(args.save_path, train_file_name))
+    if not os.path.exists(os.path.join(args.save_path, dev_file_name))\
+            or (args.overwrite and args.save_path != args.cache_path):
+        os.symlink(os.path.join(args.cache_path, dev_file_name),
+                   os.path.join(args.save_path, dev_file_name))
+
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/scripts/datasets/question_answering/prepare_triviaqa.py b/scripts/datasets/question_answering/prepare_triviaqa.py
new file mode 100644
index 0000000000..d67886fc1b
--- /dev/null
+++ b/scripts/datasets/question_answering/prepare_triviaqa.py
@@ -0,0 +1,77 @@
+import os
+import tarfile
+import argparse
+from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
+from gluonnlp.utils.misc import download, load_checksum_stats
+from gluonnlp.base import get_data_home_dir
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_BASE_DATASET_PATH = os.path.join(get_data_home_dir(), 'triviaqa')
+_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'triviaqa.txt')
+_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
+
+
+_CITATIONS = """
+@InProceedings{JoshiTriviaQA2017,
+     author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
+     title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
+     booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},
+     month = {July},
+     year = {2017},
+     address = {Vancouver, Canada},
+     publisher = {Association for Computational Linguistics},
+}
+
+"""
+
+_URLS = {
+    'rc': 'https://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz',
+    'unfiltered': 'https://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz'
+}
+
+
+@DATA_PARSER_REGISTRY.register('prepare_triviaqa')
+def get_parser():
+    parser = argparse.ArgumentParser(description='Downloading the TriviaQA Dataset.')
+    parser.add_argument('--type', type=str, choices=['rc', 'unfiltered'], default='rc',
+                        help='type of the triviaqa dataset.')
+    parser.add_argument('--save-path', type=str, default='triviaqa')
+    parser.add_argument('--cache-path', type=str, default=_BASE_DATASET_PATH,
+                        help='The path to download the dataset.')
+    parser.add_argument('--overwrite', action='store_true')
+    return parser
+
+
+@DATA_MAIN_REGISTRY.register('prepare_triviaqa')
+def main(args):
+
+    def extract(tar_path, target_path):
+        try:
+            tar = tarfile.open(tar_path, "r:gz")
+            file_names = tar.getnames()
+            for file_name in file_names:
+                tar.extract(file_name, target_path)
+            tar.close()
+        except Exception  as e:
+            print(e)
+
+    tar_url = _URLS[args.type]
+    file_name = tar_url[tar_url.rfind('/') + 1:]
+    file_hash = _URL_FILE_STATS[tar_url]
+    download(tar_url, path=os.path.join(args.cache_path, file_name), sha1_hash=file_hash)
+    if not os.path.exists(args.save_path):
+        os.makedirs(args.save_path)
+    if not os.path.exists(os.path.join(args.save_path, file_name))\
+            or (args.overwrite and args.save_path != args.cache_path):
+        os.symlink(os.path.join(args.cache_path, file_name),
+                   os.path.join(args.save_path, file_name))
+    extract(os.path.join(args.save_path, file_name), args.save_path)
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/scripts/datasets/update_download_stats.py b/scripts/datasets/update_download_stats.py
new file mode 100644
index 0000000000..4ab37f9e52
--- /dev/null
+++ b/scripts/datasets/update_download_stats.py
@@ -0,0 +1,122 @@
+import hashlib
+import requests
+import time
+import os
+import copy
+from collections import OrderedDict
+from gluonnlp.cli.data.machine_translation.prepare_wmt\
+    import _PARA_URLS as wmt_para_urls, _MONOLINGUAL_URLS as wmt_mono_urls
+from gluonnlp.cli.data.question_answering.prepare_squad import _URLS as squad_urls
+from gluonnlp.cli.data.question_answering.prepare_triviaqa import _URLS as triviaqa_url
+from gluonnlp.cli.data.question_answering.prepare_hotpotqa import _URLS as hotpotqa_urls
+from gluonnlp.cli.data.question_answering.prepare_searchqa import _URLS as searchqa_urls
+from gluonnlp.cli.data.language_modeling.prepare_lm import _URLS as lm_urls
+from gluonnlp.cli.data.music_generation.prepare_music_midi import _URLS as midi_urls
+from gluonnlp.cli.data.pretrain_corpus.prepare_bookcorpus import _URLS as book_urls
+from gluonnlp.cli.data.general_nlp_benchmark.prepare_glue import SUPERGLUE_TASK2PATH as superglue_urls
+from gluonnlp.cli.data.general_nlp_benchmark.prepare_glue import GLUE_TASK2PATH as glue_urls
+
+
+_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_CHECK_SUM_BASE = os.path.join(_CURR_DIR, 'url_checksums')
+
+
+def get_hash_and_size(obj, retries=5, algorithm='sha1', cache=None, save_path=None,
+                      verify_ssl=True):
+    """Fetch sha1 hash of all urls in the input obj"""
+    def _get_hash_and_size(obj, retries, algorithm, cache=None, save_path=None):
+        if isinstance(obj, str):
+            if obj.startswith('http://') or obj.startswith('https://'):
+                url = obj
+                hex_hash = None
+                file_size = None
+                if cache is not None and obj in cache:
+                    return obj, cache[obj]
+                while retries + 1 > 0:
+                    # Disable pyling too broad Exception
+                    # pylint: disable=W0703
+                    try:
+                        if algorithm == 'sha1':
+                            m = hashlib.sha1()
+                        elif algorithm == 'sha256':
+                            m = hashlib.sha256()
+                        elif algorithm == 'md5':
+                            m = hashlib.md5()
+                        else:
+                            raise NotImplementedError
+                        print('Calculating hash of the file downloaded from {}...'.format(url))
+                        start = time.time()
+                        r = requests.get(url, stream=True, verify=verify_ssl)
+                        if r.status_code != 200:
+                            raise RuntimeError('Failed downloading url {}'.format(url))
+                        f_size = 0
+                        for chunk in r.iter_content(chunk_size=10240):
+                            if chunk:  # filter out keep-alive new chunks
+                                m.update(chunk)
+                                f_size += len(chunk)
+                        hex_hash = m.hexdigest()
+                        file_size = f_size
+                        end = time.time()
+                        print('{}={}, size={}, Time spent={}'.format(algorithm, hex_hash, file_size,
+                                                                     end - start))
+                        if cache is None:
+                            cache = OrderedDict()
+                        cache[url] = (hex_hash, file_size)
+                        if save_path is not None:
+                            with open(save_path, 'a', encoding='utf-8') as of:
+                                of.write('{} {} {}\n'.format(url, hex_hash, file_size))
+                        break
+                    except Exception as e:
+                        retries -= 1
+                        if retries <= 0:
+                            raise e
+                        print('download failed due to {}, retrying, {} attempt{} left'
+                              .format(repr(e), retries, 's' if retries > 1 else ''))
+                return obj, (hex_hash, file_size)
+            else:
+                return obj
+        elif isinstance(obj, tuple):
+            return tuple((_get_hash_and_size(ele, retries, algorithm, cache, save_path)
+                          for ele in obj))
+        elif isinstance(obj, list):
+            return [_get_hash_and_size(ele, retries, algorithm, cache, save_path) for ele in obj]
+        elif isinstance(obj, dict):
+            return {k: _get_hash_and_size(v, retries, algorithm, cache, save_path)
+                    for k, v in obj.items()}
+        else:
+            return obj
+    if cache is None:
+        cache = OrderedDict()
+    else:
+        cache = copy.deepcopy(cache)
+    if save_path is not None and os.path.exists(save_path):
+        with open(save_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip()
+                url, hex_hash, file_size = line.split()
+                cache[url] = (hex_hash, file_size)
+    _get_hash_and_size(obj, retries, algorithm, cache, save_path)
+    return obj, cache
+
+
+if __name__ == '__main__':
+    get_hash_and_size([wmt_para_urls, wmt_mono_urls],
+                      save_path=os.path.join(_CHECK_SUM_BASE, 'wmt.txt'))
+    get_hash_and_size(squad_urls,
+                      save_path=os.path.join(_CHECK_SUM_BASE, 'squad.txt'))
+    get_hash_and_size(hotpotqa_urls,
+                      save_path=os.path.join(_CHECK_SUM_BASE, 'hotpotqa.txt'))
+    get_hash_and_size(triviaqa_url,
+                      save_path=os.path.join(_CHECK_SUM_BASE, 'triviaqa.txt'))
+    get_hash_and_size(searchqa_url,
+                      save_path=os.path.join(_CHECK_SUM_BASE, 'searchqa.txt'))
+    get_hash_and_size(lm_urls,
+                      save_path=os.path.join(_CHECK_SUM_BASE, 'language_model.txt'))
+    get_hash_and_size(midi_urls,
+                      save_path=os.path.join(_CHECK_SUM_BASE, 'music_midi.txt'))
+    get_hash_and_size(book_urls,
+                      save_path=os.path.join(_CHECK_SUM_BASE, 'book_corpus.txt'))
+    get_hash_and_size(glue_urls,
+                      save_path=os.path.join(_CHECK_SUM_BASE, 'glue.txt'))
+    get_hash_and_size(superglue_urls,
+                      save_path=os.path.join(_CHECK_SUM_BASE, 'superglue.txt'))
diff --git a/scripts/datasets/url_checksums/book_corpus.txt b/scripts/datasets/url_checksums/book_corpus.txt
new file mode 100644
index 0000000000..abacb7b93e
--- /dev/null
+++ b/scripts/datasets/url_checksums/book_corpus.txt
@@ -0,0 +1 @@
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/pretrain_corpus/Gutenberg.zip 91e842dc3671ed5a917b7ff6a60f5f87397780e2 461506225
diff --git a/scripts/datasets/url_checksums/glue.txt b/scripts/datasets/url_checksums/glue.txt
new file mode 100644
index 0000000000..f29bb8d9d7
--- /dev/null
+++ b/scripts/datasets/url_checksums/glue.txt
@@ -0,0 +1,14 @@
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FCoLA.zip?alt=media&token=46d5e637-3411-4188-bc44-5809b5bfb5f4 19096246cd2a06d8fe2d13880d6cec61149f77c7 376971
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8 44f5954391612a8b3d9d65f6d4a824e9ae8d19ce 7439277
+https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_train.txt 716e0f67af962f08220b7e97d229b293077ef41f 1047044
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc 506c7a1a5e0dd551ceec2f84070fa1a8c2bc4b41 6222
+https://dl.fbaipublicfiles.com/senteval/senteval_data/msr_paraphrase_test.txt 4265196c15cf75620b0b592b8b921f543bda7e6c 441275
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP-clean.zip?alt=media&token=11a647cb-ecd3-49c9-9d31-79f8ca8fe277 d775bd543ee78e3f64892a43ada949daf93e003d 41696084
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSTS-B.zip?alt=media&token=bddb94a7-8706-4e0d-a694-1109e12273b5 cc66d8533052de6d7475ac56dfce300751e070a4 802872
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FMNLI.zip?alt=media&token=50329ea1-e339-40e2-809c-10c40afff3ce c22c684daa5cc9fad949d09d10ecedf94a2ce053 312783507
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSNLI.zip?alt=media&token=4afcfbb2-ff0c-4b2d-a09a-dbf07926f4df c60db4cc8820749e6af9f713f4d55109dd46e8c1 129820157
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQNLIv2.zip?alt=media&token=6fdcf570-0fc5-4631-8456-9505272d1601 6700cb1d2536bf512314b01350f9ac382439218e 10627589
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FRTE.zip?alt=media&token=5efa7e85-a0bb-4f19-8ea2-9e1840f077fb 2eb8630df898b7d8df14ca9130c1ac1cf79eb376 697150
+https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FWNLI.zip?alt=media&token=068ad0a0-ded7-4bd7-99a5-5e00222e0faf fc9834b5a8af4e1d8412e48bc38b477510a8c2d0 28999
+https://storage.googleapis.com/mtl-sentence-representations.appspot.com/tsvsWithoutLabels%2FAX.tsv?GoogleAccessId=firebase-adminsdk-0khhl@mtl-sentence-representations.iam.gserviceaccount.com&Expires=2498860800&Signature=DuQ2CSPt2Yfre0C%2BiISrVYrIFaZH1Lc7hBVZDD4ZyR7fZYOMNOUGpi8QxBmTNOrNPjR3z1cggo7WXFfrgECP6FBJSsURv8Ybrue8Ypt%2FTPxbuJ0Xc2FhDi%2BarnecCBFO77RSbfuz%2Bs95hRrYhTnByqu3U%2FYZPaj3tZt5QdfpH2IUROY8LiBXoXS46LE%2FgOQc%2FKN%2BA9SoscRDYsnxHfG0IjXGwHN%2Bf88q6hOmAxeNPx6moDulUF6XMUAaXCSFU%2BnRO2RDL9CapWxj%2BDl7syNyHhB7987hZ80B%2FwFkQ3MEs8auvt5XW1%2Bd4aCU7ytgM69r8JDCwibfhZxpaa4gd50QXQ%3D%3D c137a2020ab489011dc38fde9ee429f4e2c71257 222257
+https://www.dropbox.com/s/ju7d95ifb072q9f/diagnostic-full.tsv?dl=1 2f46c4b80fea8d3ea52a28e05467af3332fa65d9 265530
diff --git a/scripts/datasets/url_checksums/hotpotqa.txt b/scripts/datasets/url_checksums/hotpotqa.txt
new file mode 100644
index 0000000000..17b96cf3c6
--- /dev/null
+++ b/scripts/datasets/url_checksums/hotpotqa.txt
@@ -0,0 +1,4 @@
+http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json 08c42431c22984f362e94de0e635c7b858c3cff0 566426227
+http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json 825b6cfc34a61db41e82bbb14d978d5a834925f8 46320117
+http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_fullwiki_v1.json 96a41025612e8cb15989251102dc05efe9647eda 47454698
+http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_test_fullwiki_v1.json b30e4ff0d8b7bd808240e5609410f9c36279ef36 46213747
diff --git a/scripts/datasets/url_checksums/language_model.txt b/scripts/datasets/url_checksums/language_model.txt
new file mode 100644
index 0000000000..f5ce7ef716
--- /dev/null
+++ b/scripts/datasets/url_checksums/language_model.txt
@@ -0,0 +1,6 @@
+https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip 3c914d17d80b1459be871a5039ac23e752a53cbe 4475746
+https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip 0aec09a7537b58d4bb65362fee27650eeaba625a 190229076
+http://mattmahoney.net/dc/enwik8.zip d856b1ccd937c51aeb9c342e47666fb8c38e7e72 36445475
+http://mattmahoney.net/dc/text8.zip 6c70299b93b7e1f927b42cd8f6ac1a31547c7a2e 31344016
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1-billion-word-language-modeling-benchmark-r13output.tar.gz 4df859766482e12264a5a9d9fb7f0e276020447d 1792209805
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/datasets/language_modeling/1b_word_vocab.txt aa2322a3da82ef628011336c9b5c6059e4f56c3f 9507106
diff --git a/scripts/datasets/url_checksums/mirror/wmt.json b/scripts/datasets/url_checksums/mirror/wmt.json
new file mode 100644
index 0000000000..fa695f6bd9
--- /dev/null
+++ b/scripts/datasets/url_checksums/mirror/wmt.json
@@ -0,0 +1,48 @@
+{
+    "http://www.statmt.org/europarl/v7/cs-en.tgz" : "datasets/third_party_mirror/cs-en-28bad3e096923694fb776b6cd6ba1079546a9e58.tgz",
+    "http://www.statmt.org/europarl/v7/de-en.tgz" : "datasets/third_party_mirror/de-en-53bb5408d22977c89284bd755717e6bbb5b12bc5.tgz",
+    "http://data.statmt.org/wmt18/translation-task/training-parallel-ep-v8.tgz" : "datasets/third_party_mirror/training-parallel-ep-v8-2f5c2c2c98b72921474a3f1837dc5b61dd44ba88.tgz",
+    "http://www.statmt.org/europarl/v9/training/europarl-v9.cs-en.tsv.gz" : "datasets/third_party_mirror/europarl-v9.cs-en.tsv-e36a1bfe634379ec813b399b57a38093df2349ef.gz",
+    "http://www.statmt.org/europarl/v9/training/europarl-v9.de-en.tsv.gz" : "datasets/third_party_mirror/europarl-v9.de-en.tsv-d553d0c8189642c1c7ae6ed3c265c847e432057c.gz",
+    "http://www.statmt.org/europarl/v9/training/europarl-v9.fi-en.tsv.gz" : "datasets/third_party_mirror/europarl-v9.fi-en.tsv-c5d2f6aad04e88dda6ad11a110f4ca24150edca3.gz",
+    "http://www.statmt.org/europarl/v9/training/europarl-v9.lt-en.tsv.gz" : "datasets/third_party_mirror/europarl-v9.lt-en.tsv-a6343d8fc158f44714ea7d01c0eb65b34640841d.gz",
+    "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz" : "datasets/third_party_mirror/training-parallel-commoncrawl-1c0ad85f0ebaf1d543acb009607205f5dae6627d.tgz",
+    "http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz" : "datasets/third_party_mirror/training-parallel-nc-v9-c7ae7f50cd45c2f3014d78ddba25a4a8a851e27a.tgz",
+    "http://www.statmt.org/wmt15/training-parallel-nc-v10.tgz" : "datasets/third_party_mirror/training-parallel-nc-v10-6c3c45b0f34d5e84a4d0b75a5edcca226ba7d6c2.tgz",
+    "http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz" : "datasets/third_party_mirror/training-parallel-nc-v11-f51a1f03908e790d23d10001e92e09ce9555a790.tgz",
+    "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz" : "datasets/third_party_mirror/training-parallel-nc-v12-d98afc59e1d753485530b377ff65f1f891d3bced.tgz",
+    "http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz" : "datasets/third_party_mirror/training-parallel-nc-v13-cbaa7834e58d36f228336e3caee6a9056029ff5d.tgz",
+    "http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.de-en.tsv.gz" : "datasets/third_party_mirror/news-commentary-v14.de-en.tsv-c1fd94c7c9ff222968cbd45100bdd8dbeb5ab2aa.gz",
+    "http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-zh.tsv.gz" : "datasets/third_party_mirror/news-commentary-v14.en-zh.tsv-4ca5c01deeba5425646d42f9598d081cd662908b.gz",
+    "http://data.statmt.org/wikititles/v1/wikititles-v1.cs-en.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.cs-en.tsv-6e094d218dfd8f987fa1a18ea7b4cb127cfb1763.gz",
+    "http://data.statmt.org/wikititles/v1/wikititles-v1.cs-pl.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.cs-pl.tsv-dc93d346d151bf73e4165d6db425b903fc21a5b0.gz",
+    "http://data.statmt.org/wikititles/v1/wikititles-v1.de-en.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.de-en.tsv-e141c55c43a474e06c259c3fa401288b39cd4315.gz",
+    "http://data.statmt.org/wikititles/v1/wikititles-v1.es-pt.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.es-pt.tsv-c3bd398d57471ee4ab33323393977b8d475a368c.gz",
+    "http://data.statmt.org/wikititles/v1/wikititles-v1.fi-en.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.fi-en.tsv-5668b004567ca286d1aad9c2b45862a441d79667.gz",
+    "http://data.statmt.org/wikititles/v1/wikititles-v1.gu-en.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.gu-en.tsv-95b9f15b6a86bfed6dc9bc91597368fd334f436e.gz",
+    "http://data.statmt.org/wikititles/v1/wikititles-v1.hi-ne.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.hi-ne.tsv-6d63908950c72bc8cc69ca470deccff11354afc2.gz",
+    "http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.kk-en.tsv-56ee1e450ef98fe92ea2116c3ce7acc7c7c42b39.gz",
+    "http://data.statmt.org/wikititles/v1/wikititles-v1.lt-en.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.lt-en.tsv-b8829928686727165eec6c591d2875d12d7c0cfe.gz",
+    "http://data.statmt.org/wikititles/v1/wikititles-v1.ru-en.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.ru-en.tsv-16d8d231fdf6347b4cc7834654adec80153ff7a4.gz",
+    "http://data.statmt.org/wikititles/v1/wikititles-v1.zh-en.tsv.gz" : "datasets/third_party_mirror/wikititles-v1.zh-en.tsv-5829097ff7dd61752f29fb306b04d790a1a1cfd7.gz",
+    "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.00" : "datasets/third_party_mirror/UNv1.0.en-ru-98c4e01e16070567d27da0ab4fe401f309dd3678.tar.gz.00",
+    "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01" : "datasets/third_party_mirror/UNv1.0.en-ru-86c6013dc88f353d2d6e591928e7549060fcb949.tar.gz.01",
+    "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02" : "datasets/third_party_mirror/UNv1.0.en-ru-bf6b18a33c8cafa6889fd463fa8a2850d8877d35.tar.gz.02",
+    "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00" : "datasets/third_party_mirror/UNv1.0.en-zh-1bec5f10297512183e483fdd4984d207700657d1.tar.gz.00",
+    "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01" : "datasets/third_party_mirror/UNv1.0.en-zh-15df2968bc69ef7662cf3029282bbb62cbf107b1.tar.gz.01",
+    "http://data.statmt.org/wmt17/translation-task/rapid2016.tgz" : "datasets/third_party_mirror/rapid2016-8b173ce0bc77f2a1a57c8134143e3b5ae228a6e2.tgz",
+    "http://data.statmt.org/wmt19/translation-task/dev.tgz" : "datasets/third_party_mirror/dev-451ce2cae815c8392212ccb3f54f5dcddb9b2b9e.tgz",
+    "http://data.statmt.org/wmt19/translation-task/test.tgz" : "datasets/third_party_mirror/test-ce02a36fb2cd41abfa19d36eb8c8d50241ed3346.tgz",
+    "http://data.statmt.org/news-crawl/de/news.2007.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2007.de.shuffled.deduped-9d746b9df345f764e6e615119113c70e3fb0858c.gz",
+    "http://data.statmt.org/news-crawl/de/news.2008.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2008.de.shuffled.deduped-185a24e8833844486aee16cb5decf9a64da1c101.gz",
+    "http://data.statmt.org/news-crawl/de/news.2009.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2009.de.shuffled.deduped-9f7645fc6467de88f4205d94f483194838bad8ce.gz",
+    "http://data.statmt.org/news-crawl/de/news.2010.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2010.de.shuffled.deduped-f29b761194e9606f086102cfac12813931575818.gz",
+    "http://data.statmt.org/news-crawl/de/news.2011.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2011.de.shuffled.deduped-613b16e7a1cb8559dd428525a4c3b42c8a4dc278.gz",
+    "http://data.statmt.org/news-crawl/de/news.2012.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2012.de.shuffled.deduped-1bc419364ea3fe2f9ba4236947c012d4198d9282.gz",
+    "http://data.statmt.org/news-crawl/de/news.2013.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2013.de.shuffled.deduped-3edd84a7f105907608371c81babc7a9078f40aac.gz",
+    "http://data.statmt.org/news-crawl/de/news.2014.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2014.de.shuffled.deduped-1466c67b330c08ab5ab7d48e666c1d3a0bb4e479.gz",
+    "http://data.statmt.org/news-crawl/de/news.2015.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2015.de.shuffled.deduped-2c6d5ec9f8fe51e9eb762be8ff7107c6116c00c4.gz",
+    "http://data.statmt.org/news-crawl/de/news.2016.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2016.de.shuffled.deduped-e7d235c5d28e36dcf6382f1aa12c6ff37d4529bb.gz",
+    "http://data.statmt.org/news-crawl/de/news.2017.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2017.de.shuffled.deduped-f70b4a67bc04c0fdc2ec955b737fa22681e8c038.gz",
+    "http://data.statmt.org/news-crawl/de/news.2018.de.shuffled.deduped.gz" : "datasets/third_party_mirror/news.2018.de.shuffled.deduped-43f8237de1e219276c0682255def13aa2cb80e35.gz"
+}
\ No newline at end of file
diff --git a/scripts/datasets/url_checksums/music_midi.txt b/scripts/datasets/url_checksums/music_midi.txt
new file mode 100644
index 0000000000..84394518ea
--- /dev/null
+++ b/scripts/datasets/url_checksums/music_midi.txt
@@ -0,0 +1,7 @@
+http://hog.ee.columbia.edu/craffel/lmd/lmd_full.tar.gz 330b3c67f24f9280f81e1f7ab12749087dd83f08 1768163879
+http://hog.ee.columbia.edu/craffel/lmd/lmd_matched.tar.gz 218b7c82ecb230a6679053e48e87714f0bd4836f 1407072670
+http://hog.ee.columbia.edu/craffel/lmd/lmd_aligned.tar.gz 9873e84dd5a531ba3623e0a24ce33a81681cba80 272169548
+http://hog.ee.columbia.edu/craffel/lmd/clean_midi.tar.gz ae47e29dfc18d7779d95697a6461d759504c7a1c 234283029
+https://storage.googleapis.com/magentadata/datasets/maestro/v1.0.0/maestro-v1.0.0-midi.zip e189d8a0b6769f3be576a036da840adafe489327 46579421
+https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0-midi.zip 13808bf9503c72371d38e9705e93ce8623b21c01 59243107
+https://archive.org/download/archiveteam-geocities-midi-collection-2009/2009.GeoCities.MIDI.ArchiveTeam.zip 493880759c648dd96167a2f4d394421e6fa33874 437506993
diff --git a/scripts/datasets/url_checksums/searchqa.txt b/scripts/datasets/url_checksums/searchqa.txt
new file mode 100644
index 0000000000..12ba03a7d5
--- /dev/null
+++ b/scripts/datasets/url_checksums/searchqa.txt
@@ -0,0 +1,3 @@
+s3://gluonnlp-numpy-data/datasets/question_answering/searchqa/train.txt c7e1eb8c34d0525547b91e18b3f8f4d855e35c16 1226681217
+s3://gluonnlp-numpy-data/datasets/question_answering/searchqa/test.txt 08a928e0f8c129d5b3ca43bf46df117e38be0c27 332064988
+s3://gluonnlp-numpy-data/datasets/question_answering/searchqa/val.txt c2f65d6b83c26188d5998ab96bc6a38c1a127fcc 170835902
diff --git a/scripts/datasets/url_checksums/squad.txt b/scripts/datasets/url_checksums/squad.txt
new file mode 100644
index 0000000000..ee6f52e66f
--- /dev/null
+++ b/scripts/datasets/url_checksums/squad.txt
@@ -0,0 +1,4 @@
+https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json 1faea1252438a64f9718412a55036b786cfcc636 30288272
+https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json e1621aae0683b346ee9743bd5609266ba0cc34fc 4854279
+https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json ceb2acdea93b9d82ab1829c7b1e03bee9e302c99 42123633
+https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json 53ebaeb15bc5cab36645150f6f65d074348e2f3d 4370528
diff --git a/scripts/datasets/url_checksums/superglue.txt b/scripts/datasets/url_checksums/superglue.txt
new file mode 100644
index 0000000000..897bb2e490
--- /dev/null
+++ b/scripts/datasets/url_checksums/superglue.txt
@@ -0,0 +1,10 @@
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/CB.zip c16fa0a46f0f888d59767851c44d8db397896fe5 75482
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/COPA.zip ef110b215d7ff95a2fd2d0133f0959d324e9eec3 43986
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/MultiRC.zip 05bfcb1da7ea06742266f24503342fc20b2ab88a 1116225
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/RTE.zip 66105efeccc3fc54f9c5539de4c2d393d5bb4d36 750920
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WiC.zip 5b95487a3690abc718bc173ccd35bf084c43b10a 396213
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/WSC.zip 829ec3dd532284281cc19bacf9cded6c11d3452d 32751
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-b.zip 8c8874dcace4942dd00cf9f76c2537ea0e2026eb 33950
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/AX-g.zip 949909079262bc4f6fb66bd889707aa71218975f 10413
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/BoolQ.zip 90bf152c8012869d326260709404ce5111a76b46 4118001
+https://dl.fbaipublicfiles.com/glue/superglue/data/v2/ReCoRD.zip af2825be511efa8fbc7813756e768efffb8fcc11 51757880
diff --git a/scripts/datasets/url_checksums/triviaqa.txt b/scripts/datasets/url_checksums/triviaqa.txt
new file mode 100644
index 0000000000..e31be83f58
--- /dev/null
+++ b/scripts/datasets/url_checksums/triviaqa.txt
@@ -0,0 +1,2 @@
+https://nlp.cs.washington.edu/triviaqa/data/triviaqa-rc.tar.gz aa7d8c01d4a5e563caaeb648e8c1f506e353ebd6 2665779500
+https://nlp.cs.washington.edu/triviaqa/data/triviaqa-unfiltered.tar.gz 670ba904b286865e25bb67ebd31c25e7c74c18ae 632549060
diff --git a/scripts/datasets/url_checksums/wikipedia.txt b/scripts/datasets/url_checksums/wikipedia.txt
new file mode 100644
index 0000000000..2f4c117a9e
--- /dev/null
+++ b/scripts/datasets/url_checksums/wikipedia.txt
@@ -0,0 +1 @@
+https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/pretrain_corpus/wikipedia-en-20200620.tar.gz    1e1d77c31622744aaa45ff5bfbfca397154d9186    5068070627
diff --git a/scripts/datasets/url_checksums/wmt.txt b/scripts/datasets/url_checksums/wmt.txt
new file mode 100644
index 0000000000..195fdf1a6a
--- /dev/null
+++ b/scripts/datasets/url_checksums/wmt.txt
@@ -0,0 +1,58 @@
+http://www.statmt.org/europarl/v7/cs-en.tgz 28bad3e096923694fb776b6cd6ba1079546a9e58 62062621
+http://www.statmt.org/europarl/v7/de-en.tgz 53bb5408d22977c89284bd755717e6bbb5b12bc5 197785698
+http://data.statmt.org/wmt18/translation-task/training-parallel-ep-v8.tgz 2f5c2c2c98b72921474a3f1837dc5b61dd44ba88 246201434
+http://www.statmt.org/europarl/v9/training/europarl-v9.cs-en.tsv.gz e36a1bfe634379ec813b399b57a38093df2349ef 68176874
+http://www.statmt.org/europarl/v9/training/europarl-v9.de-en.tsv.gz d553d0c8189642c1c7ae6ed3c265c847e432057c 204454328
+http://www.statmt.org/europarl/v9/training/europarl-v9.fi-en.tsv.gz c5d2f6aad04e88dda6ad11a110f4ca24150edca3 194574376
+http://www.statmt.org/europarl/v9/training/europarl-v9.lt-en.tsv.gz a6343d8fc158f44714ea7d01c0eb65b34640841d 64351345
+https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-cs.bicleaner07.tmx.gz 201fc692d4e730cc63e0b1274f98769eeab2faad 957135146
+https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-de.bicleaner07.tmx.gz 7930ac4d7aa1d17467edc04a45f3ed2ffe809a30 9091373722
+https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-fi.bicleaner07.tmx.gz 2485ce022a8027a4cec60eed0e35b989d2302e32 726455593
+https://s3.amazonaws.com/web-language-models/paracrawl/release3/en-lt.bicleaner07.tmx.gz 926dfcd0aba9cc46e6e1a099047a49ee01745d63 286088883
+https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz 6a4c43a2fac39153f2320984a0f13bf4266696d8 667981874
+http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz 1c0ad85f0ebaf1d543acb009607205f5dae6627d 918311367
+http://www.statmt.org/wmt14/training-parallel-nc-v9.tgz c7ae7f50cd45c2f3014d78ddba25a4a8a851e27a 80418416
+http://www.statmt.org/wmt15/training-parallel-nc-v10.tgz 6c3c45b0f34d5e84a4d0b75a5edcca226ba7d6c2 125136590
+http://data.statmt.org/wmt16/translation-task/training-parallel-nc-v11.tgz f51a1f03908e790d23d10001e92e09ce9555a790 75178032
+http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz d98afc59e1d753485530b377ff65f1f891d3bced 168591139
+http://data.statmt.org/wmt18/translation-task/training-parallel-nc-v13.tgz cbaa7834e58d36f228336e3caee6a9056029ff5d 113157482
+http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.de-en.tsv.gz c1fd94c7c9ff222968cbd45100bdd8dbeb5ab2aa 39390551
+http://data.statmt.org/news-commentary/v14/training/news-commentary-v14.en-zh.tsv.gz 4ca5c01deeba5425646d42f9598d081cd662908b 36696769
+http://data.statmt.org/wikititles/v1/wikititles-v1.cs-en.tsv.gz 6e094d218dfd8f987fa1a18ea7b4cb127cfb1763 5112423
+http://data.statmt.org/wikititles/v1/wikititles-v1.cs-pl.tsv.gz dc93d346d151bf73e4165d6db425b903fc21a5b0 3525297
+http://data.statmt.org/wikititles/v1/wikititles-v1.de-en.tsv.gz e141c55c43a474e06c259c3fa401288b39cd4315 17919359
+http://data.statmt.org/wikititles/v1/wikititles-v1.es-pt.tsv.gz c3bd398d57471ee4ab33323393977b8d475a368c 7916897
+http://data.statmt.org/wikititles/v1/wikititles-v1.fi-en.tsv.gz 5668b004567ca286d1aad9c2b45862a441d79667 5101486
+http://data.statmt.org/wikititles/v1/wikititles-v1.gu-en.tsv.gz 95b9f15b6a86bfed6dc9bc91597368fd334f436e 177183
+http://data.statmt.org/wikititles/v1/wikititles-v1.hi-ne.tsv.gz 6d63908950c72bc8cc69ca470deccff11354afc2 184765
+http://data.statmt.org/wikititles/v1/wikititles-v1.kk-en.tsv.gz 56ee1e450ef98fe92ea2116c3ce7acc7c7c42b39 1575037
+http://data.statmt.org/wikititles/v1/wikititles-v1.lt-en.tsv.gz b8829928686727165eec6c591d2875d12d7c0cfe 1725255
+http://data.statmt.org/wikititles/v1/wikititles-v1.ru-en.tsv.gz 16d8d231fdf6347b4cc7834654adec80153ff7a4 20299017
+http://data.statmt.org/wikititles/v1/wikititles-v1.zh-en.tsv.gz 5829097ff7dd61752f29fb306b04d790a1a1cfd7 12974754
+https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.00 98c4e01e16070567d27da0ab4fe401f309dd3678 1073741824
+https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01 86c6013dc88f353d2d6e591928e7549060fcb949 1073741824
+https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02 bf6b18a33c8cafa6889fd463fa8a2850d8877d35 306221588
+https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00 1bec5f10297512183e483fdd4984d207700657d1 1073741824
+https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01 15df2968bc69ef7662cf3029282bbb62cbf107b1 312943879
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/casia2015.zip b432394685e4c53797e1ac86851f8a013aef27a2 98159063
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/casict2011.zip 769a9a86c24e9507dbf520b950b9026120cb041e 166957775
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/datum2015.zip 6d94cc8d296dd4268ed0a10fa3a419267280363e 100118018
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/datum2017.zip 480fa06760b2dbe7c9a9bd7c3fd5e5b22b860a45 37389573
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt/parallel/neu2017.zip 532b56ba62f6cffccdc85f4316468873ca739bd1 148681171
+http://data.statmt.org/wmt17/translation-task/rapid2016.tgz 8b173ce0bc77f2a1a57c8134143e3b5ae228a6e2 163416042
+https://s3-eu-west-1.amazonaws.com/tilde-model/rapid2019.de-en.zip aafe431338abb98fc20951b2d6011223a1b91311 111888392
+http://data.statmt.org/wmt19/translation-task/dev.tgz 451ce2cae815c8392212ccb3f54f5dcddb9b2b9e 38654961
+http://data.statmt.org/wmt19/translation-task/test.tgz ce02a36fb2cd41abfa19d36eb8c8d50241ed3346 3533424
+https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/wmt/cwmt.tar.gz 88c2f4295169e9f0a9834bf8bff87e3fd4c04055 709032378
+http://data.statmt.org/news-crawl/de/news.2007.de.shuffled.deduped.gz 9d746b9df345f764e6e615119113c70e3fb0858c 90104365
+http://data.statmt.org/news-crawl/de/news.2008.de.shuffled.deduped.gz 185a24e8833844486aee16cb5decf9a64da1c101 308205291
+http://data.statmt.org/news-crawl/de/news.2009.de.shuffled.deduped.gz 9f7645fc6467de88f4205d94f483194838bad8ce 317590378
+http://data.statmt.org/news-crawl/de/news.2010.de.shuffled.deduped.gz f29b761194e9606f086102cfac12813931575818 170405229
+http://data.statmt.org/news-crawl/de/news.2011.de.shuffled.deduped.gz 613b16e7a1cb8559dd428525a4c3b42c8a4dc278 661772046
+http://data.statmt.org/news-crawl/de/news.2012.de.shuffled.deduped.gz 1bc419364ea3fe2f9ba4236947c012d4198d9282 854369573
+http://data.statmt.org/news-crawl/de/news.2013.de.shuffled.deduped.gz 3edd84a7f105907608371c81babc7a9078f40aac 1987692337
+http://data.statmt.org/news-crawl/de/news.2014.de.shuffled.deduped.gz 1466c67b330c08ab5ab7d48e666c1d3a0bb4e479 2018482812
+http://data.statmt.org/news-crawl/de/news.2015.de.shuffled.deduped.gz 2c6d5ec9f8fe51e9eb762be8ff7107c6116c00c4 1772843312
+http://data.statmt.org/news-crawl/de/news.2016.de.shuffled.deduped.gz e7d235c5d28e36dcf6382f1aa12c6ff37d4529bb 1276921550
+http://data.statmt.org/news-crawl/de/news.2017.de.shuffled.deduped.gz f70b4a67bc04c0fdc2ec955b737fa22681e8c038 1863251604
+http://data.statmt.org/news-crawl/de/news.2018.de.shuffled.deduped.gz 43f8237de1e219276c0682255def13aa2cb80e35 2000806230
diff --git a/scripts/index.rst b/scripts/index.rst
deleted file mode 100644
index ca7d84ac00..0000000000
--- a/scripts/index.rst
+++ /dev/null
@@ -1,93 +0,0 @@
-Model Zoo
-=========
-
-.. container:: cards
-
-   .. card::
-      :title: Word Embedding
-      :link: word_embeddings/index.html
-
-      Mapping words to vectors.
-
-   .. card::
-      :title: Language Modeling
-      :link: language_model/index.html
-
-      Learning the distribution and representation of sequences of words.
-
-   .. card::
-      :title: Machine Translation
-      :link: machine_translation/index.html
-
-      From "Hello" to "Bonjour".
-
-   .. card::
-      :title: Text Classification
-      :link: text_classification/index.html
-
-      Categorizing documents.
-
-   .. card::
-      :title: Sentiment Analysis
-      :link: sentiment_analysis/index.html
-
-      Classifying polarity of emotions and opinions.
-
-   .. card::
-      :title: Parsing 
-      :link: parsing/index.html
-
-      Dependency parsing.
-
-   .. card::
-      :title: Natural Language Inference
-      :link: natural_language_inference/index.html
-
-      Determining if the premise semantically entails the hypothesis.
-
-   .. card::
-      :title: Text Generation
-      :link: text_generation/index.html
-
-      Generating text from language models.
-
-   .. card::
-      :title: BERT
-      :link: bert/index.html
-
-      Transferring pre-trained language representations to language understanding tasks.
-
-   .. card::
-      :title: Named Entity Recognition
-      :link: ner/index.html
-
-      Locating and classifying named entity mentioned in unstructured texts.
-
-   .. card::
-      :title: Intent Classification and Slot Labeling
-      :link: intent_cls_slot_labeling/index.html
-
-      Predicting the intent of the query and extracting semantic concepts in the query.
-
-   .. card::
-      :title: Model Conversion
-      :link: model_zoo/conversion_tools/index.html
-
-      Converting NLP models from other frameworks to GluonNLP.
-
-.. toctree::
-   :hidden:
-   :maxdepth: 1
-
-   word_embeddings/index
-   language_model/index
-   machine_translation/index
-   text_classification/index
-   sentiment_analysis/index
-   natural_language_inference/index
-   text_generation/index
-   parsing/index
-   bert/index
-   ner/index
-   intent_cls_slot_labeling/index
-   conversion_tools/index
diff --git a/scripts/intent_cls_slot_labeling/finetune_icsl.py b/scripts/intent_cls_slot_labeling/finetune_icsl.py
deleted file mode 100644
index 75c4aa3a14..0000000000
--- a/scripts/intent_cls_slot_labeling/finetune_icsl.py
+++ /dev/null
@@ -1,461 +0,0 @@
-"""
-Intent Classification and Slot Labelling with BERT
-
-=========================================================================================
-
-This example shows how to implement finetune a model with pre-trained BERT parameters for
-joint intent classification and slot labelling, with Gluon NLP Toolkit.
-
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation,arguments-differ,unused-variable,missing-docstring,wrong-import-order
-import os
-import sys
-import time
-import argparse
-import random
-import numpy as np
-import pandas as pd
-from tqdm import tqdm
-import mxnet as mx
-from mxnet import gluon
-from mxnet.gluon import nn, Block
-from seqeval.metrics import f1_score as ner_f1_score
-import gluonnlp as nlp
-from gluonnlp.data import BERTTokenizer, ATISDataset, SNIPSDataset
-
-nlp.utils.check_version('0.7.0')
-
-class BERTForICSL(Block):
-    """Model
-
-    """
-    def __init__(self, bert, num_intent_classes, num_slot_classes, dropout_prob,
-                 prefix=None, params=None):
-        """
-
-        Parameters
-        ----------
-        bert : Block
-        num_intent_classes : int
-        num_slot_classes : int
-        dropout_prob : float
-        prefix : None or str
-        params : None or ParamDict
-        """
-        super(BERTForICSL, self).__init__(prefix=prefix, params=params)
-        self.bert = bert
-        with self.name_scope():
-            self.intent_classifier = nn.HybridSequential()
-            with self.intent_classifier.name_scope():
-                self.intent_classifier.add(nn.Dropout(rate=dropout_prob))
-                self.intent_classifier.add(nn.Dense(units=num_intent_classes, flatten=False))
-            self.slot_tagger = nn.HybridSequential()
-            with self.slot_tagger.name_scope():
-                self.slot_tagger.add(nn.Dropout(rate=dropout_prob))
-                self.slot_tagger.add(nn.Dense(units=num_slot_classes, flatten=False))
-
-    def forward(self, inputs, valid_length):
-        """
-
-        Parameters
-        ----------
-        inputs : NDArray
-            The input sentences, has shape (batch_size, seq_length)
-        valid_length : NDArray
-            The valid length of the sentences
-
-        Returns
-        -------
-        intent_scores : NDArray
-            Shape (batch_size, num_classes)
-        slot_scores : NDArray
-            Shape (batch_size, seq_length, num_tag_types)
-        """
-        token_types = mx.nd.zeros_like(inputs)
-        encoded_states, pooler_out = self.bert(inputs, token_types, valid_length)
-        intent_scores = self.intent_classifier(pooler_out)
-        slot_scores = self.slot_tagger(encoded_states)
-        return intent_scores, slot_scores
-
-
-class IDSLSubwordTransform():
-    """Transform the word_tokens/tags by the subword tokenizer
-
-    """
-    def __init__(self, subword_vocab, subword_tokenizer, slot_vocab, cased=False):
-        """
-
-        Parameters
-        ----------
-        subword_vocab : Vocab
-        subword_tokenizer : Tokenizer
-        cased : bool
-            Whether to convert all characters to lower
-        """
-        self._subword_vocab = subword_vocab
-        self._subword_tokenizer = subword_tokenizer
-        self._slot_vocab = slot_vocab
-        self._cased = cased
-        self._slot_pad_id = self._slot_vocab['O']
-
-
-    def __call__(self, word_tokens, tags, intent_ids):
-        """ Transform the word_tokens/tags by the subword tokenizer
-
-        Parameters
-        ----------
-        word_tokens : List[str]
-        tags : List[str]
-        intent_ids : np.ndarray
-
-        Returns
-        -------
-        subword_ids : np.ndarray
-        subword_mask : np.ndarray
-        selected : np.ndarray
-        padded_tag_ids : np.ndarray
-        intent_label : int
-        length : int
-        """
-        subword_ids = []
-        subword_mask = []
-        selected = []
-        padded_tag_ids = []
-        intent_label = intent_ids[0]
-        ptr = 0
-        for token, tag in zip(word_tokens, tags):
-            if not self._cased:
-                token = token.lower()
-            token_sw_ids = self._subword_vocab[self._subword_tokenizer(token)]
-            subword_ids.extend(token_sw_ids)
-            subword_mask.extend([1] + [0] * (len(token_sw_ids) - 1))
-            selected.append(ptr)
-            padded_tag_ids.extend([self._slot_vocab[tag]] +
-                                  [self._slot_pad_id] * (len(token_sw_ids) - 1))
-            ptr += len(token_sw_ids)
-        length = len(subword_ids)
-        if len(subword_ids) != len(padded_tag_ids):
-            print(word_tokens)
-            print(tags)
-            print(subword_ids)
-            print(padded_tag_ids)
-        return np.array(subword_ids, dtype=np.int32),\
-               np.array(subword_mask, dtype=np.int32),\
-               np.array(selected, dtype=np.int32),\
-               np.array(padded_tag_ids, dtype=np.int32),\
-               intent_label,\
-               length
-
-
-def parse_args():
-    arg_parser = argparse.ArgumentParser(
-        description='Train a BERT-based model for joint intent detection and slot filling on '
-                    'ATIS/SNIPS dataset.',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    arg_parser.add_argument('--seed', type=int, default=123)
-    arg_parser.add_argument('--dataset', choices=['atis', 'snips'], default='atis')
-    arg_parser.add_argument('--bert-model', type=str, default='bert_12_768_12',
-                            help='Name of the BERT model')
-    arg_parser.add_argument('--cased', action='store_true',
-                            help='Whether to use the cased model trained on book_corpus_wiki_en.'
-                                 'Otherwise, use the uncased model.')
-    arg_parser.add_argument('--dropout-prob', type=float, default=0.1,
-                            help='Dropout probability for the last layer')
-    arg_parser.add_argument('--batch-size', type=int, default=32, help='Batch size for training')
-    arg_parser.add_argument('--epochs', type=int, default=40, help='Batch size for training')
-    arg_parser.add_argument('--optimizer', type=str, default='bertadam',
-                            help='Optimization algorithm to use')
-    arg_parser.add_argument('--learning-rate', type=float, default=5e-5,
-                            help='Learning rate for optimization')
-    arg_parser.add_argument('--wd', type=float, default=0.0,
-                            help='Weight decay')
-    arg_parser.add_argument('--warmup-ratio', type=float, default=0.1,
-                            help='Warmup ratio for learning rate scheduling')
-    arg_parser.add_argument('--slot-loss-mult', type=float, default=1.0,
-                            help='Multiplier for the slot loss.')
-    arg_parser.add_argument('--save-dir', type=str, default='saved_model')
-    arg_parser.add_argument('--gpu', type=int, default=None,
-                            help='Number (index) of GPU to run on, e.g. 0.'
-                                 ' If not specified, uses CPU.')
-    args = arg_parser.parse_args()
-    return args
-
-
-
-def print_sample(dataset, sample_id):
-    """ Print sample in the dataset
-
-    Parameters
-    ----------
-    dataset : SimpleDataset
-    sample_id: int
-
-    Returns
-    -------
-    """
-    word_tokens, tags, intent_ids = dataset[sample_id]
-    print('Sample #ID: {} Intent: {}'.format(sample_id,
-                                             [dataset.intent_vocab.idx_to_token[ele]
-                                              for ele in intent_ids]))
-    df = pd.DataFrame(list(zip(word_tokens, tags)))
-    df.index.name = None
-    print('Sequence:')
-    print(df.to_string(header=False))
-
-
-def evaluation(ctx, data_loader, net, intent_pred_loss, slot_pred_loss, slot_vocab):
-    """ Evaluate the trained model
-
-    Parameters
-    ----------
-    ctx : Context
-    data_loader : DataLoader
-    net : Block
-    intent_pred_loss : Loss
-    slot_pred_loss : Loss
-    slot_vocab : Vocab
-
-    Returns
-    -------
-    avg_intent_loss : float
-    avg_slot_loss : float
-    intent_acc : float
-    slot_f1 : float
-    pred_slots : list
-    gt_slots : list
-    """
-    nsample = 0
-    nslot = 0
-    avg_intent_loss = 0
-    avg_slot_loss = 0
-    correct_intent = 0
-    pred_slots = []
-    gt_slots = []
-    for token_ids, mask, selected, slot_ids, intent_label, valid_length in data_loader:
-        token_ids = mx.nd.array(token_ids, ctx=ctx).astype(np.int32)
-        mask = mx.nd.array(mask, ctx=ctx).astype(np.float32)
-        slot_ids = mx.nd.array(slot_ids, ctx=ctx).astype(np.int32)
-        intent_label = mx.nd.array(intent_label, ctx=ctx).astype(np.int32)
-        valid_length = mx.nd.array(valid_length, ctx=ctx).astype(np.float32)
-        batch_nslot = mask.sum().asscalar()
-        batch_nsample = token_ids.shape[0]
-        # Forward network
-        intent_scores, slot_scores = net(token_ids, valid_length)
-        intent_loss = intent_pred_loss(intent_scores, intent_label)
-        slot_loss = slot_pred_loss(slot_scores, slot_ids, mask.expand_dims(axis=-1))
-        avg_intent_loss += intent_loss.sum().asscalar()
-        avg_slot_loss += slot_loss.sum().asscalar()
-        pred_slot_ids = mx.nd.argmax(slot_scores, axis=-1).astype(np.int32)
-        correct_intent += (mx.nd.argmax(intent_scores, axis=-1).astype(np.int32)
-                           == intent_label).sum().asscalar()
-        for i in range(batch_nsample):
-            ele_valid_length = int(valid_length[i].asscalar())
-            ele_sel = selected[i].asnumpy()[:ele_valid_length]
-            ele_gt_slot_ids = slot_ids[i].asnumpy()[ele_sel]
-            ele_pred_slot_ids = pred_slot_ids[i].asnumpy()[ele_sel]
-            ele_gt_slot_tokens = [slot_vocab.idx_to_token[v] for v in ele_gt_slot_ids]
-            ele_pred_slot_tokens = [slot_vocab.idx_to_token[v] for v in ele_pred_slot_ids]
-            gt_slots.append(ele_gt_slot_tokens)
-            pred_slots.append(ele_pred_slot_tokens)
-        nsample += batch_nsample
-        nslot += batch_nslot
-    avg_intent_loss /= nsample
-    avg_slot_loss /= nslot
-    intent_acc = correct_intent / float(nsample)
-    slot_f1 = ner_f1_score(pred_slots, gt_slots)
-    return avg_intent_loss, avg_slot_loss, intent_acc, slot_f1, pred_slots, gt_slots
-
-
-
-def train(args):
-    ctx = mx.cpu() if args.gpu is None else mx.gpu(args.gpu)
-    dataset_name = 'book_corpus_wiki_en_cased' if args.cased else 'book_corpus_wiki_en_uncased'
-    bert_model, bert_vocab = nlp.model.get_model(name=args.bert_model,
-                                                 dataset_name=dataset_name,
-                                                 pretrained=True,
-                                                 ctx=ctx,
-                                                 use_pooler=True,
-                                                 use_decoder=False,
-                                                 use_classifier=False,
-                                                 dropout=args.dropout_prob,
-                                                 embed_dropout=args.dropout_prob)
-    tokenizer = BERTTokenizer(bert_vocab, lower=not args.cased)
-    if args.dataset == 'atis':
-        train_data = ATISDataset('train')
-        dev_data = ATISDataset('dev')
-        test_data = ATISDataset('test')
-        intent_vocab = train_data.intent_vocab
-        slot_vocab = train_data.slot_vocab
-    elif args.dataset == 'snips':
-        train_data = SNIPSDataset('train')
-        dev_data = SNIPSDataset('dev')
-        test_data = SNIPSDataset('test')
-        intent_vocab = train_data.intent_vocab
-        slot_vocab = train_data.slot_vocab
-    else:
-        raise NotImplementedError
-    print('Dataset {}'.format(args.dataset))
-    print('   #Train/Dev/Test = {}/{}/{}'.format(len(train_data), len(dev_data), len(test_data)))
-    print('   #Intent         = {}'.format(len(intent_vocab)))
-    print('   #Slot           = {}'.format(len(slot_vocab)))
-    # Display An Example
-    print('Display A Samples')
-    print_sample(test_data, 1)
-    print('-' * 80)
-
-    idsl_transform = IDSLSubwordTransform(subword_vocab=bert_vocab,
-                                          subword_tokenizer=tokenizer,
-                                          slot_vocab=slot_vocab,
-                                          cased=args.cased)
-    train_data_bert = train_data.transform(idsl_transform, lazy=False)
-    dev_data_bert = dev_data.transform(idsl_transform, lazy=False)
-    test_data_bert = test_data.transform(idsl_transform, lazy=False)
-    # Construct the DataLoader
-    batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(pad_val=0),  # Subword ID
-                                          nlp.data.batchify.Pad(pad_val=0),  # Subword Mask
-                                          nlp.data.batchify.Pad(pad_val=0),  # Beginning of subword
-                                          nlp.data.batchify.Pad(pad_val=0),  # Tag IDs
-                                          nlp.data.batchify.Stack(),  # Intent Label
-                                          nlp.data.batchify.Stack())  # Valid Length
-    train_batch_sampler = nlp.data.sampler.SortedBucketSampler(
-        [len(ele) for ele in train_data_bert],
-        batch_size=args.batch_size,
-        mult=20,
-        shuffle=True)
-    train_loader = gluon.data.DataLoader(dataset=train_data_bert,
-                                         num_workers=4,
-                                         batch_sampler=train_batch_sampler,
-                                         batchify_fn=batchify_fn)
-    dev_loader = gluon.data.DataLoader(dataset=dev_data_bert,
-                                       num_workers=4,
-                                       batch_size=args.batch_size,
-                                       batchify_fn=batchify_fn,
-                                       shuffle=False)
-    test_loader = gluon.data.DataLoader(dataset=test_data_bert,
-                                        num_workers=4,
-                                        batch_size=args.batch_size,
-                                        batchify_fn=batchify_fn,
-                                        shuffle=False)
-
-    # Build the network and loss functions
-    intent_pred_loss = gluon.loss.SoftmaxCELoss()
-    slot_pred_loss = gluon.loss.SoftmaxCELoss(batch_axis=[0, 1])
-
-    net = BERTForICSL(bert_model, num_intent_classes=len(intent_vocab),
-                      num_slot_classes=len(slot_vocab), dropout_prob=args.dropout_prob)
-    net.slot_tagger.initialize(ctx=ctx, init=mx.init.Normal(0.02))
-    net.intent_classifier.initialize(ctx=ctx, init=mx.init.Normal(0.02))
-    net.hybridize()
-    intent_pred_loss.hybridize()
-    slot_pred_loss.hybridize()
-
-    # Build the trainer
-    trainer = gluon.Trainer(net.collect_params(), args.optimizer,
-                            {'learning_rate': args.learning_rate, 'wd': args.wd},
-                            update_on_kvstore=False)
-
-    step_num = 0
-    num_train_steps = int(len(train_batch_sampler) * args.epochs)
-    num_warmup_steps = int(num_train_steps * args.warmup_ratio)
-    best_dev_sf1 = -1
-    for epoch_id in range(args.epochs):
-        avg_train_intent_loss = 0.0
-        avg_train_slot_loss = 0.0
-        nsample = 0
-        nslot = 0
-        ntoken = 0
-        train_epoch_start = time.time()
-        for token_ids, mask, _, slot_ids, intent_label, valid_length\
-                in tqdm(train_loader, file=sys.stdout):
-            ntoken += valid_length.sum().asscalar()
-            token_ids = mx.nd.array(token_ids, ctx=ctx).astype(np.int32)
-            mask = mx.nd.array(mask, ctx=ctx).astype(np.float32)
-            slot_ids = mx.nd.array(slot_ids, ctx=ctx).astype(np.int32)
-            intent_label = mx.nd.array(intent_label, ctx=ctx).astype(np.int32)
-            valid_length = mx.nd.array(valid_length, ctx=ctx).astype(np.float32)
-            batch_nslots = mask.sum().asscalar()
-            batch_nsample = token_ids.shape[0]
-
-            # Set learning rate warm-up
-            step_num += 1
-            if step_num < num_warmup_steps:
-                new_lr = args.learning_rate * step_num / num_warmup_steps
-            else:
-                offset = ((step_num - num_warmup_steps) * args.learning_rate /
-                          (num_train_steps - num_warmup_steps))
-                new_lr = args.learning_rate - offset
-            trainer.set_learning_rate(new_lr)
-
-            with mx.autograd.record():
-                intent_scores, slot_scores = net(token_ids, valid_length)
-                intent_loss = intent_pred_loss(intent_scores, intent_label)
-                slot_loss = slot_pred_loss(slot_scores, slot_ids, mask.expand_dims(axis=-1))
-                intent_loss = intent_loss.mean()
-                slot_loss = slot_loss.sum() / batch_nslots
-                loss = intent_loss + args.slot_loss_mult * slot_loss
-                loss.backward()
-            trainer.update(1.0)
-            avg_train_intent_loss += intent_loss.asscalar() * batch_nsample
-            avg_train_slot_loss += slot_loss.asscalar() * batch_nslots
-            nsample += batch_nsample
-            nslot += batch_nslots
-        train_epoch_end = time.time()
-        avg_train_intent_loss /= nsample
-        avg_train_slot_loss /= nslot
-        print('[Epoch {}] train intent/slot = {:.3f}/{:.3f}, #token per second={:.0f}'.format(
-            epoch_id, avg_train_intent_loss, avg_train_slot_loss,
-            ntoken / (train_epoch_end - train_epoch_start)))
-        avg_dev_intent_loss, avg_dev_slot_loss, dev_intent_acc,\
-        dev_slot_f1, dev_pred_slots, dev_gt_slots\
-            = evaluation(ctx, dev_loader, net, intent_pred_loss, slot_pred_loss, slot_vocab)
-        print('[Epoch {}]    dev intent/slot = {:.3f}/{:.3f},'
-              ' slot f1 = {:.2f}, intent acc = {:.2f}'.format(epoch_id, avg_dev_intent_loss,
-                                                              avg_dev_slot_loss,
-                                                              dev_slot_f1 * 100,
-                                                              dev_intent_acc * 100))
-        if dev_slot_f1 > best_dev_sf1:
-            best_dev_sf1 = dev_slot_f1
-            avg_test_intent_loss, avg_test_slot_loss, test_intent_acc, \
-            test_slot_f1, test_pred_slots, test_gt_slots \
-                = evaluation(ctx, test_loader, net, intent_pred_loss, slot_pred_loss, slot_vocab)
-            print('[Epoch {}]    test intent/slot = {:.3f}/{:.3f},'
-                  ' slot f1 = {:.2f}, intent acc = {:.2f}'.format(epoch_id, avg_test_intent_loss,
-                                                                  avg_test_slot_loss,
-                                                                  test_slot_f1 * 100,
-                                                                  test_intent_acc * 100))
-            if not os.path.exists(args.save_dir):
-                os.makedirs(args.save_dir)
-            net.save_parameters(os.path.join(args.save_dir, 'best_valid.params'))
-    print('Evaluate the best model:')
-    net.load_parameters(os.path.join(args.save_dir, 'best_valid.params'))
-    avg_test_intent_loss, avg_test_slot_loss, test_intent_acc, \
-    test_slot_f1, test_pred_slots, test_gt_slots \
-        = evaluation(ctx, test_loader, net, intent_pred_loss, slot_pred_loss, slot_vocab)
-    print('Best validation model --> Slot F1={:.2f}, Intent acc={:.2f}'
-          .format(test_slot_f1 * 100, test_intent_acc * 100))
-    with open(os.path.join(args.save_dir, 'test_error.txt'), 'w') as of:
-        of.write('{} {}\n'.format(test_slot_f1, test_intent_acc))
-
-if __name__ == '__main__':
-    args = parse_args()
-    np.random.seed(args.seed)
-    mx.random.seed(args.seed)
-    random.seed(args.seed)
-    train(args)
diff --git a/scripts/intent_cls_slot_labeling/index.rst b/scripts/intent_cls_slot_labeling/index.rst
deleted file mode 100644
index 7b46da1bff..0000000000
--- a/scripts/intent_cls_slot_labeling/index.rst
+++ /dev/null
@@ -1,108 +0,0 @@
-Intent Classification and Slot Labeling
----------------------------------------
-
-:download:`Download scripts </model_zoo/intent_cls_slot_labeling.zip>`
-
-Reference:
-- Devlin, Jacob, et al. "`Bert: Pre-training of deep bidirectional transformers for language understanding. <https://arxiv.org/abs/1810.04805>`_" arXiv preprint arXiv:1810.04805 (2018).
-- Chen, Qian, et al. "`BERT for Joint Intent Classification and Slot Filling. <https://arxiv.org/pdf/1902.10909.pdf>`_"  arXiv preprint arXiv:1902.10909 (2019).
-
-Joint Intent Classification and Slot Labelling
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Intent classification and slot labelling are two essential problems in Natural Language Understanding (NLU).
-In *intent classification*, the agent needs to detect the intention that the speaker's utterance conveys. For example, when the speaker says "Book a flight from Long Beach to Seattle", the intention is to book a flight ticket.
-In *slot labelling*, the agent needs to extract the semantic entities that are related to the intent. In our previous example,
-"Long Beach" and "Seattle" are two semantic constituents related to the flight, i.e., the origin and the destination.
-
-Essentially, *intent classification* can be viewed as a sequence classification problem and *slot labelling* can be viewed as a
-sequence tagging problem similar to Named-entity Recognition (NER). Due to their inner correlation, these two tasks are usually
-trained jointly with a multi-task objective function.
-
-Here's one example of the ATIS dataset, it uses the `IOB2 format <https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)>`__.
-
-+-----------+--------------------------+--------------+
-| Sentence  | Tags                     | Intent Label |
-+===========+==========================+==============+
-| are       | O                        | atis_flight  |
-+-----------+--------------------------+--------------+
-| there     | O                        |              |
-+-----------+--------------------------+--------------+
-| any       | O                        |              |
-+-----------+--------------------------+--------------+
-| flight    | O                        |              |
-+-----------+--------------------------+--------------+
-| from      | O                        |              |
-+-----------+--------------------------+--------------+
-| long      | B-fromloc.city_name      |              |
-+-----------+--------------------------+--------------+
-| beach     | I-fromloc.city_name      |              |
-+-----------+--------------------------+--------------+
-| to        | O                        |              |
-+-----------+--------------------------+--------------+
-| columbus  | B-toloc.city_name        |              |
-+-----------+--------------------------+--------------+
-| on        | O                        |              |
-+-----------+--------------------------+--------------+
-| wednesday | B-depart_date.day_name   |              |
-+-----------+--------------------------+--------------+
-| april     | B-depart_date.month_name |              |
-+-----------+--------------------------+--------------+
-| sixteen   | B-depart_date.day_number |              |
-+-----------+--------------------------+--------------+
-
-
-
-In this example, we demonstrate how to use GluonNLP to fine-tune a pretrained BERT model for joint intent classification and slot labelling. We
-choose to finetune a pretrained BERT model.  We use two datasets `ATIS <https://github.com/yvchen/JointSLU>`__ and `SNIPS <https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines>`__.
-
-The training script requires the seqeval and tqdm packages:
-
-.. code-block:: console
-
-    $ pip3 install seqeval --user
-    $ pip3 install tqdm --user
-
-For the ATIS dataset, use the following command to run the experiment:
-
-.. code-block:: console
-
-    $ python finetune_icsl.py --gpu 0 --dataset atis
-
-It produces the final slot labelling F1 = `95.83%` and intent classification accuracy = `98.66%`
-
-For the SNIPS dataset, use the following command to run the experiment:
-
-.. code-block:: console
-
-    $ python finetune_icsl.py --gpu 0 --dataset snips
-
-It produces the final slot labelling F1 = `96.06%` and intent classification accuracy = `98.71%`
-
-Also, we train the models with three random seeds and report the mean/std.
-
-For ATIS
-
-+--------------------------------------------------------------------------------------------+----------------+-------------+
-|                                             Models                                         | Intent Acc (%) | Slot F1 (%) |
-+============================================================================================+================+=============+
-| `Intent Gating & self-attention, EMNLP 2018 <https://www.aclweb.org/anthology/D18-1417>`__ |    98.77       |  96.52      |
-+--------------------------------------------------------------------------------------------+----------------+-------------+
-| `BLSTM-CRF + ELMo, AAAI 2019, <https://arxiv.org/abs/1811.05370>`__                        |    97.42       |  95.62      |
-+--------------------------------------------------------------------------------------------+----------------+-------------+
-| `Joint BERT, Arxiv 2019, <https://arxiv.org/pdf/1902.10909.pdf>`__                         |    97.5        |  96.1       |
-+--------------------------------------------------------------------------------------------+----------------+-------------+
-| Ours                                                                                       |    98.66±0.00  |  95.88±0.04 |
-+--------------------------------------------------------------------------------------------+----------------+-------------+
-
-For SNIPS
-
-+--------------------------------------------------------------------+----------------+-------------+
-|                                   Models                           | Intent Acc (%) | Slot F1 (%) |
-+====================================================================+================+=============+
-| `BLSTM-CRF + ELMo, AAAI 2019 <https://arxiv.org/abs/1811.05370>`__ | 99.29          | 93.90       |
-+--------------------------------------------------------------------+----------------+-------------+
-| `Joint BERT, Arxiv 2019 <https://arxiv.org/pdf/1902.10909.pdf>`__  | 98.60          | 97.00       |
-+--------------------------------------------------------------------+----------------+-------------+
-| Ours                                                               | 98.81±0.13     | 95.94±0.10  |
-+--------------------------------------------------------------------+----------------+-------------+
diff --git a/scripts/language_model/__init__.py b/scripts/language_model/__init__.py
deleted file mode 100644
index a747f8c58b..0000000000
--- a/scripts/language_model/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Language Model example."""
diff --git a/scripts/language_model/cache_language_model.py b/scripts/language_model/cache_language_model.py
deleted file mode 100644
index a8acaa6771..0000000000
--- a/scripts/language_model/cache_language_model.py
+++ /dev/null
@@ -1,211 +0,0 @@
-"""
-Neural Cache Language Model
-===================
-This example shows how to build a neural cache language model based on
-pre-trained word-level language model on WikiText-2 with Gluon NLP Toolkit.
-
-We implement the neural cache language model proposed in the following work.
-@article{grave2016improving,
-  title={Improving neural language models with a continuous cache},
-  author={Grave, Edouard and Joulin, Armand and Usunier, Nicolas},
-  journal={ICLR},
-  year={2017}
-}
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import time
-import math
-import os
-import sys
-import mxnet as mx
-import gluonnlp as nlp
-
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.append(os.path.join(curr_path, '..', '..'))
-
-nlp.utils.check_version('0.7.0')
-
-parser = argparse.ArgumentParser(description=
-                                 'MXNet Neural Cache Language Model on Wikitext-2.')
-parser.add_argument('--bptt', type=int, default=2000,
-                    help='sequence length')
-parser.add_argument('--model_name', type=str, default='awd_lstm_lm_1150',
-                    help='name of the pre-trained language model')
-parser.add_argument('--gpus', type=str,
-                    help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.'
-                         '(using single gpu is suggested)')
-parser.add_argument('--window', type=int, default=2000,
-                    help='cache window length')
-parser.add_argument('--theta', type=float, default=0.662,
-                    help='the scala controls the flatness of the cache distribution '
-                         'that predict the next word')
-parser.add_argument('--lambdas', type=float, default=0.1279,
-                    help='linear scalar between only cache and vocab distribution')
-parser.add_argument('--path_to_params_file', type=str, default=None,
-                    help='path to the saved params file of user pre-trained model, '
-                         'including the params file, e.g., ~/.mxnet/models/awd_lstm_lm_1150.params')
-args = parser.parse_args()
-
-###############################################################################
-# Load vocabulary
-###############################################################################
-
-context = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
-          [mx.gpu(int(x)) for x in args.gpus.split(',')]
-
-print(args)
-
-_, vocab = nlp.model.get_model(name=args.model_name,
-                               dataset_name='wikitext-2',
-                               pretrained=False,
-                               ctx=context)
-ntokens = len(vocab)
-
-###############################################################################
-# Build the cache model and load pre-trained language model
-###############################################################################
-
-
-if not args.path_to_params_file:
-    cache_cell = nlp.model.train.get_cache_model(name=args.model_name,
-                                                 dataset_name='wikitext-2',
-                                                 window=args.window,
-                                                 theta=args.theta,
-                                                 lambdas=args.lambdas,
-                                                 ctx=context)
-else:
-    model, _ = nlp.model.get_model(name=args.model_name,
-                                   dataset_name='wikitext-2',
-                                   pretrained=False,
-                                   ctx=context)
-    cache_cell = nlp.model.train.CacheCell(model, ntokens, args.window, args.theta, args.lambdas)
-    cache_cell.load_parameters(args.path_to_params_file, ctx=context)
-
-###############################################################################
-# Load data
-###############################################################################
-
-val_dataset, test_dataset = \
-    [nlp.data.WikiText2(segment=segment,
-                        skip_empty=False, bos=None, eos='<eos>')
-     for segment in ['val', 'test']]
-
-val_batch_size = 1
-val_batchify = nlp.data.batchify.CorpusBatchify(vocab, val_batch_size)
-val_data = val_batchify(val_dataset)
-test_batch_size = 1
-test_batchify = nlp.data.batchify.CorpusBatchify(vocab, test_batch_size)
-test_data = test_batchify(test_dataset)
-
-###############################################################################
-# Training
-###############################################################################
-
-
-def detach(hidden):
-    """Transfer hidden states into new states, to detach them from the history.
-    Parameters
-    ----------
-    hidden : NDArray
-        The hidden states
-    Returns
-    ----------
-    hidden: NDArray
-        The detached hidden states
-    """
-    if isinstance(hidden, (tuple, list)):
-        hidden = [detach(h) for h in hidden]
-    else:
-        hidden = hidden.detach()
-    return hidden
-
-
-def get_batch(data_source, i, seq_len=None):
-    """Get mini-batches of the dataset.
-
-    Parameters
-    ----------
-    data_source : NDArray
-        The dataset is evaluated on.
-    i : int
-        The index of the batch, starting from 0.
-    seq_len : int
-        The length of each sample in the batch.
-
-    Returns
-    -------
-    data: NDArray
-        The context
-    target: NDArray
-        The words to predict
-    """
-    seq_len = min(seq_len if seq_len else args.bptt, len(data_source) - 1 - i)
-    data = data_source[i:i+seq_len]
-    target = data_source[i+1:i+1+seq_len]
-    return data, target
-
-
-def evaluate(data_source, batch_size, ctx=None):
-    """Evaluate the model on the dataset with cache model.
-
-    Parameters
-    ----------
-    data_source : NDArray
-        The dataset is evaluated on.
-    batch_size : int
-        The size of the mini-batch.
-    ctx : mx.cpu() or mx.gpu()
-        The context of the computation.
-
-    Returns
-    -------
-    loss: float
-        The loss on the dataset
-    """
-    total_L = 0
-    hidden = cache_cell.\
-        begin_state(func=mx.nd.zeros, batch_size=batch_size, ctx=context[0])
-    next_word_history = None
-    cache_history = None
-    for i in range(0, len(data_source) - 1, args.bptt):
-        if i > 0:
-            print('Batch %d/%d, ppl %f'%
-                  (i, len(data_source), math.exp(total_L/i)))
-        data, target = get_batch(data_source, i)
-        data = data.as_in_context(ctx)
-        target = target.as_in_context(ctx)
-        L = 0
-        outs, next_word_history, cache_history, hidden = \
-            cache_cell(data, target, next_word_history, cache_history, hidden)
-        for out in outs:
-            L += (-mx.nd.log(out)).asscalar()
-        total_L += L / data.shape[1]
-        hidden = detach(hidden)
-    return total_L / len(data_source)
-
-
-if __name__ == '__main__':
-    start_pipeline_time = time.time()
-    final_val_L = evaluate(val_data, val_batch_size, context[0])
-    final_test_L = evaluate(test_data, test_batch_size, context[0])
-    print('Best validation loss %.2f, val ppl %.2f' % (final_val_L, math.exp(final_val_L)))
-    print('Best test loss %.2f, test ppl %.2f' % (final_test_L, math.exp(final_test_L)))
-    print('Total time cost %.2fs' % (time.time()-start_pipeline_time))
diff --git a/scripts/language_model/conversion_utils/compare_transformerxl_pytorch_gluon_model.py b/scripts/language_model/conversion_utils/compare_transformerxl_pytorch_gluon_model.py
deleted file mode 100644
index 065f1bcb4a..0000000000
--- a/scripts/language_model/conversion_utils/compare_transformerxl_pytorch_gluon_model.py
+++ /dev/null
@@ -1,181 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Script for model comparison between TF and Gluon."""
-
-import argparse
-import glob
-import logging
-import os
-import pickle
-import re
-import sys
-
-import mxnet as mx
-import numpy as np
-import tensorflow as tf
-import torch
-from absl import flags
-
-import gluonnlp as nlp
-import transformers
-from utils import read_tf_checkpoint, to_gluon_kwargs
-
-
-def get_kwargs_and_corpus(args):
-    # Infer model config
-    with open(os.path.join(args.tf_data_dir, 'cache.pkl'), 'rb') as f:
-        corpus = pickle.load(f, encoding='latin1')
-    tf_checkpoint_file = os.path.expanduser(
-        os.path.join(args.tf_checkpoint_dir, args.tf_model_prefix))
-    tf_tensors = read_tf_checkpoint(tf_checkpoint_file)
-    kwargs, _ = to_gluon_kwargs(tf_tensors)
-    return kwargs, corpus
-
-
-def get_data(args):
-    record_info_dir = os.path.join(args.tf_data_dir, 'tfrecords')
-    assert os.path.exists(record_info_dir)
-    record_info_file = glob.glob(os.path.join(record_info_dir, "record_info*json"))[0]
-    eval_split, batch_size, tgt_len = re.search(r'record_info-(\w+)\.bsz-(\d+)\.tlen-(\d+).json',
-                                                record_info_file).groups()
-    batch_size, tgt_len = int(batch_size), int(tgt_len)
-
-    num_core_per_host = 1
-    num_hosts = 1
-    eval_input_fn, eval_record_info = data_utils.get_input_fn(
-        record_info_dir=record_info_dir, split=eval_split, per_host_bsz=batch_size, tgt_len=tgt_len,
-        num_core_per_host=num_core_per_host, num_hosts=num_hosts, use_tpu=False)
-
-    ##### Create computational graph
-    eval_set = eval_input_fn({"batch_size": batch_size, "data_dir": record_info_dir})
-    input_feed, label_feed = eval_set.make_one_shot_iterator().get_next()
-
-    # Extract first two batches
-    sess = tf.Session()
-    np_features, np_labels = [], []
-    for i in range(2):
-        feature_i, label_i = sess.run((input_feed, label_feed))
-        np_features.append(feature_i[:1])  # force batch_size of 1
-        np_labels.append(label_i[:1])
-
-    return np_features, np_labels, 1, tgt_len
-
-
-def compare_transformerxl(args, kwargs, corpus):
-    # Data
-    np_features, np_labels, batch_size, tgt_len = get_data(args)
-
-    # Models
-    model_p = transformers.TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
-    model_p.crit.keep_order = True
-    model_p.transformer.output_attentions = False  # no change of default; breaks model if changed
-    model_p.transformer.output_hidden_states = True
-
-    with open(args.gluon_vocab_file, 'r') as f:
-        vocab = nlp.Vocab.from_json(f.read())
-    ctx = mx.gpu()
-    model = TransformerXL(vocab_size=len(vocab), clamp_len=model_p.transformer.clamp_len, **kwargs)
-    model.initialize(ctx=ctx)
-    model.load_parameters(args.gluon_parameter_file, ignore_extra=False)
-    model.hybridize()
-
-    # Computation
-    assert len(np_features) == 2
-    mems = model.begin_mems(batch_size, model_p.config.mem_len, context=ctx)
-    mems_p = None
-    for batch in range(2):
-        print('Batch {}'.format(batch))
-
-        features_nd = mx.nd.array(np_features[batch], ctx=ctx)
-        labels_nd = mx.nd.array(np_labels[batch], ctx=ctx)
-        features_p = torch.tensor(np_features[batch], dtype=torch.long)
-        labels_p = torch.tensor(np_labels[batch], dtype=torch.long)
-
-        loss, mems, last_hidden = model(features_nd, labels_nd, mems)
-
-        loss_p, _, mems_p, all_hidden_p = model_p(features_p, mems=mems_p, labels=labels_p)
-
-        for i in range(kwargs['num_layers']):
-            a_b = mems_p[i][:, 0].numpy() - mems[i][0].asnumpy()
-            max_error = a_b.max()
-            argmax_error = a_b.argmax()
-            stdev = np.std(a_b)
-            print('Layer {i}: Maximum error {err:.2e} at position {pos}. stdev={stdev:.2e}'.format(
-                i=i, err=max_error, pos=np.unravel_index(argmax_error, shape=a_b.shape),
-                stdev=stdev))
-        a_b = loss_p.detach().numpy()[0] - loss.asnumpy()[0]
-        max_error = a_b.max()
-        argmax_error = a_b.argmax()
-        stdev = np.std(a_b)
-        print('Loss: Maximum error {err:.2e} at position {pos}. stdev={stdev:.2e}'.format(
-            i=i, err=max_error, pos=np.unravel_index(argmax_error, shape=a_b.shape), stdev=stdev))
-        assert max_error < 5e-5
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='Comparison script for Tensorflow and GLuon Transformer-XL model',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--transformer-xl-repo', type=str, required=True,
-                        help='Path to https://github.com/kimiyoung/transformer-xl repo.')
-    parser.add_argument('--tf-checkpoint-dir', type=str, required=True,
-                        help='Path to Tensorflow checkpoint folder.')
-    parser.add_argument(
-        '--tf-model-prefix', type=str, required=True, help='Prefix of the checkpoint files. '
-        'For example model.ckpt-0 or model.ckpt-1191000')
-    parser.add_argument(
-        '--tf-data-dir', type=str, required=True, help='Path to TransformerXL data folder. '
-        'The folder should contain the tfrecords directory as well as the cache.pkl file. '
-        'tfrecords can be created with the TransformerXL data_utils.py script.')
-    parser.add_argument('--gluon-parameter-file', type=str, required=True,
-                        help='gluon parameter file name.')
-    parser.add_argument('--gluon-vocab-file', type=str, required=True,
-                        help='gluon vocab file corresponding to --gluon_parameter_file.')
-    parser.add_argument('--debug', action='store_true', help='debugging mode')
-    args = parser.parse_args()
-    logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
-    logging.info(args)
-
-    # Load stuff required for unpickling
-    sys.path.append(os.path.join((args.transformer_xl_repo), 'tf'))
-    import vocabulary  # pylint: disable=unused-import
-    import data_utils
-
-    # Infer correct tf flags
-    kwargs, corpus = get_kwargs_and_corpus(args)
-    tf_argv = [
-        'train.py',
-        '--n_layer=' + str(kwargs['num_layers']),
-        '--d_model=' + str(kwargs['units']),
-        '--d_embed=' + str(kwargs['embed_size']),
-        '--n_head=' + str(kwargs['num_heads']),
-        '--d_head=' + str(kwargs['units'] // kwargs['num_heads']),
-        '--d_inner=' + str(kwargs['hidden_size']),
-        '--dropout=0.0',
-        '--dropatt=0.0',
-        '--same_length=True',
-        '--model_dir=' + args.tf_checkpoint_dir,
-        '--proj_share_all_but_first=True',
-        '--untie_r=True',
-        '--div_val=' + str(kwargs['embed_div_val']),
-    ]
-    tf_flags = flags.FLAGS(tf_argv, known_only=True)
-
-    sys.path.append(os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
-    from transformer import TransformerXL
-
-    compare_transformerxl(args, kwargs, corpus)
diff --git a/scripts/language_model/conversion_utils/compare_xlnet_pytorch_gluon_model.py b/scripts/language_model/conversion_utils/compare_xlnet_pytorch_gluon_model.py
deleted file mode 100644
index f21374e896..0000000000
--- a/scripts/language_model/conversion_utils/compare_xlnet_pytorch_gluon_model.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# coding: utf-8
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Script for model comparison between TF and Gluon."""
-
-import argparse
-import logging
-import os
-import sys
-
-import mxnet as mx
-import numpy as np
-import torch
-
-import gluonnlp as nlp
-import transformers
-
-
-def compare_xlnet(args):
-    batch_size, qlen, mlen = 2, 16, 100
-
-    model_p = transformers.XLNetLMHeadModel.from_pretrained(
-        'xlnet-base-cased'
-        if args.model_name == 'xlnet_cased_L-12_H-768_A-12' else 'xlnet-large-cased', dropout=0)
-    model_p.transformer.attentions = False  # no change of default
-    model_p.transformer.output_hidden_states = True
-    model_p.transformer.mem_len = mlen
-
-    if args.model_name == 'xlnet_cased_L-12_H-768_A-12':
-        kwargs = {
-            'hidden_size': 3072,
-            'units': 768,
-            'activation': 'approx_gelu',
-            'num_heads': 12,
-            'num_layers': 12,
-            'vocab_size': 32000
-        }
-    elif args.model_name == 'xlnet_cased_L-24_H-1024_A-16':
-        kwargs = {
-            'hidden_size': 4096,
-            'units': 1024,
-            'activation': 'approx_gelu',
-            'num_heads': 16,
-            'num_layers': 24,
-            'vocab_size': 32000
-        }
-
-    with open(args.gluon_vocab_file, 'r') as f:
-        vocab = nlp.Vocab.from_json(f.read())
-    ctx = mx.cpu()
-    assert kwargs['vocab_size'] == len(vocab)
-    clamp_len = model_p.transformer.clamp_len if model_p.transformer.clamp_len > 0 else None
-    model = XLNet(clamp_len=clamp_len, **kwargs)
-    model.initialize(ctx=ctx)
-    model.load_parameters(args.gluon_parameter_file, ignore_extra=False)
-    model.hybridize()
-
-    # Computation
-    mems = model.begin_mems(batch_size, mlen, context=mx.cpu())
-    x = mx.nd.ones(shape=(batch_size, qlen))
-    token_types = mx.nd.ones(shape=(batch_size, qlen))
-    output, new_mems = model(x, token_types, mems)
-
-    x_p = torch.tensor(x.asnumpy(), dtype=torch.long)
-    mems_p = [torch.tensor(mems_i.transpose((1, 0, 2)).asnumpy()) for mems_i in mems]
-    token_types_p = torch.tensor(token_types.asnumpy(), dtype=torch.long)
-    output_p, new_mems_p, hids_p = model_p(x_p, token_type_ids=token_types_p, mems=mems_p)
-
-    for i in range(kwargs['num_layers']):
-        a, b = new_mems[i][:, -qlen:].asnumpy(), hids_p[i].detach().numpy()
-        assert np.all(np.isclose(a, b, atol=1e-5))
-    assert np.all(np.isclose(output.asnumpy(), output_p.detach().numpy(), atol=5e-5))
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='Comparison script for Tensorflow and GLuon XLNet model',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--model-name', type=str, required=True,
-                        choices=['xlnet_cased_L-12_H-768_A-12',
-                                 'xlnet_cased_L-24_H-1024_A-16'], help='Model name')
-    parser.add_argument('--gluon-parameter-file', type=str, required=True,
-                        help='gluon parameter file name.')
-    parser.add_argument('--gluon-vocab-file', type=str, required=True,
-                        help='gluon vocab file corresponding to --gluon_parameter_file.')
-    parser.add_argument('--debug', action='store_true', help='debugging mode')
-    args = parser.parse_args()
-    logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
-    logging.info(args)
-    sys.path.append(os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
-    from transformer import XLNet
-
-    compare_xlnet(args)
diff --git a/scripts/language_model/conversion_utils/convert_transformer_xl.py b/scripts/language_model/conversion_utils/convert_transformer_xl.py
deleted file mode 100644
index e5bea1545a..0000000000
--- a/scripts/language_model/conversion_utils/convert_transformer_xl.py
+++ /dev/null
@@ -1,272 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import logging
-import os
-import pickle
-import re
-import sys
-
-import mxnet as mx
-import numpy as np
-
-import gluonnlp as nlp
-from utils import _split_dict, get_hash, to_gluon_kwargs, read_tf_checkpoint
-
-
-def to_gluon_vocab(corpus):
-    """Convert a TransformerXL corpus object to a GluonNLP Vocab."""
-    # Clean up latin-1 mis-encoding of words
-    idx2sym = [w.encode('latin-1').decode('utf-8') for w in corpus.vocab.idx2sym]
-    sym2idx = {sym: idx for idx, sym in enumerate(idx2sym)}
-
-    special_tokens = dict(unknown_token=None, padding_token=None, bos_token=None)
-    if hasattr(corpus.vocab, 'unk_idx'):
-        special_tokens['unknown_token'] = idx2sym[corpus.vocab.unk_idx]
-    elif '<unk>' in sym2idx:
-        special_tokens['unknown_token'] = '<unk>'
-    elif '<UNK>' in sym2idx:
-        special_tokens['unknown_token'] = '<UNK>'
-
-    # Discover special tokens
-    if ['<eos>'] == corpus.vocab.special:
-        if '<eos>' in sym2idx:  # Only include if special token is actually used
-            special_tokens['eos_token'] = '<eos>'
-    elif '<S>' in sym2idx:
-        # Special case for model trained on Google 1 Billion Word LM dataset
-        special_tokens['eos_token'] = '<S>'
-    elif corpus.vocab.special:
-        raise NotImplementedError('Provided TransformerXL cache.pkl uses an unknown special token. '
-                                  'You must extend the `to_gluon_vocab` method to support it.')
-    else:
-        special_tokens['eos_token'] = None
-
-    counter = nlp.data.count_tokens(sym2idx.keys())
-    vocab = nlp.vocab.Vocab(counter, token_to_idx=sym2idx, **special_tokens)
-    return vocab
-
-
-def set_params(model, tf_tensors, kwargs, tie_r):
-    # Drop optimizer params
-    _, tf_tensors = _split_dict(lambda k, v: k.endswith('Adam'), tf_tensors)
-    _, tf_tensors = _split_dict(lambda k, v: k.endswith('Adam_1'), tf_tensors)
-    del tf_tensors['global_step']
-    del tf_tensors['beta1_power']
-    del tf_tensors['beta2_power']
-
-    loaded = set()  # Cache of processed parameters
-
-    if 'embed_cutoffs' in kwargs:  # Adaptive Embedding and Softmax
-        # Embedding
-        for name, param in model._net.embedding._collect_params_with_prefix().items():
-            purpose, i, postfix = re.match(r'([a-zA-Z]*)(\d*)(.*)', name).groups()
-            if purpose == 'embedding':
-                assert postfix == '_weight'
-                tf_param = tf_tensors.pop(
-                    'transformer/adaptive_embed/cutoff_{}/lookup_table'.format(i))
-            elif purpose == 'projection':
-                assert postfix == '_weight'
-                tf_param = tf_tensors.pop('transformer/adaptive_embed/cutoff_{}/proj_W'.format(i)).T
-            else:
-                raise RuntimeError('Embedding had unexpected parameter: {}'.format(name))
-
-            param.set_data(mx.nd.array(tf_param))
-            loaded.add(param)
-
-        # Softmax
-        for name, param in model._net.crit._collect_params_with_prefix().items():
-            if param in loaded:
-                continue  # Some parameters are shared between Embedding and Softmax
-
-            purpose, i, postfix = re.match(r'([a-zA-Z]*)(\d*)(.*)', name).groups()
-            if purpose == 'outembedding':
-                if postfix == '_weight':
-                    tf_param = tf_tensors.pop(
-                        'transformer/adaptive_softmax/cutoff_{}/lookup_table'.format(i))
-                elif postfix == '_bias':
-                    tf_param = tf_tensors.pop('transformer/adaptive_softmax/cutoff_{}/b'.format(i))
-                else:
-                    raise RuntimeError('Softmax had unexpected parameter: {}'.format(name))
-            elif purpose == 'outprojection':
-                assert postfix == '_weight'
-                tf_param = tf_tensors.pop('transformer/adaptive_softmax/cutoff_{}/proj'.format(i)).T
-            elif purpose == 'cluster':
-                if postfix == '.weight':
-                    tf_param = tf_tensors.pop('transformer/adaptive_softmax/cutoff_0/cluster_W')
-                elif postfix == '.bias':
-                    tf_param = tf_tensors.pop('transformer/adaptive_softmax/cutoff_0/cluster_b')
-                else:
-                    raise RuntimeError('Softmax had unexpected parameter: {}'.format(name))
-            else:
-                raise RuntimeError('Softmax had unexpected parameter: {}'.format(name))
-
-            param.set_data(mx.nd.array(tf_param))
-            loaded.add(param)
-    else:  # Non-adaptive, (possibly) projected embedding and softmax
-        # Embedding
-        tf_param = tf_tensors.pop('transformer/adaptive_embed/lookup_table')
-        model._net.embedding.embedding_weight.set_data(mx.nd.array(tf_param))
-        loaded.add(model._net.embedding.embedding_weight)
-        if kwargs['embed_size'] != kwargs['units']:
-            tf_param = tf_tensors.pop('transformer/adaptive_embed/proj_W')
-            model._net.embedding.projection_weight.set_data(mx.nd.array(tf_param))
-            loaded.add(model._net.embedding.projection_weight)
-            assert len(model._net.embedding.collect_params().keys()) == 2
-        else:
-            assert len(model._net.embedding.collect_params().keys()) == 1
-
-        # Softmax
-        for name, param in model._net.crit._collect_params_with_prefix().items():
-            if param in loaded:
-                continue  # Some parameters are shared between Embedding and Softmax
-
-            purpose, i, postfix = re.match(r'([a-zA-Z]*)(\d*)(.*)', name).groups()
-            if purpose == 'outembedding':
-                if postfix == '_weight':
-                    tf_param = tf_tensors.pop('transformer/adaptive_softmax/lookup_table')
-                elif postfix == '_bias':
-                    tf_param = tf_tensors.pop('transformer/adaptive_softmax/bias')
-                else:
-                    raise RuntimeError('Softmax had unexpected parameter: {}'.format(name))
-            elif purpose == 'outprojection':
-                assert postfix == '_weight'
-                tf_param = tf_tensors.pop('transformer/adaptive_softmax/proj').T
-            else:
-                raise RuntimeError('Softmax had unexpected parameter: {}'.format(name))
-
-            param.set_data(mx.nd.array(tf_param))
-            loaded.add(param)
-
-    tf_r_r_bias = tf_tensors.pop('transformer/r_r_bias')
-    tf_r_w_bias = tf_tensors.pop('transformer/r_w_bias')
-    for layer_i in range(kwargs['num_layers']):
-        # Attention Cell
-        attention_cell = model._net.transformer_cells[layer_i].attention_cell
-        # TODO(leezu): Duplicate tied parameters until parameter sharing
-        # support is improved in Gluon 2. (It is currently impossible to share
-        # only subsets of parameters between Blocks due to name clashes between
-        # the non-shared parameters (due to same prefix))
-        attention_cell.query_key_bias.set_data(
-            mx.nd.array(tf_r_w_bias if tie_r else tf_r_w_bias[layer_i]))
-        attention_cell.query_emb_bias.set_data(
-            mx.nd.array(tf_r_r_bias if tie_r else tf_r_r_bias[layer_i]))
-        tf_param = np.split(
-            tf_tensors.pop('transformer/layer_{}/rel_attn/qkv/kernel'.format(layer_i)).T, 3, axis=0)
-        attention_cell.proj_query.weight.set_data(mx.nd.array(tf_param[0]))
-        attention_cell.proj_key.weight.set_data(mx.nd.array(tf_param[1]))
-        attention_cell.proj_value.weight.set_data(mx.nd.array(tf_param[2]))
-        tf_param = tf_tensors.pop('transformer/layer_{}/rel_attn/r/kernel'.format(layer_i))
-        attention_cell.proj_emb.weight.set_data(mx.nd.array(tf_param.T))
-
-        # Projection
-        tf_param = tf_tensors.pop('transformer/layer_{}/rel_attn/o/kernel'.format(layer_i))
-        model._net.transformer_cells[layer_i].proj.weight.set_data(mx.nd.array(tf_param.T))
-
-        # Layer Norm
-        tf_param = tf_tensors.pop('transformer/layer_{}/rel_attn/LayerNorm/beta'.format(layer_i))
-        model._net.transformer_cells[layer_i].layer_norm.beta.set_data(mx.nd.array(tf_param))
-        tf_param = tf_tensors.pop('transformer/layer_{}/rel_attn/LayerNorm/gamma'.format(layer_i))
-        model._net.transformer_cells[layer_i].layer_norm.gamma.set_data(mx.nd.array(tf_param))
-
-        # FFN
-        ffn = model._net.transformer_cells[layer_i].ffn
-        tf_param = tf_tensors.pop('transformer/layer_{}/ff/LayerNorm/beta'.format(layer_i))
-        ffn.layer_norm.beta.set_data(mx.nd.array(tf_param))
-        tf_param = tf_tensors.pop('transformer/layer_{}/ff/LayerNorm/gamma'.format(layer_i))
-        ffn.layer_norm.gamma.set_data(mx.nd.array(tf_param))
-        tf_param = tf_tensors.pop('transformer/layer_{}/ff/layer_1/kernel'.format(layer_i))
-        ffn.ffn_1.weight.set_data(mx.nd.array(tf_param.T))
-        tf_param = tf_tensors.pop('transformer/layer_{}/ff/layer_1/bias'.format(layer_i))
-        ffn.ffn_1.bias.set_data(mx.nd.array(tf_param))
-        tf_param = tf_tensors.pop('transformer/layer_{}/ff/layer_2/kernel'.format(layer_i))
-        ffn.ffn_2.weight.set_data(mx.nd.array(tf_param.T))
-        tf_param = tf_tensors.pop('transformer/layer_{}/ff/layer_2/bias'.format(layer_i))
-        ffn.ffn_2.bias.set_data(mx.nd.array(tf_param))
-
-
-def convert_transformerxl(args):
-    # Load tf model and vocab
-    with open(args.cache_pkl, 'rb') as f:
-        corpus = pickle.load(f, encoding='latin1')
-    vocab = to_gluon_vocab(corpus)
-    tf_checkpoint_file = os.path.expanduser(
-        os.path.join(args.tf_checkpoint_dir, args.tf_model_prefix))
-    tf_tensors = read_tf_checkpoint(tf_checkpoint_file)
-
-    # Initialize Gluon model
-    kwargs, tie_r = to_gluon_kwargs(tf_tensors)
-    model = TransformerXL(vocab_size=len(vocab), **kwargs)
-    model.initialize(init=mx.init.Normal(0.02))
-
-    # Shape inference based on forward pass
-    batch_size, seq_len = 2, 16
-    mem_length = 100
-    mems = model.begin_mems(batch_size, mem_length, context=mx.cpu())
-    x = mx.nd.ones(shape=(batch_size, seq_len))
-    model(x, x, mems)
-
-    # Convert parameters
-    set_params(model, tf_tensors, kwargs, tie_r)
-
-    # Serialization
-    tmp_file_path = os.path.expanduser(os.path.join(args.out_dir, 'tmp'))
-    with open(tmp_file_path, 'w') as f:
-        f.write(vocab.to_json())
-    hash_full, hash_short = get_hash(tmp_file_path)
-    gluon_vocab_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.vocab'))
-    with open(gluon_vocab_path, 'w') as f:
-        f.write(vocab.to_json())
-        logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full)
-    model.save_parameters(tmp_file_path)
-    hash_full, hash_short = get_hash(tmp_file_path)
-    os.remove(tmp_file_path)
-    gluon_param_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.params'))
-    logging.info('param saved to %s. hash = %s', gluon_param_path, hash_full)
-    model.save_parameters(gluon_param_path)
-    mx.nd.waitall()
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description='Conversion script for Tensorflow Transformer-XL model',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--transformer-xl-repo', type=str, required=True,
-                        help='Path to https://github.com/kimiyoung/transformer-xl repo.')
-    parser.add_argument('--tf-checkpoint-dir', type=str, required=True,
-                        help='Path to Tensorflow checkpoint folder.')
-    parser.add_argument(
-        '--tf-model-prefix', type=str, required=True, help='Prefix of the checkpoint files. '
-        'For example model.ckpt-0 or model.ckpt-1191000')
-    parser.add_argument('--cache-pkl', type=str, required=True,
-                        help='Path to TransformerXL cache.pkl file.')
-    parser.add_argument('--out-dir', type=str, required=True,
-                        help='Path to output folder. The folder must exist.')
-    parser.add_argument('--debug', action='store_true', help='debugging mode')
-    args = parser.parse_args()
-    logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
-    logging.info(args)
-
-    # Load stuff required for unpickling
-    sys.path.append(os.path.join((args.transformer_xl_repo), 'tf'))
-    import vocabulary  # pylint: disable=unused-import
-    import data_utils  # pylint: disable=unused-import
-
-    sys.path.append(os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
-    from transformer import TransformerXL
-
-    convert_transformerxl(args)
diff --git a/scripts/language_model/conversion_utils/convert_xlnet.py b/scripts/language_model/conversion_utils/convert_xlnet.py
deleted file mode 100644
index 1b9a7da37a..0000000000
--- a/scripts/language_model/conversion_utils/convert_xlnet.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# coding: utf-8
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import json
-import logging
-import os
-import sys
-
-import mxnet as mx
-import numpy as np
-
-import gluonnlp as nlp
-from utils import _split_dict, get_hash, read_tf_checkpoint
-
-
-def set_params(model, tf_tensors, kwargs, tie_r):
-    # Drop optimizer params
-    _, tf_tensors = _split_dict(lambda k, v: k.endswith('Adam'), tf_tensors)
-    _, tf_tensors = _split_dict(lambda k, v: k.endswith('Adam_1'), tf_tensors)
-    del tf_tensors['global_step']
-
-    # Embedding
-    tf_param = tf_tensors.pop('model/transformer/word_embedding/lookup_table')
-    model._net.word_embed.weight.set_data(mx.nd.array(tf_param))
-    tf_param = tf_tensors.pop('model/transformer/mask_emb/mask_emb')
-    model._net.mask_embed.set_data(mx.nd.array(tf_param))
-
-    tf_rel_segment_emb = tf_tensors.pop('model/transformer/seg_embed')
-
-    tf_r_r_bias = tf_tensors.pop('model/transformer/r_r_bias')
-    tf_r_w_bias = tf_tensors.pop('model/transformer/r_w_bias')
-    tf_r_s_bias = tf_tensors.pop('model/transformer/r_s_bias')
-    for layer_i in range(kwargs['num_layers']):
-        # Attention Cell
-        attention_cell = model._net.transformer_cells[layer_i].attention_cell
-        # TODO(leezu): Duplicate tied parameters until parameter sharing
-        # support is improved in Gluon 2. (It is currently impossible to share
-        # only subsets of parameters between Blocks due to name clashes between
-        # the non-shared parameters (due to same prefix))
-        attention_cell.query_key_bias.set_data(
-            mx.nd.array(tf_r_w_bias if tie_r else tf_r_w_bias[layer_i]))
-        attention_cell.query_emb_bias.set_data(
-            mx.nd.array(tf_r_r_bias if tie_r else tf_r_r_bias[layer_i]))
-        attention_cell.query_seg_bias.set_data(
-            mx.nd.array(tf_r_s_bias if tie_r else tf_r_s_bias[layer_i]))
-        shape = (kwargs['units'], kwargs['units'])
-        tf_param = tf_tensors.pop('model/transformer/layer_{}/rel_attn/q/kernel'.format(layer_i))
-        attention_cell.proj_query.weight.set_data(mx.nd.array(tf_param.reshape(shape).T))
-        tf_param = tf_tensors.pop('model/transformer/layer_{}/rel_attn/k/kernel'.format(layer_i))
-        attention_cell.proj_key.weight.set_data(mx.nd.array(tf_param.reshape(shape).T))
-        tf_param = tf_tensors.pop('model/transformer/layer_{}/rel_attn/v/kernel'.format(layer_i))
-        attention_cell.proj_value.weight.set_data(mx.nd.array(tf_param.reshape(shape).T))
-        tf_param = tf_tensors.pop('model/transformer/layer_{}/rel_attn/r/kernel'.format(layer_i))
-        attention_cell.proj_emb.weight.set_data(mx.nd.array(tf_param.reshape(shape).T))
-        attention_cell.seg_emb.set_data(mx.nd.array(tf_rel_segment_emb[layer_i]))
-
-        # Projection
-        tf_param = tf_tensors.pop('model/transformer/layer_{}/rel_attn/o/kernel'.format(layer_i))
-        model._net.transformer_cells[layer_i].proj.weight.set_data(
-            mx.nd.array(tf_param.reshape(shape)))  # o kernel should not be transposed
-
-        # Layer Norm
-        tf_param = tf_tensors.pop(
-            'model/transformer/layer_{}/rel_attn/LayerNorm/beta'.format(layer_i))
-        model._net.transformer_cells[layer_i].layer_norm.beta.set_data(mx.nd.array(tf_param))
-        tf_param = tf_tensors.pop(
-            'model/transformer/layer_{}/rel_attn/LayerNorm/gamma'.format(layer_i))
-        model._net.transformer_cells[layer_i].layer_norm.gamma.set_data(mx.nd.array(tf_param))
-
-        # FFN
-        ffn = model._net.transformer_cells[layer_i].ffn
-        tf_param = tf_tensors.pop('model/transformer/layer_{}/ff/LayerNorm/beta'.format(layer_i))
-        ffn.layer_norm.beta.set_data(mx.nd.array(tf_param))
-        tf_param = tf_tensors.pop('model/transformer/layer_{}/ff/LayerNorm/gamma'.format(layer_i))
-        ffn.layer_norm.gamma.set_data(mx.nd.array(tf_param))
-        tf_param = tf_tensors.pop('model/transformer/layer_{}/ff/layer_1/kernel'.format(layer_i))
-        ffn.ffn_1.weight.set_data(mx.nd.array(tf_param.T))
-        tf_param = tf_tensors.pop('model/transformer/layer_{}/ff/layer_1/bias'.format(layer_i))
-        ffn.ffn_1.bias.set_data(mx.nd.array(tf_param))
-        tf_param = tf_tensors.pop('model/transformer/layer_{}/ff/layer_2/kernel'.format(layer_i))
-        ffn.ffn_2.weight.set_data(mx.nd.array(tf_param.T))
-        tf_param = tf_tensors.pop('model/transformer/layer_{}/ff/layer_2/bias'.format(layer_i))
-        ffn.ffn_2.bias.set_data(mx.nd.array(tf_param))
-
-    if 'model/lm_loss/weight' in tf_tensors:
-        tf_param = tf_tensors.pop('model/lm_loss/weight')
-        model._net.decoder.weight.set_data(tf_param)
-    tf_param = tf_tensors.pop('model/lm_loss/bias')
-    model._net.decoder.bias.set_data(tf_param)
-
-    assert len(tf_tensors.keys()) == 0
-
-
-def convert_xlnet(args):
-    # Load vocab
-    vocab_file = os.path.join(args.model_dir, 'spiece.model')
-    vocab = nlp.vocab.BERTVocab.from_sentencepiece(vocab_file, cls_token='<cls>', sep_token='<sep>',
-                                                   mask_token='<mask>')
-
-    # Load config
-    tf_config_names_to_gluon_config_names = {
-        'd_inner': 'hidden_size',
-        'd_model': 'units',
-        'ff_activation': 'activation',
-        'n_head': 'num_heads',
-        'n_layer': 'num_layers',
-        'n_token': 'vocab_size',
-    }
-    with open(os.path.join(args.model_dir, 'xlnet_config.json'), 'r') as f:
-        tf_config = json.load(f)
-        assert tf_config['untie_r']
-        del tf_config['untie_r']
-        del tf_config['d_head']
-        assert len(tf_config) == len(tf_config_names_to_gluon_config_names)
-    kwargs = {tf_config_names_to_gluon_config_names[k]: v for k, v in tf_config.items()}
-    assert len(vocab) == kwargs['vocab_size']
-    print(kwargs)
-
-    # Load TF model
-    tf_checkpoint_file = os.path.expanduser(os.path.join(args.model_dir, 'xlnet_model.ckpt'))
-    tf_tensors = read_tf_checkpoint(tf_checkpoint_file)
-
-    # Update kwargs
-    kwargs['tie_decoder_weight'] = 'model/lm_loss/weight' not in tf_tensors
-
-    # Initialize Gluon model
-    model = XLNet(**kwargs)
-    model.initialize(init=mx.init.Normal(0.02))
-    model.hybridize()
-
-    # Shape inference based on forward pass
-    batch_size, qlen, mlen = 2, 16, 100
-    mems = model.begin_mems(batch_size, mlen, context=mx.cpu())
-    x = mx.nd.ones(shape=(batch_size, qlen))
-    segments = mx.nd.random_normal(shape=(batch_size, qlen, mlen + qlen, 2))
-    segments = segments < 0
-    model(x, segments, mems)
-
-    # Convert parameters
-    set_params(model, tf_tensors, kwargs, tie_r=False)
-
-    # Serialization
-    tmp_file_path = os.path.expanduser(os.path.join(args.out_dir, 'tmp'))
-    with open(tmp_file_path, 'w') as f:
-        f.write(vocab.to_json())
-    hash_full, hash_short = get_hash(tmp_file_path)
-    gluon_vocab_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.vocab'))
-    with open(gluon_vocab_path, 'w') as f:
-        f.write(vocab.to_json())
-        logging.info('vocab file saved to %s. hash = %s', gluon_vocab_path, hash_full)
-    model.save_parameters(tmp_file_path)
-    hash_full, hash_short = get_hash(tmp_file_path)
-    os.remove(tmp_file_path)
-    gluon_param_path = os.path.expanduser(os.path.join(args.out_dir, hash_short + '.params'))
-    logging.info('param saved to %s. hash = %s', gluon_param_path, hash_full)
-    model.save_parameters(gluon_param_path)
-    mx.nd.waitall()
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Conversion script for Tensorflow XLNet model',
-                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument(
-        '--model-dir', type=str, required=True,
-        help='Path to folder including the TensorFlow checkpoint `xlnet_model.ckpt`, '
-        'the SentencePiece model `spiece.model` and the modle config `xlnet_config.json`')
-    parser.add_argument('--out-dir', type=str, required=True,
-                        help='Path to output folder. The folder must exist.')
-    parser.add_argument('--debug', action='store_true', help='debugging mode')
-    args = parser.parse_args()
-    logging.getLogger().setLevel(logging.DEBUG if args.debug else logging.INFO)
-    logging.info(args)
-
-    sys.path.append(os.path.abspath(os.path.join(__file__, os.pardir, os.pardir)))
-    from transformer import XLNet
-
-    convert_xlnet(args)
diff --git a/scripts/language_model/conversion_utils/utils.py b/scripts/language_model/conversion_utils/utils.py
deleted file mode 100644
index d9c264c035..0000000000
--- a/scripts/language_model/conversion_utils/utils.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import hashlib
-import itertools
-import re
-
-
-def _filter_dict(func, dictionary):
-    return {k: v for k, v in dictionary.items() if func(k, v)}
-
-
-def _split_dict(func, dictionary):
-    part_one = _filter_dict(func, dictionary)
-    part_two = _filter_dict(lambda *args: not func(*args), dictionary)
-    return part_one, part_two
-
-
-def get_hash(filename):
-    sha1 = hashlib.sha1()
-    with open(filename, 'rb') as f:
-        while True:
-            data = f.read(1048576)
-            if not data:
-                break
-            sha1.update(data)
-    return sha1.hexdigest(), str(sha1.hexdigest())[:8]
-
-
-def read_tf_checkpoint(path):
-    """read tensorflow checkpoint"""
-    from tensorflow.python import pywrap_tensorflow
-    tensors = {}
-    reader = pywrap_tensorflow.NewCheckpointReader(path)
-    var_to_shape_map = reader.get_variable_to_shape_map()
-    for key in sorted(var_to_shape_map):
-        tensor = reader.get_tensor(key)
-        tensors[key] = tensor
-    return tensors
-
-
-def to_gluon_kwargs(tf_tensors):
-    kwargs = dict()
-
-    # Main model
-    kwargs['num_layers'] = len(
-        set(itertools.chain.from_iterable(re.findall(r'layer_\d*', k) for k in tf_tensors)))
-    kwargs['hidden_size'] = tf_tensors['transformer/layer_0/ff/layer_2/kernel'].shape[0]
-    kwargs['units'] = tf_tensors['transformer/layer_0/ff/layer_2/kernel'].shape[1]
-    tie_r = len(tf_tensors['transformer/r_w_bias'].shape) != 3
-    kwargs['num_heads'] = tf_tensors['transformer/r_w_bias'].shape[0 if tie_r else 1]
-
-    # Embedding and softmax
-    if 'transformer/adaptive_embed/lookup_table' in tf_tensors:
-        # Adaptive embedding is not used
-        kwargs['embed_size'] = tf_tensors['transformer/adaptive_embed/lookup_table'].shape[1]
-        kwargs['tie_input_output_embeddings'] = \
-            'transformer/adaptive_softmax/lookup_table' not in tf_tensors
-        kwargs['tie_input_output_projections'] = \
-            ['transformer/adaptive_softmax/proj' not in tf_tensors]
-    else:
-        # Adaptive embedding is used
-        lookup_table_selector = 'transformer/adaptive_embed/cutoff_{i}/lookup_table'
-        kwargs['embed_cutoffs'] = list(
-            itertools.accumulate([
-                tf_tensors[lookup_table_selector.format(i=i)].shape[0] for i in range(
-                    len(_filter_dict(lambda k, v: k.endswith('lookup_table'), tf_tensors)))
-            ][:-1]))
-        kwargs['embed_size'] = tf_tensors[lookup_table_selector.format(i=0)].shape[1]
-        size_of_second = tf_tensors[lookup_table_selector.format(i=1)].shape[1]
-        kwargs['embed_div_val'] = kwargs['embed_size'] // size_of_second
-        assert kwargs['embed_size'] % size_of_second == 0
-        kwargs['tie_input_output_embeddings'] = not bool(
-            _filter_dict(
-                lambda k, v: k.startswith('transformer/adaptive_softmax/cutoff_') and k.endswith(
-                    'lookup_table'), tf_tensors))
-        proj_selector = 'transformer/adaptive_softmax/cutoff_{i}/proj'
-        kwargs['tie_input_output_projections'] = [
-            proj_selector.format(i=i) not in tf_tensors
-            for i in range(len(kwargs['embed_cutoffs']) + 1)
-        ]
-        if kwargs['embed_size'] == kwargs['embed_size'] and \
-           'transformer/adaptive_embed/cutoff_0/proj_W' not in tf_tensors:
-            kwargs['project_same_dim'] = False
-
-    # Dropout
-    # All pre-trained TransformerXL models from
-    # https://github.com/kimiyoung/transformer-xl come without dropout
-    kwargs['dropout'] = 0
-    kwargs['attention_dropout'] = 0
-
-    print(kwargs)
-    return kwargs, tie_r
diff --git a/scripts/language_model/index.rst b/scripts/language_model/index.rst
deleted file mode 100644
index 9a69f347e0..0000000000
--- a/scripts/language_model/index.rst
+++ /dev/null
@@ -1,301 +0,0 @@
-Language Model
---------------
-
-:download:`Download scripts </model_zoo/language_model.zip>`
-
-Word Language Model
-~~~~~~~~~~~~~~~~~~~~
-
-Reference: Merity, S., et al. "`Regularizing and optimizing LSTM language models <https://openreview.net/pdf?id=SyyGPP0TZ>`_". ICLR 2018
-
-
-The key features used to reproduce the results for pre-trained models are listed in the following tables.
-
-.. editing URL for the following table: https://bit.ly/2PHSHvc
-
-The dataset used for training the models is wikitext-2.
-
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Model         | awd_lstm_lm_1150_wikitext-2                                                                                                | awd_lstm_lm_600_wikitext-2                                                                                                | standard_lstm_lm_1500_wikitext-2                                                                                                | standard_lstm_lm_650_wikitext-2                                                                                                | standard_lstm_lm_200_wikitext-2                                                                                                |
-+===============+============================================================================================================================+===========================================================================================================================+=================================================================================================================================+================================================================================================================================+================================================================================================================================+
-| Mode          | LSTM                                                                                                                       | LSTM                                                                                                                      | LSTM                                                                                                                            | LSTM                                                                                                                           | LSTM                                                                                                                           |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Num_layers    | 3                                                                                                                          | 3                                                                                                                         | 2                                                                                                                               | 2                                                                                                                              | 2                                                                                                                              |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Embed size    | 400                                                                                                                        | 200                                                                                                                       | 1500                                                                                                                            | 650                                                                                                                            | 200                                                                                                                            |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Hidden size   | 1150                                                                                                                       | 600                                                                                                                       | 1500                                                                                                                            | 650                                                                                                                            | 200                                                                                                                            |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Dropout       | 0.4                                                                                                                        | 0.2                                                                                                                       | 0.65                                                                                                                            | 0.5                                                                                                                            | 0.2                                                                                                                            |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Dropout_h     | 0.2                                                                                                                        | 0.1                                                                                                                       | 0                                                                                                                               | 0                                                                                                                              | 0                                                                                                                              |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Dropout_i     | 0.65                                                                                                                       | 0.3                                                                                                                       | 0                                                                                                                               | 0                                                                                                                              | 0                                                                                                                              |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Dropout_e     | 0.1                                                                                                                        | 0.05                                                                                                                      | 0                                                                                                                               | 0                                                                                                                              | 0                                                                                                                              |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Weight_drop   | 0.5                                                                                                                        | 0.2                                                                                                                       | 0                                                                                                                               | 0                                                                                                                              | 0                                                                                                                              |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Val PPL       | 68.71                                                                                                                      | 84.89                                                                                                                     | 86.51                                                                                                                           | 90.96                                                                                                                          | 107.59                                                                                                                         |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Test PPL      | 65.62                                                                                                                      | 80.67                                                                                                                     | 82.29                                                                                                                           | 86.91                                                                                                                          | 101.64                                                                                                                         |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Command       | [1]                                                                                                                        | [2]                                                                                                                       | [3]                                                                                                                             | [4]                                                                                                                            | [5]                                                                                                                            |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-| Training logs | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/awd_lstm_lm_1150_wikitext-2.log>`__        | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/awd_lstm_lm_600_wikitext-2.log>`__        | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/standard_lstm_lm_1500_wikitext-2.log>`__        | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/standard_lstm_lm_650_wikitext-2.log>`__        | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/standard_lstm_lm_200_wikitext-2.log>`__        |
-+---------------+----------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------+
-
-For all the above model settings, we set Tied = True and NTASGD = True .
-
-[1] awd_lstm_lm_1150_wikitext-2 (Val PPL 68.71 Test PPL 65.62 )
-
-.. code-block:: console
-
-   $ python word_language_model.py --gpu 0 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_1150_wikitext-2
-
-[2] awd_lstm_lm_600_wikitext-2 (Val PPL 84.89 Test PPL 80.67)
-
-.. code-block:: console
-
-   $ python word_language_model.py --gpu 0 --emsize 200 --nhid 600 --epochs 750 --dropout 0.2 --dropout_h 0.1 --dropout_i 0.3 --dropout_e 0.05 --weight_drop 0.2 --tied --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save awd_lstm_lm_600_wikitext-2
-
-[3] standard_lstm_lm_1500_wikitext-2 (Val PPL 86.51 Test PPL 82.29)
-
-.. code-block:: console
-
-   $ python word_language_model.py --gpu 0 --emsize 1500 --nhid 1500 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.65 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_1500_wikitext-2
-
-[4] standard_lstm_lm_650_wikitext-2 (Val PPL 90.96 Test PPL 86.91)
-
-.. code-block:: console
-
-   $ python word_language_model.py --gpu 0 --emsize 650 --nhid 650 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.5 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_650_wikitext-2
-
-[5] standard_lstm_lm_200_wikitext-2 (Val PPL 107.59 Test PPL 101.64)
-
-.. code-block:: console
-
-   $ python word_language_model.py --gpu 0 --emsize 200 --nhid 200 --nlayers 2 --lr 20 --epochs 750 --batch_size 20 --bptt 35 --dropout 0.2 --dropout_h 0 --dropout_i 0 --dropout_e 0 --weight_drop 0 --tied --wd 0 --alpha 0 --beta 0 --ntasgd --lr_update_interval 30 --lr_update_factor 0.1 --save standard_lstm_lm_200_wikitext-2
-
-Cache Language Model
-~~~~~~~~~~~~~~~~~~~~~
-
-Reference: Grave, E., et al. "`Improving neural language models with a continuous cache <https://openreview.net/pdf?id=B184E5qee>`_". ICLR 2017
-
-The key features used to reproduce the results based on the corresponding pre-trained models are listed in the following tables.
-
-.. editing URL for the following table: https://bit.ly/2NkpklU
-
-The dataset used for training the models is wikitext-2.
-
-+---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Model               | cache_awd_lstm_lm_1150_wikitext-2                                                                                                 | cache_awd_lstm_lm_600_wikitext-2                                                                                                 | cache_standard_lstm_lm_1500_wikitext-2                                                                                                 | cache_standard_lstm_lm_650_wikitext-2                                                                                                 | cache_standard_lstm_lm_200_wikitext-2                                                                                                 |
-+=====================+===================================================================================================================================+==================================================================================================================================+========================================================================================================================================+=======================================================================================================================================+=======================================================================================================================================+
-| Pre-trained setting | Refer to: awd_lstm_lm_1150_wikitext-2                                                                                             | Refer to: awd_lstm_lm_600_wikitext-2                                                                                             | Refer to: standard_lstm_lm_1500_wikitext-2                                                                                             | Refer to: standard_lstm_lm_650_wikitext-2                                                                                             | Refer to: standard_lstm_lm_200_wikitext-2                                                                                             |
-+---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Val PPL             | 53.41                                                                                                                             | 64.51                                                                                                                            | 65.54                                                                                                                                  | 68.47                                                                                                                                 | 77.51                                                                                                                                 |
-+---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Test PPL            | 51.46                                                                                                                             | 62.19                                                                                                                            | 62.79                                                                                                                                  | 65.85                                                                                                                                 | 73.74                                                                                                                                 |
-+---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Command             | [1]                                                                                                                               | [2]                                                                                                                              | [3]                                                                                                                                    | [4]                                                                                                                                   | [5]                                                                                                                                   |
-+---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Training logs       | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_awd_lstm_lm_1150_wikitext-2.log>`__         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_awd_lstm_lm_600_wikitext-2.log>`__         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_standard_lstm_lm_1500_wikitext-2.log>`__         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_standard_lstm_lm_650_wikitext-2.log>`__         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/cache_standard_lstm_lm_200_wikitext-2.log>`__         |
-+---------------------+-----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-
-For all the above model settings, we set lambdas = 0.1279, theta = 0.662, window = 2000 and bptt= 2000 .
-
-[1] cache_awd_lstm_lm_1150_wikitext-2 (Val PPL 53.41 Test PPL 51.46)
-
-.. code-block:: console
-
-   $ python cache_language_model.py --gpus 0 --model_name awd_lstm_lm_1150
-
-[2] cache_awd_lstm_lm_600_wikitext-2 (Val PPL 64.51 Test PPL 62.19)
-
-.. code-block:: console
-
-   $ python cache_language_model.py --gpus 0 --model_name awd_lstm_lm_600
-
-[3] cache_standard_lstm_lm_1500_wikitext-2 (Val PPL 65.54 Test PPL 62.79)
-
-.. code-block:: console
-
-   $ python cache_language_model.py --gpus 0 --model_name standard_lstm_lm_1500
-
-[4] cache_standard_lstm_lm_650_wikitext-2 (Val PPL 68.47 Test PPL 65.85)
-
-.. code-block:: console
-
-   $ python cache_language_model.py --gpus 0 --model_name standard_lstm_lm_650
-
-[5] cache_standard_lstm_lm_200_wikitext-2 (Val PPL 77.51 Test PPL 73.74)
-
-.. code-block:: console
-
-   $ python cache_language_model.py --gpus 0 --model_name standard_lstm_lm_200
-
-Large Scale Word Language Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Reference: Jozefowicz, Rafal, et al. "`Exploring the limits of language modeling <https://arxiv.org/abs/1602.02410>`_". arXiv preprint arXiv:1602.02410 (2016).
-
-The key features used to reproduce the results for pre-trained models are listed in the following tables.
-
-.. editing URL for the following table: https://bit.ly/2w28VXS
-
-The dataset used for training the models is Google's 1 billion words dataset.
-
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Model           | LSTM-2048-512                                                                                                                |
-+=================+==============================================================================================================================+
-| Mode            | LSTMP                                                                                                                        |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Num layers      | 1                                                                                                                            |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Embed size      | 512                                                                                                                          |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Hidden size     | 2048                                                                                                                         |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Projection size | 512                                                                                                                          |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Dropout         | 0.1                                                                                                                          |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Learning rate   | 0.2                                                                                                                          |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Num samples     | 8192                                                                                                                         |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Batch size      | 128                                                                                                                          |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Gradient clip   | 10.0                                                                                                                         |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Test perplexity | 43.62                                                                                                                        |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Num epochs      | 50                                                                                                                           |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Training logs   | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/big_rnn_lm_2048_512_gbw.log>`__              |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-| Evaluation logs | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/big_rnn_lm_2048_512_gbw-eval.log>`__         |
-+-----------------+------------------------------------------------------------------------------------------------------------------------------+
-
-[1] LSTM-2048-512 (Test PPL 43.62)
-
-.. code-block:: console
-
-   $ python large_word_language_model.py --gpus 0,1,2,3 --clip=10
-   $ python large_word_language_model.py --gpus 4 --eval-only --batch-size=1
-
-
-XLNet: Generalized Autoregressive Pretraining for Language Understanding
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Reference: Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R., &
-Le, Q. V. "`XLNet: Generalized Autoregressive Pretraining for Language
-Understanding. <https://arxiv.org/abs/1906.08237>`_" arXiv preprint
-arXiv:1906.08237 (2019).
-
-
-The following pre-trained XLNet models are available from the **get_model** API:
-
-+-------------------+--------------------------+-----------------------------+
-|                   | xlnet_cased_l12_h768_a12 | xlnet_cased_l24_h1024_a16   |
-+===================+==========================+=============================+
-| 126gb             | ✓                        | ✓                           |
-+-------------------+--------------------------+-----------------------------+
-
-where **126gb** refers to the 126 GB large training dataset used by the XLNet
-paper authors.
-
-.. code-block:: python
-
-    import gluonnlp as nlp; import mxnet as mx
-    from transformer import get_model, XLNetTokenizer
-    model, vocab, tokenizer = get_model('xlnet_cased_l12_h768_a12', dataset_name='126gb', use_decoder=True)
-    indices = mx.nd.array([vocab.to_indices(tokenizer('Hello world'))])
-    token_types = mx.nd.ones_like(indices)
-    mems = model.begin_mems(batch_size=1, mem_len=500, context=indices.context)
-    output, new_mems = model(indices, token_types, mems)
-
-Sentence Classification
-~~~~~~~~~~~~~~~~~~~~~~~
-
-GluonNLP provides the following example script to fine-tune sentence classification with pre-trained
-XLNet model.
-
-Results using `xlnet_12_768_12`:
-
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-|Task Name        |Metrics              |Results on Dev Set     |log                                                                                                                                         |command                                                                                                                                                          |
-+=================+=====================+=======================+============================================================================================================================================+=================================================================================================================================================================+
-| CoLA            |Matthew Corr.        |59.33                  |`log <https://github.com/dmlc/web-data/tree/master/gluonnlp/logs/language_model/xlnet_l12_h768_a12_finetuned_CoLA_mxnet1.6.0rc1.log>`__     |`command <https://github.com/dmlc/web-data/tree/master/gluonnlp/logs/language_model/xlnet_l12_h768_a12_finetuned_CoLA_mxnet1.6.0rc1.sh>`__                       |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| SST-2           |Accuracy             |94.61                  |`log <https://github.com/dmlc/web-data/tree/master/gluonnlp/logs/language_model/xlnet_l12_h768_a12_finetuned_SST_mxnet1.6.0rc1.log>`__      |`command <https://github.com/dmlc/web-data/tree/master/gluonnlp/logs/language_model/xlnet_l12_h768_a12_finetuned_SST_mxnet1.6.0rc1.sh>`__                        |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| MRPC            |Accuracy/F1          |89.22/92.20            |`log <https://github.com/dmlc/web-data/tree/master/gluonnlp/logs/language_model/xlnet_l12_h768_a12_finetuned_MRPC_mxnet1.6.0rc1.log>`__     |`command <https://github.com/dmlc/web-data/tree/master/gluonnlp/logs/language_model/xlnet_l12_h768_a12_finetuned_MRPC_mxnet1.6.0rc1.sh>`__                       |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| STS-B           |Pearson Corr.        |89.34                  |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_l12_h768_a12_finetuned_STS-B.log>`__                  |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_l12_h768_a12_finetuned_STS-B.sh>`__                                    |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| QQP             |Accuracy             |91.31                  |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_l12_h768_a12_finetuned_QQP.log>`__                    |`command <https://github.com/dmlc/web-data/tree/master/gluonnlp/logs/language_model/xlnet_l12_h768_a12_finetuned_QQP.sh>`__                                      |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| MNLI            |Accuracy(m/mm)       |87.19/86.45            |`log <https://github.com/dmlc/web-data/tree/master/gluonnlp/logs/language_model/xlnet_l12_h768_a12_finetuned_MNLI_mxnet1.6.0rc1.log>`__     |`command <https://github.com/dmlc/web-data/tree/master/gluonnlp/logs/language_model/xlnet_l12_h768_a12_finetuned_MNLI_mxnet1.6.0rc1.sh>`__                       |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| QNLI            |Accuracy             |88                     |`log <https://github.com/dmlc/web-data/tree/master/gluonnlp/logs/language_model/xlnet_l12_h768_a12_finetuned_QNLI.log>`__                   |`command <https://github.com/dmlc/web-data/tree/master/gluonnlp/logs/language_model/xlnet_l12_h768_a12_finetuned_QNLI.sh>`__                                     |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| RTE             |Accuracy             |75.09                  |`log <https://github.com/dmlc/web-data/tree/master/gluonnlp/logs/language_model/xlnet_l12_h768_a12_finetuned_RTE_mxnet1.6.0rc1.log>`__      |`command <https://github.com/dmlc/web-data/tree/master/gluonnlp/logs/language_model/xlnet_l12_h768_a12_finetuned_RTE_mxnet1.6.0rc1.sh>`__                        |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-
-Results using `xlnet_24_1024_16`:
-We followed the hyperparameters reported by the paper authors.
-
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-|Task Name        |Metrics              |Results on Dev Set     |log                                                                                                                                         |command                                                                                                                                                          |
-+=================+=====================+=======================+============================================================================================================================================+=================================================================================================================================================================+
-| CoLA            |Matthew Corr.        |67                     |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_l24_h1024_a16_finetuned_CoLA.log>`__                  |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_l24_h1024_a16_finetuned_CoLA.sh>`__                                    |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| SST-2           |Accuracy             |94                     |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_l24_h1024_a16_finetuned_SST.log>`__                   |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_l24_h1024_a16_finetuned_SST.sh>`__                                     |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| MRPC            |Accuracy/F1          |90.2/93                |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_l24_h1024_a16_finetuned_MRPC.log>`__                  |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_l24_h1024_a16_finetuned_MRPC.sh>`__                                    |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| STS-B           |Pearson Corr.        |91.37                  |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_l24_h1024_a16_finetuned_STS-B.log>`__                 |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_l24_h1024_a16_finetuned_STS-B.sh>`__                                   |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| QQP             |Accuracy             |91.94                  |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_l24_h1024_a16_finetuned_QQP.log>`__                   |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_l24_h1024_a16_finetuned_QQP.sh>`__                                     |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| MNLI            |Accuracy(m/mm)       |89.93/89.91            |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_l24_h1024_a16_finetuned_MNLI.log>`__                  |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_l24_h1024_a16_finetuned_MNLI.sh>`__                                    |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| RTE             |Accuracy             |84.12                  |`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_l24_h1024_a16_finetuned_RTE.log>`__                   |`command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_l24_h1024_a16_finetuned_RTE.sh>`__                                     |
-+-----------------+---------------------+-----------------------+--------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------+
-
-Question Answering on SQuAD
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| Dataset   | SQuAD 1.1                                                                                                                                               | SQuAD 1.1                                                                                                                                                | SQuAD 2.0                                                                                                                                                                                                                                                                                                        | SQuAD 2.0                                                                                                                                                                                                                                                                                                        |
-+===========+=========================================================================================================================================================+==========================================================================================================================================================+==================================================================================================================================================================================================================================================================================================================+==================================================================================================================================================================================================================================================================================================================+
-| Model     | xlnet_12_768_12                                                                                                                                         | xlnet_24_1024_16                                                                                                                                         | xlnet_12_768_12                                                                                                                                                                                                                                                                                                  | xlnet_24_1024_16                                                                                                                                                                                                                                                                                                 |
-+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| EM / F1   | 85.50 / 91.77                                                                                                                                           | 89.08 / 94.52                                                                                                                                            | 80.47 / 83.22                                                                                                                                                                                                                                                                                                    | 86.08 / 86.69                                                                                                                                                                                                                                                                                                    |
-+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| Log       | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_finetune_squad1.1_base_mx1.6.0rc1.log>`__                         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_finetune_squad1.1_large_mx1.6.0rc1.log>`__                         | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_finetune_squad2.0_base_mx1.6.0rc1.log>`__                                                                                                                                                                                  | `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_finetune_squad2.0_large_mx1.6.0rc1.log>`__                                                                                                                                                                                 |
-+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| Command   | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_finetune_squad1.1_base_mx1.6.0rc1.sh>`__                      | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_finetune_squad1.1_large_mx1.6.0rc1.sh>`__                      | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_finetune_squad2.0_base_mx1.6.0rc1.sh>`__                                                                                                                                                                               | `command <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_finetune_squad2.0_large_mx1.6.0rc1.sh>`__                                                                                                                                                                              |
-+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-| Prediction| `predictions.json <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_finetune_squad1.1_base_mx1.6.0rc1_pred.json>`__      | `predictions.json <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_finetune_squad1.1_large_mx1.6.0rc1_pred.json>`__      | `predictions.json <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_finetune_squad2.0_base_mx1.6.0rc1_pred.json>`__  `null_odds.json <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_finetune_squad2.0_base_mx1.6.0rc1_null.json>`__             | `predictions.json <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_finetune_squad2.0_large_mx1.6.0rc1_pred.json>`__  `null_odds.json <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/language_model/xlnet_finetune_squad2.0_large_mx1.6.0rc1_null.json>`__           |
-+-----------+---------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
-
-For xlnet_24_1024_16, we used hyperparameters reported by the paper authors.
-
-
-To get the score of the dev data, you need to download the evaluate script (`evaluate-2.0.py <https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/>`_).
-You can either put the evaluate script under the same folder with run_squad.py to let our script run it automatically,
-or run it manually by yourself. To run the evaluate script, you can use the following commands:
-
-SQuAD1.1:
-
-.. code-block:: console
-
-    $ python evaluate-v2.0.py dev-v2.0.json predictions.json
-
-SQuAD2.0:
-
-.. code-block:: console
-
-    $ python evaluate-v2.0.py dev-v2.0.json predictions.json --na-prob-file null_odds.json
diff --git a/scripts/language_model/large_word_language_model.py b/scripts/language_model/large_word_language_model.py
deleted file mode 100644
index 570b89e7e2..0000000000
--- a/scripts/language_model/large_word_language_model.py
+++ /dev/null
@@ -1,357 +0,0 @@
-"""
-Large Word Language Model
-===================
-
-This example shows how to build a word-level language model on Google Billion Words dataset
-with Gluon NLP Toolkit.
-By using the existing data pipeline tools and building blocks, the process is greatly simplified.
-
-We implement the LSTM 2048-512 language model proposed in the following work.
-
-@article{jozefowicz2016exploring,
- title={Exploring the Limits of Language Modeling},
- author={Jozefowicz, Rafal and Vinyals, Oriol and Schuster, Mike and Shazeer, Noam and Wu, Yonghui},
- journal={arXiv preprint arXiv:1602.02410},
- year={2016}
-}
-
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import time
-import math
-import os
-import sys
-import argparse
-import numpy as np
-import mxnet as mx
-from mxnet import gluon
-import gluonnlp as nlp
-from gluonnlp.utils import Parallel
-from gluonnlp.model.train.language_model import ParallelBigRNN
-from sampler import LogUniformSampler
-
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.append(os.path.join(curr_path, '..', '..'))
-
-nlp.utils.check_version('0.7.0')
-
-###############################################################################
-# Arg parser
-###############################################################################
-parser = argparse.ArgumentParser(description=
-                                 'Gluon-NLP Big LSTM 2048-512 Language Model on GBW')
-parser.add_argument('--save', type=str, default='model.params',
-                    help='path to save the final model.')
-parser.add_argument('--emsize', type=int, default=512,
-                    help='size of word embeddings')
-parser.add_argument('--nhid', type=int, default=2048,
-                    help='number of hidden units per layer')
-parser.add_argument('--nproj', type=int, default=512,
-                    help='number of projection units per layer. Could be different from embsize')
-parser.add_argument('--nlayers', type=int, default=1,
-                    help='number of layers')
-parser.add_argument('--from-epoch', type=int, default=None,
-                    help='start training or testing from the provided epoch')
-parser.add_argument('--epochs', type=int, default=50,
-                    help='number of epoch for training')
-parser.add_argument('--batch-size', type=int, default=128,
-                    help='batch size per gpu')
-parser.add_argument('--dropout', type=float, default=0.1,
-                    help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--eps', type=float, default=1,
-                    help='initial history accumulation for adagrad')
-parser.add_argument('--bptt', type=int, default=20,
-                    help='sequence length')
-parser.add_argument('--k', type=int, default=8192,
-                    help='number of noise samples for estimation')
-parser.add_argument('--gpus', type=str,
-                    help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.')
-parser.add_argument('--log-interval', type=int, default=1000,
-                    help='report interval')
-parser.add_argument('--seed', type=int, default=0,
-                    help='random seed')
-parser.add_argument('--lr', type=float, default=0.2,
-                    help='initial learning rate')
-parser.add_argument('--clip', type=float, default=1.0,
-                    help='gradient clipping by global norm.')
-parser.add_argument('--test-mode', action='store_true',
-                    help='Whether to run through the script with few examples')
-parser.add_argument('--eval-only', action='store_true',
-                    help='Whether to only run evaluation for the trained model')
-args = parser.parse_args()
-
-segments = ['train', 'test']
-max_nbatch_eval = None
-
-if args.test_mode:
-    args.emsize = 200
-    args.log_interval = 1
-    args.nhid = 200
-    args.nlayers = 1
-    args.epochs = 20
-    max_nbatch_eval = 3
-    segments = ['test', 'test']
-
-print(args)
-mx.random.seed(args.seed)
-np.random.seed(args.seed)
-
-context = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
-          [mx.gpu(int(x)) for x in args.gpus.split(',')]
-
-os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
-os.environ['MXNET_CPU_PARALLEL_RAND_COPY'] = str(len(context))
-os.environ['MXNET_CPU_WORKER_NTHREADS'] = str(len(context))
-
-###############################################################################
-# Data stream
-###############################################################################
-train_data_stream, test_data_stream = \
-    [nlp.data.GBWStream(segment=segment, skip_empty=True, bos=None, eos='<eos>')
-     for segment in segments]
-vocab = train_data_stream.vocab
-ntokens = len(vocab)
-
-# Sampler for generating negative classes during training with importance sampling
-sampler = LogUniformSampler(ntokens, args.k)
-
-# Given a list of (array, context) pairs, load array[i] on context[i]
-def _load(xs):
-    ret = []
-    for x, ctx in zip(xs, context):
-        if isinstance(x, tuple):
-            ret.append([y.as_in_context(ctx) for y in x])
-        else:
-            ret.append(x.as_in_context(ctx))
-    return ret
-
-# Transformation for a data batch for training.
-# First, load the data, target and mask to target contexts.
-# Second, the LSTM-2048-512 model performs importance sampling for decoding
-# during training, we need to sample negative candidate classes by invoking the
-# log uniform sampler.
-def _split_and_sample(x, y):
-    m = x != vocab[vocab.padding_token]  # mask padding
-    num_ctx = len(context)
-    if num_ctx > 1:
-        xs = gluon.utils.split_data(x, num_ctx, batch_axis=1, even_split=True)
-        ys = gluon.utils.split_data(y, num_ctx, batch_axis=1, even_split=True)
-        ms = gluon.utils.split_data(m, num_ctx, batch_axis=1, even_split=True)
-    else:
-        xs, ys, ms = [x], [y], [m]
-    xs = _load(xs)
-    ys = _load(ys)
-    ms = _load(ms)
-    ss = [sampler(y) for y in ys]
-    ss = _load(ss)
-    return xs, ys, ms, ss
-
-train_batch_size = args.batch_size * len(context)
-train_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, train_batch_size)
-train_data = train_batchify(train_data_stream)
-train_data = train_data.transform(_split_and_sample)
-
-test_batch_size = args.batch_size
-test_batchify = nlp.data.batchify.StreamBPTTBatchify(vocab, args.bptt, test_batch_size)
-test_data = test_batchify(test_data_stream)
-test_data = nlp.data.PrefetchingStream(test_data)
-
-###############################################################################
-# Build the model
-###############################################################################
-
-eval_model = nlp.model.language_model.BigRNN(ntokens, args.emsize, args.nhid,
-                                             args.nlayers, args.nproj,
-                                             embed_dropout=args.dropout,
-                                             encode_dropout=args.dropout)
-model = nlp.model.language_model.train.BigRNN(ntokens, args.emsize, args.nhid,
-                                              args.nlayers, args.nproj, args.k,
-                                              embed_dropout=args.dropout,
-                                              encode_dropout=args.dropout)
-loss = gluon.loss.SoftmaxCrossEntropyLoss()
-
-###############################################################################
-# Training code
-###############################################################################
-
-def train():
-    """Training loop for language model.
-    """
-    print(model)
-    from_epoch = 0
-    model.initialize(mx.init.Xavier(factor_type='out'), ctx=context)
-    trainer_params = {'learning_rate': args.lr, 'wd': 0, 'eps': args.eps}
-    trainer = gluon.Trainer(model.collect_params(), 'adagrad', trainer_params)
-    if args.from_epoch:
-        from_epoch = args.from_epoch
-        checkpoint_name = '%s.%s'%(args.save, format(from_epoch - 1, '02d'))
-        model.load_parameters(checkpoint_name)
-        trainer.load_states('%s.state'%args.save)
-        print('Loaded parameters from checkpoint %s'%(checkpoint_name))
-
-    model.hybridize(static_alloc=True, static_shape=True)
-    encoder_params = model.encoder.collect_params().values()
-    embedding_params = list(model.embedding.collect_params().values())
-    parallel_model = ParallelBigRNN(model, loss, args.batch_size)
-    parallel = Parallel(len(context), parallel_model)
-    for epoch in range(from_epoch, args.epochs):
-        sys.stdout.flush()
-        total_L = 0.0
-        start_epoch_time = time.time()
-        start_log_interval_time = time.time()
-        hiddens = [model.begin_state(batch_size=args.batch_size,
-                                     func=mx.nd.zeros, ctx=ctx) for ctx in context]
-        nbatch = 0
-        has_next = True
-        train_data_iter = iter(train_data)
-        data, target, mask, sample = next(train_data_iter)
-
-        while has_next:
-            nbatch += 1
-            hiddens = detach(hiddens)
-            Ls = []
-            for _, batch in enumerate(zip(data, target, mask, sample, hiddens)):
-                parallel.put(batch)
-
-            for _ in range(len(data)):
-                hidden, ls = parallel.get()
-                # hidden states are ordered by context id
-                index = context.index(hidden[0].context)
-                hiddens[index] = hidden
-                Ls.append(ls)
-
-            # prefetch the next batch of data
-            try:
-                data, target, mask, sample = next(train_data_iter)
-            except StopIteration:
-                has_next = False
-
-            # rescale embedding grad
-            for ctx in context:
-                x = embedding_params[0].grad(ctx)
-                x[:] *= args.batch_size
-                encoder_grad = [p.grad(ctx) for p in encoder_params]
-                # perform gradient clipping per ctx
-                gluon.utils.clip_global_norm(encoder_grad, args.clip)
-
-            trainer.step(len(context))
-
-            total_L += sum([mx.nd.sum(L).asscalar() / args.bptt for L in Ls])
-
-            if nbatch % args.log_interval == 0:
-                cur_L = total_L / args.log_interval / len(context)
-                ppl = math.exp(cur_L) if cur_L < 100 else float('inf')
-                print('[Epoch %d Batch %d] loss %.2f, ppl %.2f, '
-                      'throughput %.2f samples/s'
-                      %(epoch, nbatch, cur_L, ppl,
-                        train_batch_size*args.log_interval/(time.time()-start_log_interval_time)))
-                total_L = 0.0
-                start_log_interval_time = time.time()
-                sys.stdout.flush()
-
-        end_epoch_time = time.time()
-        print('Epoch %d took %.2f seconds.'%(epoch, end_epoch_time - start_epoch_time))
-        mx.nd.waitall()
-        checkpoint_name = '%s.%s'%(args.save, format(epoch, '02d'))
-        model.save_parameters(checkpoint_name)
-        trainer.save_states('%s.state'%args.save)
-
-def detach(hidden):
-    if isinstance(hidden, (tuple, list)):
-        hidden = [detach(h) for h in hidden]
-    else:
-        hidden = hidden.detach()
-    return hidden
-
-def test(data_stream, batch_size, ctx=None):
-    """Evaluate the model on the dataset.
-
-    Parameters
-    ----------
-    data_stream : DataStream
-        The dataset is tested on.
-    batch_size : int
-        The size of the mini-batch.
-    ctx : mx.cpu() or mx.gpu()
-        The context of the computation.
-
-    Returns
-    -------
-    loss: float
-        The loss on the dataset
-    """
-    total_L = 0.0
-    ntotal = 0
-    nbatch = 0
-    hidden = eval_model.begin_state(batch_size=batch_size, func=mx.nd.zeros, ctx=ctx)
-    start_time = time.time()
-    for data, target in data_stream:
-        data = data.as_in_context(ctx)
-        target = target.as_in_context(ctx)
-        mask = data != vocab[vocab.padding_token]
-        output, hidden = eval_model(data, hidden)
-        hidden = detach(hidden)
-        output = output.reshape((-3, -1))
-        L = loss(output, target.reshape(-1,)) * mask.reshape((-1,))
-        total_L += L.mean()
-        ntotal += mask.mean()
-        nbatch += 1
-        avg = total_L / ntotal
-        if nbatch % args.log_interval == 0:
-            avg_scalar = float(avg.asscalar())
-            ppl = math.exp(avg_scalar)
-            throughput = batch_size*args.log_interval/(time.time()-start_time)
-            print('Evaluation batch %d: test loss %.2f, test ppl %.2f, '
-                  'throughput = %.2f samples/s'%(nbatch, avg_scalar, ppl, throughput))
-            start_time = time.time()
-        if max_nbatch_eval and nbatch > max_nbatch_eval:
-            print('Quit evaluation early at batch %d'%nbatch)
-            break
-    return float(avg.asscalar())
-
-def evaluate():
-    """ Evaluate loop for the trained model """
-    print(eval_model)
-    eval_model.initialize(mx.init.Xavier(), ctx=context[0])
-    eval_model.hybridize(static_alloc=True, static_shape=True)
-    epoch = args.from_epoch if args.from_epoch else 0
-    while epoch < args.epochs:
-        checkpoint_name = '%s.%s'%(args.save, format(epoch, '02d'))
-        if not os.path.exists(checkpoint_name):
-            print('Wait for a new checkpoint...')
-            # check again after 600 seconds
-            time.sleep(600)
-            continue
-        eval_model.load_parameters(checkpoint_name)
-        print('Loaded parameters from checkpoint %s'%(checkpoint_name))
-        start_epoch_time = time.time()
-        final_test_L = test(test_data, test_batch_size, ctx=context[0])
-        end_epoch_time = time.time()
-        print('[Epoch %d] test loss %.2f, test ppl %.2f'%
-              (epoch, final_test_L, math.exp(final_test_L)))
-        print('Epoch %d took %.2f seconds.'%(epoch, end_epoch_time - start_epoch_time))
-        sys.stdout.flush()
-        epoch += 1
-
-if __name__ == '__main__':
-    if args.eval_only:
-        evaluate()
-    else:
-        train()
diff --git a/scripts/language_model/model/XLNet_classifier.py b/scripts/language_model/model/XLNet_classifier.py
deleted file mode 100644
index 18f91526b1..0000000000
--- a/scripts/language_model/model/XLNet_classifier.py
+++ /dev/null
@@ -1,90 +0,0 @@
-"""Model for sentence (pair) classification task/ regression with XLnet.
-"""
-from mxnet.gluon import Block
-from mxnet.gluon import nn
-import mxnet as mx
-
-
-class XLNetClassifier(Block):
-    """XLNet Classifier
-    """
-    def __init__(self, xl, units=768, num_classes=2, dropout=0.0,
-                 prefix=None, params=None):
-        super(XLNetClassifier, self).__init__(prefix=prefix, params=params)
-        self.xlnet = xl
-        self._units = units
-        with self.name_scope():
-            self.classifier = nn.HybridSequential(prefix=prefix)
-            if dropout:
-                self.classifier.add(nn.Dropout(rate=dropout))
-            self.classifier.add(nn.Dense(units=num_classes, flatten=False))
-            self.pooler = nn.Dense(units=units, flatten=False, activation='tanh', prefix=prefix)
-
-    def __call__(self, inputs, token_types, valid_length=None, mems=None):
-        # pylint: disable=arguments-differ
-        """Generate the unnormalized score for the given the input sequences.
-
-        Parameters
-        ----------
-        inputs : NDArray or Symbol, shape (batch_size, seq_length)
-            Input words for the sequences.
-        token_types : NDArray or Symbol, shape (batch_size, seq_length)
-            Token types for the sequences, used to indicate whether the word belongs to the
-            first sentence or the second one.
-        valid_length : NDArray or Symbol, or None, shape (batch_size)
-            Valid length of the sequence. This is used to mask the padded tokens.
-
-        Returns
-        -------
-        outputs : NDArray or Symbol
-            Shape (batch_size, num_classes)
-        """
-        return super(XLNetClassifier, self).__call__(inputs, token_types, valid_length, mems)
-
-    def _apply_pooling(self, sequence, valid_length):
-        """Generate the representation given the inputs.
-
-        This is used for pre-training or fine-tuning a XLNet model.
-        """
-        F = mx.ndarray
-        index = F.contrib.arange_like(sequence, axis=0, ctx=sequence.context).expand_dims(1)
-        valid_length_rs = valid_length.reshape((-1, 1)) - 1
-        gather_index = F.concat(index, valid_length_rs).T
-        cls_states = F.gather_nd(sequence, gather_index)
-        return self.pooler(cls_states)
-
-    def _padding_mask(self, inputs, valid_length):
-        F = mx.ndarray
-        valid_length = valid_length.astype(inputs.dtype)
-        steps = F.contrib.arange_like(inputs, axis=1)
-        ones = F.ones_like(steps)
-        mask = F.broadcast_lesser(F.reshape(steps, shape=(1, -1)),
-                                  F.reshape(valid_length, shape=(-1, 1)))
-        mask = F.broadcast_mul(F.expand_dims(mask, axis=1),
-                               F.broadcast_mul(ones, F.reshape(ones, shape=(-1, 1))))
-        return mask
-
-    def forward(self, inputs, token_types, valid_length=None, mems=None):
-        # pylint: disable=arguments-differ
-        """Generate the unnormalized score for the given the input sequences.
-
-        Parameters
-        ----------
-        inputs : NDArray or Symbol, shape (batch_size, seq_length)
-            Input words for the sequences.
-        token_types : NDArray or Symbol, shape (batch_size, seq_length)
-            Token types for the sequences, used to indicate whether the word belongs to the
-            first sentence or the second one.
-        valid_length : NDArray or None, shape (batch_size)
-            Valid length of the sequence. This is used to mask the padded tokens.
-
-        Returns
-        -------
-        outputs : NDArray
-            Shape (batch_size, num_classes)
-        """
-        attention_mask = self._padding_mask(inputs, valid_length).astype('float32')
-        output, _ = self.xlnet(inputs, token_types, mems, attention_mask)
-        output = self._apply_pooling(output, valid_length.astype('float32'))
-        pooler_out = self.pooler(output)
-        return self.classifier(pooler_out)
diff --git a/scripts/language_model/model/qa.py b/scripts/language_model/model/qa.py
deleted file mode 100644
index 73619efff6..0000000000
--- a/scripts/language_model/model/qa.py
+++ /dev/null
@@ -1,345 +0,0 @@
-"""XLNetForQA models."""
-
-import mxnet as mx
-from mxnet.gluon import HybridBlock, Block, loss, nn
-
-
-class PoolerStartLogits(HybridBlock):
-    """ Compute SQuAD start_logits from sequence hidden states."""
-    def __init__(self, prefix=None, params=None):
-        super(PoolerStartLogits, self).__init__(prefix=prefix, params=params)
-        self.dense = nn.Dense(1, flatten=False)
-
-    def __call__(self, hidden_states, p_masks=None):
-        # pylint: disable=arguments-differ
-        return super(PoolerStartLogits, self).__call__(hidden_states, p_masks)
-
-    def hybrid_forward(self, F, hidden_states, p_mask):
-        """Get start logits from the model output.
-
-        Parameters
-        ----------
-        hidden_states : NDArray, shape (batch_size, seq_length, hidden_size)
-        p_mask : NDArray or None, shape(batch_size, seq_length)
-
-        Returns
-        -------
-        x : NDarray, shape(batch_size, seq_length)
-            Masked start logits.
-        """
-        # pylint: disable=arguments-differ
-        x = self.dense(hidden_states).squeeze(-1)
-        if p_mask is not None:
-            x = x * (1 - p_mask) - 1e30 * p_mask
-        return x
-
-
-class PoolerEndLogits(HybridBlock):
-    """ Compute SQuAD end_logits from sequence hidden states and start token hidden state."""
-    def __init__(self, units=768, is_eval=False, prefix=None, params=None):
-        super(PoolerEndLogits, self).__init__(prefix=prefix, params=params)
-        self._eval = is_eval
-        self._hsz = units
-        with self.name_scope():
-            self.dense_0 = nn.Dense(units, activation='tanh', flatten=False)
-            self.dense_1 = nn.Dense(1, flatten=False)
-            self.layernorm = nn.LayerNorm(epsilon=1e-12, in_channels=units)
-
-    def __call__(self,
-                 hidden_states,
-                 start_states=None,
-                 start_positions=None,
-                 p_masks=None):
-        # pylint: disable=arguments-differ
-        return super(PoolerEndLogits,
-                     self).__call__(hidden_states, start_states,
-                                    start_positions, p_masks)
-
-    def hybrid_forward(self, F, hidden_states, start_states, start_positions, p_mask):
-        # pylint: disable=arguments-differ
-        """Get end logits from the model output and start states or start positions.
-
-        Parameters
-        ----------
-        hidden_states : NDArray, shape (batch_size, seq_length, hidden_size)
-        start_states : NDArray, shape (batch_size, seq_length, start_n_top, hidden_size)
-            Used during inference
-        start_positions : NDArray, shape (batch_size)
-            Ground-truth start positions used during training.
-        p_mask : NDArray or None, shape(batch_size, seq_length)
-
-        Returns
-        -------
-        x : NDarray, shape(batch_size, seq_length)
-            Masked end logits.
-        """
-        if not self._eval:
-            start_states = F.gather_nd(
-                hidden_states,
-                F.concat(
-                    F.contrib.arange_like(hidden_states,
-                                          axis=0).expand_dims(1),
-                    start_positions.expand_dims(
-                        1)).transpose())  # shape(bsz, hsz)
-            start_states = start_states.expand_dims(1)
-            start_states = F.broadcast_like(
-                start_states, hidden_states)  # shape (bsz, slen, hsz)
-        x = self.dense_0(F.concat(hidden_states, start_states, dim=-1))
-        x = self.layernorm(x)
-        x = self.dense_1(x).squeeze(-1)
-        if p_mask is not None and self._eval:
-            p_mask = p_mask.expand_dims(-1)
-            p_mask = F.broadcast_like(p_mask, x)
-        if p_mask is not None:
-            x = x * (1 - p_mask) - 1e30 * p_mask
-        return x
-
-
-class XLNetPoolerAnswerClass(HybridBlock):
-    """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
-    def __init__(self, units=768, dropout=0.1, prefix=None, params=None):
-        super(XLNetPoolerAnswerClass, self).__init__(prefix=prefix,
-                                                     params=params)
-        with self.name_scope():
-            self._units = units
-            self.dense_0 = nn.Dense(units,
-                                    in_units=2 * units,
-                                    activation='tanh',
-                                    use_bias=True,
-                                    flatten=False)
-            self.dense_1 = nn.Dense(1,
-                                    in_units=units,
-                                    use_bias=False,
-                                    flatten=False)
-            self._dropout = nn.Dropout(dropout)
-
-    def __call__(self, hidden_states, start_states=None, cls_index=None):
-        # pylint: disable=arguments-differ
-        return super(XLNetPoolerAnswerClass,
-                     self).__call__(hidden_states, start_states, cls_index)
-
-    def hybrid_forward(self, F, hidden_states, start_states, cls_index):
-        # pylint: disable=arguments-differ
-        """Get answerability logits from the model output and start states.
-
-        Parameters
-        ----------
-        hidden_states : NDArray, shape (batch_size, seq_length, hidden_size)
-        start_states : NDArray, shape (batch_size, hidden_size)
-            Typically weighted average hidden_states along second dimension.
-        cls_index : NDArray, shape (batch_size)
-            Index of [CLS] token in sequence.
-
-        Returns
-        -------
-        x : NDarray, shape(batch_size,)
-            CLS logits.
-        """
-        index = F.contrib.arange_like(hidden_states,
-                                      axis=0).expand_dims(1)
-        valid_length_rs = cls_index.reshape((-1, 1)) - 1
-        gather_index = F.transpose(F.concat(index, valid_length_rs), axes=(1, 0))
-        cls_token_state = F.gather_nd(hidden_states, gather_index)
-
-        x = self.dense_0(F.concat(start_states, cls_token_state, dim=-1))
-        x = self._dropout(x)
-        x = self.dense_1(x).squeeze(-1)
-        return x
-
-
-class XLNetForQA(Block):
-    """Model for SQuAD task with XLNet.
-
-    Parameters
-    ----------
-    xlnet_base: XLNet Block
-    start_top_n : int
-        Number of start position candidates during inference.
-    end_top_n : int
-        Number of end position candidates for each start position during inference.
-    is_eval : Bool
-        If set to True, do inference.
-    prefix : str or None
-        See document of `mx.gluon.Block`.
-    params : ParameterDict or None
-        See document of `mx.gluon.Block`.
-    """
-    def __init__(self,
-                 xlnet_base,
-                 start_top_n=None,
-                 end_top_n=None,
-                 is_eval=False,
-                 units=768,
-                 prefix=None,
-                 params=None):
-        super(XLNetForQA, self).__init__(prefix=prefix, params=params)
-        with self.name_scope():
-            self.xlnet = xlnet_base
-            self.start_top_n = start_top_n
-            self.end_top_n = end_top_n
-            self.loss = loss.SoftmaxCELoss()
-            self.start_logits = PoolerStartLogits()
-            self.end_logits = PoolerEndLogits(units=units, is_eval=is_eval)
-            self.eval = is_eval
-            self.answer_class = XLNetPoolerAnswerClass(units=units)
-            self.cls_loss = loss.SigmoidBinaryCrossEntropyLoss()
-
-    def __call__(self,
-                 inputs,
-                 token_types,
-                 valid_length=None,
-                 label=None,
-                 p_mask=None,
-                 is_impossible=None,
-                 mems=None):
-        #pylint: disable=arguments-differ
-        """Generate the unnormalized score for the given the input sequences."""
-        valid_length = [] if valid_length is None else valid_length
-        return super(XLNetForQA,
-                     self).__call__(inputs, token_types, valid_length, p_mask,
-                                    label, is_impossible, mems)
-
-    def _padding_mask(self, inputs, valid_length, left_pad=False):
-        F = mx.ndarray
-        if left_pad:
-            # left padding
-            valid_length_start = valid_length.astype('int64')
-            steps = F.contrib.arange_like(inputs, axis=1) + 1
-            ones = F.ones_like(steps)
-            mask = F.broadcast_greater(
-                F.reshape(steps, shape=(1, -1)),
-                F.reshape(valid_length_start, shape=(-1, 1)))
-            mask = F.broadcast_mul(
-                F.expand_dims(mask, axis=1),
-                F.broadcast_mul(ones, F.reshape(ones, shape=(-1, 1))))
-        else:
-            # right padding
-            valid_length = valid_length.astype(inputs.dtype)
-            steps = F.contrib.arange_like(inputs, axis=1)
-            ones = F.ones_like(steps)
-            mask = F.broadcast_lesser(F.reshape(steps, shape=(1, -1)),
-                                      F.reshape(valid_length, shape=(-1, 1)))
-            mask = F.broadcast_mul(
-                F.expand_dims(mask, axis=1),
-                F.broadcast_mul(ones, F.reshape(ones, shape=(-1, 1))))
-        return mask
-
-    def forward(self, inputs, token_types, valid_length, p_mask, label,
-                is_impossible, mems):
-        # pylint: disable=arguments-differ
-        """Generate the unnormalized score for the given the input sequences.
-
-        Parameters
-        ----------
-        inputs : NDArray, shape (batch_size, seq_length)
-            Input words for the sequences.
-        token_types : NDArray, shape (batch_size, seq_length)
-            Token types for the sequences, used to indicate whether the word belongs to the
-            first sentence or the second one.
-        valid_length : NDArray or None, shape (batch_size,)
-            Valid length of the sequence. This is used to mask the padded tokens.
-        p_mask : NDArray or None, shape (batch_size, seq_length)
-            We do not want special tokens(e.g., [SEP], [PAD]) and question tokens to be
-            included in answer. Set to 1 to mask the token.
-        label : NDArray, shape (batch_size, 1)
-            Ground-truth label(start/end position) for loss computation.
-        is_impossible : NDArray or None, shape (batch_size ,1)
-            Ground-truth label(is impossible) for loss computation. Set to None for squad1.
-        mems : NDArray
-            We do not use memory(a Transformer XL component) during finetuning.
-
-        Returns
-        -------
-        For training we have:
-        total_loss : list of NDArray
-            Specifically, we have a span loss (batch_size, ) and a cls_loss (batch_size, )
-        total_loss_sum : NDArray
-
-        For inference we have:
-        start_top_log_probs : NDArray, shape (batch_size, start_n_top, )
-        start_top_index :  NDArray, shape (batch_size, start_n_top)
-        end_top_log_probs : NDArray, shape (batch_size, start_n_top * end_n_top)
-        end_top_index : NDArray, shape (batch_size, start_n_top * end_n_top)
-        cls_logits : NDArray or None, shape (batch_size, )
-        """
-        if isinstance(valid_length, list) and len(valid_length) == 0:
-            valid_length = None
-        attention_mask = self._padding_mask(inputs,
-                                            valid_length).astype('float32')
-        output, _ = self.xlnet(inputs, token_types, mems, attention_mask)
-        start_logits = self.start_logits(output,
-                                         p_masks=p_mask)  # shape (bsz, slen)
-        bsz, slen, hsz = output.shape
-        if not self.eval:
-            # training
-            start_positions, end_positions = label
-            end_logit = self.end_logits(output,
-                                        start_positions=start_positions,
-                                        p_masks=p_mask)
-            span_loss = (self.loss(start_logits, start_positions) +
-                         self.loss(end_logit, end_positions)) / 2
-
-            total_loss = [span_loss]
-
-            # get cls loss
-            start_log_probs = mx.nd.softmax(start_logits, axis=-1)
-            start_states = mx.nd.batch_dot(output,
-                                           start_log_probs.expand_dims(-1),
-                                           transpose_a=True).squeeze(-1)
-
-            cls_logits = self.answer_class(output, start_states,
-                                           valid_length)
-            cls_loss = self.cls_loss(cls_logits, is_impossible)
-            total_loss.append(0.5 * cls_loss)
-            total_loss_sum = span_loss + 0.5 * cls_loss
-            return total_loss, total_loss_sum
-        else:
-            #inference
-            start_log_probs = mx.nd.log_softmax(start_logits,
-                                                axis=-1)  # shape (bsz, slen)
-            start_top_log_probs, start_top_index = mx.ndarray.topk(
-                start_log_probs, k=self.start_top_n, axis=-1,
-                ret_typ='both')  # shape (bsz, start_n_top)
-            index = mx.nd.concat(*[
-                mx.nd.arange(bsz, ctx=start_log_probs.context).expand_dims(1)
-            ] * self.start_top_n).reshape(bsz * self.start_top_n, 1)
-            start_top_index_rs = start_top_index.reshape((-1, 1))
-            gather_index = mx.nd.concat(
-                index, start_top_index_rs).T  #shape(2, bsz * start_n_top)
-            start_states = mx.nd.gather_nd(output, gather_index).reshape(
-                (bsz, self.start_top_n, hsz))  #shape (bsz, start_n_top, hsz)
-
-            start_states = start_states.expand_dims(1)
-            start_states = mx.nd.broadcast_to(
-                start_states, (bsz, slen, self.start_top_n,
-                               hsz))  # shape (bsz, slen, start_n_top, hsz)
-            hidden_states_expanded = output.expand_dims(2)
-            hidden_states_expanded = mx.ndarray.broadcast_to(
-                hidden_states_expanded, shape=start_states.shape
-            )  # shape (bsz, slen, start_n_top, hsz)
-            end_logits = self.end_logits(
-                hidden_states_expanded,
-                start_states=start_states,
-                p_masks=p_mask)  # shape (bsz, slen, start_n_top)
-            end_log_probs = mx.nd.log_softmax(
-                end_logits, axis=1)  # shape (bsz, slen, start_n_top)
-            # Note that end_top_index and end_top_log_probs have shape (bsz, END_N_TOP, start_n_top)
-            # So that for each start position, there are end_n_top end positions on the second dim.
-            end_top_log_probs, end_top_index = mx.ndarray.topk(
-                end_log_probs, k=self.end_top_n, axis=1,
-                ret_typ='both')  # shape (bsz, end_n_top, start_n_top)
-            end_top_log_probs = end_top_log_probs.reshape(
-                (-1, self.start_top_n * self.end_top_n))
-            end_top_index = end_top_index.reshape(
-                (-1, self.start_top_n * self.end_top_n))
-
-            start_probs = mx.nd.softmax(start_logits, axis=-1)
-            start_states = mx.nd.batch_dot(output,
-                                           start_probs.expand_dims(-1),
-                                           transpose_a=True).squeeze(-1)
-            cls_logits = self.answer_class(output, start_states,
-                                           valid_length)
-
-            outputs = (start_top_log_probs, start_top_index, end_top_log_probs,
-                       end_top_index, cls_logits)
-            return outputs
diff --git a/scripts/language_model/run_glue.py b/scripts/language_model/run_glue.py
deleted file mode 100644
index 7f9041f836..0000000000
--- a/scripts/language_model/run_glue.py
+++ /dev/null
@@ -1,658 +0,0 @@
-"""
-Sentence Pair Classification with XLNet
-"""
-import io
-import os
-import time
-import argparse
-import random
-import logging
-import warnings
-from functools import partial
-import numpy as np
-import mxnet as mx
-from mxnet import gluon
-import gluonnlp as nlp
-from gluonnlp.data.classification import get_task
-from gluonnlp.data.bert.glue import truncate_seqs_equal, concat_sequences
-from model.XLNet_classifier import XLNetClassifier
-from transformer import model
-
-parser = argparse.ArgumentParser(
-    description='XLNet fine-tune examples for classification/regression tasks.',
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-# Training config
-parser.add_argument('--epochs', type=int, default=3, help='number of epochs.')
-parser.add_argument('--training_steps',
-                    type=int,
-                    help='If specified, epochs will be ignored.')
-parser.add_argument(
-    '--batch_size',
-    type=int,
-    default=128,
-    help='Batch size. Number of examples per gpu in a minibatch.')
-
-parser.add_argument(
-    '--accumulate',
-    type=int,
-    default=None,
-    help=
-    'The number of batches for gradients accumulation to simulate large batch size. '
-    'Default is None')
-
-parser.add_argument('--dev_batch_size',
-                    type=int,
-                    default=32,
-                    help='Batch size for dev set and test set')
-
-parser.add_argument('--dropout', type=float, default=0.1, help='dropout')
-parser.add_argument('--attention_dropout',
-                    type=float,
-                    default=0.1,
-                    help='attention dropout')
-parser.add_argument('--log_interval',
-                    type=int,
-                    default=10,
-                    help='report interval')
-parser.add_argument(
-    '--early_stop',
-    type=int,
-    default=None,
-    help='Whether to perform early stopping based on the metric on dev set. '
-    'The provided value is the patience. ')
-
-# Optimizer config
-parser.add_argument('--optimizer', type=str, default='Adam', help='')
-parser.add_argument('--lr',
-                    type=float,
-                    default=3e-5,
-                    help='Initial learning rate')
-parser.add_argument('--lr_decay',
-                    type=str,
-                    choices=['linear'],
-                    default='linear',
-                    help='lr schedule')
-parser.add_argument('--epsilon',
-                    type=float,
-                    default=1e-6,
-                    help='Small value to avoid division by 0')
-parser.add_argument(
-    '--warmup_ratio',
-    type=float,
-    default=0,
-    help='ratio of warmup steps used in NOAM\'s stepsize schedule')
-
-# task spesific & data preprocessing
-parser.add_argument('--gpu',
-                    type=int,
-                    default=None,
-                    help='Number of gpus for finetuning.')
-parser.add_argument('--task_name',
-                    default='MRPC',
-                    choices=['MRPC', 'QNLI', 'RTE', 'STS-B', 'CoLA',
-                             'MNLI', 'WNLI', 'SST', 'XNLI', 'LCQMC', 'ChnSentiCorp'],
-                    type=str,
-                    help='The name of the task to fine-tune.')
-
-parser.add_argument(
-    '--model_name',
-    type=str,
-    default='xlnet_cased_l12_h768_a12',
-    choices=['xlnet_cased_l24_h1024_a16', 'xlnet_cased_l12_h768_a12'],
-    help='The name of pre-trained XLNet model to fine-tune')
-
-parser.add_argument('--dataset',
-                    type=str,
-                    default='126gb',
-                    help='The dataset BERT pre-trained with.')
-parser.add_argument('--max_len',
-                    type=int,
-                    default=128,
-                    help='Maximum length of the sentence pairs')
-
-parser.add_argument(
-    '--round_to', type=int, default=None,
-    help='The length of padded sequences will be rounded up to be multiple of this argument.'
-         'When round to is set to 8, training throughput may increase for mixed precision'
-         'training on GPUs with tensorcores.')
-
-parser.add_argument(
-    '--only_inference',
-    action='store_true',
-    help=
-    'If set, we skip training and only perform inference on dev and test data.'
-)
-
-# Initializing config
-parser.add_argument('--seed', type=int, default=2, help='Random seed')
-
-# I/O config
-parser.add_argument(
-    '--output_dir',
-    type=str,
-    default='./output_dir',
-    help='The output directory where the model params will be written.')
-parser.add_argument(
-    '--model_parameters',
-    type=str,
-    default=None,
-    help='A parameter file for the model that is loaded into the model'
-    ' before training/inference. It is different from the parameter'
-    ' file written after the model is trained.')
-
-args = parser.parse_args()
-
-
-def split_array(arr, num_of_splits):
-    """split an array into equal pieces"""
-    # TODO Replace this function with gluon.utils.split_data() once targeting MXNet 1.7
-    size = arr.shape[0]
-    if size < num_of_splits:
-        return [arr[i:i + 1] for i in range(size)]
-    slice_len, rest = divmod(size, num_of_splits)
-    div_points = [0] + [(slice_len * index + min(index, rest) + slice_len +
-                         (index < rest)) for index in range(num_of_splits)]
-    slices = [
-        arr[div_points[i]:div_points[i + 1]] for i in range(num_of_splits)
-    ]
-    return slices
-
-
-def split_and_load(arrs, _ctxs):
-    """split and load arrays to a list of contexts"""
-    # TODO Replace split_array() with gluon.utils.split_data() once targeting MXNet 1.7
-    assert isinstance(arrs, (list, tuple))
-    # split and load
-    loaded_arrs = [[
-        i.as_in_context(ctx)
-        for i, ctx in zip(split_array(arr, len(_ctxs)), _ctxs)
-    ] for arr in arrs]
-    return zip(*loaded_arrs)
-
-
-def convert_examples_to_features(example,
-                                 tokenizer=None,
-                                 truncate_length=512,
-                                 cls_token=None,
-                                 sep_token=None,
-                                 class_labels=None,
-                                 label_alias=None,
-                                 vocab=None,
-                                 is_test=False):
-    #pylint: disable=redefined-outer-name
-    """convert glue examples into necessary features"""
-    assert vocab
-    if not is_test:
-        label_dtype = 'int32' if class_labels else 'float32'
-        # get the label
-        label = example[-1]
-        example = example[:-1]
-        #create label maps if classification task
-        if class_labels:
-            label_map = {}
-            for (i, l) in enumerate(class_labels):
-                label_map[l] = i
-            if label_alias:
-                for key in label_alias:
-                    label_map[key] = label_map[label_alias[key]]
-            label = label_map[label]
-        label = np.array([label], dtype=label_dtype)
-
-    # tokenize raw text
-    tokens_raw = [tokenizer(l) for l in example]
-    # truncate to the truncate_length,
-    tokens_trun = truncate_seqs_equal(tokens_raw, truncate_length)
-    # concate the sequences with special tokens, cls_token is added to the end in XlNet
-    special_tokens = [[sep_token]] * len(tokens_trun) + [[cls_token]]
-    tokens, segment_ids, _ = concat_sequences(tokens_trun, special_tokens)
-    # convert the token to ids
-    input_ids = vocab[tokens]
-    valid_length = len(input_ids)
-    if not is_test:
-        return input_ids, valid_length, segment_ids, label
-    else:
-        return input_ids, valid_length, segment_ids
-
-
-def preprocess_data(_tokenizer,
-                    _task,
-                    batch_size,
-                    dev_batch_size,
-                    max_len,
-                    _vocab):
-    """Train/eval Data preparation function."""
-    label_dtype = 'int32' if _task.class_labels else 'float32'
-    truncate_length = max_len - 3 if _task.is_pair else max_len - 2
-    trans = partial(convert_examples_to_features,
-                    tokenizer=_tokenizer,
-                    truncate_length=truncate_length,
-                    cls_token=_vocab.cls_token,
-                    sep_token=_vocab.sep_token,
-                    class_labels=_task.class_labels,
-                    label_alias=_task.label_alias,
-                    vocab=_vocab)
-
-    # data train
-    # task.dataset_train returns (segment_name, dataset)
-    train_tsv = _task.dataset_train()[1]
-    data_train = list(map(trans, train_tsv))
-    data_train = mx.gluon.data.SimpleDataset(data_train)
-    data_train_len = data_train.transform(
-        lambda _, valid_length, segment_ids, label: valid_length, lazy=False)
-
-    # bucket sampler for training
-    pad_val = _vocab[_vocab.padding_token]
-    batchify_fn = nlp.data.batchify.Tuple(
-        nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to),  # input
-        nlp.data.batchify.Stack(),  # length
-        nlp.data.batchify.Pad(axis=0, pad_val=4, round_to=args.round_to),  # segment
-        nlp.data.batchify.Stack(label_dtype))  # label
-    batch_sampler = nlp.data.sampler.FixedBucketSampler(data_train_len,
-                                                        batch_size=batch_size,
-                                                        num_buckets=10,
-                                                        ratio=0,
-                                                        shuffle=True)
-    # data loader for training
-    loader_train = gluon.data.DataLoader(dataset=data_train,
-                                         num_workers=4,
-                                         batch_sampler=batch_sampler,
-                                         batchify_fn=batchify_fn)
-
-    # data dev. For MNLI, more than one dev set is available
-    dev_tsv = _task.dataset_dev()
-    dev_tsv_list = dev_tsv if isinstance(dev_tsv, list) else [dev_tsv]
-    loader_dev_list = []
-    for segment, data in dev_tsv_list:
-        data_dev = mx.gluon.data.SimpleDataset(list(map(trans, data)))
-        loader_dev = mx.gluon.data.DataLoader(data_dev,
-                                              batch_size=dev_batch_size,
-                                              num_workers=4,
-                                              shuffle=False,
-                                              batchify_fn=batchify_fn)
-        loader_dev_list.append((segment, loader_dev))
-
-    # batchify for data test
-    test_batchify_fn = nlp.data.batchify.Tuple(
-        nlp.data.batchify.Pad(axis=0, pad_val=pad_val, round_to=args.round_to),
-        nlp.data.batchify.Stack(),
-        nlp.data.batchify.Pad(axis=0, pad_val=0, round_to=args.round_to))
-
-    # transform for data test
-    test_trans = partial(convert_examples_to_features,
-                         tokenizer=_tokenizer,
-                         truncate_length=max_len,
-                         cls_token=_vocab.cls_token,
-                         sep_token=_vocab.sep_token,
-                         class_labels=None,
-                         is_test=True,
-                         vocab=_vocab)
-
-    # data test. For MNLI, more than one test set is available
-    test_tsv = _task.dataset_test()
-    test_tsv_list = test_tsv if isinstance(test_tsv, list) else [test_tsv]
-    loader_test_list = []
-    for segment, data in test_tsv_list:
-        data_test = mx.gluon.data.SimpleDataset(list(map(test_trans, data)))
-        loader_test = mx.gluon.data.DataLoader(data_test,
-                                               batch_size=dev_batch_size,
-                                               num_workers=4,
-                                               shuffle=False,
-                                               batchify_fn=test_batchify_fn)
-        loader_test_list.append((segment, loader_test))
-    return loader_train, loader_dev_list, loader_test_list, len(data_train)
-
-
-logger = logging.getLogger()
-logger.setLevel(logging.INFO)
-logging.captureWarnings(True)
-handler = logging.FileHandler('log_{0}.txt'.format(args.task_name))
-handler.setLevel(logging.INFO)
-handler2 = logging.StreamHandler()
-handler2.setLevel(logging.INFO)
-formatter = logging.Formatter(
-    '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
-handler.setFormatter(formatter)
-handler2.setFormatter(formatter)
-logger.addHandler(handler)
-logger.addHandler(handler2)
-logging.info(args)
-
-log_interval = args.log_interval * args.accumulate if args.accumulate else args.log_interval
-
-if args.accumulate:
-    logging.info('Using gradient accumulation. Effective batch size = ' \
-                 'batch_size * accumulate = %d', args.accumulate * args.batch_size)
-
-# random seed
-np.random.seed(args.seed)
-random.seed(args.seed)
-mx.random.seed(args.seed)
-
-num_workers = 0
-ctxs = [mx.cpu(0)] if not args.gpu else [mx.gpu(i) for i in range(args.gpu)]
-
-task = get_task(args.task_name)
-
-# model and loss
-if args.only_inference and not args.model_parameters:
-    warnings.warn('model_parameters is not set. '
-                  'Randomly initialized model will be used for inference.')
-
-get_pretrained = True
-
-get_model_params = {
-    'name': args.model_name,
-    'dataset_name': args.dataset,
-    'pretrained': get_pretrained,
-    'ctx': ctxs,
-    'use_decoder': False,
-    'dropout': args.dropout,
-    'attention_dropout': args.attention_dropout
-}
-
-xlnet_base, vocab, tokenizer = model.get_model(**get_model_params)
-# initialize the rest of the parameters
-initializer = mx.init.Normal(0.02)
-
-do_regression = not task.class_labels
-if do_regression:
-    num_classes = 1
-    loss_function = gluon.loss.L2Loss()
-else:
-    num_classes = len(task.class_labels)
-    loss_function = gluon.loss.SoftmaxCELoss()
-# reuse the XLnetClassifier class with num_classes=1 for regression
-model = XLNetClassifier(xlnet_base,
-                        units=xlnet_base._net._units,
-                        dropout=0.1,
-                        num_classes=num_classes)
-
-num_ctxes = len(ctxs)
-
-# initialize classifier
-if not args.model_parameters:
-    model.classifier.initialize(init=initializer, ctx=ctxs)
-    model.pooler.initialize(init=initializer, ctx=ctxs)
-
-# load checkpointing
-output_dir = args.output_dir
-
-if args.model_parameters:
-    logging.info('loading model params from %s', args.model_parameters)
-    nlp.utils.load_parameters(model,
-                              args.model_parameters,
-                              ctx=ctxs,
-                              cast_dtype=True)
-
-nlp.utils.mkdir(output_dir)
-
-logging.debug(model)
-model.hybridize(static_alloc=True)
-loss_function.hybridize(static_alloc=True)
-
-logging.info('processing dataset...')
-train_data, dev_data_list, test_data_list, num_train_examples = preprocess_data(
-    tokenizer, task, args.batch_size, args.dev_batch_size, args.max_len, vocab)
-
-
-def test(loader_test, segment):
-    """Inference function on the test dataset."""
-    logging.info('Now we are doing testing on %s with %s.', segment, ctxs)
-
-    tic = time.time()
-    results = []
-    for _, seqs in enumerate(loader_test):
-        #input_ids, valid_length, segment_ids = seqs
-        data_list = list(split_and_load(seqs, ctxs))
-        out_list = []
-        for splited_data in data_list:
-            input_ids, valid_length, segment_ids = splited_data
-            out = model(input_ids, segment_ids, valid_length=valid_length)
-            out_list.append(out)
-        out_list = np.vstack([o.asnumpy() for o in out_list])
-        if not task.class_labels:
-            # regression task
-            for result in out_list.reshape(-1).tolist():
-                results.append('{:.3f}'.format(result))
-        else:
-            # classification task
-            out = out_list.reshape(-1, out_list.shape[-1])
-            indices = out.argmax(axis=-1)
-            for index in indices:
-                results.append(task.class_labels[int(index)])
-
-    mx.nd.waitall()
-    toc = time.time()
-    logging.info('Time cost=%.2fs, throughput=%.2f samples/s', toc - tic,
-                 args.dev_batch_size * len(loader_test) / (toc - tic))
-    # write result to a file.
-    segment = segment.replace('_mismatched', '-mm')
-    segment = segment.replace('_matched', '-m')
-    segment = segment.replace('SST', 'SST-2')
-    filename = args.task_name + segment.replace('test', '') + '.tsv'
-    test_path = os.path.join(args.output_dir, filename)
-    with io.open(test_path, 'w', encoding='utf-8') as f:
-        f.write(u'index\tprediction\n')
-        for i, pred in enumerate(results):
-            f.write(u'%d\t%s\n' % (i, str(pred)))
-
-
-def log_metric(metric, is_training=True):
-    prefix = 'training' if is_training else 'validation'
-    metric_nm, metric_val = metric.get()
-    if not isinstance(metric_nm, list):
-        metric_nm, metric_val = [metric_nm], [metric_val]
-    logging_str = prefix + ' metrics:' + ','.join(
-        [i + ':%.4f' for i in metric_nm])
-    logging.info(logging_str, *metric_val)
-    return metric_nm, metric_val
-
-
-def log_train(batch_id, batch_num, step_loss, _log_interval, epoch_id,
-              learning_rate):
-    """Generate and print out the log message for training. """
-    train_str = '[Epoch %d Batch %d/%d] loss=%.4f, lr=%.7f'
-    logging.info(train_str, epoch_id + 1, batch_id + 1, batch_num,
-                 step_loss / _log_interval, learning_rate)
-
-
-def log_eval(batch_id, batch_num, step_loss, _log_interval):
-    """Generate and print out the log message for inference. """
-    eval_str = '[Batch %d/%d] loss=%.4f'
-    logging.info(eval_str, batch_id + 1, batch_num, step_loss / _log_interval)
-
-
-def train(metric):
-    """Training function."""
-    if not args.only_inference:
-        logging.info('Now we are doing XLNet classification training on %s!',
-                     ctxs)
-
-    all_model_params = model.collect_params()
-    optimizer_params = {
-        'learning_rate': args.lr,
-        'epsilon': args.epsilon,
-        'wd': 0
-    }
-    trainer = gluon.Trainer(all_model_params,
-                            args.optimizer,
-                            optimizer_params,
-                            update_on_kvstore=False)
-
-    step_size = args.batch_size * args.accumulate if args.accumulate else args.batch_size
-    num_train_steps = int(num_train_examples / step_size * args.epochs)
-    epoch_number = args.epochs
-    if args.training_steps:
-        num_train_steps = args.training_steps
-        epoch_number = 9999
-    logging.info('training steps=%d', num_train_steps)
-    warmup_ratio = args.warmup_ratio
-    num_warmup_steps = int(num_train_steps * warmup_ratio)
-    step_num = 0
-
-    # Do not apply weight decay on LayerNorm and bias terms
-    for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
-        v.wd_mult = 0.0
-    # Collect differentiable parameters
-    params = [p for p in all_model_params.values() if p.grad_req != 'null']
-
-    # Set grad_req if gradient accumulation is required
-    if args.accumulate and args.accumulate > 1:
-        for p in params:
-            p.grad_req = 'add'
-    # track best eval score
-    metric_history = []
-    best_metric = None
-    patience = args.early_stop
-
-    tic = time.time()
-    finish_flag = False
-    for epoch_id in range(epoch_number):
-        if args.early_stop and patience == 0:
-            logging.info('Early stopping at epoch %d', epoch_id)
-            break
-        if finish_flag:
-            break
-        if not args.only_inference:
-            metric.reset()
-            step_loss = 0
-            tic = time.time()
-            all_model_params.zero_grad()
-            for batch_id, seqs in enumerate(train_data):
-                new_lr = args.lr
-                # learning rate schedule
-                if step_num < num_warmup_steps:
-                    new_lr = args.lr * step_num / num_warmup_steps
-                elif args.lr_decay == 'linear':
-                    non_warmup_steps = step_num - num_warmup_steps
-                    offset = non_warmup_steps / (num_train_steps -
-                                                 num_warmup_steps)
-                    new_lr = max(0, args.lr - offset * args.lr)
-                trainer.set_learning_rate(new_lr)
-                batch_loss = []
-                # forward and backward
-                with mx.autograd.record():
-                    data_list = list(split_and_load(seqs, ctxs))
-                    for splited_data in data_list:
-                        input_ids, valid_length, segment_ids, label = splited_data
-                        out = model(input_ids,
-                                    segment_ids,
-                                    valid_length=valid_length)
-                        ls = loss_function(out, label).mean() / len(ctxs)
-                        batch_loss.append(ls)
-                        if args.accumulate:
-                            ls = ls / args.accumulate
-                        ls.backward()
-                # update
-                if not args.accumulate or (batch_id +
-                                           1) % args.accumulate == 0:
-                    trainer.allreduce_grads()
-                    nlp.utils.clip_grad_global_norm(params, 1)
-                    trainer.update(args.accumulate if args.accumulate else 1,
-                                   ignore_stale_grad=True)
-                    step_num += 1
-                    if args.accumulate and args.accumulate > 1:
-                        # set grad to zero for gradient accumulation
-                        all_model_params.zero_grad()
-                    if batch_id == 0 and epoch_id == 0:
-                        toc = time.time()
-                        logging.info(
-                            'Time cost for the first forward-backward =%.2fs',
-                            toc - tic)
-                batch_loss = sum([ls.asscalar() for ls in batch_loss])
-                step_loss += batch_loss
-                if (batch_id + 1) % (args.log_interval) == 0:
-                    log_train(batch_id, len(train_data), step_loss,
-                              args.log_interval, epoch_id,
-                              trainer.learning_rate)
-                    step_loss = 0
-                if step_num >= num_train_steps:
-                    logging.info('Finish training step: %d', step_num)
-                    finish_flag = True
-                    break
-
-            mx.nd.waitall()
-
-        # inference on dev data
-        for segment, dev_data in dev_data_list:
-            metric_nm, metric_val = evaluate(dev_data, metric, segment)
-            if best_metric is None or metric_val >= best_metric:
-                best_metric = metric_val
-                patience = args.early_stop
-            else:
-                if args.early_stop is not None:
-                    patience -= 1
-            metric_history.append((epoch_id, metric_nm, metric_val))
-
-        if not args.only_inference:
-            # save params
-            ckpt_name = 'model_xlnet_{0}_{1}.params'.format(
-                args.task_name, epoch_id)
-            params_saved = os.path.join(output_dir, ckpt_name)
-            nlp.utils.save_parameters(model, params_saved)
-            logging.info('params saved in: %s', params_saved)
-            toc = time.time()
-            logging.info('Time cost=%.2fs', toc - tic)
-            tic = toc
-
-    if not args.only_inference:
-        # we choose the best model based on metric[0],
-        # assuming higher score stands for better model quality
-        metric_history.sort(key=lambda x: x[2][0], reverse=True)
-        epoch_id, metric_nm, metric_val = metric_history[0]
-        ckpt_name = 'model_xlnet_{0}_{1}.params'.format(
-            args.task_name, epoch_id)
-        params_saved = os.path.join(output_dir, ckpt_name)
-        nlp.utils.load_parameters(model, params_saved)
-        metric_str = 'Best model at epoch {}. Validation metrics:'.format(
-            epoch_id + 1)
-        metric_str += ','.join([i + ':%.4f' for i in metric_nm])
-        logging.info(metric_str, *metric_val)
-
-    # inference on test data
-    for segment, test_data in test_data_list:
-        test(test_data, segment)
-    print('finish test!')
-
-
-def evaluate(loader_dev, metric, segment):
-    """Evaluate the model on validation dataset."""
-    logging.info('Now we are doing evaluation on %s with %s.', segment, ctxs)
-    metric.reset()
-    step_loss = 0
-    tic = time.time()
-    out_list = []
-    label_list = []
-    for batch_id, seqs in enumerate(loader_dev):
-        batch_loss = []
-        # forward and backward
-        data_list = list(split_and_load(seqs, ctxs))
-        for splited_data in data_list:
-            input_ids, valid_length, segment_ids, label = splited_data
-            out = model(input_ids, segment_ids, valid_length=valid_length)
-            batch_loss.append(loss_function(out, label).mean() / len(ctxs))
-            if not do_regression:
-                label = label.reshape((-1))
-            out_list.append(out.as_in_context(mx.cpu(0)))
-            label_list.append(label.as_in_context(mx.cpu(0)))
-
-        batch_loss = sum([ls.asscalar() for ls in batch_loss])
-        step_loss += batch_loss
-        if (batch_id + 1) % (args.log_interval) == 0:
-            log_eval(batch_id, len(loader_dev), step_loss, args.log_interval)
-            step_loss = 0
-
-    label_list = mx.nd.concat(*label_list, dim=0)
-    out_list = mx.nd.concat(*out_list, dim=0)
-    metric.update([label_list], [out_list])
-    metric_nm, metric_val = log_metric(metric, is_training=False)
-    mx.nd.waitall()
-    toc = time.time()
-    logging.info('Time cost=%.2fs, throughput=%.2f samples/s', toc - tic,
-                 args.dev_batch_size * len(loader_dev) / (toc - tic))
-    return metric_nm, metric_val
-
-
-if __name__ == '__main__':
-    train(task.metrics)
diff --git a/scripts/language_model/run_squad.py b/scripts/language_model/run_squad.py
deleted file mode 100644
index ab57edf7c4..0000000000
--- a/scripts/language_model/run_squad.py
+++ /dev/null
@@ -1,721 +0,0 @@
-"""
-Question Answering with XLNet
-"""
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-
-import os
-import time
-import argparse
-import random
-import logging
-import warnings
-import json
-import collections
-import pickle
-import sys
-import itertools
-import subprocess
-import multiprocessing as mp
-from functools import partial
-import numpy as np
-import mxnet as mx
-import gluonnlp as nlp
-from gluonnlp.data import SQuAD
-from gluonnlp.data.bert.glue import concat_sequences
-from gluonnlp.data.bert.squad import get_doc_spans, \
-    check_is_max_context, convert_squad_examples, align_position2doc_spans
-from gluonnlp.data.xlnet.squad import lcs_match, convert_index
-from model.qa import XLNetForQA
-from transformer import model
-from xlnet_qa_evaluate import predict_extended
-parser = argparse.ArgumentParser(description='XLNet QA example.'
-                                 'We fine-tune the XLNet model on SQuAD dataset.')
-
-# I/O configuration
-parser.add_argument('--sentencepiece', type=str, default=None,
-                    help='Path to the sentencepiece .model file for both tokenization and vocab.')
-parser.add_argument('--pretrained_xlnet_parameters', type=str, default=None,
-                    help='Pre-trained bert model parameter file. default is None')
-parser.add_argument('--load_pickle', action='store_true',
-                    help='Whether do data preprocessing or load from pickled file')
-parser.add_argument('--dev_dataset_file', default='./output_dir/out.dev', type=str,
-                    help='Path to dev data features')
-parser.add_argument('--train_dataset_file', default='./output_dir/out.train', type=str,
-                    help='Path to train data features')
-parser.add_argument('--model_parameters', type=str, default=None, help='Model parameter file')
-parser.add_argument(
-    '--output_dir', type=str, default='./output_dir',
-    help='The output directory where the model params will be written.'
-    ' default is ./output_dir')
-
-# Training configuration
-parser.add_argument('--seed', type=int, default=3, help='Random seed')
-parser.add_argument('--version_2', action='store_true', help='Whether use SQuAD v2.0 dataset')
-parser.add_argument('--model', type=str, default='xlnet_cased_l12_h768_a12',
-                    choices=['xlnet_cased_l24_h1024_a16', 'xlnet_cased_l12_h768_a12'],
-                    help='The name of pre-trained XLNet model to fine-tune')
-parser.add_argument('--dataset', type=str, default='126gb', choices=['126gb'],
-                    help='The dataset BERT pre-trained with. Currently only 126gb is available')
-parser.add_argument(
-    '--uncased', action='store_true', help=
-    'if set, inputs are converted to lower case. Up to 01/04/2020, all released models are cased')
-parser.add_argument('--gpu', type=int, default=None,
-                    help='Number of gpus to use for finetuning. CPU is used if not set.')
-parser.add_argument('--log_interval', type=int, default=10, help='report interval. default is 10')
-parser.add_argument('--debug', action='store_true',
-                    help='Run the example in test mode for sanity checks')
-parser.add_argument('--only_predict', action='store_true', help='Whether to predict only.')
-
-# Hyperparameters
-parser.add_argument('--epochs', type=int, default=3, help='number of epochs, default is 3')
-parser.add_argument(
-    '--training_steps', type=int, help='training steps. Note that epochs will be ignored '
-    'if training steps are set')
-
-parser.add_argument('--batch_size', type=int, default=32,
-                    help='Batch size. Number of examples per gpu in a minibatch. default is 32')
-
-parser.add_argument('--test_batch_size', type=int, default=24,
-                    help='Test batch size. default is 24')
-
-parser.add_argument('--optimizer', type=str, default='bertadam',
-                    help='optimization algorithm. default is bertadam')
-
-parser.add_argument(
-    '--accumulate', type=int, default=None, help='The number of batches for '
-    'gradients accumulation to simulate large batch size. Default is None')
-
-parser.add_argument('--lr', type=float, default=3e-5,
-                    help='Initial learning rate. default is 5e-5')
-
-parser.add_argument(
-    '--warmup_ratio', type=float, default=0,
-    help='ratio of warmup steps that linearly increase learning rate from '
-    '0 to target learning rate. default is 0')
-parser.add_argument('--layerwise_decay', type=float, default=0.75, help='Layer-wise lr decay')
-parser.add_argument('--wd', type=float, default=0.01, help='weight decay')
-parser.add_argument('--dropout', type=float, default=0.1, help='dropout')
-parser.add_argument('--attention_dropout', type=float, default=0.1, help='attention dropout')
-
-# Data pre/post processing
-parser.add_argument(
-    '--max_seq_length', type=int, default=512,
-    help='The maximum total input sequence length after WordPiece tokenization.'
-    'Sequences longer than this will be truncated, and sequences shorter '
-    'than this will be padded. default is 512')
-
-parser.add_argument(
-    '--doc_stride', type=int, default=128,
-    help='When splitting up a long document into chunks, how much stride to '
-    'take between chunks. default is 128')
-
-parser.add_argument(
-    '--max_query_length', type=int, default=64,
-    help='The maximum number of tokens for the question. Questions longer than '
-    'this will be truncated to this length. default is 64')
-
-parser.add_argument(
-    '--round_to', type=int, default=None,
-    help='The length of padded sequences will be rounded up to be multiple of this argument.'
-         'When round to is set to 8, training throughput may increase for mixed precision'
-         'training on GPUs with tensorcores.')
-
-parser.add_argument('--start_top_n', type=int, default=5,
-                    help='Number of start-position candidates')
-parser.add_argument('--end_top_n', type=int, default=5,
-                    help='Number of end-position candidates corresponding '
-                    'to a start position')
-parser.add_argument('--n_best_size', type=int, default=5, help='top N results written to file')
-parser.add_argument(
-    '--max_answer_length', type=int, default=64,
-    help='The maximum length of an answer that can be generated. This is needed '
-    'because the start and end predictions are not conditioned on one another.'
-    ' default is 64')
-parser.add_argument('--num_workers', type=int, default=4,
-                    help='Number of workers used for data preprocessing')
-parser.add_argument(
-    '--null_score_diff_threshold', type=float, default=0.0,
-    help='If null_score - best_non_null is greater than the threshold predict null.'
-    'Typical values are between -1.0 and -5.0. default is 0.0. '
-    'Note that a best value can be automatically found by the evaluation script')
-
-args = parser.parse_args()
-
-# random seed
-np.random.seed(args.seed)
-random.seed(args.seed)
-mx.random.seed(args.seed)
-
-if not os.path.exists(args.output_dir):
-    os.mkdir(args.output_dir)
-
-# set the logger
-log = logging.getLogger('gluonnlp')
-log.setLevel(logging.DEBUG)
-formatter = logging.Formatter(fmt='%(levelname)s:%(name)s:%(asctime)s %(message)s',
-                              datefmt='%H:%M:%S')
-fh = logging.FileHandler(os.path.join(args.output_dir, 'finetune_squad.log'))
-fh.setLevel(logging.INFO)
-fh.setFormatter(formatter)
-console = logging.StreamHandler()
-console.setLevel(logging.INFO)
-console.setFormatter(formatter)
-log.addHandler(console)
-log.addHandler(fh)
-
-log.info(args)
-
-pretrained_xlnet_parameters = args.pretrained_xlnet_parameters
-if pretrained_xlnet_parameters and args.model_parameters:
-    raise ValueError('Cannot provide both pre-trained BERT parameters and '
-                     'BertForQA model parameters.')
-
-ctx = [mx.cpu(0)] if not args.gpu else [mx.gpu(i) for i in range(args.gpu)]
-
-log_interval = args.log_interval * args.accumulate if args.accumulate else args.log_interval
-if args.accumulate:
-    log.info('Using gradient accumulation. Effective batch size = %d',
-             args.accumulate * args.batch_size)
-if args.max_seq_length <= args.max_query_length + 3:
-    raise ValueError('The max_seq_length (%d) must be greater than max_query_length '
-                     '(%d) + 3' % (args.max_seq_length, args.max_query_length))
-
-get_pretrained = True
-
-get_model_params = {
-    'name': args.model,
-    'dataset_name': args.dataset,
-    'pretrained': get_pretrained,
-    'ctx': ctx,
-    'use_decoder': False,
-    'dropout': args.dropout,
-    'attention_dropout': args.attention_dropout
-}
-
-# model, vocabulary and tokenizer
-xlnet_base, vocab, tokenizer = model.get_model(**get_model_params)
-
-batchify_fn = nlp.data.batchify.Tuple(
-    nlp.data.batchify.Stack('int32'),  # example_id
-    nlp.data.batchify.Pad(axis=0, pad_val=vocab[vocab.padding_token], dtype='int32',
-                          round_to=args.round_to),  # input_ids
-    nlp.data.batchify.Pad(axis=0, pad_val=3, dtype='int32', round_to=args.round_to),  # segment_ids
-    nlp.data.batchify.Stack('float32'),  # valid_length
-    nlp.data.batchify.Pad(axis=0, pad_val=1, round_to=args.round_to),  # p_mask
-    nlp.data.batchify.Stack('float32'),  # start_position
-    nlp.data.batchify.Stack('float32'),  # end_position
-    nlp.data.batchify.Stack('float32'))  # is_impossible
-
-if pretrained_xlnet_parameters:
-    # only load XLnetModel parameters
-    nlp.utils.load_parameters(xlnet_base, pretrained_xlnet_parameters, ctx=ctx, ignore_extra=True,
-                              cast_dtype=True)
-
-units = xlnet_base._net._units
-net = XLNetForQA(xlnet_base=xlnet_base, start_top_n=args.start_top_n, end_top_n=args.end_top_n,
-                 units=units)
-
-net_eval = XLNetForQA(xlnet_base=xlnet_base, start_top_n=args.start_top_n,
-                      end_top_n=args.end_top_n, units=units, is_eval=True,
-                      params=net.collect_params())
-
-initializer = mx.init.Normal(0.02)
-
-if args.model_parameters:
-    # load complete XLNetForQA parameters
-    nlp.utils.load_parameters(net, args.model_parameters, ctx=ctx, cast_dtype=True)
-else:
-    net.start_logits.initialize(init=initializer, ctx=ctx)
-    net.end_logits.initialize(init=initializer, ctx=ctx)
-    net.answer_class.initialize(init=initializer, ctx=ctx)
-
-net.hybridize(static_alloc=True)
-net_eval.hybridize(static_alloc=True)
-
-SquadXLNetFeautre = collections.namedtuple('SquadXLNetFeautre', [
-    'example_id', 'qas_id', 'valid_length', 'tokens', 'tok_start_to_orig_index',
-    'tok_end_to_orig_index', 'token_is_max_context', 'input_ids', 'p_mask', 'segment_ids',
-    'start_position', 'end_position', 'paragraph_text', 'paragraph_len', 'is_impossible'
-])
-
-
-def convert_examples_to_features(example, tokenizer=None, cls_token=None, sep_token=None,
-                                 vocab=None, max_seq_length=384, doc_stride=128,
-                                 max_query_length=64, is_training=True):
-    """convert the examples to the XLNet features"""
-    query_tokenized = tokenizer(example.question_text)[:max_query_length]
-    #tokenize paragraph and get start/end position of the answer in tokenized paragraph
-    paragraph_tokenized = tokenizer(example.paragraph_text)
-
-    chartok_to_tok_index = [] # char to its corresponding token's index
-    tok_start_to_chartok_index = [] # token index to its first character's index
-    tok_end_to_chartok_index = [] # token index to its last character's index
-    char_cnt = 0
-    for i, token in enumerate(paragraph_tokenized):
-        chartok_to_tok_index.extend([i] * len(token))
-        tok_start_to_chartok_index.append(char_cnt)
-        char_cnt += len(token)
-        tok_end_to_chartok_index.append(char_cnt - 1)
-
-    tok_cat_text = ''.join(paragraph_tokenized).replace(u'▁', ' ')
-
-    # XLNet takes a more complicated strategy to match the origin text
-    # and the tokenized tokens
-    # Get the LCS matching between origin text and token-concatenated text.
-    n, m = len(example.paragraph_text), len(tok_cat_text)
-    max_dist = abs(n - m) + 5
-    for _ in range(2):
-        f, g = lcs_match(max_dist, example.paragraph_text, tok_cat_text)
-        if f[n - 1, m - 1] > 0.8 * n:
-            break
-        max_dist *= 2
-
-    # Get the mapping from orgin text/tokenized text to tokenized text/origin text
-    orig_to_chartok_index = [None] * n
-    chartok_to_orig_index = [None] * m
-    i, j = n - 1, m - 1
-    while i >= 0 and j >= 0:
-        if (i, j) not in g:
-            break
-        if g[(i, j)] == 2:
-            orig_to_chartok_index[i] = j
-            chartok_to_orig_index[j] = i
-            i, j = i - 1, j - 1
-        elif g[(i, j)] == 1:
-            j = j - 1
-        else:
-            i = i - 1
-
-    # get start/end mapping
-    tok_start_to_orig_index = []
-    tok_end_to_orig_index = []
-    for i in range(len(paragraph_tokenized)): # for each token in the tokenized paragraph
-        start_chartok_pos = tok_start_to_chartok_index[i] # first character's index in origin text
-        end_chartok_pos = tok_end_to_chartok_index[i] # last character's index in origin text
-        start_orig_pos = convert_index(chartok_to_orig_index, start_chartok_pos, n, is_start=True)
-        end_orig_pos = convert_index(chartok_to_orig_index, end_chartok_pos, m, is_start=False)
-
-        tok_start_to_orig_index.append(start_orig_pos)
-        tok_end_to_orig_index.append(end_orig_pos)
-
-    tok_start_position, tok_end_position = -1, -1
-
-    # get mapped start/end position
-    if is_training and not example.is_impossible:
-        start_chartok_pos = convert_index(orig_to_chartok_index, example.start_offset,
-                                          is_start=True)
-        tok_start_position = chartok_to_tok_index[start_chartok_pos]
-
-        end_chartok_pos = convert_index(orig_to_chartok_index, example.end_offset, is_start=False)
-        tok_end_position = chartok_to_tok_index[end_chartok_pos]
-        assert tok_start_position <= tok_end_position
-
-    # get doc spans using sliding window
-    doc_spans, doc_spans_indices = get_doc_spans(paragraph_tokenized,
-                                                 max_seq_length - len(query_tokenized) - 3,
-                                                 doc_stride)
-
-    # record whether the tokens in a docspan have max context
-    token_is_max_context = [{
-        p: check_is_max_context(doc_spans_indices, i, p + doc_spans_indices[i][0])
-        for p in range(len(doc_span))
-    } for (i, doc_span) in enumerate(doc_spans)]
-
-    # get token -> origin text mapping
-    cur_tok_start_to_orig_index = [[tok_start_to_orig_index[p + st] for p in range(len(doc_span))]
-                                   for doc_span, (st, ed) in zip(doc_spans, doc_spans_indices)]
-    cur_tok_end_to_orig_index = [[tok_end_to_orig_index[p + st] for p in range(len(doc_span))]
-                                 for doc_span, (st, ed) in zip(doc_spans, doc_spans_indices)]
-
-    # get sequence features: tokens, segment_ids, p_masks
-    seq_features = [
-        concat_sequences([doc_span, query_tokenized], [[sep_token]] * 2 + [[cls_token]],
-                         [[0] * len(doc_span), [1] * len(query_tokenized)], [[1], [1], [0]])
-        for doc_span in doc_spans
-    ]
-
-    # get the start/end positions aligned to doc spans. If is_impossible or position out of span
-    # set position to cls_index, i.e., last token in the sequence.
-    if not example.is_impossible:
-        positions = [
-            align_position2doc_spans([tok_start_position, tok_end_position], doc_idx, offset=0,
-                                     default_value=len(seq[0]) - 1)
-            for (doc_idx, seq) in zip(doc_spans_indices, seq_features)
-        ]
-    else:
-        positions = [(len(seq_feature[0]) - 1, len(seq_feature[0]) - 1)
-                     for seq_feature in seq_features]
-
-    features = [
-        SquadXLNetFeautre(example_id=example.example_id, qas_id=example.qas_id,
-                          tok_start_to_orig_index=t2st, tok_end_to_orig_index=t2ed,
-                          valid_length=len(tokens), tokens=tokens, token_is_max_context=is_max,
-                          input_ids=vocab[tokens], p_mask=p_mask, segment_ids=segment_ids,
-                          start_position=start, end_position=end,
-                          paragraph_text=example.paragraph_text, paragraph_len=len(tokens),
-                          is_impossible=(start == len(tokens) - 1))
-        for (tokens, segment_ids, p_mask), (
-            start,
-            end), is_max, t2st, t2ed in zip(seq_features, positions, token_is_max_context,
-                                            cur_tok_start_to_orig_index, cur_tok_end_to_orig_index)
-    ]
-    return features
-
-
-def preprocess_dataset(tokenizer, dataset, vocab=None, max_seq_length=384, doc_stride=128,
-                       max_query_length=64, num_workers=16, load_from_pickle=False,
-                       feature_file=None, is_training=True):
-    """Loads a dataset into features"""
-    vocab = tokenizer.vocab if vocab is None else vocab
-    trans = partial(convert_examples_to_features, tokenizer=tokenizer, cls_token=vocab.cls_token,
-                    sep_token=vocab.sep_token, vocab=vocab, max_seq_length=max_seq_length,
-                    doc_stride=doc_stride, max_query_length=max_query_length)
-    pool = mp.Pool(num_workers)
-    start = time.time()
-    if not load_from_pickle:
-        example_trans = partial(convert_squad_examples, is_training=is_training)
-        # convert the raw dataset into raw features
-        examples = pool.map(example_trans, dataset)
-        raw_features = list(map(trans, examples))  #pool.map(trans, examples)
-        if feature_file:
-            with open(feature_file, 'wb') as file:
-                pickle.dump(raw_features, file)
-    else:
-        assert feature_file, 'feature file should be provided.'
-        with open(feature_file, 'rb') as file:
-            raw_features = pickle.load(file)
-
-    end = time.time()
-    pool.close()
-    log.info('Done! Transform dataset costs %.2f seconds.', (end - start))
-    return raw_features
-
-
-def convert_full_features_to_input_features(raw_features):
-    """convert the full features into the input features"""
-    data_features = mx.gluon.data.SimpleDataset(list(itertools.chain.from_iterable(raw_features)))
-    data_features = data_features.transform(lambda *example: (
-        example[0],  # example_id
-        example[7],  # inputs_id
-        example[9],  # segment_ids
-        example[2],  # valid_length,
-        example[8],  # p_mask
-        example[10],  # start_position,
-        example[11],  # end_position
-        example[14]))  # is_impossible
-    return data_features
-
-
-def split_array(arr, num_of_splits):
-    """split an array into equal pieces"""
-    # TODO Replace this function with gluon.utils.split_data() once targeting MXNet 1.7
-    size = arr.shape[0]
-    if size < num_of_splits:
-        return [arr[i:i + 1] for i in range(size)]
-    slice_len, rest = divmod(size, num_of_splits)
-    div_points = [0] + [(slice_len * index + min(index, rest) + slice_len + (index < rest))
-                        for index in range(num_of_splits)]
-    slices = [arr[div_points[i]:div_points[i + 1]] for i in range(num_of_splits)]
-    return slices
-
-
-def split_and_load(arrs, _ctxs):
-    """split and load arrays to a list of contexts"""
-    # TODO Replace split_array() with gluon.utils.split_data() once targeting MXNet 1.7
-    assert isinstance(arrs, (list, tuple))
-    # split and load
-    loaded_arrs = [[i.as_in_context(ctx) for i, ctx in zip(split_array(arr, len(_ctxs)), _ctxs)]
-                   for arr in arrs]
-    return zip(*loaded_arrs)
-
-
-def _apply_gradient_decay():
-    """apply layer-wise gradient decay.
-
-    Note that the description in origin paper about layer-wise learning rate decay
-    is inaccurate. According to their implementation, they are actually performing
-    layer-wise gradient decay. Gradient decay and learning rate decay could be the
-    same by using standard SGD, but different by using Adaptive optimizer(e.g., Adam).
-    """
-    parameter_not_included = ['seg_emb', 'query_key_bias', 'query_emb_bias', 'query_seg_bias']
-    num_layers = len(xlnet_base._net.transformer_cells)
-    for (i, layer_parameters) in enumerate(xlnet_base._net.transformer_cells):
-        layer_params = layer_parameters.collect_params()
-        for key, value in layer_params.items():
-            skip = False
-            for pn in parameter_not_included:
-                if pn in key:
-                    skip = True
-            if skip:
-                continue
-            if value.grad_req != 'null':
-                for arr in value.list_grad():
-                    arr *= args.layerwise_decay**(num_layers - i - 1)
-
-
-def train():
-    """Training function."""
-    segment = 'train'
-    log.info('Loading %s data...', segment)
-    # Note that for XLNet, the authors always use squad2 dataset for training
-    train_data = SQuAD(segment, version='2.0')
-    if args.debug:
-        sampled_data = [train_data[i] for i in range(100)]
-        train_data = mx.gluon.data.SimpleDataset(sampled_data)
-    log.info('Number of records in Train data: %s', len(train_data))
-
-    train_data_features = preprocess_dataset(
-        tokenizer, train_data, vocab=vocab, max_seq_length=args.max_seq_length,
-        doc_stride=args.doc_stride, num_workers=args.num_workers,
-        max_query_length=args.max_query_length, load_from_pickle=args.load_pickle,
-        feature_file=args.train_dataset_file)
-
-    train_data_input = convert_full_features_to_input_features(train_data_features)
-    log.info('The number of examples after preprocessing: %s', len(train_data_input))
-
-    train_dataloader = mx.gluon.data.DataLoader(train_data_input, batchify_fn=batchify_fn,
-                                                batch_size=args.batch_size, num_workers=4,
-                                                shuffle=True)
-
-    optimizer_params = {'learning_rate': args.lr, 'wd': args.wd}
-    try:
-        trainer = mx.gluon.Trainer(net.collect_params(), args.optimizer, optimizer_params,
-                                   update_on_kvstore=False)
-    except ValueError as _:
-        warnings.warn('AdamW optimizer is not found. Please consider upgrading to '
-                      'mxnet>=1.5.0. Now the original Adam optimizer is used instead.')
-        trainer = mx.gluon.Trainer(net.collect_params(), 'bertadam', optimizer_params,
-                                   update_on_kvstore=False)
-
-    num_train_examples = len(train_data_input)
-    step_size = args.batch_size * args.accumulate if args.accumulate else args.batch_size
-    num_train_steps = int(num_train_examples / step_size * args.epochs)
-    epoch_number = args.epochs
-    if args.training_steps:
-        num_train_steps = args.training_steps
-        epoch_number = 100000
-
-    log.info('training steps=%d', num_train_steps)
-    num_warmup_steps = int(num_train_steps * args.warmup_ratio)
-    step_num = 0
-
-    def set_new_lr(step_num, batch_id):
-        """set new learning rate"""
-        # set grad to zero for gradient accumulation
-        if args.accumulate:
-            if batch_id % args.accumulate == 0:
-                net.collect_params().zero_grad()
-                step_num += 1
-        else:
-            step_num += 1
-        # learning rate schedule
-        # Notice that this learning rate scheduler is adapted from traditional linear learning
-        # rate scheduler where step_num >= num_warmup_steps, new_lr = 1 - step_num/num_train_steps
-        if step_num < num_warmup_steps:
-            new_lr = args.lr * step_num / num_warmup_steps
-        else:
-            offset = (step_num - num_warmup_steps) * args.lr / \
-                (num_train_steps - num_warmup_steps)
-            new_lr = args.lr - offset
-        trainer.set_learning_rate(new_lr)
-        return step_num
-
-    # Do not apply weight decay on LayerNorm and bias terms
-    for _, v in net.collect_params('.*beta|.*gamma|.*bias').items():
-        v.wd_mult = 0.0
-    # Collect differentiable parameters
-    params = [p for p in net.collect_params().values() if p.grad_req != 'null']
-    # Set grad_req if gradient accumulation is required
-    if args.accumulate:
-        for p in params:
-            p.grad_req = 'add'
-
-    epoch_tic = time.time()
-    total_num = 0
-    log_num = 0
-    finish_flag = False
-    for epoch_id in range(epoch_number):
-        step_loss = 0.0
-        step_loss_span = 0
-        step_loss_cls = 0
-        tic = time.time()
-        if finish_flag:
-            break
-        for batch_id, data in enumerate(train_dataloader):
-            # set new lr
-            step_num = set_new_lr(step_num, batch_id)
-            data_list = list(split_and_load(data, ctx))
-            # forward and backward
-            batch_loss = []
-            batch_loss_sep = []
-            with mx.autograd.record():
-                for splited_data in data_list:
-                    _, inputs, token_types, valid_length, p_mask, start_label, end_label, is_impossible = splited_data  # pylint: disable=line-too-long
-                    valid_length = valid_length.astype('float32')
-                    log_num += len(inputs)
-                    total_num += len(inputs)
-                    out_sep, out = net(
-                        inputs,
-                        token_types,
-                        valid_length,
-                        [start_label, end_label],
-                        p_mask=p_mask,  # pylint: disable=line-too-long
-                        is_impossible=is_impossible)
-                    ls = out.mean() / len(ctx)
-                    batch_loss_sep.append(out_sep)
-                    batch_loss.append(ls)
-                    if args.accumulate:
-                        ls = ls / args.accumulate
-                    ls.backward()
-            # update
-            if not args.accumulate or (batch_id + 1) % args.accumulate == 0:
-                trainer.allreduce_grads()
-                nlp.utils.clip_grad_global_norm(params, 1)
-                _apply_gradient_decay()
-                trainer.update(1, ignore_stale_grad=True)
-
-                step_loss_sep_tmp = np.array(
-                    [[span_ls.mean().asscalar(),
-                      cls_ls.mean().asscalar()] for span_ls, cls_ls in batch_loss_sep])
-                step_loss_sep_tmp = list(np.sum(step_loss_sep_tmp, axis=0))
-                step_loss_span += step_loss_sep_tmp[0] / len(ctx)
-                step_loss_cls += step_loss_sep_tmp[1] / len(ctx)
-
-            step_loss += sum([ls.asscalar() for ls in batch_loss])
-            if (batch_id + 1) % log_interval == 0:
-                toc = time.time()
-                log.info(
-                    'Epoch: %d, Batch: %d/%d, Loss=%.4f, lr=%.7f '
-                    'Time cost=%.1f Thoughput=%.2f samples/s', epoch_id + 1, batch_id + 1,
-                    len(train_dataloader), step_loss / log_interval, trainer.learning_rate,
-                    toc - tic, log_num / (toc - tic))
-                log.info('span_loss: %.4f, cls_loss: %.4f', step_loss_span / log_interval,
-                         step_loss_cls / log_interval)
-
-                tic = time.time()
-                step_loss = 0.0
-                step_loss_span = 0
-                step_loss_cls = 0
-                log_num = 0
-            if step_num >= num_train_steps:
-                logging.info('Finish training step: %d', step_num)
-                finish_flag = True
-                break
-        epoch_toc = time.time()
-        log.info('Time cost=%.2f s, Thoughput=%.2f samples/s', epoch_toc - epoch_tic,
-                 total_num / (epoch_toc - epoch_tic))
-        version_prefix = 'squad2' if args.version_2 else 'squad1'
-        ckpt_name = 'model_{}_{}_{}.params'.format(args.model, version_prefix, epoch_id + 1)
-        params_saved = os.path.join(args.output_dir, ckpt_name)
-        nlp.utils.save_parameters(net, params_saved)
-        log.info('params saved in: %s', params_saved)
-
-
-RawResultExtended = collections.namedtuple(
-    'RawResultExtended',
-    ['start_top_log_probs', 'start_top_index', 'end_top_log_probs', 'end_top_index', 'cls_logits'])
-
-
-def evaluate():
-    """Evaluate the model on validation dataset.
-    """
-    log.info('Loading dev data...')
-    if args.version_2:
-        dev_data = SQuAD('dev', version='2.0')
-    else:
-        dev_data = SQuAD('dev', version='1.1')
-    (_, _), (data_file_name, _) \
-        = dev_data._data_file[dev_data._version][dev_data._segment]
-    dev_data_path = os.path.join(dev_data._root, data_file_name)
-
-    if args.debug:
-        sampled_data = [dev_data[0], dev_data[1], dev_data[2]]
-        dev_data = mx.gluon.data.SimpleDataset(sampled_data)
-    log.info('Number of records in dev data: %d', len(dev_data))
-
-    dev_data_features = preprocess_dataset(
-        tokenizer, dev_data, vocab=vocab, max_seq_length=args.max_seq_length,
-        doc_stride=args.doc_stride, num_workers=args.num_workers,
-        max_query_length=args.max_query_length, load_from_pickle=args.load_pickle,
-        feature_file=args.dev_dataset_file)
-
-    dev_data_input = convert_full_features_to_input_features(dev_data_features)
-    log.info('The number of examples after preprocessing: %d', len(dev_data_input))
-
-    dev_dataloader = mx.gluon.data.DataLoader(dev_data_input, batchify_fn=batchify_fn,
-                                              num_workers=4, batch_size=args.test_batch_size,
-                                              shuffle=False, last_batch='keep')
-
-    log.info('start prediction')
-
-    all_results = collections.defaultdict(list)
-
-    epoch_tic = time.time()
-    total_num = 0
-    for (batch_id, data) in enumerate(dev_dataloader):
-        data_list = list(split_and_load(data, ctx))
-        for splited_data in data_list:
-            example_ids, inputs, token_types, valid_length, p_mask, _, _, _ = splited_data
-            total_num += len(inputs)
-            outputs = net_eval(inputs, token_types, valid_length, p_mask=p_mask)
-            example_ids = example_ids.asnumpy().tolist()
-            for c, example_ids in enumerate(example_ids):
-                result = RawResultExtended(start_top_log_probs=outputs[0][c].asnumpy().tolist(),
-                                           start_top_index=outputs[1][c].asnumpy().tolist(),
-                                           end_top_log_probs=outputs[2][c].asnumpy().tolist(),
-                                           end_top_index=outputs[3][c].asnumpy().tolist(),
-                                           cls_logits=outputs[4][c].asnumpy().tolist())
-                all_results[example_ids].append(result)
-        if batch_id % args.log_interval == 0:
-            log.info('Batch: %d/%d', batch_id + 1, len(dev_dataloader))
-
-    epoch_toc = time.time()
-    log.info('Time cost=%2f s, Thoughput=%.2f samples/s', epoch_toc - epoch_tic,
-             total_num / (epoch_toc - epoch_tic))
-
-    log.info('Get prediction results...')
-
-    all_predictions = collections.OrderedDict()
-    all_nbest_json = collections.OrderedDict()
-    scores_diff_json = collections.OrderedDict()
-    for features in dev_data_features:
-        results = all_results[features[0].example_id]
-        example_qas_id = features[0].qas_id
-        score_diff, best_non_null_entry, nbest_json = predict_extended(
-            features=features, results=results, n_best_size=args.n_best_size,
-            max_answer_length=args.max_answer_length, start_n_top=args.start_top_n,
-            end_n_top=args.end_top_n)
-        scores_diff_json[example_qas_id] = score_diff
-        all_predictions[example_qas_id] = best_non_null_entry
-        all_nbest_json[example_qas_id] = nbest_json
-
-    output_prediction_file = os.path.join(args.output_dir, 'predictions.json')
-    output_nbest_file = os.path.join(args.output_dir, 'nbest_predictions.json')
-    output_null_log_odds_file = os.path.join(args.output_dir, 'null_odds.json')
-
-    with open(output_prediction_file, 'w') as writer:
-        writer.write(json.dumps(all_predictions, indent=4) + '\n')
-    with open(output_nbest_file, 'w') as writer:
-        writer.write(json.dumps(all_nbest_json, indent=4) + '\n')
-    with open(output_null_log_odds_file, 'w') as writer:
-        writer.write(json.dumps(scores_diff_json, indent=4) + '\n')
-
-    if os.path.exists(sys.path[0] + '/evaluate-v2.0.py'):
-        arguments = [
-            dev_data_path, output_prediction_file, '--na-prob-thresh',
-            str(args.null_score_diff_threshold)
-        ]
-        if args.version_2:
-            arguments += ['--na-prob-file', output_null_log_odds_file]
-        subprocess.call([sys.executable, sys.path[0] + '/evaluate-v2.0.py'] + arguments)
-    else:
-        log.info('Please download evaluate-v2.0.py to get evaluation results for SQuAD. '
-                 'Check index.rst for the detail.')
-
-
-if __name__ == '__main__':
-    if not args.only_predict:
-        train()
-        evaluate()
-    else:
-        evaluate()
diff --git a/scripts/language_model/sampler.py b/scripts/language_model/sampler.py
deleted file mode 100644
index f841fba160..0000000000
--- a/scripts/language_model/sampler.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Log Uniform Candidate Sampler"""
-
-import math
-import numpy as np
-from mxnet import ndarray, gluon
-
-
-class LogUniformSampler(gluon.block.Block):
-    """Draw random samples from an approximately log-uniform or Zipfian distribution.
-
-    This operation randomly samples *num_sampled* candidates the range of integers [0, range_max).
-    The elements of sampled_candidates are drawn without replacement from the base distribution.
-
-    The base distribution for this operator is an approximately log-uniform or Zipfian distribution:
-
-    P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)
-
-    This sampler is useful when the true classes approximately follow such a distribution.
-
-    For example, if the classes represent words in a lexicon sorted in decreasing order of
-    frequency. If your classes are not ordered by decreasing frequency, do not use this op.
-
-    Additionally, it also returns the number of times each of the
-    true classes and the sampled classes is expected to occur.
-
-    As the candidates are drawn without replacement, the expected count for the sampled candidates
-    and true classes are approximated. If the candidates are drawn with `num_tries` draws, we assume
-    (falsely) that the number of tries to get a batch of batch_size distinct values is always
-    `num_tries`, and the probability that the value is in a batch is 1 - (1-p)**num_tries.
-
-    Parameters
-    ----------
-    num_sampled: int
-        The number of classes to randomly sample.
-    range_max: int
-        The number of possible classes.
-    dtype: str or np.dtype
-        The dtype for outputs
-    """
-    def __init__(self, range_max, num_sampled, dtype=None, **kwargs):
-        super(LogUniformSampler, self).__init__(**kwargs)
-        self._num_sampled = num_sampled
-        self._log_range = math.log(range_max + 1)
-        self._dtype = np.float32 if dtype is None else dtype
-        self._range_max = range_max
-
-    def _prob_helper(self, num_tries, prob):
-        return (num_tries.astype('float64') * (-prob).log1p()).expm1() * -1
-
-    def forward(self, true_classes): # pylint: disable=arguments-differ
-        """Draw samples from log uniform distribution and returns sampled candidates,
-        expected count for true classes and sampled classes.
-
-        Parameters
-        ----------
-        true_classes: NDArray
-            The true classes.
-
-        Returns
-        -------
-        samples: NDArray
-            The sampled candidate classes.
-        expected_count_sample: NDArray
-            The expected count for sampled candidates.
-        expected_count_true: NDArray
-            The expected count for true classes in the same shape as `true_classes`.
-        """
-        num_sampled = self._num_sampled
-        ctx = true_classes.context
-        num_tries = 0
-        log_range = math.log(self._range_max + 1)
-
-        # sample candidates
-        f = ndarray._internal._sample_unique_zipfian
-        sampled_classes, num_tries = f(self._range_max, shape=(1, num_sampled))
-        sampled_classes = sampled_classes.reshape((-1,))
-        sampled_classes = sampled_classes.as_in_context(ctx)
-        num_tries = num_tries.as_in_context(ctx)
-
-        # expected count for true classes
-        true_cls = true_classes.as_in_context(ctx).astype('float64')
-        prob_true = ((true_cls + 2.0) / (true_cls + 1.0)).log() / log_range
-        count_true = self._prob_helper(num_tries, prob_true)
-        # expected count for sampled classes
-        sampled_classes = ndarray.array(sampled_classes, ctx=ctx, dtype='int64')
-        sampled_cls_fp64 = sampled_classes.astype('float64')
-        prob_sampled = ((sampled_cls_fp64 + 2.0) / (sampled_cls_fp64 + 1.0)).log() / log_range
-        count_sampled = self._prob_helper(num_tries, prob_sampled)
-        # convert to dtype
-        sampled_classes = sampled_classes.astype(self._dtype, copy=False)
-        count_true = count_true.astype(self._dtype, copy=False)
-        count_sampled = count_sampled.astype(self._dtype, copy=False)
-        return sampled_classes, count_sampled, count_true
diff --git a/scripts/language_model/transformer/__init__.py b/scripts/language_model/transformer/__init__.py
deleted file mode 100644
index f687b12e5b..0000000000
--- a/scripts/language_model/transformer/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utilities for Language Models based on the Transformer architecture."""
-
-from .attention_cell import *
-from .data import *
-from .embedding import *
-from .model import *
-from .softmax import *
-from .transformer import *
-
-__all__ = attention_cell.__all__ + embedding.__all__ + softmax.__all__ + \
-    transformer.__all__ + model.__all__ + data.__all__
diff --git a/scripts/language_model/transformer/attention_cell.py b/scripts/language_model/transformer/attention_cell.py
deleted file mode 100644
index d82ece2960..0000000000
--- a/scripts/language_model/transformer/attention_cell.py
+++ /dev/null
@@ -1,394 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Attention cells."""
-
-__all__ = [
-    'PositionalEmbeddingMultiHeadAttentionCell',
-    'RelativeSegmentEmbeddingPositionalEmbeddingMultiHeadAttentionCell'
-]
-
-import math
-
-import mxnet as mx
-
-from gluonnlp.model.attention_cell import _masked_softmax
-
-
-def _rel_shift(F, x):
-    """Perform relative shift operation following Dai et al. (2019) Appendix B.
-
-    Unlike Dai et al.'s implementation, the relative shift is performed on the
-    last two dimensions of the ndarray x. Further, we follow Yang et al. (2019)
-    in not performing zero-padding but expecting the input array to be one
-    element longer along the to be shifted dimension. For example, for
-    TransformerXL, the pos_seq should be `arange(start=klen, stop=-qlen,
-    step=-1)` instead of `arange(start=klen - 1, stop=-qlen, step=-1)`.
-
-    Assumes len(x.shape) == 3 (could be generalized once F.swapaxes supports
-    negative indices)
-
-    """
-    x_ = x
-    # Reshape to x.shape[:-2] + [x.shape[-1] + 1, x.shape[-2]]
-    x_ = F.reshape_like(x_, F.swapaxes(x_, 1, 2))
-    # Remove padded elements
-    x_ = F.slice_axis(x_, axis=-2, begin=1, end=None)
-    # Reshape back to original shape
-    x = F.reshape_like(x_, F.swapaxes(x_, 1, 2))
-    return x
-
-
-class PositionalEmbeddingMultiHeadAttentionCell(mx.gluon.HybridBlock):
-    """Multi-head Attention Cell with positional embeddings.
-
-    Parameters
-    ----------
-    d_head
-        Number of projected units for respectively query, key, value and
-        positional embeddings per attention head.
-    num_heads
-        Number of parallel attention heads
-    dropout
-    scaled
-    weight_initializer : str or `Initializer` or None, default None
-        Initializer of the weights.
-    bias_initializer : str or `Initializer`, default 'zeros'
-        Initializer of the bias.
-    """
-
-    def __init__(self, d_head: int, num_heads: int, dropout: float, scaled: bool,
-                 weight_initializer=None, bias_initializer='zeros', dtype='float32', prefix=None,
-                 params=None):
-        super().__init__(prefix=prefix, params=params)
-        self._d_head = d_head
-        self._num_heads = num_heads
-        self._dropout = dropout
-        self._scaled = scaled
-        self._dtype = dtype
-        units = ['query', 'key', 'value', 'emb']
-        with self.name_scope():
-            for name in units:
-                setattr(
-                    self, 'proj_{}'.format(name),
-                    mx.gluon.nn.Dense(units=d_head * num_heads, use_bias=False, flatten=False,
-                                      weight_initializer=weight_initializer,
-                                      bias_initializer=bias_initializer, prefix='{}_'.format(name)))
-            self.query_key_bias = self.params.get('query_key_bias', shape=(num_heads, d_head),
-                                                  init=bias_initializer)
-            self.query_emb_bias = self.params.get('query_emb_bias', shape=(num_heads, d_head),
-                                                  init=bias_initializer)
-            if dropout:
-                self._dropout_layer = mx.gluon.nn.Dropout(dropout)
-
-    def hybrid_forward(self, F, query, key, value, emb, mask, query_key_bias, query_emb_bias):  # pylint: disable=arguments-differ
-        """Compute the attention.
-
-        Parameters
-        ----------
-        query : Symbol or NDArray
-            Query vector. Shape (batch_size, query_length, query_dim)
-        key : Symbol or NDArray
-            Key of the memory. Shape (batch_size, memory_length, key_dim)
-        value : Symbol or NDArray
-            Value of the memory. Shape (batch_size, memory_length, value_dim)
-        emb : Symbol or NDArray
-            Positional embeddings. Shape (memory_length + 1, value_dim)
-        mask : Symbol or NDArray
-            Mask of the memory slots. Shape (batch_size, query_length, memory_length)
-            Only contains 0 or 1 where 0 means that the memory slot will not be used.
-            If set to None. No mask will be used.
-
-        Returns
-        -------
-        context_vec : Symbol or NDArray
-            Shape (batch_size, query_length, context_vec_dim)
-        att_weights : Symbol or NDArray
-            Attention weights of multiple heads.
-            Shape (batch_size, num_heads, query_length, memory_length)
-        """
-        att_weights = self._compute_weight(F, query, key, emb, mask, query_key_bias=query_key_bias,
-                                           query_emb_bias=query_emb_bias)
-        context_vec = self._read_by_weight(F, att_weights, value)
-        return context_vec, att_weights
-
-    def _project(self, F, name, x):
-        # Shape (batch_size, query_length, num_heads * d_head)
-        x = getattr(self, 'proj_{}'.format(name))(x)
-        # Shape (batch_size * num_heads, query_length, d_head)
-        x = F.transpose(x.reshape(shape=(0, 0, self._num_heads, -1)),
-                        axes=(0, 2, 1, 3))\
-             .reshape(shape=(-1, 0, 0), reverse=True)
-        return x
-
-    def _compute_weight(self, F, query, key, emb, mask, query_key_bias, query_emb_bias):
-        # Project query, key and emb
-        proj_query = self.proj_query(query).reshape(shape=(0, 0, self._num_heads, -1))
-        proj_key = self.proj_key(key).reshape(shape=(0, 0, self._num_heads, -1))
-        proj_emb = self.proj_emb(emb).reshape(shape=(-1, self._num_heads, self._d_head))
-
-        # Add biases and transpose to (batch_size, num_heads, query_length,
-        # d_head) or (num_heads, query_length, d_head)
-        query_with_key_bias = F.transpose(
-            F.broadcast_add(proj_query, F.reshape(query_key_bias, shape=(1, 1, 0, 0),
-                                                  reverse=True)), axes=(0, 2, 1, 3))
-        query_with_emb_bias = F.transpose(
-            F.broadcast_add(proj_query, F.reshape(query_emb_bias, shape=(1, 1, 0, 0),
-                                                  reverse=True)), axes=(0, 2, 1, 3))
-        proj_key = F.transpose(proj_key, axes=(0, 2, 1, 3))
-        proj_emb = F.transpose(proj_emb, axes=(1, 0, 2))
-
-        # Broadcast emb along batch axis
-        proj_emb = F.broadcast_like(F.expand_dims(proj_emb, axis=0), proj_key, lhs_axes=(0, ),
-                                    rhs_axes=(0, ))
-
-        # Merge batch and num_heads axes
-        query_with_key_bias = query_with_key_bias.reshape(shape=(-1, 0, 0), reverse=True)
-        proj_key = proj_key.reshape(shape=(-1, 0, 0), reverse=True)
-        query_with_emb_bias = query_with_emb_bias.reshape(shape=(-1, 0, 0), reverse=True)
-        proj_emb = proj_emb.reshape(shape=(-1, 0, 0), reverse=True)
-
-        if mask is not None:
-            # Insert and broadcast along num_heads axis. Merge num_heads and
-            # batch_size axes: (batch_size * num_heads, query_length,
-            # memory_length)
-            mask = F.broadcast_axis(F.expand_dims(mask, axis=1), axis=1, size=self._num_heads)\
-                    .reshape(shape=(-1, 0, 0), reverse=True)
-
-        att_score_AC = F.batch_dot(query_with_key_bias, proj_key, transpose_b=True)
-        att_score_BD = F.batch_dot(query_with_emb_bias, proj_emb, transpose_b=True)
-
-        # Relative shift
-        shifted_att_score_BD = _rel_shift(F, att_score_BD)
-        shifted_att_score_BD = F.slice_like(shifted_att_score_BD, shape_like=att_score_AC,
-                                            axes=(2, ))
-
-        att_score = att_score_AC + shifted_att_score_BD
-        if self._scaled:
-            att_score = att_score / math.sqrt(self._d_head)
-
-        att_weights = _masked_softmax(F, att_score, mask, self._dtype)
-        if self._dropout:
-            att_weights = self._dropout_layer(att_weights)
-
-        return att_weights.reshape(shape=(-1, self._num_heads, 0, 0), reverse=True)
-
-    def _read_by_weight(self, F, att_weights, value):
-        att_weights = att_weights.reshape(shape=(-1, 0, 0), reverse=True)
-        proj_value = self._project(F, 'value', value)
-        context_vec = F.batch_dot(att_weights, proj_value)
-        context_vec = F.transpose(
-            context_vec.reshape(shape=(-1, self._num_heads, 0, 0), reverse=True),
-            axes=(0, 2, 1, 3)).reshape(shape=(0, 0, -1))
-        return context_vec
-
-
-class RelativeSegmentEmbeddingPositionalEmbeddingMultiHeadAttentionCell(mx.gluon.HybridBlock):
-    """Multi-head Attention Cell with positional embeddings.
-
-    Parameters
-    ----------
-    d_head
-        Number of projected units for respectively query, key, value and
-        positional embeddings per attention head.
-    num_heads
-        Number of parallel attention heads
-    dropout
-    scaled
-    weight_initializer : str or `Initializer` or None, default None
-        Initializer of the weights.
-    bias_initializer : str or `Initializer`, default 'zeros'
-        Initializer of the bias.
-    embedding_initializer
-        Initializer of the segment embeddings.
-    """
-
-    def __init__(self, d_head: int, num_heads: int, dropout: float, scaled: bool,
-                 weight_initializer=None, embedding_initializer=None, bias_initializer='zeros',
-                 dtype='float32', prefix=None, params=None):
-        super().__init__(prefix=prefix, params=params)
-        self._d_head = d_head
-        self._num_heads = num_heads
-        self._dropout = dropout
-        self._scaled = scaled
-        self._dtype = dtype
-        units = ['query', 'key', 'value', 'emb']
-        with self.name_scope():
-            for name in units:
-                setattr(
-                    self, 'proj_{}'.format(name),
-                    mx.gluon.nn.Dense(units=d_head * num_heads, use_bias=False, flatten=False,
-                                      weight_initializer=weight_initializer,
-                                      bias_initializer=bias_initializer, prefix='{}_'.format(name)))
-            self.query_key_bias = self.params.get('query_key_bias', shape=(num_heads, d_head),
-                                                  init=bias_initializer)
-            self.query_emb_bias = self.params.get('query_emb_bias', shape=(num_heads, d_head),
-                                                  init=bias_initializer)
-            self.seg_emb = self.params.get('seg_emb', shape=(2, num_heads, d_head),
-                                           init=embedding_initializer)
-            self.query_seg_bias = self.params.get('query_seg_bias', shape=(num_heads, d_head),
-                                                  init=bias_initializer)
-            if dropout:
-                self._dropout_layer = mx.gluon.nn.Dropout(dropout)
-
-    # pylint: disable=arguments-differ
-    def hybrid_forward(self, F, query, key, value, emb, mask, segments, query_key_bias,
-                       query_emb_bias, seg_emb, query_seg_bias):
-        """Compute the attention.
-
-        Parameters
-        ----------
-        query : Symbol or NDArray
-            Query vector. Shape (batch_size, query_length, query_dim)
-        key : Symbol or NDArray
-            Key of the memory. Shape (batch_size, memory_length, key_dim)
-        value : Symbol or NDArray
-            Value of the memory. Shape (batch_size, memory_length, value_dim)
-        emb : Symbol or NDArray
-            Positional embeddings. Shape (memory_length + 1, value_dim) or
-            (memory_length + query_length + 1, value_dim)
-        mask : Symbol or NDArray
-            Mask of the memory slots. Shape (batch_size, query_length, memory_length)
-            Only contains 0 or 1 where 0 means that the memory slot will not be used.
-            If set to None. No mask will be used.
-        segments : Symbol or NDArray
-            One-hot vector indicating if a query-key pair is in the same
-            segment or not. Shape [batch_size, query_length, key_length , 2].
-            `1` indicates that the pair is not in the same segment.
-
-        Returns
-        -------
-        context_vec : Symbol or NDArray
-            Shape (batch_size, query_length, context_vec_dim)
-        att_weights : Symbol or NDArray
-            Attention weights of multiple heads.
-            Shape (batch_size, num_heads, query_length, memory_length)
-        """
-        att_weights = self._compute_weight(F, query=query, key=key, emb=emb, segments=segments,
-                                           seg_emb=seg_emb, mask=mask,
-                                           query_key_bias=query_key_bias,
-                                           query_emb_bias=query_emb_bias,
-                                           query_seg_bias=query_seg_bias)
-        context_vec = self._read_by_weight(F, att_weights, value)
-        return context_vec, att_weights
-
-    def _project(self, F, name, x):
-        # Shape (batch_size, query_length, num_heads * d_head)
-        x = getattr(self, 'proj_{}'.format(name))(x)
-        # Shape (batch_size * num_heads, query_length, d_head)
-        x = F.transpose(x.reshape(shape=(0, 0, self._num_heads, -1)),
-                        axes=(0, 2, 1, 3))\
-             .reshape(shape=(-1, 0, 0), reverse=True)
-        return x
-
-    def _compute_weight(self, F, query, key, emb, segments, seg_emb, mask, query_key_bias,
-                        query_emb_bias, query_seg_bias):
-        # Project query, key and emb
-        proj_query = self.proj_query(query).reshape(shape=(0, 0, self._num_heads, -1))
-        proj_key = self.proj_key(key).reshape(shape=(0, 0, self._num_heads, -1))
-        proj_emb = self.proj_emb(emb).reshape(shape=(-1, self._num_heads, self._d_head))
-
-        # Add biases and transpose to (batch_size, num_heads, query_length,
-        # d_head) or (num_heads, query_length, d_head)
-        query_with_key_bias = F.transpose(
-            F.broadcast_add(proj_query, F.reshape(query_key_bias, shape=(1, 1, 0, 0),
-                                                  reverse=True)), axes=(0, 2, 1, 3))
-        query_with_emb_bias = F.transpose(
-            F.broadcast_add(proj_query, F.reshape(query_emb_bias, shape=(1, 1, 0, 0),
-                                                  reverse=True)), axes=(0, 2, 1, 3))
-        query_with_seg_bias = F.transpose(
-            F.broadcast_add(proj_query, F.reshape(query_seg_bias, shape=(1, 1, 0, 0),
-                                                  reverse=True)), axes=(0, 2, 1, 3))
-        proj_key = F.transpose(proj_key, axes=(0, 2, 1, 3))
-        proj_emb = F.transpose(proj_emb, axes=(1, 0, 2))
-
-        # Broadcast emb along batch axis
-        proj_emb = F.broadcast_like(F.expand_dims(proj_emb, axis=0), proj_key, lhs_axes=(0, ),
-                                    rhs_axes=(0, ))
-
-        # Merge batch and num_heads axes
-        query_with_key_bias = query_with_key_bias.reshape(shape=(-1, 0, 0), reverse=True)
-        proj_key = proj_key.reshape(shape=(-1, 0, 0), reverse=True)
-        query_with_emb_bias = query_with_emb_bias.reshape(shape=(-1, 0, 0), reverse=True)
-        proj_emb = proj_emb.reshape(shape=(-1, 0, 0), reverse=True)
-        query_with_seg_bias = query_with_seg_bias.reshape(shape=(-1, 0, 0), reverse=True)
-
-        if mask is not None:
-            # Insert and broadcast along num_heads axis. Merge num_heads and
-            # batch_size axes: (batch_size * num_heads, query_length,
-            # memory_length)
-            mask = F.broadcast_axis(F.expand_dims(mask, axis=1), axis=1, size=self._num_heads)\
-                    .reshape(shape=(-1, 0, 0), reverse=True)
-
-        att_score_AC = F.batch_dot(query_with_key_bias, proj_key, transpose_b=True)
-        att_score_BD = F.batch_dot(query_with_emb_bias, proj_emb, transpose_b=True)
-
-        # Relative Segment Embeddings
-        # einsum bnid,snd->bnis
-        seg_emb = F.transpose(seg_emb, axes=(1, 2, 0)).expand_dims(0)
-        seg_emb = F.broadcast_like(lhs=seg_emb, rhs=query, lhs_axes=[0], rhs_axes=[0])
-        seg_emb = seg_emb.reshape(shape=(-1, 0, 0), reverse=True)
-        # seg_emb of shape (batch_size * num_heads, d_head, 2)
-
-        ef = F.batch_dot(query_with_seg_bias, seg_emb)
-        ef = ef.reshape(shape=(-1, self._num_heads, 0, 2), reverse=True)
-        # ef of shape (batch_size, num_heads, query_length, 2)
-
-        # einsum bijs,bnis->bnij
-        segments = segments.reshape(shape=(-1, 2), reverse=True)
-        # segments of shape (batch_size * query_length * memory_length, 2)
-        efs = []
-        for n in range(self._num_heads):
-            # shape (batch_size, 1, query_length, 2)
-            ef_n = ef.slice_axis(axis=1, begin=n, end=n + 1)
-            ef_n = ef_n.transpose((0, 2, 1, 3))  # shape (batch_size, query_length, 1, 2)
-            ef_n = F.broadcast_like(lhs=ef_n, rhs=key, lhs_axes=[2], rhs_axes=[1])
-            ef_n_merged = ef_n.reshape(shape=(-1, 2), reverse=True)
-            # ef_n_merged of shape (batch_size * query_length * memory_length, 2)
-
-            ef_n_result = F.batch_dot(segments.expand_dims(1), ef_n_merged.expand_dims(2))
-            # ef_n_result of shape (batch_size * query_length * memory_length, 1, 1)
-            ef_n_result = ef_n_result.reshape_like(ef_n, lhs_begin=0, lhs_end=3, rhs_begin=0,
-                                                   rhs_end=3).expand_dims(1)
-            # ef_n_result of shape (batch_size, 1, query_length, memory_length)
-            efs.append(ef_n_result)
-
-        att_score_EF = F.concat(*efs, dim=1).reshape(shape=(-1, 0, 0), reverse=True)
-        # shape (batch_size * num_heads, query_length, memory_length)
-
-        # Relative shift
-        shifted_att_score_BD = _rel_shift(F, att_score_BD)
-        shifted_att_score_BD = F.slice_like(shifted_att_score_BD, shape_like=att_score_AC,
-                                            axes=(2, ))
-
-        att_score = att_score_AC + shifted_att_score_BD + att_score_EF
-        if self._scaled:
-            att_score = att_score / math.sqrt(self._d_head)
-
-        att_weights = _masked_softmax(F, att_score, mask, self._dtype)
-        if self._dropout:
-            att_weights = self._dropout_layer(att_weights)
-
-        return att_weights.reshape(shape=(-1, self._num_heads, 0, 0), reverse=True)
-
-    def _read_by_weight(self, F, att_weights, value):
-        att_weights = att_weights.reshape(shape=(-1, 0, 0), reverse=True)
-        proj_value = self._project(F, 'value', value)
-        context_vec = F.batch_dot(att_weights, proj_value)
-        context_vec = F.transpose(
-            context_vec.reshape(shape=(-1, self._num_heads, 0, 0), reverse=True),
-            axes=(0, 2, 1, 3)).reshape(shape=(0, 0, -1))
-        return context_vec
diff --git a/scripts/language_model/transformer/data.py b/scripts/language_model/transformer/data.py
deleted file mode 100644
index b4b0ef3b7f..0000000000
--- a/scripts/language_model/transformer/data.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-encoded-data, too-many-lines
-"""Transformer API.
-
-It provides tools for common transformation on samples in text dataset, such as
-clipping, padding, and tokenization.
-"""
-import unicodedata
-from typing import List, Optional
-
-import gluonnlp as nlp
-
-__all__ = ['XLNetTokenizer']
-
-
-class XLNetTokenizer:
-    """End-to-end tokenization for XLNet models.
-
-    Parameters
-    ----------
-    sentencepiece_path
-        Path to sentencepiece model, to be used for obtaining word pieces.
-
-    .. note::
-
-        For multi-processing, making an extra copy of the XLNetTokenizer instance
-        is recommended before calling it for the first time is recommended.
-        SentencePiece models can't be pickled, which is needed for
-        multi-processing. The SentencePiece model is initialized during the first
-        call.
-
-    Examples
-    --------
-    >>> _, vocab = gluonnlp.model.bert_12_768_12(dataset_name='wiki_multilingual_uncased',
-    ...                                          pretrained=False, root='./model')
-    -etc-
-    >>> tokenizer = gluonnlp.data.BERTTokenizer(vocab=vocab)
-    >>> tokenizer('gluonnlp: 使NLP变得简单。')
-    ['gl', '##uo', '##nn', '##lp', ':', '使', 'nl', '##p', '变', '得', '简', '单', '。']
-
-    """
-    _spiece_prefix = '▁'
-
-    def __init__(self, sentencepiece_path: str, lower: bool = False, remove_space: bool = True,
-                 keep_accents: bool = False):
-        self._sentencepiece_path = sentencepiece_path
-        self._lower = lower
-        self._remove_space = remove_space
-        self._keep_accents = keep_accents
-        self._sentencepiece = None  # type: Optional[nlp.data.SentencepieceTokenizer]
-
-    def __call__(self, sample: str) -> List[str]:
-        """Tokenize a sample.
-
-        Parameters
-        ----------
-        sample
-            The string to tokenize.
-
-        Returns
-        -------
-        tokens
-            List of tokens
-        """
-
-        if self._remove_space:
-            outputs = ' '.join(sample.strip().split())
-        else:
-            outputs = sample
-        outputs = outputs.replace('``', '"').replace('\'\'', '"')
-
-        if not self._keep_accents:
-            outputs = unicodedata.normalize('NFKD', outputs)
-            outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
-        if self._lower:
-            outputs = outputs.lower()
-
-        if self._sentencepiece is None:
-            self._sentencepiece = nlp.data.SentencepieceTokenizer(self._sentencepiece_path)
-
-        pieces = self._sentencepiece(outputs)
-        new_pieces = []  # type: List[str]
-        for piece in pieces:
-            if len(piece) > 1 and piece[-1] == ',' and piece[-2].isdigit():
-                cur_pieces = self._sentencepiece(piece[:-1].replace(self._spiece_prefix, ''))
-                if piece[0] != self._spiece_prefix and cur_pieces[0][0] == self._spiece_prefix:
-                    if len(cur_pieces[0]) == 1:
-                        cur_pieces = cur_pieces[1:]
-                    else:
-                        cur_pieces[0] = cur_pieces[0][1:]
-                cur_pieces.append(piece[-1])
-                new_pieces.extend(cur_pieces)
-            else:
-                new_pieces.append(piece)
-
-        return new_pieces
diff --git a/scripts/language_model/transformer/embedding.py b/scripts/language_model/transformer/embedding.py
deleted file mode 100644
index c937e09457..0000000000
--- a/scripts/language_model/transformer/embedding.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Attention cells."""
-
-__all__ = ['AdaptiveEmbedding', 'ProjectedEmbedding']
-
-from typing import List
-
-import mxnet as mx
-
-
-class ProjectedEmbedding(mx.gluon.HybridBlock):
-    """Projected Embedding"""
-
-    def __init__(self, vocab_size: int, embed_size: int, units: int, project_same_dim: bool = True,
-                 embedding_initializer=None, projection_initializer=None, prefix=None, params=None):
-        super().__init__(prefix=prefix, params=params)
-        self._vocab_size = vocab_size
-        self._embed_size = embed_size
-        self._units = units
-        self._project_same_dim = project_same_dim
-        self._emb_scale = units**0.5
-
-        with self.name_scope():
-            self.embedding_weight = self.params.get('embedding_weight',
-                                                    shape=(vocab_size, embed_size),
-                                                    init=embedding_initializer)
-            if units != embed_size or project_same_dim:
-                self.projection_weight = self.params.get('projection_weight',
-                                                         shape=(units, embed_size),
-                                                         init=projection_initializer)
-
-    def hybrid_forward(self, F, inp, **params):  # pylint: disable=arguments-differ
-        emb = F.Embedding(data=inp, weight=params['embedding_weight'], input_dim=self._vocab_size,
-                          output_dim=self._embed_size)
-        if self._units != self._embed_size or self._project_same_dim:
-            emb = F.FullyConnected(data=emb, weight=params['projection_weight'], no_bias=True,
-                                   flatten=False, num_hidden=self._units)
-        return emb * self._emb_scale
-
-
-class AdaptiveEmbedding(mx.gluon.HybridBlock):
-    """Adaptive Embedding
-
-    Baevski, A., & Auli, M. (2019). Adaptive input representations for neural
-    language modeling. In International Conference on Learning Representations.
-
-    """
-
-    # TODO: Transformer-XL has a sample_softmax argument here
-
-    def __init__(self, vocab_size: int, embed_size: int, units: int, cutoffs: List[int],
-                 div_val: int = 1, project_same_dim: bool = True, embedding_initializer=None,
-                 projection_initializer=None, prefix=None, params=None):
-        super().__init__(prefix=prefix, params=params)
-        # Sanity checks
-        if cutoffs != sorted(cutoffs):
-            raise ValueError('cutoffs must be a sorted list of cutoff values. '
-                             'Got {}, but expected {}'.format(cutoffs, sorted(cutoffs)))
-        if not cutoffs:
-            raise ValueError('cutoffs must not be empty. Got {}'.format(cutoffs))
-        if cutoffs[0] <= 0:
-            raise ValueError('The first cutoff value ({}) must be greater 0.'.format(cutoffs[0]))
-        if cutoffs[-1] >= vocab_size:
-            raise ValueError(
-                'The last cutoff value ({}) must be smaller than vocab_size ({}).'.format(
-                    cutoffs[-1], vocab_size))
-
-        self._vocab_size = vocab_size
-        self._embed_size = embed_size
-        self._cutoffs = [0] + cutoffs + [vocab_size]
-        self._div_val = div_val
-        self._units = units
-        self._project_same_dim = project_same_dim
-        self._emb_scale = units**0.5
-
-        with self.name_scope():
-            if self._div_val == 1:
-                name = 'embedding0_weight'
-                setattr(
-                    self, name,
-                    self.params.get(name, shape=(vocab_size, embed_size),
-                                    init=embedding_initializer))
-
-                if units != embed_size or project_same_dim:
-                    name = 'projection0_weight'
-                    setattr(
-                        self, name,
-                        self.params.get(name, shape=(units, embed_size),
-                                        init=projection_initializer))
-            else:
-                for i, (l_idx, r_idx) in enumerate(zip(self._cutoffs, self._cutoffs[1:])):
-                    name = 'embedding{}_weight'.format(i)
-                    setattr(
-                        self, name,
-                        self.params.get(name, shape=(r_idx - l_idx, embed_size // div_val**i),
-                                        init=embedding_initializer))
-
-                    if units != embed_size // div_val**i or project_same_dim:
-                        name = 'projection{}_weight'.format(i)
-                        setattr(
-                            self, name,
-                            self.params.get(name, shape=(units, embed_size // div_val**i),
-                                            init=projection_initializer))
-
-    def hybrid_forward(self, F, inp, **params):  # pylint: disable=arguments-differ
-        if self._div_val == 1:
-            emb = F.Embedding(data=inp, weight=params['embedding0_weight'],
-                              input_dim=self._vocab_size, output_dim=self._embed_size)
-            if self._units != self._embed_size or self._project_same_dim:
-                emb = F.FullyConnected(data=emb, weight=params['projection0_weight'], no_bias=True,
-                                       flatten=False, num_hidden=self._units)
-        else:
-            inp_flat = inp.reshape((-1, ))
-            zeros_like_inp_flat = F.zeros_like(inp_flat)
-            ones_like_inp_flat = F.ones_like(inp_flat)
-            emb_flat = None
-            for i, (l_idx, r_idx) in enumerate(zip(self._cutoffs, self._cutoffs[1:])):
-                cond_i = F.broadcast_logical_and(inp_flat >= l_idx, inp_flat < r_idx)
-                inp_i = F.where(cond_i, inp_flat - l_idx, zeros_like_inp_flat)
-                mask_i = F.expand_dims(F.where(cond_i, ones_like_inp_flat, zeros_like_inp_flat),
-                                       axis=1)
-
-                emb_i = F.Embedding(data=inp_i, weight=params['embedding{}_weight'.format(i)],
-                                    input_dim=r_idx - l_idx,
-                                    output_dim=self._embed_size // self._div_val**i)
-                emb_i = F.broadcast_mul(emb_i, mask_i)
-                if self._units != self._embed_size // self._div_val**i or self._project_same_dim:
-                    emb_i = F.FullyConnected(data=emb_i,
-                                             weight=params['projection{}_weight'.format(i)],
-                                             no_bias=True, flatten=False, num_hidden=self._units)
-
-                if emb_flat is None:  # i == 0
-                    emb_flat = emb_i
-                else:
-                    emb_flat = emb_flat + emb_i
-
-            emb = F.reshape_like(emb_flat, inp, lhs_begin=0, lhs_end=1)
-
-        emb = emb * self._emb_scale
-
-        return emb
diff --git a/scripts/language_model/transformer/model.py b/scripts/language_model/transformer/model.py
deleted file mode 100644
index de4d7dbbe6..0000000000
--- a/scripts/language_model/transformer/model.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import, arguments-differ
-"""Module for pre-defined NLP models."""
-
-import errno
-import os
-import time
-import zipfile
-from typing import Optional
-
-import mxnet as mx
-from mxnet.gluon.model_zoo import model_store
-from mxnet.gluon.utils import _get_repo_url, check_sha1, download
-
-import gluonnlp as nlp
-from gluonnlp.base import get_home_dir
-from gluonnlp.data.utils import _url_format
-from gluonnlp.model.utils import _load_pretrained_params, _load_vocab
-
-from .data import XLNetTokenizer
-from .transformer import TransformerXL, XLNet
-
-__all__ = ['get_model']
-
-model_store._model_sha1.update({
-    name: checksum
-    for checksum, name in [
-        ('ca7a092186ec3f42ef25590a872450409faaa84f', 'xlnet_cased_l12_h768_a12_126gb'),
-        ('ceae74798c1577bcf5ffb3c46b73b056a5ead786', 'xlnet_cased_l24_h1024_a16_126gb'),
-    ]
-})
-
-
-def get_model(name, **kwargs):
-    """Returns a pre-defined model by name."""
-    models = {
-        # TODO better naming scheme when moving this to main API?
-        'transformerxl': transformerxl,
-        'xlnet_cased_l12_h768_a12': xlnet_cased_l12_h768_a12,
-        'xlnet_cased_l24_h1024_a16': xlnet_cased_l24_h1024_a16
-    }
-    name = name.lower()
-    if name not in models:
-        raise ValueError('Model %s is not supported. Available options are\n\t%s' %
-                         (name, '\n\t'.join(sorted(models.keys()))))
-    return models[name](**kwargs)
-
-
-def transformerxl(dataset_name: str, vocab: nlp.Vocab, **kwargs):
-    """Generic pre-trained Transformer-XL model.
-
-    The hyperparameters are chosen based on the specified dataset_name from the
-    published hyperparameters of Dai et al.
-
-
-    References:
-    Dai, Z., Yang, Z., Yang, Y., Cohen, W. W., Carbonell, J., Le, Q. V., &
-    Salakhutdinov, R. (2019). Transformer-XL: Attentive language models beyond
-    a fixed-length context. arXiv preprint arXiv:1901.02860, (), .
-
-    Parameters
-    ----------
-    dataset_name
-        Used to load hyperparameters for the dataset.
-    vocab
-        Vocabulary for the dataset.
-
-    Returns
-    -------
-    TransformerXL, gluonnlp.Vocab
-
-    """
-
-    dataset_name_to_kwargs = dict(
-        wt103={
-            'embed_cutoffs': [20000, 40000, 200000],
-            'embed_size': 1024,
-            'embed_div_val': 4,
-            'tie_input_output_embeddings': True,
-            'tie_input_output_projections': [False, True, True, True],
-            'num_layers': 18,
-            'hidden_size': 4096,
-            'units': 1024,
-            'num_heads': 16,
-            'dropout': 0,
-            'attention_dropout': 0
-        }, lm1b={
-            'embed_cutoffs': [60000, 100000, 640000],
-            'embed_size': 1280,
-            'embed_div_val': 4,
-            'project_same_dim': False,
-            'tie_input_output_embeddings': True,
-            'num_layers': 24,
-            'hidden_size': 8192,
-            'units': 1280,
-            'num_heads': 16,
-            'dropout': 0,
-            'attention_dropout': 0
-        }, enwik8={
-            'embed_size': 1024,
-            'tie_input_output_embeddings': True,
-            'num_layers': 24,
-            'hidden_size': 3072,
-            'units': 1024,
-            'num_heads': 8,
-            'dropout': 0,
-            'attention_dropout': 0
-        }, text8={
-            'embed_size': 1024,
-            'tie_input_output_embeddings': True,
-            'num_layers': 24,
-            'hidden_size': 3072,
-            'units': 1024,
-            'num_heads': 8,
-            'dropout': 0,
-            'attention_dropout': 0
-        })
-
-    options = dataset_name_to_kwargs[dataset_name]
-    options.update(**kwargs)
-    model = TransformerXL(vocab_size=len(vocab), **options)
-    return model, vocab
-
-
-def xlnet_cased_l12_h768_a12(dataset_name: Optional[str] = None, vocab: Optional[nlp.Vocab] = None,
-                             tokenizer: Optional[XLNetTokenizer] = None, pretrained: bool = True,
-                             ctx: mx.Context = mx.cpu(),
-                             root=os.path.join(get_home_dir(), 'models'),
-                             do_lower_case=False, **kwargs):
-    """XLNet model.
-
-    References:
-    Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R., & Le, Q. V.
-    (2019). XLNet: Generalized Autoregressive Pretraining for Language
-    Understanding. arXiv preprint arXiv:1906.08237.
-
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        If not None, the dataset name is used to load a vocabulary for the
-        dataset. If the `pretrained` argument is set to True, the dataset name
-        is further used to select the pretrained parameters to load.
-        Options include 'books_enwiki_giga5_clueweb2012b_commoncrawl'.
-    vocab : gluonnlp.vocab.Vocab or None, default None
-        Vocabulary for the dataset. Must be provided if dataset_name is not
-        specified. Ignored if dataset_name is specified.
-    tokenizer : XLNetTokenizer or None, default None
-        XLNetTokenizer for the dataset. Must be provided if dataset_name is not
-        specified. Ignored if dataset_name is specified.
-    pretrained : bool, default True
-        Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Returns
-    -------
-    XLNet, gluonnlp.Vocab
-    """
-
-    kwargs.update(**{
-        'hidden_size': 3072,
-        'units': 768,
-        'activation': 'gelu',
-        'num_heads': 12,
-        'num_layers': 12,
-    })
-    if vocab is None or dataset_name is not None:
-        vocab = _load_vocab('xlnet_' + dataset_name, vocab, root)
-    net = XLNet(vocab_size=len(vocab), **kwargs)
-    if pretrained:
-        _load_pretrained_params(net=net, model_name='xlnet_cased_l12_h768_a12',
-                                dataset_name=dataset_name, root=root, ctx=ctx,
-                                ignore_extra=not kwargs.get('use_decoder', True))
-    if tokenizer is None or dataset_name is not None:
-        tokenizer = _get_xlnet_tokenizer(dataset_name, root, do_lower_case)
-    return net, vocab, tokenizer
-
-
-def xlnet_cased_l24_h1024_a16(dataset_name: Optional[str] = None, vocab: Optional[nlp.Vocab] = None,
-                              tokenizer: Optional[XLNetTokenizer] = None, pretrained: bool = True,
-                              ctx: mx.Context = mx.cpu(),
-                              root=os.path.join(get_home_dir(), 'models'),
-                              do_lower_case=False, **kwargs):
-    """XLNet model.
-
-    References:
-    Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R., & Le, Q. V.
-    (2019). XLNet: Generalized Autoregressive Pretraining for Language
-    Understanding. arXiv preprint arXiv:1906.08237.
-
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        If not None, the dataset name is used to load a vocabulary for the
-        dataset. If the `pretrained` argument is set to True, the dataset name
-        is further used to select the pretrained parameters to load.
-        Options include 'books_enwiki_giga5_clueweb2012b_commoncrawl'.
-    vocab : gluonnlp.vocab.Vocab or None, default None
-        Vocabulary for the dataset. Must be provided if dataset_name is not
-        specified. Ignored if dataset_name is specified.
-    tokenizer : XLNetTokenizer or None, default None
-        XLNetTokenizer for the dataset. Must be provided if dataset_name is not
-        specified. Ignored if dataset_name is specified.
-    pretrained : bool, default True
-        Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Returns
-    -------
-    XLNet, gluonnlp.Vocab, XLNetTokenizer
-
-    """
-    kwargs.update(**{
-        'hidden_size': 4096,
-        'units': 1024,
-        'activation': 'approx_gelu',
-        'num_heads': 16,
-        'num_layers': 24,
-    })
-    if vocab is None or dataset_name is not None:
-        vocab = _load_vocab('xlnet_' + dataset_name, vocab, root)
-    net = XLNet(vocab_size=len(vocab), **kwargs)
-    if pretrained:
-        _load_pretrained_params(net=net, model_name='xlnet_cased_l24_h1024_a16',
-                                dataset_name=dataset_name, root=root, ctx=ctx,
-                                ignore_extra=not kwargs.get('use_decoder', True))
-    if tokenizer is None or dataset_name is not None:
-        tokenizer = _get_xlnet_tokenizer(dataset_name, root, do_lower_case)
-    return net, vocab, tokenizer
-
-
-def _get_xlnet_tokenizer(dataset_name, root, do_lower_case=False):
-    assert dataset_name.lower() == '126gb'
-    root = os.path.expanduser(root)
-    file_path = os.path.join(root, 'xlnet_126gb-871f0b3c.spiece')
-    sha1_hash = '871f0b3c13b92fc5aea8fba054a214c420e302fd'
-    if os.path.exists(file_path):
-        if not check_sha1(file_path, sha1_hash):
-            print('Detected mismatch in the content of model tokenizer. Downloading again.')
-    else:
-        print('Tokenizer file is not found. Downloading.')
-
-    if not os.path.exists(root):
-        try:
-            os.makedirs(root)
-        except OSError as e:
-            if e.errno == errno.EEXIST and os.path.isdir(root):
-                pass
-            else:
-                raise e
-
-    repo_url = _get_repo_url()
-    prefix = str(time.time())
-    zip_file_path = os.path.join(root, prefix + 'xlnet_126gb-871f0b3c.zip')
-    if repo_url[-1] != '/':
-        repo_url = repo_url + '/'
-    download(_url_format.format(repo_url=repo_url, file_name='xlnet_126gb-871f0b3c'),
-             path=zip_file_path, overwrite=True)
-    with zipfile.ZipFile(zip_file_path) as zf:
-        if not os.path.exists(file_path):
-            zf.extractall(root)
-    try:
-        os.remove(zip_file_path)
-    except OSError as e:
-        # file has already been removed.
-        if e.errno == 2:
-            pass
-        else:
-            raise e
-
-    if not check_sha1(file_path, sha1_hash):
-        raise ValueError('Downloaded file has different hash. Please try again.')
-
-    tokenizer = XLNetTokenizer(file_path, lower=do_lower_case)
-    return tokenizer
diff --git a/scripts/language_model/transformer/softmax.py b/scripts/language_model/transformer/softmax.py
deleted file mode 100644
index cbd86d01e5..0000000000
--- a/scripts/language_model/transformer/softmax.py
+++ /dev/null
@@ -1,360 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Attention cells."""
-
-__all__ = ['AdaptiveLogSoftmaxWithLoss', 'ProjectedLogSoftmaxWithLoss']
-
-from typing import List, Optional
-
-import mxnet as mx
-
-
-class ProjectedLogSoftmaxWithLoss(mx.gluon.HybridBlock):
-    """ProjectedLogSoftmaxWithLoss"""
-
-    def __init__(self, vocab_size: int, embed_size: int, units: int, use_bias: bool = True,
-                 project_same_dim: bool = True, projection_initializer=None,
-                 embedding_initializer=None, tie_embeddings: bool = False,
-                 tie_projections: bool = False, prefix: Optional[str] = None,
-                 params: Optional[mx.gluon.ParameterDict] = None):
-        super().__init__(prefix=prefix, params=params)
-        self._vocab_size = vocab_size
-        self._embed_size = embed_size
-        self._use_bias = use_bias
-        self._units = units
-        self._project_same_dim = project_same_dim
-        self._embedding_initializer = embedding_initializer
-        self._projection_initializer = projection_initializer
-        self._tie_embeddings = tie_embeddings
-        self._tie_projections = tie_projections
-
-        self._projections_name = '{}projection_weight'
-        self._embeddings_name = '{}embedding_weight'
-        with self.name_scope():
-            if units != embed_size or project_same_dim:
-                name = self._get_param_name('projection')
-                param = self.params.get(name, shape=(units, embed_size),
-                                        init=self._projection_initializer)
-                setattr(self, name, param)
-
-            name = self._get_param_name('embedding')
-            param = self.params.get(name, shape=(vocab_size, embed_size),
-                                    init=self._embedding_initializer)
-            setattr(self, name, param)
-            if use_bias:
-                name = 'outembedding_bias'
-                param = self.params.get(name, shape=(self._vocab_size, ))
-                setattr(self, name, param)
-
-    def _get_param_name(self, name):
-        if name == 'projection':
-            return self._projections_name.format('' if self._tie_projections else 'out')
-        elif name == 'embedding':
-            return self._embeddings_name.format('' if self._tie_embeddings else 'out')
-        else:
-            raise ValueError('Invalid name')
-
-    def hybrid_forward(self, F, hidden, target, **params):  # pylint: disable=arguments-differ
-        """Compute adaptive softmax.
-
-        Parameters
-        ----------
-        hidden : Symbol or NDArray
-            Inputs of shape [batch_size, sequence_length, units]
-        target : Symbol or NDArray
-            Targets of shape [batch_size, sequence_length]
-
-        Returns
-        -------
-        out : Symbol or NDArray
-            Negative log likelihood of targets with shape [batch_size,
-            sequence_length]
-        """
-        if target is None:  # TODO support None or add separate log_prob method
-            raise NotImplementedError()
-
-        # Work with flat data for simplicity
-        target_flat = target.reshape((-1, ))
-        hidden = F.reshape(hidden, shape=(-1, 0), reverse=True)
-
-        # Helper arrays
-        if F is mx.nd:
-            range_bs_len = mx.nd.arange(target_flat.shape[0], dtype=target_flat.dtype,
-                                        ctx=target_flat.context)
-        else:
-            # Shape inference fails when relying on F.stack(range_bs_len, ...)
-            # below. Thus add zeros of intended shape here to simplify the
-            # shape inference problem.
-            range_bs_len = F.zeros_like(target_flat) + F.arange(start=0, stop=None,
-                                                                infer_range=True)
-
-        if self._units != self._embed_size or self._project_same_dim:
-            name = self._get_param_name('projection')
-            hidden = F.FullyConnected(data=hidden, weight=F.transpose(params[name]), no_bias=True,
-                                      flatten=False, num_hidden=self._embed_size)
-
-        name = self._get_param_name('embedding')
-        logits = F.FullyConnected(data=hidden, weight=params[name],
-                                  bias=params['outembedding_bias'] if self._use_bias else None,
-                                  no_bias=not self._use_bias, flatten=False,
-                                  num_hidden=self._vocab_size)
-        logprob = F.log_softmax(logits)
-        target_ = F.stack(range_bs_len, target_flat)
-        out = F.gather_nd(logprob, indices=target_)
-
-        out = F.reshape_like(out, target)
-
-        return -out
-
-
-class AdaptiveLogSoftmaxWithLoss(mx.gluon.HybridBlock):
-    """Efficient softmax approximation
-
-    Grave, E., Joulin, A., Cissé, M., Jégou, H., & others, (2017). Efficient
-    softmax approximation for GPUs. In , Proceedings of the 34th International
-    Conference on Machine Learning-Volume 70 (pp. 1302–1310).
-
-    Parameters
-    ----------
-    vocab_size
-    embed_size
-    units
-        Feature dimension of inputs. Must be specified, as shape inference
-        would fail if the first batch does not contain target indices of every
-        cluster.
-    cutoffs
-        Ordered list of cutoff values for the clusters.
-    div_val
-        Division value to obtain embed_size per cluster. For cluster i:
-        embed_size / div_val**i.
-    use_bias
-        Use a bias for the output layer.
-    projection_initializer
-        Initializer for the projection layers.
-    embedding_initializer
-        Initializer for the output layers and cluster weights. Called
-        embedding_initializer, as the parameters may be tied to the embedding
-        parameters of AdaptiveEmbedding.
-    tie_embeddings
-        Share embedding parameters with an AdaptiveEmbedding Block? If True, the
-        params argument must be provided and set to the ParameterDict of the
-        AdaptiveEmbedding Block.
-    tie_projections
-        Share projection parameters with an AdaptiveEmbedding Block? If True, the
-        params argument must be provided and set to the ParameterDict of the
-        AdaptiveEmbedding Block. tie_projections should be a list of boolean
-        values, specifying if the projection weights for the respective
-        parameter are to be shared or not.
-
-    """
-
-    def __init__(self, vocab_size: int, embed_size: int, units: int, cutoffs: List[int],
-                 div_val: int = 1, use_bias: bool = True, project_same_dim: bool = True,
-                 projection_initializer=None, embedding_initializer=None,
-                 tie_embeddings: bool = False, tie_projections: Optional[List[bool]] = None,
-                 prefix: Optional[str] = None, params: Optional[mx.gluon.ParameterDict] = None):
-        super().__init__(prefix=prefix, params=params)
-        self._vocab_size = vocab_size
-        self._embed_size = embed_size
-        self._cutoffs = [0] + cutoffs + [vocab_size]
-        self._div_val = div_val
-        self._use_bias = use_bias
-        self._units = units
-        self._project_same_dim = project_same_dim
-        self._embedding_initializer = embedding_initializer
-        self._projection_initializer = projection_initializer
-        self._tie_embeddings = tie_embeddings
-        self._tie_projections = tie_projections
-
-        # Sanity checks
-        if cutoffs != sorted(cutoffs):
-            raise ValueError('cutoffs must be a sorted list of cutoff values. '
-                             'Got {}, but expected {}'.format(cutoffs, sorted(cutoffs)))
-        if not cutoffs:
-            raise ValueError('cutoffs must not be empty. Got {}'.format(cutoffs))
-        if cutoffs[0] <= 0:
-            raise ValueError('The first cutoff value ({}) must be greater 0.'.format(cutoffs[0]))
-        if cutoffs[-1] >= vocab_size:
-            raise ValueError(
-                'The last cutoff value ({}) must be smaller than vocab_size ({}).'.format(
-                    cutoffs[-1], vocab_size))
-
-        if tie_embeddings:
-            assert params is not None
-        if tie_projections is not None:
-            assert params is not None
-            if div_val == 1:
-                if self._units == self._embed_size:
-                    assert len(tie_projections) == 0
-                elif len(tie_projections) != 1:
-                    raise ValueError(
-                        'tie_projections should be None or a boolean for every cluster. '
-                        'As div_val == 1 there is only a single cluster. But got ({}).'.format(
-                            tie_projections))
-            if len(tie_projections) != len(cutoffs) + 1:
-                raise ValueError(
-                    'tie_projections should be None or a boolean for every cluster. '
-                    'It must thus have len(cutoffs) + 1. But got ({}) for cutoffs ({}).'.format(
-                        tie_projections, cutoffs))
-
-        self._projections_name = '{}projection{}_weight'
-        self._embeddings_name = '{}embedding{}_weight'
-        with self.name_scope():
-            if self._div_val == 1:
-                if self._units != self._embed_size or project_same_dim:
-                    name = self._get_param_name('projection', 0)
-                    param = self.params.get(name, shape=(self._units, self._embed_size),
-                                            init=self._projection_initializer)
-                    setattr(self, name, param)
-
-                name = self._get_param_name('embedding', 0)
-                param = self.params.get(name, shape=(self._vocab_size, self._embed_size),
-                                        init=self._embedding_initializer)
-                setattr(self, name, param)
-                if use_bias:
-                    name = 'outembedding0_bias'
-                    param = self.params.get(name, shape=(self._vocab_size, ))
-                    setattr(self, name, param)
-            else:
-                for i, (l_idx, r_idx) in enumerate(zip(self._cutoffs, self._cutoffs[1:])):
-                    if self._units != self._embed_size // self._div_val**i or project_same_dim:
-                        name = self._get_param_name('projection', i)
-                        param = self.params.get(
-                            name, shape=(self._units, self._embed_size // self._div_val**i),
-                            init=self._projection_initializer)
-                        setattr(self, name, param)
-
-                    name = self._get_param_name('embedding', i)
-                    param = self.params.get(
-                        name, shape=(r_idx - l_idx, self._embed_size // self._div_val**i),
-                        init=self._embedding_initializer)
-                    setattr(self, name, param)
-                    if use_bias:
-                        name = 'outembedding{}_bias'.format(i)
-                        param = self.params.get(name, shape=(r_idx - l_idx, ))
-                        setattr(self, name, param)
-
-                if self._div_val != 1:
-                    self.cluster = mx.gluon.nn.Dense(len(cutoffs), flatten=False,
-                                                     in_units=embed_size,
-                                                     weight_initializer=embedding_initializer)
-
-    def _get_param_name(self, name, i):
-        if name == 'projection':
-            tied = self._tie_projections is not None and self._tie_projections[i]
-            return self._projections_name.format('' if tied else 'out', i)
-        elif name == 'embedding':
-            return self._embeddings_name.format('' if self._tie_embeddings else 'out', i)
-        else:
-            raise ValueError('Invalid name')
-
-    def hybrid_forward(self, F, hidden, target, **params):  # pylint: disable=arguments-differ
-        """Compute adaptive softmax.
-
-        Parameters
-        ----------
-        hidden : Symbol or NDArray
-            Inputs of shape [batch_size, sequence_length, units]
-        target : Symbol or NDArray
-            Targets of shape [batch_size, sequence_length]
-
-        Returns
-        -------
-        out : Symbol or NDArray
-            Negative log likelihood of targets with shape [batch_size,
-            sequence_length]
-        """
-        if target is None:  # TODO support None or add separate log_prob method
-            raise NotImplementedError()
-
-        # Work with flat data for simplicity
-        target_flat = target.reshape((-1, ))
-        hidden = F.reshape(hidden, shape=(-1, 0), reverse=True)
-
-        # Helper arrays
-        if F is mx.nd:
-            range_bs_len = mx.nd.arange(target_flat.shape[0], dtype=target_flat.dtype,
-                                        ctx=target_flat.context)
-        else:
-            # Shape inference fails when relying on F.stack(range_bs_len, ...)
-            # below. Thus add zeros of intended shape here to simplify the
-            # shape inference problem.
-            range_bs_len = F.zeros_like(target_flat) + F.arange(start=0, stop=None,
-                                                                infer_range=True)
-
-        if self._div_val == 1:
-            if self._units != self._embed_size or self._project_same_dim:
-                name = self._get_param_name('projection', 0)
-                hidden = F.FullyConnected(data=hidden, weight=F.transpose(params[name]),
-                                          no_bias=True, flatten=False, num_hidden=self._embed_size)
-
-            name = self._get_param_name('embedding', 0)
-            logits = F.FullyConnected(data=hidden, weight=params[name],
-                                      bias=params['outembedding0_bias'] if self._use_bias else None,
-                                      no_bias=not self._use_bias, flatten=False,
-                                      num_hidden=self._vocab_size)
-            logprob = F.log_softmax(logits)
-            target_ = F.stack(range_bs_len, target_flat)
-            out = F.gather_nd(logprob, indices=target_)
-        else:
-            # Prepare output
-            if F is mx.nd:
-                assert target.dtype == hidden.dtype
-            out = F.zeros_like(target_flat)
-
-            for i, (l_idx, r_idx) in enumerate(zip(self._cutoffs, self._cutoffs[1:])):
-                if self._units != self._embed_size // self._div_val**i or self._project_same_dim:
-                    name = self._get_param_name('projection', i)
-                    proj_i = F.FullyConnected(data=hidden, weight=F.transpose(params[name]),
-                                              no_bias=True, flatten=False,
-                                              num_hidden=self._embed_size // self._div_val**i)
-                else:
-                    proj_i = hidden
-                # Shape [batch_size * sequence_length, r_idx - l_idx]
-                name = self._get_param_name('embedding', i)
-                logits_i = F.FullyConnected(
-                    data=proj_i, weight=params[name],
-                    bias=params['outembedding{}_bias'.format(i)] if self._use_bias else None,
-                    no_bias=not self._use_bias, flatten=False, num_hidden=r_idx - l_idx)
-                if i == 0:  # Shortlist
-                    logits_cluster = self.cluster(proj_i)
-                    logits_shortlist_cluster = F.concat(logits_i, logits_cluster, dim=1)
-                    logprob_shortlist_cluster = F.log_softmax(logits_shortlist_cluster)
-
-                    logprob_i = F.slice_axis(logprob_shortlist_cluster, axis=1, begin=0,
-                                             end=-(len(self._cutoffs) - 2))
-                    logprob_cluster = F.slice_axis(logprob_shortlist_cluster, axis=1,
-                                                   begin=-(len(self._cutoffs) - 2), end=None)
-                else:  # Tail cluster
-                    logprob_i = F.broadcast_add(
-                        F.log_softmax(logits_i),
-                        F.gather_nd(logprob_cluster,
-                                    F.stack(range_bs_len,
-                                            F.ones_like(range_bs_len) * (i - 1))).expand_dims(1))
-
-                # Targets limited to current cluster
-                cond_i = F.broadcast_logical_and(target_flat >= l_idx, target_flat < r_idx)
-                target_i = F.where(cond_i, target_flat - l_idx, F.zeros_like(target_flat))
-                target_i = F.stack(range_bs_len, target_i)
-
-                # Copy for targets that fall into the current cluster to out
-                out_i = F.gather_nd(logprob_i, indices=target_i)
-                out = F.where(cond_i, out_i, out)
-
-        out = F.reshape_like(out, target)
-
-        return -out
diff --git a/scripts/language_model/transformer/transformer.py b/scripts/language_model/transformer/transformer.py
deleted file mode 100644
index 0c02df35bc..0000000000
--- a/scripts/language_model/transformer/transformer.py
+++ /dev/null
@@ -1,755 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Attention cells."""
-
-__all__ = ['TransformerXLCell', 'TransformerXL', 'XLNet']
-
-import typing
-
-import numpy as np
-import mxnet as mx
-from mxnet.gluon import nn
-
-import gluonnlp as nlp
-
-from .attention_cell import PositionalEmbeddingMultiHeadAttentionCell, \
-    RelativeSegmentEmbeddingPositionalEmbeddingMultiHeadAttentionCell
-from .embedding import AdaptiveEmbedding, ProjectedEmbedding
-from .softmax import AdaptiveLogSoftmaxWithLoss, ProjectedLogSoftmaxWithLoss
-
-
-class PositionalEmbedding(mx.gluon.HybridBlock):
-    """Positional embedding.
-
-    Parameters
-    ----------
-    embed_size : int
-        Dimensionality of positional embeddings.
-    """
-
-    def __init__(self, embed_size, **kwargs):
-        super().__init__(**kwargs)
-
-        inv_freq = 1 / mx.nd.power(10000, mx.nd.arange(0.0, embed_size, 2.0) / embed_size)
-        with self.name_scope():
-            self.inv_freq = self.params.get_constant('inv_freq', inv_freq.reshape((1, -1)))
-
-    def hybrid_forward(self, F, pos_seq, inv_freq):  # pylint: disable=arguments-differ
-        """Compute positional embeddings.
-
-        Parameters
-        ----------
-        pos_seq : Symbol or NDArray
-            Positions to compute embedding for. Shape (length, )
-
-        Returns
-        -------
-        pos_emb: Symbol or NDArray
-            Positional embeddings for positions secified in pos_seq. Shape
-            (length, embed_size).
-        """
-        inp = F.dot(pos_seq.reshape((-1, 1)), inv_freq)
-        pos_emb = F.concat(F.sin(inp), F.cos(inp), dim=-1)
-        return pos_emb
-
-
-class TransformerXLCell(mx.gluon.HybridBlock):
-    """Transformer-XL Cell.
-
-    Parameters
-    ----------
-    attention_cell
-        Attention cell to be used.
-    units : int
-        Number of units for the output
-    hidden_size : int
-        number of units in the hidden layer of position-wise feed-forward networks
-    num_heads : int
-        Number of heads in multi-head attention
-    scaled : bool
-        Whether to scale the softmax input by the sqrt of the input dimension
-        in multi-head attention
-    dropout : float
-    attention_dropout : float
-    layer_norm_eps : float, default 1e-5
-        Epsilon parameter passed to for mxnet.gluon.nn.LayerNorm
-    use_residual : bool
-    output_attention: bool
-        Whether to output the attention weights
-    weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default None
-        Prefix for name of `Block`s
-        (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells.
-        Created if `None`.
-    """
-
-    def __init__(self, attention_cell: PositionalEmbeddingMultiHeadAttentionCell, units=128,
-                 hidden_size=512, num_heads=4, activation='relu', scaled=True, dropout=0.0,
-                 layer_norm_eps=1e-5, output_attention=False, use_residual=True,
-                 weight_initializer=None, bias_initializer='zeros', prefix=None, params=None):
-        super().__init__(prefix=prefix, params=params)
-        self._units = units
-        self._num_heads = num_heads
-        self._activation = activation
-        self._dropout = dropout
-        self._use_residual = use_residual
-        self._output_attention = output_attention
-        self._scaled = scaled
-        with self.name_scope():
-            if dropout:
-                self.dropout_layer = nn.Dropout(rate=dropout)
-            assert units % num_heads == 0
-            self.attention_cell = attention_cell
-            self.proj = nn.Dense(units=units, flatten=False, use_bias=False,
-                                 weight_initializer=weight_initializer,
-                                 bias_initializer=bias_initializer, prefix='proj_')
-            self.ffn = nlp.model.PositionwiseFFN(hidden_size=hidden_size, units=units,
-                                                 use_residual=use_residual, dropout=dropout,
-                                                 ffn1_dropout=True, activation=activation,
-                                                 weight_initializer=weight_initializer,
-                                                 bias_initializer=bias_initializer,
-                                                 layer_norm_eps=layer_norm_eps)
-            self.layer_norm = nn.LayerNorm(in_channels=units, epsilon=layer_norm_eps)
-
-    def hybrid_forward(self, F, inputs, pos_emb, mem_value, mask):
-        #  pylint: disable=arguments-differ
-        """Transformer Decoder Attention Cell.
-
-        Parameters
-        ----------
-        inputs : Symbol or NDArray
-            Input sequence. Shape (batch_size, length, C_in)
-        mem_value : Symbol or NDArray
-            Memory value, i.e. output of the encoder. Shape (batch_size, mem_length, C_in)
-        pos_emb : Symbol or NDArray
-            Positional embeddings. Shape (mem_length, C_in)
-        mask : Symbol or NDArray or None
-            Attention mask of shape (batch_size, length, length + mem_length)
-
-        Returns
-        -------
-        decoder_cell_outputs: list
-            Outputs of the decoder cell. Contains:
-
-            - outputs of the transformer decoder cell. Shape (batch_size, length, C_out)
-            - additional_outputs of all the transformer decoder cell
-        """
-        key_value = F.concat(mem_value, inputs, dim=1)
-        outputs, attention_outputs = self.attention_cell(inputs, key_value, key_value, pos_emb,
-                                                         mask)
-        outputs = self.proj(outputs)
-        if self._dropout:
-            outputs = self.dropout_layer(outputs)
-        if self._use_residual:
-            outputs = outputs + inputs
-        outputs = self.layer_norm(outputs)
-        outputs = self.ffn(outputs)
-        additional_outputs = [attention_outputs] if self._output_attention else []
-        return outputs, additional_outputs
-
-
-class _BaseTransformerXL(mx.gluon.HybridBlock):
-    def __init__(self, vocab_size, embed_size, embed_cutoffs=None, embed_div_val=None, num_layers=2,
-                 units=128, hidden_size=2048, num_heads=4, scaled=True, dropout=0.0,
-                 attention_dropout=0.0, use_residual=True, clamp_len: typing.Optional[int] = None,
-                 project_same_dim: bool = True, tie_input_output_embeddings: bool = False,
-                 tie_input_output_projections: typing.Optional[typing.List[bool]] = None,
-                 output_attention=False, weight_initializer=None, bias_initializer='zeros',
-                 prefix=None, params=None):
-        super().__init__(prefix=prefix, params=params)
-        assert units % num_heads == 0, 'In TransformerDecoder, the units should be divided ' \
-                                       'exactly by the number of heads. Received units={}, ' \
-                                       'num_heads={}'.format(units, num_heads)
-
-        self._num_layers = num_layers
-        self._units = units
-        self._embed_size = embed_size
-        self._hidden_size = hidden_size
-        self._num_heads = num_heads
-        self._dropout = dropout
-        self._use_residual = use_residual
-        self._clamp_len = clamp_len
-        self._project_same_dim = project_same_dim
-        self._tie_input_output_embeddings = tie_input_output_embeddings
-        self._tie_input_output_projections = tie_input_output_projections
-        if output_attention:
-            # Will be implemented when splitting this Block to separate the
-            # AdaptiveLogSoftmaxWithLoss used with targets
-            raise NotImplementedError()
-        self._output_attention = output_attention
-        with self.name_scope():
-            if embed_cutoffs is not None and embed_div_val != 1:
-                self.embedding = AdaptiveEmbedding(vocab_size=vocab_size, embed_size=embed_size,
-                                                   units=units, cutoffs=embed_cutoffs,
-                                                   div_val=embed_div_val,
-                                                   project_same_dim=project_same_dim)
-                self.crit = AdaptiveLogSoftmaxWithLoss(vocab_size=vocab_size, embed_size=embed_size,
-                                                       units=units, cutoffs=embed_cutoffs,
-                                                       div_val=embed_div_val,
-                                                       project_same_dim=project_same_dim,
-                                                       tie_embeddings=tie_input_output_embeddings,
-                                                       tie_projections=tie_input_output_projections,
-                                                       params=self.embedding.collect_params())
-            else:
-                self.embedding = ProjectedEmbedding(vocab_size=vocab_size, embed_size=embed_size,
-                                                    units=units, project_same_dim=project_same_dim)
-                self.crit = ProjectedLogSoftmaxWithLoss(
-                    vocab_size=vocab_size, embed_size=embed_size, units=units,
-                    project_same_dim=project_same_dim, tie_embeddings=tie_input_output_embeddings,
-                    tie_projections=tie_input_output_projections[0]
-                    if tie_input_output_projections is not None else None,
-                    params=self.embedding.collect_params())
-
-            self.pos_emb = PositionalEmbedding(embed_size)
-            if dropout:
-                self.dropout_layer = nn.Dropout(rate=dropout)
-
-            self.transformer_cells = nn.HybridSequential()
-            for i in range(num_layers):
-                attention_cell = PositionalEmbeddingMultiHeadAttentionCell(
-                    d_head=units // num_heads, num_heads=num_heads, scaled=scaled,
-                    dropout=attention_dropout)
-                self.transformer_cells.add(
-                    TransformerXLCell(attention_cell=attention_cell, units=units,
-                                      hidden_size=hidden_size, num_heads=num_heads,
-                                      weight_initializer=weight_initializer,
-                                      bias_initializer=bias_initializer, dropout=dropout,
-                                      scaled=scaled, use_residual=use_residual,
-                                      output_attention=output_attention,
-                                      prefix='transformer%d_' % i))
-
-    def hybrid_forward(self, F, step_input, target, mask, pos_seq, mems):  # pylint: disable=arguments-differ
-        """
-
-        Parameters
-        ----------
-        step_input : NDArray or Symbol
-            Input of shape [batch_size, length]
-        target : NDArray or Symbol
-            Targets of shape [batch_size, length]
-        mask : NDArray or Symbol
-            Attention mask of shape [length + memory_length]
-        pos_seq : NDArray or Symbol
-            Array of [length + memory_length] created with arange(length +
-            memory_length).
-        mems : List of NDArray or Symbol, optional
-            Optional memory from previous forward passes containing
-            `num_layers` `NDArray`s or `Symbol`s each of shape [batch_size,
-            memory_length, units].
-
-        Returns
-        -------
-        softmax_output : NDArray or Symbol
-            Negative log likelihood of targets with shape [batch_size, length]
-        hids : List of NDArray or Symbol
-            List containing `num_layers` `NDArray`s or `Symbol`s each of shape
-            [batch_size, mem_len, units] representing the mememory states at
-            the entry of each layer (does not include last_hidden).
-        last_hidden
-
-        """
-        core_out = self.embedding(step_input)
-        if self._clamp_len is not None and self._clamp_len >= 0:
-            pos_seq = F.clip(pos_seq, a_min=0, a_max=self._clamp_len)
-        pos_emb = self.pos_emb(pos_seq)
-
-        if self._dropout:
-            core_out = self.dropout_layer(core_out)
-            pos_emb = self.dropout_layer(pos_emb)
-
-        hids = []
-        for i, layer in enumerate(self.transformer_cells):
-            hids.append(core_out)
-            mems_i = None if mems is None else mems[i]
-            # inputs, mem_value, emb, mask=None
-            core_out, _ = layer(core_out, pos_emb, mems_i, mask)
-
-        if self._dropout:
-            core_out = self.dropout_layer(core_out)
-
-        softmax_output = self.crit(core_out, target)
-
-        return softmax_output, hids, core_out
-
-
-class TransformerXL(mx.gluon.Block):
-    """Structure of the Transformer-XL.
-
-    Dai, Z., Yang, Z., Yang, Y., Cohen, W. W., Carbonell, J., Le, Q. V., &
-    Salakhutdinov, R. (2019). Transformer-XL: Attentive language models beyond
-    a fixed-length context. arXiv preprint arXiv:1901.02860.
-
-    Parameters
-    ----------
-    attention_cell : None
-        Argument reserved for later.
-    vocab_size : int or None, default None
-        The size of the vocabulary.
-    num_layers : int
-    units : int
-    hidden_size : int
-        number of units in the hidden layer of position-wise feed-forward networks
-    num_heads : int
-        Number of heads in multi-head attention
-    scaled : bool
-        Whether to scale the softmax input by the sqrt of the input dimension
-        in multi-head attention
-    dropout : float
-    use_residual : bool
-    output_attention: bool
-        Whether to output the attention weights
-    tie_input_output_embeddings : boolean, default False
-        If True, tie embedding parameters for all clusters between
-        AdaptiveEmbedding and AdaptiveLogSoftmaxWithLoss.
-    tie_input_output_projections : List[boolean] or None, default None
-        If not None, tie projection parameters for the specified clusters
-        between AdaptiveEmbedding and AdaptiveLogSoftmaxWithLoss. The number of
-        clusters is `len(tie_input_output_projections) == len(cutoffs) + 1`.
-    weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default 'rnn_'
-        Prefix for name of `Block`s
-        (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells.
-        Created if `None`.
-
-    """
-
-    def __init__(self, *args, **kwargs):
-        prefix = kwargs.pop('prefix', None)
-        params = kwargs.pop('params', None)
-        super().__init__(prefix=prefix, params=params)
-
-        with self.name_scope():
-            self._net = _BaseTransformerXL(*args, **kwargs)
-
-    def begin_mems(self, batch_size, mem_len, context):
-        mems = [
-            mx.nd.zeros((batch_size, mem_len, self._net._units), ctx=context)
-            for _ in range(len(self._net.transformer_cells))
-        ]
-        return mems
-
-    def forward(self, step_input, target, mems):  # pylint: disable=arguments-differ
-        """
-
-        Parameters
-        ----------
-        step_input : NDArray or Symbol
-            Input of shape [batch_size, length]
-        target : NDArray or Symbol
-            Input of shape [batch_size, length]
-        mems : List of NDArray or Symbol, optional
-            Optional memory from previous forward passes containing
-            `num_layers` `NDArray`s or `Symbol`s each of shape [batch_size,
-            mem_len, units].
-
-        Returns
-        -------
-        softmax_output : NDArray or Symbol
-            Negative log likelihood of targets with shape [batch_size, length]
-        mems : List of NDArray or Symbol
-            List containing `num_layers` `NDArray`s or `Symbol`s each of shape
-            [batch_size, mem_len, units] representing the mememory states at
-            the entry of each layer.
-
-        """
-        # Uses same number of unmasked memory steps for every step
-        batch_size, qlen = step_input.shape[:2]
-        mlen = mems[0].shape[1] if mems is not None else 0
-        klen = qlen + mlen
-
-        all_ones = np.ones((qlen, klen), dtype=step_input.dtype)
-        mask = np.triu(all_ones, 1 + mlen) + np.tril(all_ones, 0)
-        mask_nd = (mx.nd.from_numpy(mask, zero_copy=True) == 0).as_in_context(
-            step_input.context).expand_dims(0).broadcast_axes(axis=0, size=batch_size)
-
-        pos_seq = mx.nd.arange(start=klen, stop=-qlen, step=-1, ctx=step_input.context)
-
-        softmax_output, hids, last_hidden = self._net(step_input, target, mask_nd, pos_seq, mems)
-
-        # Update memory
-        if mems is not None:
-            new_mems = [
-                # pylint: disable=invalid-sequence-index
-                mx.nd.concat(mem_i, hid_i, dim=1)[:, -mem_i.shape[1]:].detach()
-                for mem_i, hid_i in zip(mems, hids)
-            ]
-        else:
-            new_mems = None
-
-        return softmax_output, new_mems, last_hidden
-
-
-class XLNetCell(TransformerXLCell):
-    """XLNet Cell.
-
-    Parameters
-    ----------
-    attention_cell
-        Attention cell to be used.
-    units : int
-        Number of units for the output
-    hidden_size : int
-        number of units in the hidden layer of position-wise feed-forward networks
-    num_heads : int
-        Number of heads in multi-head attention
-    scaled : bool
-        Whether to scale the softmax input by the sqrt of the input dimension
-        in multi-head attention
-    dropout : float
-    attention_dropout : float
-    use_residual : bool
-    output_attention: bool
-        Whether to output the attention weights
-    weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default None
-        Prefix for name of `Block`s
-        (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells.
-        Created if `None`.
-    """
-
-    def hybrid_forward(self, F, inputs, pos_emb, mem_value, mask, segments):
-        #  pylint: disable=arguments-differ
-        """Transformer Decoder Attention Cell.
-
-        Parameters
-        ----------
-        inputs : Symbol or NDArray
-            Input sequence. Shape (batch_size, length, C_in)
-        mem_value : Symbol or NDArray
-            Memory value, i.e. output of the encoder. Shape (batch_size,
-            memory__length, C_in)
-        pos_emb : Symbol or NDArray
-            Positional embeddings. Shape (mem_length, C_in)
-        seg_emb : Symbol or NDArray
-            Segment embeddings. Shape (mem_length, C_in)
-        mask : Symbol or NDArray
-            Attention mask of shape (batch_size, length, length + mem_length)
-        segments : Symbol or NDArray
-            One-hot vector indicating if a query-key pair is in the same
-            segment or not. Shape [batch_size, query_length, query_length +
-            memory_length, 2]. `1` indicates that the pair is not in the same
-            segment.
-
-        Returns
-        -------
-        decoder_cell_outputs: list
-            Outputs of the decoder cell. Contains:
-
-            - outputs of the transformer decoder cell. Shape (batch_size, length, C_out)
-            - additional_outputs of all the transformer decoder cell
-        """
-        key_value = inputs
-        if mem_value is not None:
-            key_value = F.concat(mem_value, inputs, dim=1)
-        outputs, attention_outputs = self.attention_cell(inputs, key_value, key_value, pos_emb,
-                                                         mask, segments)
-
-        outputs = self.proj(outputs)
-        if self._dropout:
-            outputs = self.dropout_layer(outputs)
-        if self._use_residual:
-            outputs = outputs + inputs
-        outputs = self.layer_norm(outputs)
-        outputs = self.ffn(outputs)
-        additional_outputs = [attention_outputs] if self._output_attention else []
-        return outputs, additional_outputs
-
-
-class _BaseXLNet(mx.gluon.HybridBlock):
-    """
-    Parameters
-    ----------
-    vocab_size : int
-        The size of the vocabulary.
-    num_layers : int
-    units : int
-    hidden_size : int
-        number of units in the hidden layer of position-wise feed-forward networks
-    num_heads : int
-        Number of heads in multi-head attention
-    activation
-        Activation function used for the position-wise feed-forward networks
-    two_stream
-        If True, use Two-Stream Self-Attention. Typically set to True for
-        pre-training and False during finetuning.
-    scaled : bool
-        Whether to scale the softmax input by the sqrt of the input dimension
-        in multi-head attention
-    dropout : float
-    attention_dropout : float
-    use_residual : bool
-    clamp_len : int
-        Clamp all relative distances larger than clamp_len
-    use_decoder : bool, default True
-        Whether to include the decoder for language model prediction.
-    tie_decoder_weight : bool, default True
-        Whether to tie the decoder weight with the input embeddings
-    weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default None
-        Prefix for name of `Block`s (and name of weight if params is `None`).
-    params : ParameterDict or None
-        Container for weight sharing between cells. Created if `None`.
-
-    """
-
-    def __init__(self, vocab_size, num_layers=2, units=128, hidden_size=2048, num_heads=4,
-                 activation='approx_gelu', two_stream: bool = False, scaled=True, dropout=0.0,
-                 attention_dropout=0.0, use_residual=True, clamp_len: typing.Optional[int] = None,
-                 use_decoder=True, tie_decoder_weight=True, weight_initializer=None,
-                 bias_initializer='zeros', prefix=None, params=None):
-        super().__init__(prefix=prefix, params=params)
-        assert units % num_heads == 0, 'In TransformerDecoder, the units should be divided ' \
-                                       'exactly by the number of heads. Received units={}, ' \
-                                       'num_heads={}'.format(units, num_heads)
-
-        self._num_layers = num_layers
-        self._units = units
-        self._hidden_size = hidden_size
-        self._num_heads = num_heads
-        self._two_stream = two_stream
-        assert not two_stream, 'Not yet implemented.'
-        self._dropout = dropout
-        self._use_residual = use_residual
-        self._clamp_len = clamp_len
-        with self.name_scope():
-            self.word_embed = nn.Embedding(vocab_size, units)
-            self.mask_embed = self.params.get('mask_embed', shape=(1, 1, units))
-            self.pos_embed = PositionalEmbedding(units)
-            if dropout:
-                self.dropout_layer = nn.Dropout(rate=dropout)
-
-            self.transformer_cells = nn.HybridSequential()
-            for i in range(num_layers):
-                attention_cell = RelativeSegmentEmbeddingPositionalEmbeddingMultiHeadAttentionCell(
-                    d_head=units // num_heads, num_heads=num_heads, scaled=scaled,
-                    dropout=attention_dropout)
-                self.transformer_cells.add(
-                    XLNetCell(attention_cell=attention_cell, units=units, hidden_size=hidden_size,
-                              num_heads=num_heads, activation=activation, layer_norm_eps=1e-12,
-                              weight_initializer=weight_initializer,
-                              bias_initializer=bias_initializer, dropout=dropout, scaled=scaled,
-                              use_residual=use_residual, prefix='transformer%d_' % i))
-            if use_decoder:
-                self.decoder = nn.Dense(
-                    vocab_size, flatten=False,
-                    params=self.word_embed.params if tie_decoder_weight else None)
-
-    def hybrid_forward(self, F, step_input, segments, mask, pos_seq, mems, mask_embed):
-        #  pylint: disable=arguments-differ
-        """
-        Parameters
-        ----------
-        step_input : Symbol or NDArray
-            Input of shape [batch_size, query_length]
-        segments : Symbol or NDArray
-            One-hot vector indicating if a query-key pair is in the same
-            segment or not. Shape [batch_size, query_length, query_length +
-            memory_length, 2]. `1` indicates that the pair is not in the same
-            segment.
-        mask : Symbol or NDArray
-            Attention mask of shape (batch_size, length, length + mem_length)
-        pos_seq : Symbol or NDArray
-            Relative distances
-        mems : List of NDArray or Symbol, optional
-            Memory from previous forward passes containing
-            `num_layers` `NDArray`s or `Symbol`s each of shape [batch_size,
-            memory_length, units].
-        Returns
-
-        -------
-        core_out : NDArray or Symbol
-            For use_decoder=True, logits. Otherwise output of last layer.
-        hids : List of NDArray or Symbol
-            Stacking the output of each layer
-        """
-        if self._clamp_len:
-            pos_seq = F.clip(pos_seq, a_min=0, a_max=self._clamp_len)
-
-        # Force use mask_embed in a noop to make HybridBlock happy
-        core_out = F.broadcast_add(self.word_embed(step_input), 0 * mask_embed)
-        pos_emb = self.pos_embed(pos_seq)
-
-        if self._dropout:
-            core_out = self.dropout_layer(core_out)
-            pos_emb = self.dropout_layer(pos_emb)
-
-        hids = []
-        for i, layer in enumerate(self.transformer_cells):
-            hids.append(core_out)
-            mems_i = None if mems is None else mems[i]
-            core_out, _ = layer(core_out, pos_emb, mems_i, mask, segments)
-
-        if self._dropout:
-            core_out = self.dropout_layer(core_out)
-
-        if hasattr(self, 'decoder'):
-            return self.decoder(core_out), hids
-        return core_out, hids
-
-    def begin_mems(self, batch_size, mem_len, context):
-        mems = [
-            mx.nd.zeros((batch_size, mem_len, self._units), ctx=context)
-            for _ in range(len(self.transformer_cells))
-        ]
-        return mems
-
-
-class XLNet((mx.gluon.Block)):
-    """XLNet
-
-    Yang, Z., Dai, Z., Yang, Y., Carbonell, J., Salakhutdinov, R., & Le, Q. V.
-    (2019). XLNet: Generalized Autoregressive Pretraining for Language
-    Understanding. arXiv preprint arXiv:1906.08237.
-
-    Parameters
-    ----------
-    attention_cell : None
-        Argument reserved for later.
-    vocab_size : int or None, default None
-        The size of the vocabulary.
-    num_layers : int
-    units : int
-    hidden_size : int
-        number of units in the hidden layer of position-wise feed-forward networks
-    num_heads : int
-        Number of heads in multi-head attention
-    activation
-        Activation function used for the position-wise feed-forward networks
-    two_stream
-        If True, use Two-Stream Self-Attention. Typically set to True for
-        pre-training and False during finetuning.
-    scaled : bool
-        Whether to scale the softmax input by the sqrt of the input dimension
-        in multi-head attention
-    dropout : float
-    use_residual : bool
-    use_decoder : bool, default True
-        Whether to include the decoder for language model prediction.
-    tie_decoder_weight : bool, default True
-        Whether to tie the decoder weight with the input embeddings
-    weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default None
-        Prefix for name of `Block`s (and name of weight if params is `None`).
-    params : ParameterDict or None
-        Container for weight sharing between cells. Created if `None`.
-
-    """
-
-    def __init__(self, *args, **kwargs):
-        prefix = kwargs.pop('prefix', None)
-        params = kwargs.pop('params', None)
-        super().__init__(prefix=prefix, params=params)
-
-        with self.name_scope():
-            self._net = _BaseXLNet(*args, **kwargs)
-
-    def begin_mems(self, batch_size, mem_len, context):
-        mems = [
-            mx.nd.zeros((batch_size, mem_len, self._net._units), ctx=context)
-            for _ in range(len(self._net.transformer_cells))
-        ]
-        return mems
-
-    def forward(self, step_input, token_types, mems=None, mask=None):  # pylint: disable=arguments-differ
-        """
-
-        Parameters
-        ----------
-        step_input : NDArray or Symbol
-            Input of shape [batch_size, query_length]
-        token_types : NDArray or Symbol
-            Token types of the input tokens of shape [batch_size,
-            query_length], indicating various portions of the inputs.
-        mems : List of NDArray or Symbol, optional
-            Optional memory from previous forward passes containing
-            `num_layers` `NDArray`s or `Symbol`s each of shape [batch_size,
-            memory_length, units].
-        mask : Symbol or NDArray
-            Attention mask of shape (batch_size, length, length + mem_length)
-
-        Returns
-        -------
-        output : NDArray or Symbol
-            For XLNet(..., use_decoder=True), logits. Otherwise output of last
-            XLNetCell layer.
-        mems : List of NDArray or Symbol
-            List containing `num_layers` `NDArray`s or `Symbol`s each of shape
-            [batch_size, mem_len, units] representing the mememory states at
-            the entry of each layer.
-
-        """
-        # Uses same number of unmasked memory steps for every step
-        batch_size, qlen = step_input.shape[:2]
-        mlen = mems[0].shape[1] if mems is not None else 0
-        klen = qlen + mlen
-        segments = None
-        if token_types is not None:
-            if mlen > 0:
-                mem_pad = mx.nd.zeros([batch_size, mlen], dtype=token_types.dtype,
-                                      ctx=token_types.context)
-                mem_pad_token_types = mx.nd.concat(mem_pad, token_types, dim=1)
-            else:
-                mem_pad_token_types = token_types
-            # `1` indicates not in the same segment [qlen x klen x bsz]
-            segments = mx.nd.broadcast_not_equal(token_types.expand_dims(2),
-                                                 mem_pad_token_types.expand_dims(1))
-            segments = mx.nd.one_hot(segments, 2, 1, 0)
-
-
-        pos_seq = mx.nd.arange(start=klen, stop=-qlen, step=-1, ctx=step_input.context)
-
-        if mask is None and self._net._active:
-            # Hybridized _net does not support `None`-valued parameters
-            mask = mx.nd.ones((batch_size, qlen, klen), ctx=step_input.context)
-        output, hids = self._net(step_input, segments, mask, pos_seq, mems)
-
-        # Update memory
-        new_mems = None
-        if mems is not None:
-            new_mems = [
-                # pylint: disable=invalid-sequence-index
-                mx.nd.concat(mem_i, hid_i, dim=1)[:, -mem_i.shape[1]:].detach()
-                for mem_i, hid_i in zip(mems, hids)
-            ]
-
-        return output, new_mems
diff --git a/scripts/language_model/transformer_xl.py b/scripts/language_model/transformer_xl.py
deleted file mode 100644
index 2592aadb85..0000000000
--- a/scripts/language_model/transformer_xl.py
+++ /dev/null
@@ -1,164 +0,0 @@
-"""Transformer-XL Language Model
-================================
-
-This example shows how to build a Transformer-XL language model with Gluon NLP
-Toolkit.
-
-@article{dai2019transformer,
-  title = {Transformer-XL: Attentive language models beyond a fixed-length context},
-  author = {Dai, Zihang and Yang, Zhilin and Yang, Yiming and Cohen, William W
-      and Carbonell, Jaime and Le, Quoc V and Salakhutdinov, Ruslan},
-  journal = {arXiv preprint arXiv:1901.02860},
-  year = {2019},
-}
-
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import itertools
-import math
-import sys
-import time
-
-import mxnet as mx
-import gluonnlp as nlp
-
-nlp.utils.check_version('0.8.0')
-
-def evaluate(data_iter):
-    """Evaluate the model on the dataset."""
-
-    total_L = mx.nd.zeros(shape=(1, ))
-    ntotal = 0
-
-    mems = model.begin_mems(args.eval_batch_size, args.mem_len, context=ctx)
-    for i, (data, target) in enumerate(data_iter):
-        data = data.T.as_in_context(ctx)
-        target = target.T.as_in_context(ctx)
-        L, mems, _ = model(data, target, mems)  # Negative log likelihood of targets
-        total_L += mx.nd.sum(L).as_in_context(mx.cpu())
-        ntotal += L.size
-        mx.nd.waitall()  # Avoid OOM due to pushing data too fast
-
-        if i % args.log_every == 0:
-            current_loss = total_L.asscalar() / ntotal
-            print('Iter {} evaluation loss {:.2f}, ppl {:.2f}, bpc {:.2f}'.format(
-                i, current_loss, math.exp(current_loss), current_loss / math.log(2)))
-
-    return total_L.asscalar() / ntotal
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Transformer-XL Language Modeling.',
-                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--dataset', type=str, required=True,
-                        choices=['wt103', 'text8', 'enwik8', 'lm1b'], help='Dataset name.')
-    parser.add_argument('--split', type=str, default='test', choices=['valid', 'test'],
-                        help='Which split to evaluate')
-    parser.add_argument('--parameter-file', type=str, default=None, required=True,
-                        help='File storing pre-trained parameters for the model.')
-    parser.add_argument('--vocab-file', type=str, default=None, required=True,
-                        help='File storing nlp.Vocab corresponding to --parameter-file.')
-
-    parser.add_argument('--mem-len', type=int, default=1600,
-                        help='length of the retained previous heads')
-    parser.add_argument('--bptt', type=int, default=128,
-                        help='The number of tokens per batch dimension per sample.')
-    parser.add_argument('--clamp-len', type=int, default=1000,
-                        help='max positional embedding index')
-
-    parser.add_argument('--log-every', type=int, default=10,
-                        help='Log every `--log-every` iterations.')
-
-    # TODO: training not yet supported
-    parser.add_argument('--eval-only', action='store_true', required=True,
-                        help='Only evaluate the trained model')
-    parser.add_argument('--eval-batch-size', type=int, default=64,
-                        help='Batch size for evaluation.')
-    parser.add_argument('--gpu', type=int, help='GPU id')
-    args = parser.parse_args()
-
-    start_time = time.time()
-
-    # Model
-    from transformer.model import get_model
-    with open(args.vocab_file, 'r') as f:
-        vocab = nlp.Vocab.from_json(f.read())
-
-    ctx = mx.gpu(args.gpu) if args.gpu is not None else mx.cpu()
-    model, vocab = get_model('transformerxl', vocab=vocab, dataset_name=args.dataset,
-                             clamp_len=args.clamp_len)
-    model.initialize(ctx=ctx)
-    model.load_parameters(args.parameter_file, ignore_extra=False)
-    model.hybridize()
-    print(model)
-
-    # Data
-    if args.dataset == 'wt103':
-        val_dataset, test_dataset = [
-            nlp.data.WikiText103(segment=segment, skip_empty=False, bos=vocab.bos_token,
-                                 eos=vocab.eos_token) for segment in ['val', 'test']
-        ]
-    elif args.dataset == 'lm1b':
-        # bos=vocab.eos_token is not a typo: tf uses ['<S>'] + symbols + ['<S>']
-        test_datasets = list(
-            nlp.data.GBWStream(segment='test', skip_empty=True, bos=vocab.eos_token,
-                               eos=vocab.eos_token))
-        assert len(test_datasets) == 1
-        test_dataset = mx.gluon.data.SimpleDataset(
-            list(itertools.chain.from_iterable(test_datasets[0])))
-        val_dataset = None
-    elif args.dataset == 'text8':
-        dataset = nlp.data.Text8(max_sentence_length=None)
-        chars = list(itertools.chain.from_iterable(list(w) + ['_'] for w in dataset[0]))
-        num_test_chars = 5000000
-        val_dataset = mx.gluon.data.SimpleDataset(chars[-2 * num_test_chars:-num_test_chars])
-        test_dataset = mx.gluon.data.SimpleDataset(chars[-num_test_chars:])
-    elif args.dataset == 'enwik8':
-        val_dataset, test_dataset = [
-            mx.gluon.data.SimpleDataset(
-                list(itertools.chain.from_iterable(nlp.data.Enwik8(segment=segment))))
-            for segment in ['val', 'test']
-        ]
-    else:
-        print('Dataset unsupported by this script.')
-        sys.exit(1)
-
-    eval_batchify = nlp.data.batchify.CorpusBPTTBatchify(vocab, args.bptt, args.eval_batch_size,
-                                                         last_batch='discard')
-
-    # Evaluate
-    test_loss = None
-    valid_loss = None
-    if args.split in ('valid', 'all') and val_dataset is not None:
-        val_data = eval_batchify(val_dataset)
-        valid_loss = evaluate(val_data)
-    if args.split in ('test', 'all') and test_dataset is not None:
-        test_data = eval_batchify(test_dataset)
-        test_loss = evaluate(test_data)
-
-    if test_loss is not None:
-        print('Best test loss {:.2f}, test ppl {:.2f}, test bpc {:.2f}'.format(
-            test_loss, math.exp(test_loss), test_loss / math.log(2)))
-    if valid_loss is not None:
-        print('Best validation loss {:.2f}, val ppl {:.2f}, val bpc {:.2f}'.format(
-            valid_loss, math.exp(valid_loss), valid_loss / math.log(2)))
-
-    print('Total time cost {:.2f}s'.format(time.time() - start_time))
diff --git a/scripts/language_model/word_language_model.py b/scripts/language_model/word_language_model.py
deleted file mode 100644
index 12df344d79..0000000000
--- a/scripts/language_model/word_language_model.py
+++ /dev/null
@@ -1,474 +0,0 @@
-"""
-Word Language Model
-===================
-
-This example shows how to build a word-level language model on WikiText-2 with Gluon NLP Toolkit.
-By using the existing data pipeline tools and building blocks, the process is greatly simplified.
-
-We implement the AWD LSTM language model proposed in the following work.
-
-@article{merityRegOpt,
-  title={{Regularizing and Optimizing LSTM Language Models}},
-  author={Merity, Stephen and Keskar, Nitish Shirish and Socher, Richard},
-  journal={ICLR},
-  year={2018}
-}
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import time
-import math
-import os
-import sys
-import mxnet as mx
-from mxnet import gluon, autograd
-import gluonnlp as nlp
-
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.append(os.path.join(curr_path, '..', '..'))
-
-nlp.utils.check_version('0.7.0')
-
-parser = argparse.ArgumentParser(description=
-                                 'MXNet Autograd RNN/LSTM Language Model on Wikitext-2.')
-parser.add_argument('--model', type=str, default='lstm',
-                    help='type of recurrent net (rnn_tanh, rnn_relu, lstm, gru)')
-parser.add_argument('--emsize', type=int, default=400,
-                    help='size of word embeddings')
-parser.add_argument('--nhid', type=int, default=1150,
-                    help='number of hidden units per layer')
-parser.add_argument('--nlayers', type=int, default=3,
-                    help='number of layers')
-parser.add_argument('--lr', type=float, default=30,
-                    help='initial learning rate')
-parser.add_argument('--clip', type=float, default=0.25,
-                    help='gradient clipping')
-parser.add_argument('--epochs', type=int, default=750,
-                    help='upper epoch limit')
-parser.add_argument('--batch_size', type=int, default=80, metavar='N',
-                    help='batch size')
-parser.add_argument('--bptt', type=int, default=70,
-                    help='sequence length')
-parser.add_argument('--dropout', type=float, default=0.4,
-                    help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--dropout_h', type=float, default=0.2,
-                    help='dropout applied to hidden layer (0 = no dropout)')
-parser.add_argument('--dropout_i', type=float, default=0.65,
-                    help='dropout applied to input layer (0 = no dropout)')
-parser.add_argument('--dropout_e', type=float, default=0.1,
-                    help='dropout applied to embedding layer (0 = no dropout)')
-parser.add_argument('--weight_dropout', type=float, default=0.5,
-                    help='weight dropout applied to h2h weight matrix (0 = no weight dropout)')
-parser.add_argument('--tied', action='store_true',
-                    help='tie the word embedding and softmax weights')
-parser.add_argument('--log-interval', type=int, default=200, metavar='N',
-                    help='report interval')
-parser.add_argument('--save', type=str, default='model.params',
-                    help='path to save the final model')
-parser.add_argument('--eval_only', action='store_true',
-                    help='Whether to only evaluate the trained model')
-parser.add_argument('--gpu', type=str, help='single gpu id')
-parser.add_argument('--optimizer', type=str, default='sgd',
-                    help='optimizer to use (sgd, adam)')
-parser.add_argument('--wd', type=float, default=1.2e-6,
-                    help='weight decay applied to all weights')
-parser.add_argument('--alpha', type=float, default=2,
-                    help='alpha L2 regularization on RNN activation '
-                         '(alpha = 0 means no regularization)')
-parser.add_argument('--beta', type=float, default=1,
-                    help='beta slowness regularization applied on RNN activation '
-                         '(beta = 0 means no regularization)')
-parser.add_argument('--ntasgd', action='store_true',
-                    help='Whether to apply ntasgd')
-parser.add_argument('--test_mode', action='store_true',
-                    help='Whether to run through the script with few examples')
-parser.add_argument('--lr_update_interval', type=int, default=30,
-                    help='lr udpate interval')
-parser.add_argument('--lr_update_factor', type=float, default=0.1,
-                    help='lr udpate factor')
-args = parser.parse_args()
-
-###############################################################################
-# Load data
-###############################################################################
-
-context = [mx.cpu()] if not args.gpu else [mx.gpu(int(args.gpu))]
-
-assert args.batch_size % len(context) == 0, \
-    'Total batch size must be multiple of the number of devices'
-
-assert args.weight_dropout > 0 or (args.weight_dropout == 0 and args.alpha == 0), \
-    'The alpha L2 regularization cannot be used with standard RNN, please set alpha to 0'
-
-train_dataset, val_dataset, test_dataset = \
-    [nlp.data.WikiText2(segment=segment,
-                        skip_empty=False, bos=None, eos='<eos>')
-     for segment in ['train', 'val', 'test']]
-
-vocab = nlp.Vocab(counter=nlp.data.Counter(train_dataset), padding_token=None, bos_token=None)
-train_batchify = nlp.data.batchify.CorpusBatchify(vocab, args.batch_size)
-train_data = train_batchify(train_dataset)
-val_batch_size = 10
-val_batchify = nlp.data.batchify.CorpusBatchify(vocab, val_batch_size)
-val_data = val_batchify(val_dataset)
-test_batch_size = 1
-test_batchify = nlp.data.batchify.CorpusBatchify(vocab, test_batch_size)
-test_data = test_batchify(test_dataset)
-
-if args.test_mode:
-    args.emsize = 200
-    args.nhid = 200
-    args.nlayers = 1
-    args.epochs = 3
-    train_data = train_data[0:100]
-    val_data = val_data[0:100]
-    test_data = test_data[0:100]
-
-print(args)
-
-###############################################################################
-# Build the model
-###############################################################################
-
-ntokens = len(vocab)
-
-if args.weight_dropout > 0:
-    print('Use AWDRNN')
-    model_eval = nlp.model.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers,
-                                  args.tied, args.dropout, args.weight_dropout,
-                                  args.dropout_h, args.dropout_i, args.dropout_e)
-    model = nlp.model.train.AWDRNN(args.model, len(vocab), args.emsize, args.nhid, args.nlayers,
-                                   args.tied, args.dropout, args.weight_dropout,
-                                   args.dropout_h, args.dropout_i, args.dropout_e)
-else:
-    model_eval = nlp.model.StandardRNN(args.model, len(vocab), args.emsize,
-                                       args.nhid, args.nlayers, args.dropout, args.tied)
-    model = nlp.model.train.StandardRNN(args.model, len(vocab), args.emsize,
-                                        args.nhid, args.nlayers, args.dropout, args.tied)
-
-model.initialize(mx.init.Xavier(), ctx=context)
-
-model.hybridize(static_alloc=True)
-
-print(model)
-
-
-if args.optimizer == 'sgd':
-    trainer_params = {'learning_rate': args.lr,
-                      'momentum': 0,
-                      'wd': args.wd}
-elif args.optimizer == 'adam':
-    trainer_params = {'learning_rate': args.lr,
-                      'wd': args.wd,
-                      'beta1': 0,
-                      'beta2': 0.999,
-                      'epsilon': 1e-9}
-
-trainer = gluon.Trainer(model.collect_params(), args.optimizer, trainer_params,
-                        update_on_kvstore=False)
-
-loss = gluon.loss.SoftmaxCrossEntropyLoss()
-
-
-class JointActivationRegularizationLoss(gluon.loss.Loss):
-    r"""Computes Joint Regularization Loss with standard loss.
-
-    The activation regularization refer to
-    gluonnlp.loss.ActivationRegularizationLoss.
-
-    The temporal activation regularization refer to
-    gluonnlp.loss.TemporalActivationRegularizationLoss.
-
-    Parameters
-    ----------
-    loss : gluon.loss.Loss
-        The standard loss
-    alpha: float
-        The activation regularization parameter in gluonnlp.loss.ActivationRegularizationLoss
-    beta: float
-        The temporal activation regularization parameter in
-        gluonnlp.loss.TemporalActivationRegularizationLoss
-
-    Inputs:
-        - **out**: NDArray
-        output tensor with shape `(sequence_length, batch_size, input_size)`
-          when `layout` is "TNC".
-        - **target**: NDArray
-        target tensor with shape `(sequence_length, batch_size, input_size)`
-          when `layout` is "TNC".
-        - **states**: the stack outputs from RNN,
-        which consists of output from each time step (TNC).
-        - **dropped_states**: the stack outputs from RNN with dropout,
-        which consists of output from each time step (TNC).
-
-    Outputs:
-        - **loss**: loss tensor with shape (batch_size,). Dimensions other than
-          batch_axis are averaged out.
-    """
-
-    def __init__(self, l, alpha, beta, weight=None, batch_axis=None, **kwargs):
-        super(JointActivationRegularizationLoss, self).__init__(weight, batch_axis, **kwargs)
-        self._loss = l
-        self._alpha, self._beta = alpha, beta
-        if alpha:
-            self._ar_loss = nlp.loss.ActivationRegularizationLoss(alpha)
-        if beta:
-            self._tar_loss = nlp.loss.TemporalActivationRegularizationLoss(beta)
-
-    def __repr__(self):
-        s = 'JointActivationTemporalActivationRegularizationLoss'
-        return s
-
-    def hybrid_forward(self, F, out, target, states, dropped_states): # pylint: disable=arguments-differ
-        # pylint: disable=unused-argument
-        l = self._loss(out.reshape(-3, -1), target.reshape(-1,))
-        if self._alpha:
-            l = l + self._ar_loss(*dropped_states)
-        if self._beta:
-            l = l + self._tar_loss(*states)
-        return l
-
-
-joint_loss = JointActivationRegularizationLoss(loss, args.alpha, args.beta)
-
-###############################################################################
-# Training code
-###############################################################################
-
-
-def detach(hidden):
-    """Transfer hidden states into new states, to detach them from the history.
-    Parameters
-    ----------
-    hidden : NDArray
-        The hidden states
-    Returns
-    ----------
-    hidden: NDArray
-        The detached hidden states
-    """
-    if isinstance(hidden, (tuple, list)):
-        hidden = [detach(h) for h in hidden]
-    else:
-        hidden = hidden.detach()
-    return hidden
-
-
-def get_batch(data_source, i, seq_len=None):
-    """Get mini-batches of the dataset.
-
-    Parameters
-    ----------
-    data_source : NDArray
-        The dataset is evaluated on.
-    i : int
-        The index of the batch, starting from 0.
-    seq_len : int
-        The length of each sample in the batch.
-
-    Returns
-    -------
-    data: NDArray
-        The context
-    target: NDArray
-        The words to predict
-    """
-    seq_len = min(seq_len if seq_len else args.bptt, len(data_source) - 1 - i)
-    data = data_source[i:i+seq_len]
-    target = data_source[i+1:i+1+seq_len]
-    return data, target
-
-
-def evaluate(data_source, batch_size, params_file_name, ctx=None):
-    """Evaluate the model on the dataset.
-
-    Parameters
-    ----------
-    data_source : NDArray
-        The dataset is evaluated on.
-    batch_size : int
-        The size of the mini-batch.
-    params_file_name : str
-        The parameter file to use to evaluate,
-        e.g., val.params or args.save
-    ctx : mx.cpu() or mx.gpu()
-        The context of the computation.
-
-    Returns
-    -------
-    loss: float
-        The loss on the dataset
-    """
-
-    total_L = 0.0
-    ntotal = 0
-
-    model_eval.load_parameters(params_file_name, context)
-
-    hidden = model_eval.begin_state(batch_size=batch_size, func=mx.nd.zeros, ctx=context[0])
-    i = 0
-    while i < len(data_source) - 1 - 1:
-        data, target = get_batch(data_source, i, seq_len=args.bptt)
-        data = data.as_in_context(ctx)
-        target = target.as_in_context(ctx)
-        output, hidden = model_eval(data, hidden)
-        hidden = detach(hidden)
-        L = loss(output.reshape(-3, -1),
-                 target.reshape(-1,))
-        total_L += mx.nd.sum(L).asscalar()
-        ntotal += L.size
-        i += args.bptt
-    return total_L / ntotal
-
-
-def train():
-    """Training loop for awd language model.
-
-    """
-    ntasgd = False
-    best_val = float('Inf')
-    start_train_time = time.time()
-    parameters = model.collect_params()
-    param_dict_avg = None
-    t = 0
-    avg_trigger = 0
-    n = 5
-    valid_losses = []
-    for epoch in range(args.epochs):
-        total_L = 0.0
-        start_epoch_time = time.time()
-        start_log_interval_time = time.time()
-        hiddens = [model.begin_state(args.batch_size//len(context),
-                                     func=mx.nd.zeros, ctx=ctx) for ctx in context]
-        batch_i, i = 0, 0
-        while i < len(train_data) - 1 - 1:
-            bptt = args.bptt if mx.nd.random.uniform().asscalar() < 0.95 else args.bptt / 2
-            seq_len = max(5, int(mx.nd.random.normal(bptt, 5).asscalar()))
-            lr_batch_start = trainer.learning_rate
-            trainer.set_learning_rate(lr_batch_start*seq_len/args.bptt)
-
-            data, target = get_batch(train_data, i, seq_len=seq_len)
-            data_list = gluon.utils.split_and_load(data, context, batch_axis=1, even_split=True)
-            target_list = gluon.utils.split_and_load(target, context, batch_axis=1, even_split=True)
-            hiddens = detach(hiddens)
-            Ls = []
-            with autograd.record():
-                for j, (X, y, h) in enumerate(zip(data_list, target_list, hiddens)):
-                    output, h, encoder_hs, dropped_encoder_hs = model(X, h)
-                    l = joint_loss(output, y, encoder_hs, dropped_encoder_hs)
-                    Ls.append(l / (len(context) * X.size))
-                    hiddens[j] = h
-            for L in Ls:
-                L.backward()
-
-            grads = [p.grad(d.context) for p in parameters.values() for d in data_list]
-            gluon.utils.clip_global_norm(grads, args.clip)
-
-            if args.ntasgd and ntasgd:
-                if param_dict_avg is None:
-                    param_dict_avg = {k.split(model._prefix)[1]: v.data(context[0]).copy()
-                                      for k, v in parameters.items()}
-
-            trainer.step(1)
-
-            if args.ntasgd and ntasgd:
-                gamma = 1.0 / max(1, epoch * (len(train_data) // args.bptt)
-                                  + batch_i - avg_trigger + 2)
-                for name, param_avg in param_dict_avg.items():
-                    param_avg[:] += gamma * (parameters['{}{}'.format(model._prefix, name)]
-                                             .data(context[0]) - param_avg)
-
-            total_L += sum([mx.nd.sum(L).asscalar() for L in Ls])
-            trainer.set_learning_rate(lr_batch_start)
-
-            if batch_i % args.log_interval == 0 and batch_i > 0:
-                cur_L = total_L / args.log_interval
-                print('[Epoch %d Batch %d/%d] current loss %.2f, ppl %.2f, '
-                      'throughput %.2f samples/s, lr %.2f'
-                      % (epoch, batch_i, len(train_data) // args.bptt, cur_L, math.exp(cur_L),
-                         args.batch_size * args.log_interval
-                         / (time.time() - start_log_interval_time),
-                         lr_batch_start * seq_len / args.bptt))
-                total_L = 0.0
-                start_log_interval_time = time.time()
-            i += seq_len
-            batch_i += 1
-
-        mx.nd.waitall()
-
-        print('[Epoch %d] throughput %.2f samples/s' % (
-            epoch, (args.batch_size * len(train_data)) / (time.time() - start_epoch_time)))
-
-        if args.ntasgd and ntasgd:
-            mx.nd.save('{}.val.params'.format(args.save), param_dict_avg)
-        else:
-            model.save_parameters('{}.val.params'.format(args.save))
-        val_L = evaluate(val_data, val_batch_size, '{}.val.params'.format(args.save), context[0])
-        print('[Epoch %d] time cost %.2fs, valid loss %.2f, valid ppl %.2f, lr %.2f' % (
-            epoch, time.time() - start_epoch_time, val_L, math.exp(val_L),
-            trainer.learning_rate))
-
-        if args.ntasgd and avg_trigger == 0:
-            if t > n and val_L > min(valid_losses[-n:]):
-                if param_dict_avg is None:
-                    param_dict_avg = {k.split(model._prefix)[1]: v.data(context[0]).copy()
-                                      for k, v in parameters.items()}
-                else:
-                    for k, v in parameters.items():
-                        param_dict_avg[k.split(model._prefix)[1]] \
-                            = v.data(context[0]).copy()
-                avg_trigger = epoch * (len(train_data) // args.bptt) + len(train_data) // args.bptt
-                print('Switching to NTASGD and avg_trigger is : %d' % avg_trigger)
-                ntasgd = True
-            valid_losses.append(val_L)
-            t += 1
-
-        if val_L < best_val:
-            update_lr_epoch = 0
-            best_val = val_L
-            if args.ntasgd and ntasgd:
-                mx.nd.save(args.save, param_dict_avg)
-            else:
-                model.save_parameters(args.save)
-            test_L = evaluate(test_data, test_batch_size, args.save, context[0])
-            print('[Epoch %d] test loss %.2f, test ppl %.2f'
-                  % (epoch, test_L, math.exp(test_L)))
-        else:
-            update_lr_epoch += 1
-            if update_lr_epoch % args.lr_update_interval == 0 and update_lr_epoch != 0:
-                lr_scale = trainer.learning_rate * args.lr_update_factor
-                print('Learning rate after interval update %f' % lr_scale)
-                trainer.set_learning_rate(lr_scale)
-                update_lr_epoch = 0
-
-    print('Total training throughput %.2f samples/s'
-          % ((args.batch_size * len(train_data) * args.epochs) / (time.time() - start_train_time)))
-
-
-if __name__ == '__main__':
-    start_pipeline_time = time.time()
-    if not args.eval_only:
-        train()
-    model.load_parameters(args.save, context)
-    final_val_L = evaluate(val_data, val_batch_size, args.save, context[0])
-    final_test_L = evaluate(test_data, test_batch_size, args.save, context[0])
-    print('Best validation loss %.2f, val ppl %.2f' % (final_val_L, math.exp(final_val_L)))
-    print('Best test loss %.2f, test ppl %.2f' % (final_test_L, math.exp(final_test_L)))
-    print('Total time cost %.2fs' % (time.time()-start_pipeline_time))
diff --git a/scripts/language_model/xlnet_qa_evaluate.py b/scripts/language_model/xlnet_qa_evaluate.py
deleted file mode 100644
index 3421192d1a..0000000000
--- a/scripts/language_model/xlnet_qa_evaluate.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors, Allenai and DMLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""XLNet SQuAD evaluate."""
-
-from collections import namedtuple, OrderedDict
-
-from mxnet import nd
-
-_PrelimPrediction = namedtuple(  # pylint: disable=invalid-name
-    'PrelimPrediction', [
-        'feature_id', 'start_index', 'end_index', 'start_log_prob',
-        'end_log_prob'
-    ])
-
-_NbestPrediction = namedtuple(  # pylint: disable=invalid-name
-    'NbestPrediction', ['text', 'start_log_prob', 'end_log_prob'])
-
-
-def predict_extended(features,
-                     results,
-                     n_best_size,
-                     max_answer_length=64,
-                     start_n_top=5,
-                     end_n_top=5):
-    """Get prediction results for XLNet.
-
-    Parameters
-    ----------
-    features : list of SQuADFeature
-        List of squad features for the example.
-    results : list of data.qa.PredResult
-        List of model predictions for span start and span end.
-    tokenizer: callable
-        Tokenizer function.
-    max_answer_length: int, default 64
-        Maximum length of the answer tokens.
-    null_score_diff_threshold: float, default 0.0
-        If null_score - best_non_null is greater than the threshold predict null.
-    n_best_size: int, default 10
-        The total number of n-best predictions.
-    version_2: bool, default False
-        If true, the SQuAD examples contain some that do not have an answer.
-
-    Returns
-    -------
-    prediction: str
-        The final prediction.
-    nbest : list of (str, float)
-        n-best predictions with their probabilities.
-    """
-
-    prelim_predictions = []
-    score_null = 1000000  # large and positive
-    for features_id, (result, feature) in enumerate(zip(results, features)):
-        cur_null_score = result.cls_logits[0]
-        score_null = min(score_null, cur_null_score)
-        for i in range(start_n_top):
-            for j in range(end_n_top):
-                start_log_prob = result.start_top_log_probs[i]
-                start_index = int(result.start_top_index[i])
-                j_index = j * end_n_top + i
-                end_log_prob = result.end_top_log_probs[j_index]
-                end_index = int(result.end_top_index[j_index])
-                # We could hypothetically create invalid predictions, e.g., predict
-                # that the start of the span is in the question. We throw out all
-                # invalid predictions.
-                if start_index >= feature.paragraph_len - 1:
-                    continue
-                if end_index >= feature.paragraph_len - 1:
-                    continue
-
-                if not feature.token_is_max_context.get(start_index, False):
-                    continue
-                if end_index < start_index:
-                    continue
-                length = end_index - start_index + 1
-                if length > max_answer_length:
-                    continue
-                prelim_predictions.append(
-                    _PrelimPrediction(feature_id=features_id,
-                                      start_index=start_index,
-                                      end_index=end_index,
-                                      start_log_prob=start_log_prob,
-                                      end_log_prob=end_log_prob))
-
-    prelim_predictions = sorted(prelim_predictions,
-                                key=lambda x:
-                                (x.start_log_prob + x.end_log_prob),
-                                reverse=True)
-
-    seen_predictions = {}
-    nbest = []
-    for pred in prelim_predictions:
-        if len(nbest) >= n_best_size:
-            break
-        feature = features[pred.feature_id]
-        tok_start_to_orig_index = feature.tok_start_to_orig_index
-        tok_end_to_orig_index = feature.tok_end_to_orig_index
-        start_orig_pos = tok_start_to_orig_index[pred.start_index]
-        end_orig_pos = tok_end_to_orig_index[pred.end_index]
-
-        paragraph_text = feature.paragraph_text
-        final_text = paragraph_text[start_orig_pos:end_orig_pos + 1].strip()
-        if final_text in seen_predictions:
-            continue
-        seen_predictions[final_text] = True
-        nbest.append(
-            _NbestPrediction(text=final_text,
-                             start_log_prob=pred.start_log_prob,
-                             end_log_prob=pred.end_log_prob))
-
-    # In very rare edge cases we could have no valid predictions. So we
-    # just create a nonce prediction in this case to avoid failure.
-    if not nbest:
-        nbest.append(
-            _NbestPrediction(text='', start_log_prob=-1e6, end_log_prob=-1e6))
-
-    assert len(nbest) >= 1
-
-    total_scores = []
-    best_non_null_entry = None
-    for entry in nbest:
-        total_scores.append(entry.start_log_prob + entry.end_log_prob)
-        if not best_non_null_entry:
-            best_non_null_entry = entry
-    probs = nd.softmax(nd.array(total_scores)).asnumpy()
-
-    nbest_json = []
-
-    for (i, entry) in enumerate(nbest):
-        output = OrderedDict()
-        output['text'] = entry.text
-        output['probability'] = float(probs[i])
-        output['start_log_prob'] = float(entry.start_log_prob)
-        output['end_log_prob'] = float(entry.end_log_prob)
-        nbest_json.append(output)
-
-    assert len(nbest_json) >= 1
-    assert best_non_null_entry is not None
-    score_diff = score_null
-    return score_diff, best_non_null_entry.text, nbest_json
diff --git a/scripts/machine_translation/README.md b/scripts/machine_translation/README.md
new file mode 100644
index 0000000000..402e6272eb
--- /dev/null
+++ b/scripts/machine_translation/README.md
@@ -0,0 +1,172 @@
+# Machine Translation
+
+## Train a Transformer from scratch
+First, use the script described in [datasets/machine_translation](../datasets/machine_translation) 
+to generate the dataset. Then, run `train_transformer.py` to train the model. 
+
+In the following, we give the training script for WMT2014 EN-DE task with yttm tokenizer. 
+You may first run the following command in [datasets/machine_translation](../datasets/machine_translation).
+```bash
+bash wmt2014_ende.sh yttm
+```
+
+Then, you can run the experiment.
+For "transformer_base" configuration
+
+```bash
+SUBWORD_MODEL=yttm
+SRC=en
+TGT=de
+datapath=../datasets/machine_translation
+python3 train_transformer.py \
+    --train_src_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${SRC} \
+    --train_tgt_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${TGT} \
+    --dev_src_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${SRC} \
+    --dev_tgt_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${TGT} \
+    --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+    --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+    --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+    --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+    --save_dir transformer_base_wmt2014_en_de_${SUBWORD_ALGO} \
+    --cfg transformer_base \
+    --lr 0.002 \
+    --sampler BoundedBudgetSampler \
+    --max_num_tokens 2700 \
+    --max_update 15000 \
+    --save_interval_update 500 \
+    --warmup_steps 6000 \
+    --warmup_init_lr 0.0 \
+    --seed 123 \
+    --gpus 0,1,2,3
+```
+
+Or training via horovod
+```
+horovodrun -np 4 -H localhost:4 python3 train_transformer.py \
+    --comm_backend horovod \
+    --train_src_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${SRC} \
+    --train_tgt_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${TGT} \
+    --dev_src_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${SRC} \
+    --dev_tgt_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${TGT} \
+    --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+    --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+    --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+    --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+    --save_dir transformer_base_wmt2014_en_de_${SUBWORD_ALGO} \
+    --cfg transformer_base \
+    --lr 0.002 \
+    --sampler BoundedBudgetSampler \
+    --max_num_tokens 2700 \
+    --max_update 15000 \
+    --save_interval_update 500 \
+    --warmup_steps 6000 \
+    --warmup_init_lr 0.0 \
+    --seed 123 \
+    --gpus 0,1,2,3
+```
+
+Use the average_checkpoint cli to average the last 10 checkpoints
+
+```bash
+gluon_average_checkpoint --checkpoints transformer_base_wmt2014_en_de_${SUBWORD_ALGO}/epoch*.params \
+    --begin 30 \
+    --end 39 \
+    --save-path transformer_base_wmt2014_en_de_${SUBWORD_ALGO}/epoch_avg_30_39.params
+```
+
+Use the following command to inference/evaluate the Transformer model:
+
+```bash
+python3 evaluate_transformer.py \
+    --param_path transformer_base_wmt2014_en_de_${SUBWORD_MODEL}/epoch_avg_30_39.params \
+    --src_lang en \
+    --tgt_lang de \
+    --cfg transformer_base_wmt2014_en_de_${SUBWORD_MODEL}/config.yml \
+    --src_tokenizer ${SUBWORD_MODEL} \
+    --tgt_tokenizer ${SUBWORD_MODEL} \
+    --src_subword_model_path ../datasets/machine_translation/wmt2014_ende/${SUBWORD_MODEL}.model \
+    --tgt_subword_model_path ../datasets/machine_translation/wmt2014_ende/${SUBWORD_MODEL}.model \
+    --src_vocab_path ../datasets/machine_translation/wmt2014_ende/${SUBWORD_MODEL}.vocab \
+    --tgt_vocab_path ../datasets/machine_translation/wmt2014_ende/${SUBWORD_MODEL}.vocab \
+    --src_corpus ../datasets/machine_translation/wmt2014_ende/test.raw.en \
+    --tgt_corpus ../datasets/machine_translation/wmt2014_ende/test.raw.de
+```
+
+
+
+For "transformer_wmt_en_de_big" configuration
+
+```bash
+SUBWORD_MODEL=yttm
+SRC=en
+TGT=de
+datapath=../datasets/machine_translation
+python3 train_transformer.py \
+    --train_src_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${SRC} \
+    --train_tgt_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${TGT} \
+    --dev_src_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${SRC} \
+    --dev_tgt_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${TGT} \
+    --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+    --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+    --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+    --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+    --save_dir transformer_big_wmt2014_en_de_${SUBWORD_ALGO} \
+    --cfg transformer_wmt_en_de_big \
+    --lr 0.001 \
+    --sampler BoundedBudgetSampler \
+    --max_num_tokens 3584 \
+    --max_update 15000 \
+    --warmup_steps 4000 \
+    --warmup_init_lr 0.0 \
+    --seed 123 \
+    --gpus 0,1,2,3
+```
+
+Use the average_checkpoint cli to average the last 10 checkpoints
+
+```bash
+gluon_average_checkpoint --checkpoints transformer_big_wmt2014_en_de_${SUBWORD_ALGO}/update*.params \
+    --begin 21 \
+    --end 30 \
+    --save-path transformer_big_wmt2014_en_de_${SUBWORD_ALGO}/avg_21_30.params
+```
+
+
+Use the following command to inference/evaluate the Transformer model:
+
+```bash
+python3 evaluate_transformer.py \
+    --param_path transformer_big_wmt2014_en_de_${SUBWORD_MODEL}/average_21_30.params \
+    --src_lang en \
+    --tgt_lang de \
+    --cfg transformer_big_wmt2014_en_de_${SUBWORD_MODEL}/config.yml \
+    --src_tokenizer ${SUBWORD_MODEL} \
+    --tgt_tokenizer ${SUBWORD_MODEL} \
+    --src_subword_model_path ../datasets/machine_translation/wmt2014_ende/${SUBWORD_MODEL}.model \
+    --tgt_subword_model_path ../datasets/machine_translation/wmt2014_ende/${SUBWORD_MODEL}.model \
+    --src_vocab_path ../datasets/machine_translation/wmt2014_ende/${SUBWORD_MODEL}.vocab \
+    --tgt_vocab_path ../datasets/machine_translation/wmt2014_ende/${SUBWORD_MODEL}.vocab \
+    --src_corpus ../datasets/machine_translation/wmt2014_ende/test.raw.en \
+    --tgt_corpus ../datasets/machine_translation/wmt2014_ende/test.raw.de
+```
+
+
+Test BLEU score with 3 seeds (evaluated via sacre BLEU):
+
+- transformer_base
+
+(test bleu / valid bleu)
+| Subword Model | #Params    | Seed = 123  | Seed = 1234 | Seed = 12345 |  Mean±std   |
+|---------------|------------|-------------|-------------|--------------|-------------|
+| yttm          |            | 26.50/26.29 | -           |  -           |  -          |
+| hf_bpe        |            |  -          | -           |  -           |  -          |
+| spm           |            |  -          | -           |  -           |  -          |
+
+- transformer_wmt_en_de_big
+
+(test bleu / valid bleu)
+| Subword Model | #Params    | Seed = 123  | Seed = 1234 | Seed = 12345 |  Mean±std   |
+|---------------|------------|-------------|-------------|--------------|-------------|
+| yttm          |            | 27.93/26.82 | -           |  -           |  -          |
+| hf_bpe        |            |  -          | -           |  -           |  -          |
+| spm           |            |  -          | -           |  -           |  -          |
diff --git a/scripts/machine_translation/__init__.py b/scripts/machine_translation/__init__.py
index 4c7a3827b3..e69de29bb2 100644
--- a/scripts/machine_translation/__init__.py
+++ b/scripts/machine_translation/__init__.py
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""NMT example."""
-from . import _constants, bleu, dataset, \
-              gnmt, translation, utils
diff --git a/scripts/machine_translation/_constants.py b/scripts/machine_translation/_constants.py
deleted file mode 100644
index a3d996d240..0000000000
--- a/scripts/machine_translation/_constants.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Constants used in the NMT examples."""
-import os
-
-__all__ = ['CACHE_PATH']
-
-CACHE_PATH = os.path.realpath(os.path.join(os.path.realpath(__file__), '..', 'cached'))
diff --git a/scripts/machine_translation/bleu.py b/scripts/machine_translation/bleu.py
deleted file mode 100644
index 2a0c820ccd..0000000000
--- a/scripts/machine_translation/bleu.py
+++ /dev/null
@@ -1,352 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""BLEU."""
-import sys
-import re
-import math
-import unicodedata
-from collections import Counter
-import six
-LIST_TYPES = (list, tuple)
-
-__all__ = ['compute_bleu']
-
-
-def _ngrams(segment, n):
-    """Extracts n-grams from an input segment.
-
-    Parameters
-    ----------
-    segment: list
-        Text segment from which n-grams will be extracted.
-    n: int
-        Order of n-gram.
-
-    Returns
-    -------
-    ngram_counts: Counter
-        Contain all the nth n-grams in segment with a count of how many times each n-gram occurred.
-    """
-    ngram_counts = Counter()
-    for i in range(0, len(segment) - n + 1):
-        ngram = tuple(segment[i:i + n])
-        ngram_counts[ngram] += 1
-    return ngram_counts
-
-
-def _split_compound_word(segment):
-    """Put compounds in ATAT format.
-       rich-text format" --> rich ##AT##-##AT## text format.
-    """
-    return re.sub(r'(\S)-(\S)', '\\1 ##AT##-##AT## \\2', ' '.join(segment)).split()
-
-
-def _bpe_to_words(sentence, delimiter='@@'):
-    """Convert a sequence of bpe words into sentence."""
-    words = []
-    word = ''
-    delimiter_len = len(delimiter)
-    for subwords in sentence:
-        if len(subwords) >= delimiter_len and subwords[-delimiter_len:] == delimiter:
-            word += subwords[:-delimiter_len]
-        else:
-            word += subwords
-            words.append(word)
-            word = ''
-    return words
-
-
-def _tokenize_mteval_13a(segment):
-    r"""
-    Tokenizes a string following the tokenizer in mteval-v13a.pl.
-    See https://github.com/moses-smt/mosesdecoder/"
-           "blob/master/scripts/generic/mteval-v14.pl#L917-L942
-    Parameters
-    ----------
-    segment: str
-        A string to be tokenized
-
-    Returns
-    -------
-    The tokenized string
-    """
-
-    norm = segment.rstrip()
-
-    norm = norm.replace('<skipped>', '')
-    norm = norm.replace('-\n', '')
-    norm = norm.replace('\n', ' ')
-    norm = norm.replace('&quot;', '"')
-    norm = norm.replace('&amp;', '&')
-    norm = norm.replace('&lt;', '<')
-    norm = norm.replace('&gt;', '>')
-
-    norm = u' {} '.format(norm)
-    norm = re.sub(r'([\{-\~\[-\` -\&\(-\+\:-\@\/])', ' \\1 ', norm)
-    norm = re.sub(r'([^0-9])([\.,])', '\\1 \\2 ', norm)
-    norm = re.sub(r'([\.,])([^0-9])', ' \\1 \\2', norm)
-    norm = re.sub(r'([0-9])(-)', '\\1 \\2 ', norm)
-    norm = re.sub(r'\s+', ' ', norm)
-    norm = re.sub(r'^\s+', '', norm)
-    norm = re.sub(r'\s+$', '', norm)
-
-    return norm
-
-
-class UnicodeRegex:
-    """Ad-hoc hack to recognize all punctuation and symbols.
-    """
-    def __init__(self):
-        punctuation = self._property_chars('P')
-        self.nondigit_punct_re = re.compile(r'([^\d])([' + punctuation + r'])')
-        self.punct_nondigit_re = re.compile(r'([' + punctuation + r'])([^\d])')
-        self.symbol_re = re.compile('([' + self._property_chars('S') + '])')
-
-    def _property_chars(self, prefix):
-        return ''.join(six.unichr(x) for x in range(sys.maxunicode)
-                       if unicodedata.category(six.unichr(x)).startswith(prefix))
-
-
-unicodeRegex = UnicodeRegex()
-
-
-def _tokenize_mteval_v14_intl(segment):
-    r"""Tokenize a string following following the international tokenizer in mteval-v14a.pl.
-    See https://github.com/moses-smt/mosesdecoder/"
-           "blob/master/scripts/generic/mteval-v14.pl#L954-L983
-
-    Parameters
-    ----------
-    segment: str
-        A string to be tokenized
-
-    Returns
-    -------
-    The tokenized string
-    """
-    segment = segment.rstrip()
-    segment = unicodeRegex.nondigit_punct_re.sub(r'\1 \2 ', segment)
-    segment = unicodeRegex.punct_nondigit_re.sub(r' \1 \2', segment)
-    segment = unicodeRegex.symbol_re.sub(r' \1 ', segment)
-    return segment.strip()
-
-
-TOKENIZERS = {
-    '13a': _tokenize_mteval_13a,
-    'intl': _tokenize_mteval_v14_intl,
-    None: lambda x: x,
-}
-
-
-def compute_bleu(reference_corpus_list, translation_corpus, tokenized=True,
-                 tokenizer='13a', max_n=4, smooth=False, lower_case=False,
-                 bpe=False, split_compound_word=False):
-    r"""Compute bleu score of translation against references.
-
-    Parameters
-    ----------
-    reference_corpus_list: list of list(list(str)) or list of list(str)
-        list of list(list(str)): tokenized references
-        list of list(str): plain text
-        List of references for each translation.
-    translation_corpus: list(list(str)) or list(str)
-        list(list(str)): tokenized translation
-        list(str): plain text
-        Translations to score.
-    tokenized: bool, default True
-        Whether the inputs has been tokenized.
-    tokenizer: str or None, default '13a'
-        '13a': follow the tokenizer in mteval-v13a.pl
-        'intl': follow the international tokenizer in mteval-v14.pl
-        None: identity mapping on the string.
-        This option is ignored if tokenized is True
-    max_n: int, default 4
-        Maximum n-gram order to use when computing BLEU score.
-    smooth: bool, default False
-        Whether or not to compute smoothed bleu score.
-    lower_case: bool, default False
-        Whether or not to use lower case of tokens
-    split_compound_word: bool, default False
-        Whether or not to split compound words
-        "rich-text format" --> rich ##AT##-##AT## text format.
-    bpe: bool, default False
-        Whether or not the inputs are in BPE format
-
-    Returns
-    -------
-    5-Tuple with the BLEU score, n-gram precisions, brevity penalty,
-        reference length, and translation length
-    """
-    precision_numerators = [0 for _ in range(max_n)]
-    precision_denominators = [0 for _ in range(max_n)]
-    ref_length, trans_length = 0, 0
-    for references in reference_corpus_list:
-        assert len(references) == len(translation_corpus), \
-            'The number of translations and their references do not match'
-    if tokenized:
-        assert isinstance(reference_corpus_list[0][0], LIST_TYPES) and \
-               isinstance(translation_corpus[0], LIST_TYPES), \
-            'references and translation should have format of list of list(list(str)) ' \
-            'and list(list(str)), respectively, when tokenized is True.'
-    else:
-        assert isinstance(reference_corpus_list[0][0], six.string_types) and \
-               isinstance(translation_corpus[0], six.string_types), \
-            'references and translation should have format of list(list(str)) ' \
-            'and list(str), respectively, when tokenized is False.'
-    for references, translation in zip(zip(*reference_corpus_list), translation_corpus):
-        if not tokenized:
-            references = [TOKENIZERS[tokenizer](reference).split() for reference in references]
-            translation = TOKENIZERS[tokenizer](translation).split()
-        if bpe:
-            references = [_bpe_to_words(reference) for reference in references]
-            translation = _bpe_to_words(translation)
-        if split_compound_word:
-            references = [_split_compound_word(reference) for reference in references]
-            translation = _split_compound_word(translation)
-        if lower_case:
-            references = [[w.lower() for w in reference] for reference in references]
-            translation = [w.lower() for w in translation]
-        trans_len = len(translation)
-        trans_length += trans_len
-        ref_length += _closest_ref_length(references, trans_len)
-        for n in range(max_n):
-            matches, candidates = _compute_precision(references, translation, n + 1)
-            precision_numerators[n] += matches
-            precision_denominators[n] += candidates
-
-    precision_fractions = [(precision_numerators[n], precision_denominators[n])
-                           for n in range(max_n)]
-    smooth_const = 0
-    if smooth:
-        smooth_const = 1
-    precisions = _smoothing(precision_fractions, smooth_const)
-    if min(precisions) > 0:
-        precision_log_average = sum(math.log(p) for p in precisions) / max_n
-        precision_exp_log_average = math.exp(precision_log_average)
-    else:
-        precision_exp_log_average = 0
-
-    bp = _brevity_penalty(ref_length, trans_length)
-    bleu = precision_exp_log_average*bp
-
-    return bleu, precisions, bp, ref_length, trans_length
-
-
-def _compute_precision(references, translation, n):
-    """Compute ngram precision.
-
-    Parameters
-    ----------
-    references: list(list(str))
-        A list of references.
-    translation: list(str)
-        A translation.
-    n: int
-        Order of n-gram.
-
-    Returns
-    -------
-    matches: int
-        Number of matched nth order n-grams
-    candidates
-        Number of possible nth order n-grams
-    """
-    matches = 0
-    candidates = 0
-    ref_ngram_counts = Counter()
-
-    for reference in references:
-        ref_ngram_counts |= _ngrams(reference, n)
-    trans_ngram_counts = _ngrams(translation, n)
-    overlap_ngram_counts = trans_ngram_counts & ref_ngram_counts
-    matches += sum(overlap_ngram_counts.values())
-    possible_matches = len(translation) - n + 1
-    if possible_matches > 0:
-        candidates += possible_matches
-
-    return matches, candidates
-
-
-def _brevity_penalty(ref_length, trans_length):
-    """Calculate brevity penalty.
-
-    Parameters
-    ----------
-    ref_length: int
-        Sum of all closest references'lengths for every translations in a corpus
-    trans_length: int
-        Sum of all translations's lengths in a corpus.
-
-    Returns
-    -------
-    bleu's brevity penalty: float
-    """
-    if trans_length > ref_length:
-        return 1
-    # If translation is empty, brevity penalty = 0 should result in BLEU = 0.0
-    elif trans_length == 0:
-        return 0
-    else:
-        return math.exp(1 - float(ref_length) / trans_length)
-
-
-def _closest_ref_length(references, trans_length):
-    """Find the reference that has the closest length to the translation.
-
-    Parameters
-    ----------
-    references: list(list(str))
-        A list of references.
-    trans_length: int
-        Length of the translation.
-
-    Returns
-    -------
-    closest_ref_len: int
-        Length of the reference that is closest to the translation.
-    """
-    ref_lengths = (len(reference) for reference in references)
-    closest_ref_len = min(ref_lengths,
-                          key=lambda ref_length: (abs(ref_length - trans_length), ref_length))
-
-    return closest_ref_len
-
-
-def _smoothing(precision_fractions, c=1):
-    """Compute the smoothed precision for all the orders.
-
-    Parameters
-    ----------
-    precision_fractions: list(tuple)
-        Contain a list of (precision_numerator, precision_denominator) pairs
-    c: int, default 1
-        Smoothing constant to use
-
-    Returns
-    -------
-    ratios: list of floats
-        Contain the smoothed precision_fractions.
-    """
-    ratios = [0] * len(precision_fractions)
-    for i, precision_fraction in enumerate(precision_fractions):
-        if precision_fraction[1] > 0:
-            ratios[i] = float(precision_fraction[0] + c) / (precision_fraction[1] + c)
-        else:
-            ratios[i] = 0.0
-
-    return ratios
diff --git a/scripts/machine_translation/dataprocessor.py b/scripts/machine_translation/dataprocessor.py
deleted file mode 100644
index e60989e2e1..0000000000
--- a/scripts/machine_translation/dataprocessor.py
+++ /dev/null
@@ -1,284 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Data preprocessing for transformer."""
-
-import os
-import io
-import time
-import logging
-import numpy as np
-from mxnet import gluon
-import gluonnlp as nlp
-import gluonnlp.data.batchify as btf
-import _constants
-import dataset as _dataset
-
-
-def _cache_dataset(dataset, prefix):
-    """Cache the processed npy dataset the dataset into a npz
-
-    Parameters
-    ----------
-    dataset : SimpleDataset
-    file_path : str
-    """
-    if not os.path.exists(_constants.CACHE_PATH):
-        os.makedirs(_constants.CACHE_PATH)
-    src_data = np.concatenate([e[0] for e in dataset])
-    tgt_data = np.concatenate([e[1] for e in dataset])
-    src_cumlen = np.cumsum([0]+[len(e[0]) for e in dataset])
-    tgt_cumlen = np.cumsum([0]+[len(e[1]) for e in dataset])
-    np.savez(os.path.join(_constants.CACHE_PATH, prefix + '.npz'),
-             src_data=src_data, tgt_data=tgt_data,
-             src_cumlen=src_cumlen, tgt_cumlen=tgt_cumlen)
-
-
-def _load_cached_dataset(prefix):
-    cached_file_path = os.path.join(_constants.CACHE_PATH, prefix + '.npz')
-    if os.path.exists(cached_file_path):
-        print('Loading dataset...')
-        npz_data = np.load(cached_file_path)
-        src_data, tgt_data, src_cumlen, tgt_cumlen = \
-                [npz_data[n] for n in ['src_data', 'tgt_data', 'src_cumlen', 'tgt_cumlen']]
-        src_data = np.array([src_data[low:high] for low, high
-                             in zip(src_cumlen[:-1], src_cumlen[1:])])
-        tgt_data = np.array([tgt_data[low:high] for low, high
-                             in zip(tgt_cumlen[:-1], tgt_cumlen[1:])])
-        return gluon.data.ArrayDataset(np.array(src_data), np.array(tgt_data))
-    else:
-        return None
-
-
-class TrainValDataTransform:
-    """Transform the machine translation dataset.
-
-    Clip source and the target sentences to the maximum length. For the source sentence, append the
-    EOS. For the target sentence, append BOS and EOS.
-
-    Parameters
-    ----------
-    src_vocab : Vocab
-    tgt_vocab : Vocab
-    src_max_len : int
-    tgt_max_len : int
-    """
-
-    def __init__(self, src_vocab, tgt_vocab, src_max_len=None, tgt_max_len=None):
-        self._src_vocab = src_vocab
-        self._tgt_vocab = tgt_vocab
-        self._src_max_len = src_max_len
-        self._tgt_max_len = tgt_max_len
-
-    def __call__(self, src, tgt):
-        # For src_max_len < 0, we do not clip the sequence
-        if self._src_max_len >= 0:
-            src_sentence = self._src_vocab[src.split()[:self._src_max_len]]
-        else:
-            src_sentence = self._src_vocab[src.split()]
-        # For tgt_max_len < 0, we do not clip the sequence
-        if self._tgt_max_len >= 0:
-            tgt_sentence = self._tgt_vocab[tgt.split()[:self._tgt_max_len]]
-        else:
-            tgt_sentence = self._tgt_vocab[tgt.split()]
-        src_sentence.append(self._src_vocab[self._src_vocab.eos_token])
-        tgt_sentence.insert(0, self._tgt_vocab[self._tgt_vocab.bos_token])
-        tgt_sentence.append(self._tgt_vocab[self._tgt_vocab.eos_token])
-        src_npy = np.array(src_sentence, dtype=np.int32)
-        tgt_npy = np.array(tgt_sentence, dtype=np.int32)
-        return src_npy, tgt_npy
-
-
-def process_dataset(dataset, src_vocab, tgt_vocab, src_max_len=-1, tgt_max_len=-1):
-    start = time.time()
-    dataset_processed = dataset.transform(TrainValDataTransform(src_vocab, tgt_vocab,
-                                                                src_max_len,
-                                                                tgt_max_len), lazy=False)
-    end = time.time()
-    print('Processing Time spent: {}'.format(end - start))
-    return dataset_processed
-
-
-def load_translation_data(dataset, bleu, args):
-    """Load translation dataset
-
-    Parameters
-    ----------
-    dataset : str
-    args : argparse result
-
-    Returns
-    -------
-
-    """
-    src_lang, tgt_lang = args.src_lang, args.tgt_lang
-    if dataset == 'IWSLT2015':
-        common_prefix = 'IWSLT2015_{}_{}_{}_{}'.format(src_lang, tgt_lang,
-                                                       args.src_max_len, args.tgt_max_len)
-        data_train = nlp.data.IWSLT2015('train', src_lang=src_lang, tgt_lang=tgt_lang)
-        data_val = nlp.data.IWSLT2015('val', src_lang=src_lang, tgt_lang=tgt_lang)
-        data_test = nlp.data.IWSLT2015('test', src_lang=src_lang, tgt_lang=tgt_lang)
-    elif dataset == 'WMT2016BPE':
-        common_prefix = 'WMT2016BPE_{}_{}_{}_{}'.format(src_lang, tgt_lang,
-                                                        args.src_max_len, args.tgt_max_len)
-        data_train = nlp.data.WMT2016BPE('train', src_lang=src_lang, tgt_lang=tgt_lang)
-        data_val = nlp.data.WMT2016BPE('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang)
-        data_test = nlp.data.WMT2016BPE('newstest2014', src_lang=src_lang, tgt_lang=tgt_lang)
-    elif dataset == 'WMT2014BPE':
-        common_prefix = 'WMT2014BPE_{}_{}_{}_{}'.format(src_lang, tgt_lang,
-                                                        args.src_max_len, args.tgt_max_len)
-        data_train = nlp.data.WMT2014BPE('train', src_lang=src_lang, tgt_lang=tgt_lang)
-        data_val = nlp.data.WMT2014BPE('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang)
-        data_test = nlp.data.WMT2014BPE('newstest2014', src_lang=src_lang, tgt_lang=tgt_lang,
-                                        full=args.full)
-    elif dataset == 'TOY':
-        common_prefix = 'TOY_{}_{}_{}_{}'.format(src_lang, tgt_lang,
-                                                 args.src_max_len, args.tgt_max_len)
-        data_train = _dataset.TOY('train', src_lang=src_lang, tgt_lang=tgt_lang)
-        data_val = _dataset.TOY('val', src_lang=src_lang, tgt_lang=tgt_lang)
-        data_test = _dataset.TOY('test', src_lang=src_lang, tgt_lang=tgt_lang)
-    else:
-        raise NotImplementedError
-    src_vocab, tgt_vocab = data_train.src_vocab, data_train.tgt_vocab
-    data_train_processed = _load_cached_dataset(common_prefix + '_train')
-    if not data_train_processed:
-        data_train_processed = process_dataset(data_train, src_vocab, tgt_vocab,
-                                               args.src_max_len, args.tgt_max_len)
-        _cache_dataset(data_train_processed, common_prefix + '_train')
-    data_val_processed = _load_cached_dataset(common_prefix + '_val')
-    if not data_val_processed:
-        data_val_processed = process_dataset(data_val, src_vocab, tgt_vocab)
-        _cache_dataset(data_val_processed, common_prefix + '_val')
-    if dataset == 'WMT2014BPE':
-        filename = common_prefix + '_' + str(args.full) + '_test'
-    else:
-        filename = common_prefix + '_test'
-    data_test_processed = _load_cached_dataset(filename)
-    if not data_test_processed:
-        data_test_processed = process_dataset(data_test, src_vocab, tgt_vocab)
-        _cache_dataset(data_test_processed, filename)
-    if bleu == 'tweaked':
-        fetch_tgt_sentence = lambda src, tgt: tgt.split()
-        val_tgt_sentences = list(data_val.transform(fetch_tgt_sentence))
-        test_tgt_sentences = list(data_test.transform(fetch_tgt_sentence))
-    elif bleu in ('13a', 'intl'):
-        fetch_tgt_sentence = lambda src, tgt: tgt
-        if dataset == 'WMT2016BPE':
-            val_text = nlp.data.WMT2016('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang)
-            test_text = nlp.data.WMT2016('newstest2014', src_lang=src_lang, tgt_lang=tgt_lang)
-        elif dataset == 'WMT2014BPE':
-            val_text = nlp.data.WMT2014('newstest2013', src_lang=src_lang, tgt_lang=tgt_lang)
-            test_text = nlp.data.WMT2014('newstest2014', src_lang=src_lang, tgt_lang=tgt_lang,
-                                         full=args.full)
-        elif dataset in ('IWSLT2015', 'TOY'):
-            val_text = data_val
-            test_text = data_test
-        else:
-            raise NotImplementedError
-        val_tgt_sentences = list(val_text.transform(fetch_tgt_sentence))
-        test_tgt_sentences = list(test_text.transform(fetch_tgt_sentence))
-    else:
-        raise NotImplementedError
-    return data_train_processed, data_val_processed, data_test_processed, \
-           val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab
-
-
-def get_data_lengths(dataset):
-    get_lengths = lambda *args: (args[2], args[3])
-    return list(dataset.transform(get_lengths))
-
-def get_dataloader(data_set, args, dataset_type,
-                   use_average_length=False, num_shards=0, num_workers=8):
-    """Create data loaders for training/validation/test."""
-    assert dataset_type in ['train', 'val', 'test']
-
-    if args.bucket_scheme == 'constant':
-        bucket_scheme = nlp.data.ConstWidthBucket()
-    elif args.bucket_scheme == 'linear':
-        bucket_scheme = nlp.data.LinearWidthBucket()
-    elif args.bucket_scheme == 'exp':
-        bucket_scheme = nlp.data.ExpWidthBucket(bucket_len_step=1.2)
-    else:
-        raise NotImplementedError
-
-    data_lengths = get_data_lengths(data_set)
-
-    if dataset_type == 'train':
-        train_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
-                                      btf.Stack(dtype='float32'), btf.Stack(dtype='float32'))
-
-    else:
-        data_lengths = list(map(lambda x: x[-1], data_lengths))
-        test_batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0),
-                                     btf.Stack(dtype='float32'), btf.Stack(dtype='float32'),
-                                     btf.Stack())
-
-    batch_sampler = nlp.data.FixedBucketSampler(lengths=data_lengths,
-                                                batch_size=(args.batch_size \
-                                                            if dataset_type == 'train' \
-                                                            else args.test_batch_size),
-                                                num_buckets=args.num_buckets,
-                                                ratio=args.bucket_ratio,
-                                                shuffle=(dataset_type == 'train'),
-                                                use_average_length=use_average_length,
-                                                num_shards=num_shards,
-                                                bucket_scheme=bucket_scheme)
-
-    if dataset_type == 'train':
-        logging.info('Train Batch Sampler:\n%s', batch_sampler.stats())
-        data_loader = nlp.data.ShardedDataLoader(data_set,
-                                                 batch_sampler=batch_sampler,
-                                                 batchify_fn=train_batchify_fn,
-                                                 num_workers=num_workers)
-    else:
-        if dataset_type == 'val':
-            logging.info('Valid Batch Sampler:\n%s', batch_sampler.stats())
-        else:
-            logging.info('Test Batch Sampler:\n%s', batch_sampler.stats())
-
-        data_loader = gluon.data.DataLoader(data_set,
-                                            batch_sampler=batch_sampler,
-                                            batchify_fn=test_batchify_fn,
-                                            num_workers=num_workers)
-
-    return data_loader
-
-def make_dataloader(data_train, data_val, data_test, args,
-                    use_average_length=False, num_shards=0, num_workers=8):
-    """Create data loaders for training/validation/test."""
-    train_data_loader = get_dataloader(data_train, args, dataset_type='train',
-                                       use_average_length=use_average_length,
-                                       num_shards=num_shards,
-                                       num_workers=num_workers)
-
-    val_data_loader = get_dataloader(data_val, args, dataset_type='val',
-                                     use_average_length=use_average_length,
-                                     num_workers=num_workers)
-
-    test_data_loader = get_dataloader(data_test, args, dataset_type='test',
-                                      use_average_length=use_average_length,
-                                      num_workers=num_workers)
-
-    return train_data_loader, val_data_loader, test_data_loader
-
-
-def write_sentences(sentences, file_path):
-    with io.open(file_path, 'w', encoding='utf-8') as of:
-        for sent in sentences:
-            if isinstance(sent, (list, tuple)):
-                of.write(' '.join(sent) + '\n')
-            else:
-                of.write(sent + '\n')
diff --git a/scripts/machine_translation/dataset.py b/scripts/machine_translation/dataset.py
deleted file mode 100644
index 5392e80508..0000000000
--- a/scripts/machine_translation/dataset.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-"""Translation datasets."""
-
-
-__all__ = ['TOY']
-
-import os
-from gluonnlp.base import get_home_dir
-from gluonnlp.data.translation import _TranslationDataset, _get_pair_key
-
-
-class TOY(_TranslationDataset):
-    """A Small Translation Dataset for Testing Scripts.
-
-    Parameters
-    ----------
-    segment : str or list of str, default 'train'
-        Dataset segment. Options are 'train', 'val', 'test' or their combinations.
-    src_lang : str, default 'en'
-        The source language. Option for source and target languages are 'en' <-> 'de'
-    tgt_lang : str, default 'de'
-        The target language. Option for source and target languages are 'en' <-> 'de'
-    root : str, default '$MXNET_HOME/datasets/translation_test'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    """
-    def __init__(self, segment='train', src_lang='en', tgt_lang='de',
-                 root=os.path.join(get_home_dir(), 'datasets', 'translation_test')):
-        self._supported_segments = ['train', 'val', 'test']
-        self._archive_file = {_get_pair_key('en', 'de'):
-                                  ('translation_test.zip',
-                                   '14f6c8e31ac6ec84ce469b4c196d60b4c86a179d')}
-        self._data_file = {_get_pair_key('en', 'de'):
-                               {'train_en': ('train.en',
-                                             'aa7f22b91eb93390fd342a57a81f51f53ed29542'),
-                                'train_de': ('train.de',
-                                             'f914217ce23ddd8cac07e761a75685c043d4f6d3'),
-                                'val_en': ('train.en',
-                                           'aa7f22b91eb93390fd342a57a81f51f53ed29542'),
-                                'val_de': ('train.de',
-                                           'f914217ce23ddd8cac07e761a75685c043d4f6d3'),
-                                'test_en': ('train.en',
-                                            'aa7f22b91eb93390fd342a57a81f51f53ed29542'),
-                                'test_de': ('train.de',
-                                            'f914217ce23ddd8cac07e761a75685c043d4f6d3'),
-                                'vocab_en': ('vocab.en.json',
-                                             'c7c6af4603ea70f0a4af2460a622333fbd014050'),
-                                'vocab_de' : ('vocab.de.json',
-                                              '5b6f1be36a3e3cb9946b86e5d0fc73d164fda99f')}}
-        super(TOY, self).__init__('translation_test', segment=segment, src_lang=src_lang,
-                                  tgt_lang=tgt_lang, root=root)
diff --git a/scripts/machine_translation/evaluate_transformer.py b/scripts/machine_translation/evaluate_transformer.py
new file mode 100644
index 0000000000..2dddfdc06b
--- /dev/null
+++ b/scripts/machine_translation/evaluate_transformer.py
@@ -0,0 +1,291 @@
+import numpy as np
+import random
+import os
+import mxnet as mx
+from mxnet import gluon
+import argparse
+import logging
+import time
+from gluonnlp.utils.misc import logging_config
+from gluonnlp.models.transformer import TransformerModel,\
+    TransformerNMTInference
+from gluonnlp.data.batchify import Tuple, Pad, Stack
+from gluonnlp.data.filtering import MosesNormalizer
+from gluonnlp.data import tokenizers
+from gluonnlp.sequence_sampler import BeamSearchSampler, BeamSearchScorer
+import sacrebleu
+from tqdm import tqdm
+mx.npx.set_np()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Transformer for Neural Machine Translation. Load a checkpoint and inference.')
+    parser.add_argument('--seed', type=int, default=100, help='The random seed.')
+    parser.add_argument('--src_lang', type=str, default='en', help='Source language')
+    parser.add_argument('--tgt_lang', type=str, default='de', help='Target language')
+    parser.add_argument('--src_corpus', type=str, required=True,
+                        help='The source corpus for evaluation.')
+    parser.add_argument('--tgt_corpus', type=str, default=None,
+                        help='The target corpus for evaluation.')
+    parser.add_argument('--src_tokenizer', choices=['spm',
+                                                    'subword_nmt',
+                                                    'yttm',
+                                                    'hf_bytebpe',
+                                                    'hf_wordpiece',
+                                                    'hf_bpe'],
+                        required=True, type=str,
+                        help='The source tokenizer. Only supports online encoding at present.')
+    parser.add_argument('--tgt_tokenizer', choices=['spm',
+                                                    'subword_nmt',
+                                                    'yttm',
+                                                    'hf_bytebpe',
+                                                    'hf_wordpiece',
+                                                    'hf_bpe'],
+                        required=True, type=str,
+                        help='The target tokenizer. Only supports online encoding at present.')    
+    parser.add_argument('--src_subword_model_path', type=str,
+                        help='Path to the source subword model.')
+    parser.add_argument('--src_vocab_path', type=str,
+                        help='Path to the source subword vocab.')
+    parser.add_argument('--tgt_subword_model_path', type=str,
+                        help='Path to the target subword model.')
+    parser.add_argument('--tgt_vocab_path', type=str,
+                        help='Path to the target subword vocab.')
+    parser.add_argument('--src_max_len', type=int, default=None,
+                        help='Maximum length of the source sentence.')
+    parser.add_argument('--tgt_max_len', type=int, default=None,
+                        help='Maximum length of the target sentence.')
+    parser.add_argument('--cfg', type=str, help='Config file of the Transformer model.')
+    parser.add_argument('--beam-size', type=int, default=4, help='Number of beams')
+    parser.add_argument('--lp_alpha', type=float, default=0.6,
+                        help='The alpha value in the length penalty')
+    parser.add_argument('--lp_k', type=int, default=5, help='The K value in the length penalty')
+    parser.add_argument('--max_length_a', type=int, default=1,
+                        help='The a in the a * x + b formula of beam search')
+    parser.add_argument('--max_length_b', type=int, default=50,
+                        help='The b in the a * x + b formula of beam search')
+    parser.add_argument('--param_path', type=str, help='The path to the model parameters.')
+    parser.add_argument('--gpus', type=str, default='0',
+                        help='List of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.'
+                             '(using single gpu is suggested)')
+    parser.add_argument('--save_dir', type=str, default=None,
+                        help='The path to save the log files and predictions.')
+    parser.add_argument('--stochastic', action='store_true',
+                        help='Whether to use the stochastic beam search')
+    parser.add_argument('--inference', action='store_true',
+                        help='Whether to inference with your own data, '
+                        'when applying inference, tgt_corpus is not needed and will be set to None.')
+    parser.add_argument('--fp16', action='store_true',
+                        help='Whether to use dtype float16')
+    args = parser.parse_args()
+    if args.save_dir is None:
+        args.save_dir = os.path.splitext(args.param_path)[0] + '_evaluation'
+    assert args.inference or args.tgt_corpus, 'requring --tgt_corpus while not using --inference'
+    if args.inference:
+        args.tgt_corpus = None
+    logging_config(args.save_dir, console=True)
+    logging.info(args)
+    return args
+
+
+def process_corpus(corpus_path, sentence_normalizer, bpe_tokenizer,
+                   base_tokenizer=None, add_bos=True,
+                   add_eos=True):
+    processed_token_ids = []
+    raw_lines = []
+    with open(corpus_path, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            raw_lines.append(line)
+            line = sentence_normalizer(line)
+            if base_tokenizer is not None:
+                line = ' '.join(base_tokenizer.encode(line))
+            bpe_token_ids = bpe_tokenizer.encode(line, output_type=int)
+            if add_bos:
+                bpe_token_ids = [bpe_tokenizer.vocab.bos_id] + bpe_token_ids
+            if add_eos:
+                bpe_token_ids.append(bpe_tokenizer.vocab.eos_id)
+            processed_token_ids.append(bpe_token_ids)
+    return processed_token_ids, raw_lines
+
+
+def create_tokenizer(tokenizer_type, model_path, vocab_path):
+    if tokenizer_type == 'spm':
+        return tokenizers.create(tokenizer_type, model_path=model_path, vocab=vocab_path)
+    elif tokenizer_type == 'subword_nmt':
+        return tokenizers.create(tokenizer_type, codec_path=model_path, vocab_path=vocab_path)
+    elif tokenizer_type == 'yttm':
+        return tokenizers.create(tokenizer_type, model_path=model_path)
+    elif tokenizer_type == 'hf_bytebpe':
+        return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path)
+    elif tokenizer_type == 'hf_wordpiece':
+        return tokenizers.create(tokenizer_type, vocab_file=vocab_path)
+    elif tokenizer_type == 'hf_bpe':
+        return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path)
+    else:
+        raise NotImplementedError
+
+
+def evaluate(args):
+    ctx_l = [mx.cpu()] if args.gpus is None or args.gpus == '' else [mx.gpu(int(x)) for x in
+                                                                     args.gpus.split(',')]
+    src_normalizer = MosesNormalizer(args.src_lang)
+    tgt_normalizer = MosesNormalizer(args.tgt_lang)
+    base_src_tokenizer = tokenizers.create('moses', args.src_lang)
+    base_tgt_tokenizer = tokenizers.create('moses', args.tgt_lang)
+
+    src_tokenizer = create_tokenizer(args.src_tokenizer,
+                                     args.src_subword_model_path,
+                                     args.src_vocab_path)
+    tgt_tokenizer = create_tokenizer(args.tgt_tokenizer,
+                                     args.tgt_subword_model_path,
+                                     args.tgt_vocab_path)
+    src_vocab = src_tokenizer.vocab
+    tgt_vocab = tgt_tokenizer.vocab
+    if args.cfg.endswith('.yml'):
+        cfg = TransformerModel.get_cfg().clone_merge(args.cfg)
+    else:
+        cfg = TransformerModel.get_cfg(args.cfg)
+    cfg.defrost()
+    cfg.MODEL.src_vocab_size = len(src_vocab)
+    cfg.MODEL.tgt_vocab_size = len(tgt_vocab)
+    if args.fp16:
+        cfg.MODEL.dtype = 'float16'
+    cfg.freeze()
+    model = TransformerModel.from_cfg(cfg)
+    model.hybridize()
+    model.load_parameters(args.param_path, ctx=ctx_l)
+    inference_model = TransformerNMTInference(model=model)
+    inference_model.hybridize()
+    # Construct the BeamSearchSampler
+    if args.stochastic:
+        scorer = BeamSearchScorer(alpha=0.0,
+                                  K=0.0,
+                                  temperature=1.0,
+                                  from_logits=False)
+    else:
+        scorer = BeamSearchScorer(alpha=args.lp_alpha,
+                                  K=args.lp_k,
+                                  from_logits=False)
+    beam_search_sampler = BeamSearchSampler(beam_size=args.beam_size,
+                                            decoder=inference_model,
+                                            vocab_size=len(tgt_vocab),
+                                            eos_id=tgt_vocab.eos_id,
+                                            scorer=scorer,
+                                            stochastic=args.stochastic,
+                                            max_length_a=args.max_length_a,
+                                            max_length_b=args.max_length_b)   
+
+    logging.info(beam_search_sampler)
+    all_src_token_ids, all_src_lines = process_corpus(
+        args.src_corpus,
+        sentence_normalizer=src_normalizer,
+        base_tokenizer=base_src_tokenizer,
+        bpe_tokenizer=src_tokenizer,
+        add_bos=False,
+        add_eos=True
+    )
+    if args.tgt_corpus is not None:
+        all_tgt_token_ids, all_tgt_lines = process_corpus(
+            args.tgt_corpus,
+            sentence_normalizer=tgt_normalizer,
+            base_tokenizer=base_tgt_tokenizer,
+            bpe_tokenizer=tgt_tokenizer,
+            add_bos=True,
+            add_eos=True
+        )
+    else: # when applying inference, populate the fake tgt tokens
+        all_tgt_token_ids = all_tgt_lines = [[] for i in range(len(all_src_token_ids))]
+    test_dataloader = gluon.data.DataLoader(
+        list(zip(all_src_token_ids,
+                 [len(ele) for ele in all_src_token_ids],
+                 all_tgt_token_ids,
+                 [len(ele) for ele in all_tgt_token_ids])),
+        batch_size=32,
+        batchify_fn=Tuple(Pad(), Stack(), Pad(), Stack()),
+        shuffle=False)
+
+    ctx = ctx_l[0]
+    pred_sentences = []
+    start_eval_time = time.time()
+    # evaluate
+    if not args.inference:
+        avg_nll_loss = 0
+        ntokens = 0
+        for i, (src_token_ids, src_valid_length, tgt_token_ids, tgt_valid_length)\
+                in enumerate(test_dataloader):
+            src_token_ids = mx.np.array(src_token_ids, ctx=ctx, dtype=np.int32)
+            src_valid_length = mx.np.array(src_valid_length, ctx=ctx, dtype=np.int32)
+            tgt_token_ids = mx.np.array(tgt_token_ids, ctx=ctx, dtype=np.int32)
+            tgt_valid_length = mx.np.array(tgt_valid_length, ctx=ctx, dtype=np.int32)
+            tgt_pred = model(src_token_ids, src_valid_length, tgt_token_ids[:, :-1],
+                            tgt_valid_length - 1)
+            pred_logits = mx.npx.log_softmax(tgt_pred, axis=-1)
+            nll = - mx.npx.pick(pred_logits, tgt_token_ids[:, 1:])
+            avg_nll_loss += mx.npx.sequence_mask(nll,
+                                                sequence_length=tgt_valid_length - 1,
+                                                use_sequence_length=True, axis=1).sum().asnumpy()
+            ntokens += int((tgt_valid_length - 1).sum().asnumpy())
+            init_input = mx.np.array([tgt_vocab.bos_id for _ in range(src_token_ids.shape[0])], ctx=ctx)
+            states = inference_model.init_states(src_token_ids, src_valid_length)
+            samples, scores, valid_length = beam_search_sampler(init_input, states, src_valid_length)
+            for j in range(samples.shape[0]):
+                pred_tok_ids = samples[j, 0, :valid_length[j, 0].asnumpy()].asnumpy().tolist()
+                bpe_decode_line = tgt_tokenizer.decode(pred_tok_ids[1:-1])
+                pred_sentence = base_tgt_tokenizer.decode(bpe_decode_line.split(' '))
+                pred_sentences.append(pred_sentence)
+                print(pred_sentence)
+            print('Processed {}/{}'.format(len(pred_sentences), len(all_tgt_lines)))
+        end_eval_time = time.time()
+        avg_nll_loss = avg_nll_loss / ntokens
+
+        with open(os.path.join(args.save_dir, 'gt_sentences.txt'), 'w', encoding='utf-8') as of:
+            of.write('\n'.join(all_tgt_lines))
+            of.write('\n')
+        with open(os.path.join(args.save_dir, 'pred_sentences.txt'), 'w', encoding='utf-8') as of:
+            of.write('\n'.join(pred_sentences))
+            of.write('\n')
+
+        sacrebleu_out = sacrebleu.corpus_bleu(sys_stream=pred_sentences, ref_streams=[all_tgt_lines])        
+        logging.info('Time Spent: {}, #Sent={}, SacreBlEU={} '
+                     '({:2.1f} {:2.1f} {:2.1f} {:2.1f}) '
+                     '(BP={:.3f}, ratio={:.3f}, syslen={}, reflen={}), '
+                     'Avg NLL={}, Perplexity={}'
+                     .format(end_eval_time - start_eval_time, len(all_tgt_lines),
+                             sacrebleu_out.score,
+                             *sacrebleu_out.precisions,
+                             sacrebleu_out.bp, sacrebleu_out.sys_len / sacrebleu_out.ref_len,
+                             sacrebleu_out.sys_len, sacrebleu_out.ref_len,
+                             avg_nll_loss, np.exp(avg_nll_loss)))
+    # inference only
+    else:
+        with open(os.path.join(args.save_dir, 'pred_sentences.txt'), 'w', encoding='utf-8') as of:
+            processed_sentences = 0
+            for src_token_ids, src_valid_length, _, _ in tqdm(test_dataloader):
+                src_token_ids = mx.np.array(src_token_ids, ctx=ctx, dtype=np.int32)
+                src_valid_length = mx.np.array(src_valid_length, ctx=ctx, dtype=np.int32)
+                init_input = mx.np.array([tgt_vocab.bos_id for _ in range(src_token_ids.shape[0])], ctx=ctx)
+                states = inference_model.init_states(src_token_ids, src_valid_length)
+                samples, scores, valid_length = beam_search_sampler(init_input, states, src_valid_length)
+                for j in range(samples.shape[0]):
+                    pred_tok_ids = samples[j, 0, :valid_length[j, 0].asnumpy()].asnumpy().tolist()
+                    bpe_decode_line = tgt_tokenizer.decode(pred_tok_ids[1:-1])
+                    pred_sentence = base_tgt_tokenizer.decode(bpe_decode_line.split(' '))
+                    pred_sentences.append(pred_sentence)
+                of.write('\n'.join(pred_sentences))
+                of.write('\n')
+                processed_sentences += len(pred_sentences)
+                pred_sentences = []
+        end_eval_time = time.time()
+        logging.info('Time Spent: {}, Inferred sentences: {}'
+                     .format(end_eval_time - start_eval_time, processed_sentences))
+
+if __name__ == '__main__':
+    os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
+    os.environ['MXNET_USE_FUSION'] = '0'  # Manually disable pointwise fusion
+    args = parse_args()
+    np.random.seed(args.seed)
+    mx.random.seed(args.seed)
+    random.seed(args.seed)
+    evaluate(args)
diff --git a/scripts/machine_translation/gnmt.py b/scripts/machine_translation/gnmt.py
deleted file mode 100644
index c31cb1d66f..0000000000
--- a/scripts/machine_translation/gnmt.py
+++ /dev/null
@@ -1,512 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Encoder and decoder usded in sequence-to-sequence learning."""
-__all__ = ['GNMTEncoder', 'GNMTDecoder', 'GNMTOneStepDecoder', 'get_gnmt_encoder_decoder']
-
-import mxnet as mx
-from mxnet.base import _as_list
-from mxnet.gluon import nn, rnn
-from mxnet.gluon.block import HybridBlock
-from gluonnlp.model.seq2seq_encoder_decoder import Seq2SeqEncoder, Seq2SeqDecoder, \
-     Seq2SeqOneStepDecoder, _nested_sequence_last
-from gluonnlp.model.utils import _get_cell_type
-from gluonnlp.model.attention_cell import _get_attention_cell
-
-
-class GNMTEncoder(Seq2SeqEncoder):
-    r"""Structure of the RNN Encoder similar to that used in
-     "[Arxiv2016] Google's Neural Machine Translation System:
-                 Bridgeing the Gap between Human and Machine Translation"
-
-    The encoder first stacks several bidirectional RNN layers and then stacks multiple
-    uni-directional RNN layers with residual connections.
-
-    Parameters
-    ----------
-    cell_type : str or function
-        Can be "lstm", "gru" or constructor functions that can be directly called,
-         like rnn.LSTMCell
-    num_layers : int
-        Total number of layers
-    num_bi_layers : int
-        Total number of bidirectional layers
-    hidden_size : int
-        Number of hidden units
-    dropout : float
-        The dropout rate
-    use_residual : bool
-        Whether to use residual connection. Residual connection will be added in the
-        uni-directional RNN layers
-    i2h_weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    h2h_weight_initializer : str or Initializer
-        Initializer for the recurrent weights matrix, used for the linear
-        transformation of the recurrent state.
-    i2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    h2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default 'rnn_'
-        Prefix for name of `Block`s
-        (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells.
-        Created if `None`.
-    """
-    def __init__(self, cell_type='lstm', num_layers=2, num_bi_layers=1, hidden_size=128,
-                 dropout=0.0, use_residual=True,
-                 i2h_weight_initializer=None, h2h_weight_initializer=None,
-                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
-                 prefix=None, params=None):
-        super(GNMTEncoder, self).__init__(prefix=prefix, params=params)
-        self._cell_type = _get_cell_type(cell_type)
-        assert num_bi_layers <= num_layers,\
-            'Number of bidirectional layers must be smaller than the total number of layers, ' \
-            'num_bi_layers={}, num_layers={}'.format(num_bi_layers, num_layers)
-        self._num_bi_layers = num_bi_layers
-        self._num_layers = num_layers
-        self._hidden_size = hidden_size
-        self._dropout = dropout
-        self._use_residual = use_residual
-        with self.name_scope():
-            self.dropout_layer = nn.Dropout(dropout)
-            self.rnn_cells = nn.HybridSequential()
-            for i in range(num_layers):
-                if i < num_bi_layers:
-                    self.rnn_cells.add(rnn.BidirectionalCell(
-                        l_cell=self._cell_type(hidden_size=self._hidden_size,
-                                               i2h_weight_initializer=i2h_weight_initializer,
-                                               h2h_weight_initializer=h2h_weight_initializer,
-                                               i2h_bias_initializer=i2h_bias_initializer,
-                                               h2h_bias_initializer=h2h_bias_initializer,
-                                               prefix='rnn%d_l_' % i),
-                        r_cell=self._cell_type(hidden_size=self._hidden_size,
-                                               i2h_weight_initializer=i2h_weight_initializer,
-                                               h2h_weight_initializer=h2h_weight_initializer,
-                                               i2h_bias_initializer=i2h_bias_initializer,
-                                               h2h_bias_initializer=h2h_bias_initializer,
-                                               prefix='rnn%d_r_' % i)))
-                else:
-                    self.rnn_cells.add(
-                        self._cell_type(hidden_size=self._hidden_size,
-                                        i2h_weight_initializer=i2h_weight_initializer,
-                                        h2h_weight_initializer=h2h_weight_initializer,
-                                        i2h_bias_initializer=i2h_bias_initializer,
-                                        h2h_bias_initializer=h2h_bias_initializer,
-                                        prefix='rnn%d_' % i))
-
-    def __call__(self, inputs, states=None, valid_length=None):
-        """Encoder the inputs given the states and valid sequence length.
-
-        Parameters
-        ----------
-        inputs : NDArray
-            Input sequence. Shape (batch_size, length, C_in)
-        states : list of NDArrays or None
-            Initial states. The list of initial states
-        valid_length : NDArray or None
-            Valid lengths of each sequence. This is usually used when part of sequence has
-            been padded. Shape (batch_size,)
-
-        Returns
-        -------
-        encoder_outputs: list
-            Outputs of the encoder. Contains:
-
-            - outputs of the last RNN layer
-            - new_states of all the RNN layers
-        """
-        return super(GNMTEncoder, self).__call__(inputs, states, valid_length)
-
-    def forward(self, inputs, states=None, valid_length=None):  #pylint: disable=arguments-differ, missing-docstring
-        # TODO(sxjscience) Accelerate the forward using HybridBlock
-        _, length, _ = inputs.shape
-        new_states = []
-        outputs = inputs
-        for i, cell in enumerate(self.rnn_cells):
-            begin_state = None if states is None else states[i]
-            outputs, layer_states = cell.unroll(
-                length=length, inputs=inputs, begin_state=begin_state, merge_outputs=True,
-                valid_length=valid_length, layout='NTC')
-            if i < self._num_bi_layers:
-                # For bidirectional RNN, we use the states of the backward RNN
-                new_states.append(layer_states[len(self.rnn_cells[i].state_info()) // 2:])
-            else:
-                new_states.append(layer_states)
-            # Apply Dropout
-            outputs = self.dropout_layer(outputs)
-            if self._use_residual:
-                if i > self._num_bi_layers:
-                    outputs = outputs + inputs
-            inputs = outputs
-        if valid_length is not None:
-            outputs = mx.nd.SequenceMask(outputs, sequence_length=valid_length,
-                                         use_sequence_length=True, axis=1)
-        return [outputs, new_states], []
-
-
-class _BaseGNMTDecoder(HybridBlock):
-    def __init__(self, cell_type='lstm', attention_cell='scaled_luong',
-                 num_layers=2, hidden_size=128,
-                 dropout=0.0, use_residual=True, output_attention=False,
-                 i2h_weight_initializer=None, h2h_weight_initializer=None,
-                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
-                 prefix=None, params=None):
-        super().__init__(prefix=prefix, params=params)
-        self._cell_type = _get_cell_type(cell_type)
-        self._num_layers = num_layers
-        self._hidden_size = hidden_size
-        self._dropout = dropout
-        self._use_residual = use_residual
-        self._output_attention = output_attention
-        with self.name_scope():
-            self.attention_cell = _get_attention_cell(attention_cell, units=hidden_size)
-            self.dropout_layer = nn.Dropout(dropout)
-            self.rnn_cells = nn.HybridSequential()
-            for i in range(num_layers):
-                self.rnn_cells.add(
-                    self._cell_type(hidden_size=self._hidden_size,
-                                    i2h_weight_initializer=i2h_weight_initializer,
-                                    h2h_weight_initializer=h2h_weight_initializer,
-                                    i2h_bias_initializer=i2h_bias_initializer,
-                                    h2h_bias_initializer=h2h_bias_initializer,
-                                    prefix='rnn%d_' % i))
-
-    def init_state_from_encoder(self, encoder_outputs, encoder_valid_length=None):
-        """Initialize the state from the encoder outputs.
-
-        Parameters
-        ----------
-        encoder_outputs : list
-        encoder_valid_length : NDArray or None
-
-        Returns
-        -------
-        decoder_states : list
-            The decoder states, includes:
-
-            - rnn_states : NDArray
-            - attention_vec : NDArray
-            - mem_value : NDArray
-            - mem_masks : NDArray, optional
-        """
-        mem_value, rnn_states = encoder_outputs
-        batch_size, _, mem_size = mem_value.shape
-        attention_vec = mx.nd.zeros(shape=(batch_size, mem_size), ctx=mem_value.context)
-        decoder_states = [rnn_states, attention_vec, mem_value]
-        mem_length = mem_value.shape[1]
-        if encoder_valid_length is not None:
-            mem_masks = mx.nd.broadcast_lesser(
-                mx.nd.arange(mem_length, ctx=encoder_valid_length.context).reshape((1, -1)),
-                encoder_valid_length.reshape((-1, 1)))
-            decoder_states.append(mem_masks)
-        return decoder_states
-
-    def forward(self, step_input, states):  # pylint: disable=arguments-differ
-        """One-step-ahead decoding of the GNMT decoder.
-
-        Parameters
-        ----------
-        step_input : NDArray or Symbol
-        states : list of NDArray or Symbol
-
-        Returns
-        -------
-        step_output : NDArray or Symbol
-            The output of the decoder. Shape is (batch_size, C_out)
-        new_states: list
-            Includes
-
-            - rnn_states : list of NDArray or Symbol
-            - attention_vec : NDArray or Symbol, Shape (batch_size, C_memory)
-            - mem_value : NDArray
-            - mem_masks : NDArray, optional
-
-        step_additional_outputs : list
-            Either be an empty list or contains the attention weights in this step.
-            The attention weights will have shape (batch_size, 1, mem_length) or
-            (batch_size, num_heads, 1, mem_length)
-        """
-        step_output, new_states, step_additional_outputs = super().forward(step_input, states)
-        # In hybrid_forward, only the rnn_states and attention_vec are calculated.
-        # We directly append the mem_value and mem_masks in the forward() function.
-        # We apply this trick because the memory value/mask can be directly appended to the next
-        # timestamp and there is no need to create additional NDArrays. If we use HybridBlock,
-        # new NDArrays will be created even for identity mapping.
-        # See https://github.com/apache/incubator-mxnet/issues/10167
-        new_states += states[2:]
-        return step_output, new_states, step_additional_outputs
-
-    def hybrid_forward(self, F, step_input, states):  #pylint: disable=arguments-differ
-        """
-
-        Parameters
-        ----------
-        step_input : NDArray or Symbol
-        states : list of NDArray or Symbol
-
-        Returns
-        -------
-        step_output : NDArray or Symbol
-            The output of the decoder. Shape is (batch_size, C_out)
-        new_states: list
-            Includes
-
-            - rnn_states : list of NDArray or Symbol
-            - attention_vec : NDArray or Symbol, Shape (batch_size, C_memory)
-
-        step_additional_outputs : list
-            Either be an empty list or contains the attention weights in this step.
-            The attention weights will have shape (batch_size, 1, mem_length) or
-            (batch_size, num_heads, 1, mem_length)
-
-        """
-        has_mem_mask = (len(states) == 4)
-        if has_mem_mask:
-            rnn_states, attention_output, mem_value, mem_masks = states
-            mem_masks = F.expand_dims(mem_masks, axis=1)
-        else:
-            rnn_states, attention_output, mem_value = states
-            mem_masks = None
-        new_rnn_states = []
-        # Process the first layer
-        rnn_out, layer_state =\
-            self.rnn_cells[0](F.concat(step_input, attention_output, dim=-1), rnn_states[0])
-        new_rnn_states.append(layer_state)
-        attention_vec, attention_weights =\
-            self.attention_cell(F.expand_dims(rnn_out, axis=1),  # Shape(B, 1, C)
-                                mem_value,
-                                mem_value,
-                                mem_masks)
-        attention_vec = F.reshape(attention_vec, shape=(0, -1))
-        # Process the 2nd layer - the last layer
-        for i in range(1, len(self.rnn_cells)):
-            curr_input = rnn_out
-            rnn_cell = self.rnn_cells[i]
-            # Concatenate the attention vector calculated by the bottom layer and the output of the
-            # previous layer
-            rnn_out, layer_state = rnn_cell(F.concat(curr_input, attention_vec, dim=-1),
-                                            rnn_states[i])
-            rnn_out = self.dropout_layer(rnn_out)
-            if self._use_residual:
-                rnn_out = rnn_out + curr_input
-            # Append new RNN state
-            new_rnn_states.append(layer_state)
-        new_states = [new_rnn_states, attention_vec]
-        step_additional_outputs = []
-        if self._output_attention:
-            step_additional_outputs.append(attention_weights)
-        return rnn_out, new_states, step_additional_outputs
-
-
-class GNMTOneStepDecoder(_BaseGNMTDecoder, Seq2SeqOneStepDecoder):
-    """RNN Encoder similar to that used in the Google Neural Machine Translation paper.
-
-    One-step ahead decoder used during inference.
-
-    We use gnmt_v2 strategy in tensorflow/nmt
-
-    Parameters
-    ----------
-    cell_type : str or type
-        Can be "lstm", "gru" or constructor functions that can be directly called,
-         like rnn.LSTMCell
-    attention_cell : AttentionCell or str
-        Arguments of the attention cell.
-        Can be 'scaled_luong', 'normed_mlp', 'dot'
-    num_layers : int
-        Total number of layers
-    hidden_size : int
-        Number of hidden units
-    dropout : float
-        The dropout rate
-    use_residual : bool
-        Whether to use residual connection. Residual connection will be added in the
-        uni-directional RNN layers
-    output_attention: bool
-        Whether to output the attention weights
-    i2h_weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    h2h_weight_initializer : str or Initializer
-        Initializer for the recurrent weights matrix, used for the linear
-        transformation of the recurrent state.
-    i2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    h2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default 'rnn_'
-        Prefix for name of `Block`s
-        (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells.
-        Created if `None`.
-    """
-
-
-class GNMTDecoder(_BaseGNMTDecoder, Seq2SeqDecoder):
-    """RNN Encoder similar to that used in the Google Neural Machine Translation paper.
-
-    Multi-step decoder used during training with teacher forcing.
-
-    We use gnmt_v2 strategy in tensorflow/nmt
-
-    Parameters
-    ----------
-    cell_type : str or type
-        Can be "lstm", "gru" or constructor functions that can be directly called,
-         like rnn.LSTMCell
-    attention_cell : AttentionCell or str
-        Arguments of the attention cell.
-        Can be 'scaled_luong', 'normed_mlp', 'dot'
-    num_layers : int
-        Total number of layers
-    hidden_size : int
-        Number of hidden units
-    dropout : float
-        The dropout rate
-    use_residual : bool
-        Whether to use residual connection. Residual connection will be added in the
-        uni-directional RNN layers
-    output_attention: bool
-        Whether to output the attention weights
-    i2h_weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    h2h_weight_initializer : str or Initializer
-        Initializer for the recurrent weights matrix, used for the linear
-        transformation of the recurrent state.
-    i2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    h2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default 'rnn_'
-        Prefix for name of `Block`s
-        (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells.
-        Created if `None`.
-    """
-
-    def forward(self, inputs, states, valid_length=None):  # pylint: disable=arguments-differ
-        """Decode the decoder inputs. This function is only used for training.
-
-        Parameters
-        ----------
-        inputs : NDArray, Shape (batch_size, length, C_in)
-        states : list of NDArrays or None
-            Initial states. The list of initial decoder states
-        valid_length : NDArray or None
-            Valid lengths of each sequence. This is usually used when part of sequence has
-            been padded. Shape (batch_size,)
-
-        Returns
-        -------
-        output : NDArray, Shape (batch_size, length, C_out)
-        states : list
-            The decoder states, includes:
-
-            - rnn_states : NDArray
-            - attention_vec : NDArray
-            - mem_value : NDArray
-            - mem_masks : NDArray, optional
-        additional_outputs : list
-            Either be an empty list or contains the attention weights in this step.
-            The attention weights will have shape (batch_size, length, mem_length) or
-            (batch_size, num_heads, length, mem_length)
-        """
-        length = inputs.shape[1]
-        output = []
-        additional_outputs = []
-        inputs = _as_list(mx.nd.split(inputs, num_outputs=length, axis=1, squeeze_axis=True))
-        rnn_states_l = []
-        attention_output_l = []
-        fixed_states = states[2:]
-        for i in range(length):
-            ele_output, states, ele_additional_outputs = super().forward(inputs[i], states)
-            rnn_states_l.append(states[0])
-            attention_output_l.append(states[1])
-            output.append(ele_output)
-            additional_outputs.extend(ele_additional_outputs)
-        output = mx.nd.stack(*output, axis=1)
-        if valid_length is not None:
-            states = [_nested_sequence_last(rnn_states_l, valid_length),
-                      _nested_sequence_last(attention_output_l, valid_length)] + fixed_states
-            output = mx.nd.SequenceMask(output,
-                                        sequence_length=valid_length,
-                                        use_sequence_length=True,
-                                        axis=1)
-        if self._output_attention:
-            additional_outputs = [mx.nd.concat(*additional_outputs, dim=-2)]
-        return output, states, additional_outputs
-
-
-def get_gnmt_encoder_decoder(cell_type='lstm', attention_cell='scaled_luong', num_layers=2,
-                             num_bi_layers=1, hidden_size=128, dropout=0.0, use_residual=False,
-                             i2h_weight_initializer=None, h2h_weight_initializer=None,
-                             i2h_bias_initializer=mx.init.LSTMBias(forget_bias=1.0),
-                             h2h_bias_initializer='zeros',
-                             prefix='gnmt_', params=None):
-    """Build a pair of GNMT encoder/decoder
-
-    Parameters
-    ----------
-    cell_type : str or type
-    attention_cell : str or AttentionCell
-    num_layers : int
-    num_bi_layers : int
-    hidden_size : int
-    dropout : float
-    use_residual : bool
-    i2h_weight_initializer : mx.init.Initializer or None
-    h2h_weight_initializer : mx.init.Initializer or None
-    i2h_bias_initializer : mx.init.Initializer or None
-    h2h_bias_initializer : mx.init.Initializer or None
-    prefix : str, default 'gnmt_'
-        Prefix for name of `Block`s.
-    params : Parameter or None
-        Container for weight sharing between cells.
-        Created if `None`.
-
-    Returns
-    -------
-    encoder : GNMTEncoder
-    decoder : GNMTDecoder
-    """
-    encoder = GNMTEncoder(cell_type=cell_type, num_layers=num_layers, num_bi_layers=num_bi_layers,
-                          hidden_size=hidden_size, dropout=dropout, use_residual=use_residual,
-                          i2h_weight_initializer=i2h_weight_initializer,
-                          h2h_weight_initializer=h2h_weight_initializer,
-                          i2h_bias_initializer=i2h_bias_initializer,
-                          h2h_bias_initializer=h2h_bias_initializer, prefix=prefix + 'enc_',
-                          params=params)
-    decoder = GNMTDecoder(cell_type=cell_type, attention_cell=attention_cell, num_layers=num_layers,
-                          hidden_size=hidden_size, dropout=dropout, use_residual=use_residual,
-                          i2h_weight_initializer=i2h_weight_initializer,
-                          h2h_weight_initializer=h2h_weight_initializer,
-                          i2h_bias_initializer=i2h_bias_initializer,
-                          h2h_bias_initializer=h2h_bias_initializer, prefix=prefix + 'dec_',
-                          params=params)
-    one_step_ahead_decoder = GNMTOneStepDecoder(
-        cell_type=cell_type, attention_cell=attention_cell, num_layers=num_layers,
-        hidden_size=hidden_size, dropout=dropout, use_residual=use_residual,
-        i2h_weight_initializer=i2h_weight_initializer,
-        h2h_weight_initializer=h2h_weight_initializer, i2h_bias_initializer=i2h_bias_initializer,
-        h2h_bias_initializer=h2h_bias_initializer, prefix=prefix + 'dec_',
-        params=decoder.collect_params())
-    return encoder, decoder, one_step_ahead_decoder
diff --git a/scripts/machine_translation/hyperparameters.py b/scripts/machine_translation/hyperparameters.py
deleted file mode 100644
index f9aaf9a7fb..0000000000
--- a/scripts/machine_translation/hyperparameters.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Hyperparameters for transformer, for past reference only."""
-
-# parameters for dataset
-src_lang = 'en'
-tgt_lang = 'de'
-src_max_len = -1
-tgt_max_len = -1
-
-# parameters for model
-num_units = 512
-hidden_size = 2048
-dropout = 0.1
-epsilon = 0.1
-num_layers = 6
-num_heads = 8
-scaled = True
-
-# parameters for training
-optimizer = 'adam'
-epochs = 3
-batch_size = 2700
-test_batch_size = 256
-num_accumulated = 1
-lr = 2
-warmup_steps = 1
-save_dir = 'transformer_en_de_u512'
-average_start = 1
-num_buckets = 20
-log_interval = 10
-bleu = '13a'
-
-#parameters for testing
-beam_size = 4
-lp_alpha = 0.6
-lp_k = 5
diff --git a/scripts/machine_translation/index.rst b/scripts/machine_translation/index.rst
deleted file mode 100644
index a228ee24ed..0000000000
--- a/scripts/machine_translation/index.rst
+++ /dev/null
@@ -1,71 +0,0 @@
-Machine Translation
--------------------
-
-:download:`Download scripts </model_zoo/machine_translation.zip>`
-
-Google Neural Machine Translation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Use the following command to train the GNMT model on the IWSLT2015 dataset.
-
-.. code-block:: console
-
-   $ MXNET_GPU_MEM_POOL_TYPE=Round python train_gnmt.py --src_lang en --tgt_lang vi --batch_size 128 \
-                   --optimizer adam --lr 0.001 --lr_update_factor 0.5 --beam_size 10 --bucket_scheme exp \
-                   --num_hidden 512 --save_dir gnmt_en_vi_l2_h512_beam10 --epochs 12 --gpu 0
-
-It gets test BLEU score equals to 26.20.
-
-Transformers
-~~~~~~~~~~~~
-
-Use the following commands to train the Transformer model on the WMT14 dataset for English to German translation.
-
-.. code-block:: console
-
-   $ MXNET_GPU_MEM_POOL_TYPE=Round python train_transformer.py --dataset WMT2014BPE \
-                          --src_lang en --tgt_lang de --batch_size 2700 \
-                          --optimizer adam --num_accumulated 16 --lr 2.0 --warmup_steps 4000 \
-                          --save_dir transformer_en_de_u512 --epochs 30 --gpus 0,1,2,3,4,5,6,7 --scaled \
-                          --average_start 5 --num_buckets 20 --bucket_scheme exp --bleu 13a --log_interval 10
-
-It gets official mteval-v13a BLEU score equals to 27.09 on newstest2014 (http://statmt.org/wmt14/test-filtered.tgz).
-This result is obtained by using averaged SGD in last 5 epochs. If we use international tokenization (i.e., ``--bleu intl``),
-we can obtain bleu score equals to 27.89. If we use ``--bleu tweaked``, we obtain test BLEU score equals to 28.96.
-This result is obtained on tweaked reference, where the tokenized reference text is put in ATAT format for historical reason
-and following preprocessing pipeline is done:
-
-.. code-block:: console
-
-    mosesdecoder/scripts/tokenizer/normalize-punctuation.perl -l de
-    mosesdecoder/scripts/tokenizer/remove-non-printing-char.perl
-    mosesdecoder/scripts/tokenizer/tokenizer.perl -q -no-escape -protected mosesdecoder/scripts/tokenizer/basic-protected-patterns -l de.
-
-If we turn on  ``--full``, the testing is performed on newstest2014 (http://statmt.org/wmt14/test-full.tgz). Then, we can
-obtain BLEU=27.05 with ``--bleu 13a``, BLEU=27.81 with ``--bleu intl``, and BLEU=28.80 with ``--bleu tweaked``
-
-The pre-trained model can be downloaded from http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/transformer_en_de_512_WMT2014-e25287c5.zip.
-
-For the users from China, it might be faster with this link instead: https://apache-mxnet.s3.cn-north-1.amazonaws.com.cn/gluon/models/transformer_en_de_512_WMT2014-e25287c5.zip.
-
-
-Use the following commands to inference the Transformer model on the WMT14 test dataset for English to German translation.
-
-.. code-block:: console
-    
-    $ python inference_transformer.py --dataset WMT2014BPE 
-                            --src_lang en \
-                            --tgt_lang de \
-                            --batch_size 2700 \
-                            --scaled \
-                            --num_buckets 20 \
-                            --bucket_scheme exp \
-                            --bleu 13a \
-                            --log_interval 10 \
-                            --gpu 0 \
-                            --model_parameter PATH/TO/valid_best.params
-
-Before inference, you should do a complete training at least one time to get the pre-trained model, or you can get the pre-trained model from http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/transformer_en_de_512_WMT2014-e25287c5.zip.
-
-For the users from China, it might be faster with this link instead: https://apache-mxnet.s3.cn-north-1.amazonaws.com.cn/gluon/models/transformer_en_de_512_WMT2014-e25287c5.zip.
-
diff --git a/scripts/machine_translation/inference_transformer.py b/scripts/machine_translation/inference_transformer.py
deleted file mode 100644
index 178270a6d6..0000000000
--- a/scripts/machine_translation/inference_transformer.py
+++ /dev/null
@@ -1,300 +0,0 @@
-"""
-Transformer
-=================================
-
-This example shows how to implement the Transformer model with Gluon NLP Toolkit.
-
-@inproceedings{vaswani2017attention,
-  title={Attention is all you need},
-  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones,
-          Llion and Gomez, Aidan N and Kaiser, Lukasz and Polosukhin, Illia},
-  booktitle={Advances in Neural Information Processing Systems},
-  pages={6000--6010},
-  year={2017}
-}
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-
-import argparse
-import time
-import random
-import os
-import zipfile
-import logging
-import numpy as np
-import mxnet as mx
-from mxnet import gluon
-from mxnet.gluon.utils import download, check_sha1
-import gluonnlp as nlp
-
-from gluonnlp.loss import MaskedSoftmaxCELoss
-from gluonnlp.model.translation import NMTModel
-from gluonnlp.model.transformer import get_transformer_encoder_decoder
-from translation import BeamSearchTranslator
-from utils import logging_config
-from bleu import _bpe_to_words, compute_bleu
-import dataprocessor
-
-np.random.seed(100)
-random.seed(100)
-mx.random.seed(10000)
-
-nlp.utils.check_version('0.7.0')
-
-parser = argparse.ArgumentParser(description='Neural Machine Translation Example.'
-                                             'We use this script only for transformer inference.')
-parser.add_argument('--dataset', type=str, default='WMT2014BPE', help='Dataset to use.')
-parser.add_argument('--src_lang', type=str, default='en', help='Source language')
-parser.add_argument('--tgt_lang', type=str, default='de', help='Target language')
-parser.add_argument('--num_units', type=int, default=512, help='Dimension of the embedding '
-                                                               'vectors and states.')
-parser.add_argument('--hidden_size', type=int, default=2048,
-                    help='Dimension of the hidden state in position-wise feed-forward networks.')
-parser.add_argument('--dropout', type=float, default=0.1,
-                    help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--num_layers', type=int, default=6,
-                    help='number of layers in the encoder and decoder')
-parser.add_argument('--num_heads', type=int, default=8,
-                    help='number of heads in multi-head attention')
-parser.add_argument('--scaled', action='store_true', help='Turn on to use scale in attention')
-parser.add_argument('--batch_size', type=int, default=1024,
-                    help='Batch size. Number of tokens in a minibatch')
-parser.add_argument('--beam_size', type=int, default=4, help='Beam size')
-parser.add_argument('--lp_alpha', type=float, default=0.6,
-                    help='Alpha used in calculating the length penalty')
-parser.add_argument('--lp_k', type=int, default=5, help='K used in calculating the length penalty')
-parser.add_argument('--test_batch_size', type=int, default=256, help='Test batch size')
-parser.add_argument('--num_buckets', type=int, default=10, help='Bucket number')
-parser.add_argument('--bucket_scheme', type=str, default='constant',
-                    help='Strategy for generating bucket keys. It supports: '
-                         '"constant": all the buckets have the same width; '
-                         '"linear": the width of bucket increases linearly; '
-                         '"exp": the width of bucket increases exponentially')
-parser.add_argument('--bucket_ratio', type=float, default=0.0, help='Ratio for increasing the '
-                                                                    'throughput of the bucketing')
-parser.add_argument('--src_max_len', type=int, default=-1, help='Maximum length of the source '
-                                                                'sentence, -1 means no clipping')
-parser.add_argument('--tgt_max_len', type=int, default=-1, help='Maximum length of the target '
-                                                                'sentence, -1 means no clipping')
-parser.add_argument('--full', action='store_true',
-                    help='In default, we use the test dataset in'
-                         ' http://statmt.org/wmt14/test-filtered.tgz.'
-                         ' When the option full is turned on, we use the test dataset in'
-                         ' http://statmt.org/wmt14/test-full.tgz')
-parser.add_argument('--bleu', type=str, default='tweaked',
-                    help='Schemes for computing bleu score. It can be: '
-                    '"tweaked": it uses similar steps in get_ende_bleu.sh in tensor2tensor '
-                    'repository, where compound words are put in ATAT format; '
-                    '"13a": This uses official WMT tokenization and produces the same results'
-                    ' as official script (mteval-v13a.pl) used by WMT; '
-                    '"intl": This use international tokenization in mteval-v14a.pl')
-parser.add_argument('--log_interval', type=int, default=100, metavar='N',
-                    help='report interval')
-parser.add_argument('--save_dir', type=str, default='transformer_out',
-                    help='directory path to save the final model and training log')
-parser.add_argument('--gpu', type=int,
-                    help='gpu id, e.g. 0 or 1. Unspecified means using cpu.')
-parser.add_argument('--model_parameter', type=str, default=' ', required=True,
-                    help='model parameter for inference, must be provided.')
-
-args = parser.parse_args()
-logging_config(args.save_dir)
-logging.info(args)
-
-# data process
-data_train, data_val, data_test, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab \
-    = dataprocessor.load_translation_data(dataset=args.dataset, bleu=args.bleu, args=args)
-
-dataprocessor.write_sentences(test_tgt_sentences, os.path.join(args.save_dir, 'test_gt.txt'))
-
-data_train = data_train.transform(lambda src, tgt: (src, tgt, len(src), len(tgt)), lazy=False)
-data_val = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
-                                     for i, ele in enumerate(data_val)])
-data_test = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
-                                      for i, ele in enumerate(data_test)])
-
-data_train_lengths, data_val_lengths, data_test_lengths = [dataprocessor.get_data_lengths(x)
-                                                           for x in
-                                                           [data_train, data_val, data_test]]
-
-detokenizer = nlp.data.SacreMosesDetokenizer()
-
-# model prepare
-ctx = [mx.cpu()] if args.gpu is None else [mx.gpu(args.gpu)]
-
-if args.src_max_len <= 0 or args.tgt_max_len <= 0:
-    max_len = np.max(
-        [np.max(data_train_lengths, axis=0), np.max(data_val_lengths, axis=0),
-         np.max(data_test_lengths, axis=0)],
-        axis=0)
-
-if args.src_max_len > 0:
-    src_max_len = args.src_max_len
-else:
-    src_max_len = max_len[0]
-if args.tgt_max_len > 0:
-    tgt_max_len = args.tgt_max_len
-else:
-    tgt_max_len = max_len[1]
-
-encoder, decoder, one_step_ahead_decoder = get_transformer_encoder_decoder(
-    units=args.num_units, hidden_size=args.hidden_size, dropout=args.dropout,
-    num_layers=args.num_layers, num_heads=args.num_heads, max_src_length=max(src_max_len, 500),
-    max_tgt_length=max(tgt_max_len, 500), scaled=args.scaled)
-model = NMTModel(src_vocab=src_vocab, tgt_vocab=tgt_vocab, encoder=encoder, decoder=decoder,
-                 one_step_ahead_decoder=one_step_ahead_decoder, share_embed=args.dataset != 'TOY',
-                 embed_size=args.num_units, tie_weights=args.dataset != 'TOY',
-                 embed_initializer=None, prefix='transformer_')
-
-param_name = args.model_parameter
-if (not os.path.exists(param_name)):
-    archive_param_url = 'http://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/{}'
-    archive_file_hash = ('transformer_en_de_512_WMT2014-e25287c5.zip',
-                         '5193b469e0e2dfdda3c834f9212420758a0d1d71')
-    param_file_hash = ('transformer_en_de_512_WMT2014-e25287c5.params',
-                       'e25287c5a924b7025e08d626f02626d5fa3af2d1')
-    archive_file, archive_hash = archive_file_hash
-    param_file, param_hash = param_file_hash
-    logging.warning('The provided param file {} does not exist, start to download it from {}...'
-                    .format(param_name, archive_param_url.format(archive_file)))
-
-    root_dir = os.path.dirname(__file__)
-    archive_file_path = '{}/{}'.format(root_dir, archive_file)
-    param_name = '{}/{}'.format(root_dir, param_file)
-    if (not os.path.exists(param_name) or not check_sha1(param_name, param_hash)):
-        download(archive_param_url.format(archive_file),
-                 path=archive_file_path,
-                 sha1_hash=archive_hash)
-        with zipfile.ZipFile(archive_file_path) as zf:
-            zf.extractall(root_dir)
-
-model.load_parameters(param_name, ctx)
-
-static_alloc = True
-model.hybridize(static_alloc=static_alloc)
-logging.info(model)
-
-# translator prepare
-translator = BeamSearchTranslator(model=model, beam_size=args.beam_size,
-                                  scorer=nlp.model.BeamSearchScorer(alpha=args.lp_alpha,
-                                                                    K=args.lp_k),
-                                  max_length=200)
-logging.info('Use beam_size={}, alpha={}, K={}'.format(args.beam_size, args.lp_alpha, args.lp_k))
-
-test_loss_function = MaskedSoftmaxCELoss()
-test_loss_function.hybridize(static_alloc=static_alloc)
-
-def inference():
-    """inference function."""
-    logging.info('Inference on test_dataset!')
-
-    # data prepare
-    test_data_loader = dataprocessor.get_dataloader(data_test, args,
-                                                    dataset_type='test',
-                                                    use_average_length=True)
-
-    if args.bleu == 'tweaked':
-        bpe = bool(args.dataset != 'IWSLT2015' and args.dataset != 'TOY')
-        split_compound_word = bpe
-        tokenized = True
-    elif args.bleu == '13a' or args.bleu == 'intl':
-        bpe = False
-        split_compound_word = False
-        tokenized = False
-    else:
-        raise NotImplementedError
-
-    translation_out = []
-    all_inst_ids = []
-    total_wc = 0
-    total_time = 0
-    batch_total_blue = 0
-
-    for batch_id, (src_seq, tgt_seq, src_test_length, tgt_test_length, inst_ids) \
-            in enumerate(test_data_loader):
-
-        total_wc += src_test_length.sum().asscalar() + tgt_test_length.sum().asscalar()
-
-        src_seq = src_seq.as_in_context(ctx[0])
-        tgt_seq = tgt_seq.as_in_context(ctx[0])
-        src_test_length = src_test_length.as_in_context(ctx[0])
-        tgt_test_length = tgt_test_length.as_in_context(ctx[0])
-        all_inst_ids.extend(inst_ids.asnumpy().astype(np.int32).tolist())
-
-        start = time.time()
-        # Translate to get a bleu score
-        samples, _, sample_test_length = \
-            translator.translate(src_seq=src_seq, src_valid_length=src_test_length)
-        total_time += (time.time() - start)
-
-        # generator the translator result for each batch
-        max_score_sample = samples[:, 0, :].asnumpy()
-        sample_test_length = sample_test_length[:, 0].asnumpy()
-        translation_tmp = []
-        translation_tmp_sentences = []
-        for i in range(max_score_sample.shape[0]):
-            translation_tmp.append([tgt_vocab.idx_to_token[ele] for ele in \
-                                    max_score_sample[i][1:(sample_test_length[i] - 1)]])
-
-        # detokenizer each translator result
-        for _, sentence in enumerate(translation_tmp):
-            if args.bleu == 'tweaked':
-                translation_tmp_sentences.append(sentence)
-                translation_out.append(sentence)
-            elif args.bleu == '13a' or args.bleu == 'intl':
-                translation_tmp_sentences.append(detokenizer(_bpe_to_words(sentence)))
-                translation_out.append(detokenizer(_bpe_to_words(sentence)))
-            else:
-                raise NotImplementedError
-
-        # generate tgt_sentence for bleu calculation of each batch
-        tgt_sen_tmp = [test_tgt_sentences[index] for \
-                         _, index in enumerate(inst_ids.asnumpy().astype(np.int32).tolist())]
-        batch_test_bleu_score, _, _, _, _ = compute_bleu([tgt_sen_tmp], translation_tmp_sentences,
-                                                         tokenized=tokenized, tokenizer=args.bleu,
-                                                         split_compound_word=split_compound_word,
-                                                         bpe=bpe)
-        batch_total_blue += batch_test_bleu_score
-
-        # log for every ten batchs
-        if batch_id % 10 == 0 and batch_id != 0:
-            batch_ave_bleu = batch_total_blue / 10
-            batch_total_blue = 0
-            logging.info('batch id={:d}, batch_bleu={:.4f}'
-                         .format(batch_id, batch_ave_bleu * 100))
-
-    # reorg translation sentences by inst_ids
-    real_translation_out = [None for _ in range(len(all_inst_ids))]
-    for ind, sentence in zip(all_inst_ids, translation_out):
-        real_translation_out[ind] = sentence
-
-    # get bleu score, n-gram precisions, brevity penalty,  reference length, and translation length
-    test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], real_translation_out,
-                                               tokenized=tokenized, tokenizer=args.bleu,
-                                               split_compound_word=split_compound_word,
-                                               bpe=bpe)
-
-    logging.info('Inference at test dataset. \
-                 inference bleu={:.4f}, throughput={:.4f}K wps'
-                 .format(test_bleu_score * 100, total_wc / total_time / 1000))
-
-
-if __name__ == '__main__':
-    inference()
diff --git a/scripts/machine_translation/train_gnmt.py b/scripts/machine_translation/train_gnmt.py
deleted file mode 100644
index da1c61f2d9..0000000000
--- a/scripts/machine_translation/train_gnmt.py
+++ /dev/null
@@ -1,285 +0,0 @@
-"""
-Google Neural Machine Translation
-=================================
-
-This example shows how to implement the GNMT model with Gluon NLP Toolkit.
-
-@article{wu2016google,
-  title={Google's neural machine translation system:
-   Bridging the gap between human and machine translation},
-  author={Wu, Yonghui and Schuster, Mike and Chen, Zhifeng and Le, Quoc V and
-   Norouzi, Mohammad and Macherey, Wolfgang and Krikun, Maxim and Cao, Yuan and Gao, Qin and
-   Macherey, Klaus and others},
-  journal={arXiv preprint arXiv:1609.08144},
-  year={2016}
-}
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-
-import argparse
-import time
-import random
-import os
-import logging
-import numpy as np
-import mxnet as mx
-from mxnet import gluon
-import gluonnlp as nlp
-
-from gluonnlp.model.translation import NMTModel
-from gluonnlp.loss import MaskedSoftmaxCELoss
-from gnmt import get_gnmt_encoder_decoder
-from translation import BeamSearchTranslator
-from utils import logging_config
-from bleu import compute_bleu
-import dataprocessor
-
-np.random.seed(100)
-random.seed(100)
-mx.random.seed(10000)
-
-nlp.utils.check_version('0.9.0')
-
-parser = argparse.ArgumentParser(description='Neural Machine Translation Example.'
-                                             'We train the Google NMT model')
-parser.add_argument('--dataset', type=str, default='IWSLT2015', help='Dataset to use.')
-parser.add_argument('--src_lang', type=str, default='en', help='Source language')
-parser.add_argument('--tgt_lang', type=str, default='vi', help='Target language')
-parser.add_argument('--epochs', type=int, default=40, help='upper epoch limit')
-parser.add_argument('--num_hidden', type=int, default=128, help='Dimension of the embedding '
-                                                                'vectors and states.')
-parser.add_argument('--dropout', type=float, default=0.2,
-                    help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--num_layers', type=int, default=2, help='number of layers in the encoder'
-                                                              ' and decoder')
-parser.add_argument('--num_bi_layers', type=int, default=1,
-                    help='number of bidirectional layers in the encoder and decoder')
-parser.add_argument('--batch_size', type=int, default=128, help='Batch size')
-parser.add_argument('--beam_size', type=int, default=4, help='Beam size')
-parser.add_argument('--lp_alpha', type=float, default=1.0,
-                    help='Alpha used in calculating the length penalty')
-parser.add_argument('--lp_k', type=int, default=5, help='K used in calculating the length penalty')
-parser.add_argument('--test_batch_size', type=int, default=32, help='Test batch size')
-parser.add_argument('--num_buckets', type=int, default=5, help='Bucket number')
-parser.add_argument('--bucket_scheme', type=str, default='constant',
-                    help='Strategy for generating bucket keys. It supports: '
-                         '"constant": all the buckets have the same width; '
-                         '"linear": the width of bucket increases linearly; '
-                         '"exp": the width of bucket increases exponentially')
-parser.add_argument('--bucket_ratio', type=float, default=0.0, help='Ratio for increasing the '
-                                                                    'throughput of the bucketing')
-parser.add_argument('--src_max_len', type=int, default=50, help='Maximum length of the source '
-                                                                'sentence')
-parser.add_argument('--tgt_max_len', type=int, default=50, help='Maximum length of the target '
-                                                                'sentence')
-parser.add_argument('--optimizer', type=str, default='adam', help='optimization algorithm')
-parser.add_argument('--lr', type=float, default=1E-3, help='Initial learning rate')
-parser.add_argument('--lr_update_factor', type=float, default=0.5,
-                    help='Learning rate decay factor')
-parser.add_argument('--clip', type=float, default=5.0, help='gradient clipping')
-parser.add_argument('--log_interval', type=int, default=100, metavar='N',
-                    help='report interval')
-parser.add_argument('--save_dir', type=str, default='out_dir',
-                    help='directory path to save the final model and training log')
-parser.add_argument('--gpu', type=int, default=None,
-                    help='id of the gpu to use. Set it to empty means to use cpu.')
-args = parser.parse_args()
-print(args)
-logging_config(args.save_dir)
-
-
-data_train, data_val, data_test, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab\
-    = dataprocessor.load_translation_data(dataset=args.dataset, bleu='tweaked', args=args)
-
-dataprocessor.write_sentences(val_tgt_sentences, os.path.join(args.save_dir, 'val_gt.txt'))
-dataprocessor.write_sentences(test_tgt_sentences, os.path.join(args.save_dir, 'test_gt.txt'))
-
-data_train = data_train.transform(lambda src, tgt: (src, tgt, len(src), len(tgt)), lazy=False)
-data_val = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
-                                     for i, ele in enumerate(data_val)])
-data_test = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
-                                      for i, ele in enumerate(data_test)])
-if args.gpu is None:
-    ctx = mx.cpu()
-    print('Use CPU')
-else:
-    ctx = mx.gpu(args.gpu)
-
-encoder, decoder, one_step_ahead_decoder = get_gnmt_encoder_decoder(
-    hidden_size=args.num_hidden, dropout=args.dropout, num_layers=args.num_layers,
-    num_bi_layers=args.num_bi_layers)
-model = NMTModel(src_vocab=src_vocab, tgt_vocab=tgt_vocab, encoder=encoder, decoder=decoder,
-                 one_step_ahead_decoder=one_step_ahead_decoder, embed_size=args.num_hidden,
-                 prefix='gnmt_')
-model.initialize(init=mx.init.Uniform(0.1), ctx=ctx)
-static_alloc = True
-model.hybridize(static_alloc=static_alloc)
-logging.info(model)
-
-translator = BeamSearchTranslator(model=model, beam_size=args.beam_size,
-                                  scorer=nlp.model.BeamSearchScorer(alpha=args.lp_alpha,
-                                                                    K=args.lp_k),
-                                  max_length=args.tgt_max_len + 100)
-logging.info('Use beam_size={}, alpha={}, K={}'.format(args.beam_size, args.lp_alpha, args.lp_k))
-
-
-loss_function = MaskedSoftmaxCELoss()
-loss_function.hybridize(static_alloc=static_alloc)
-
-
-def evaluate(data_loader):
-    """Evaluate given the data loader
-
-    Parameters
-    ----------
-    data_loader : DataLoader
-
-    Returns
-    -------
-    avg_loss : float
-        Average loss
-    real_translation_out : list of list of str
-        The translation output
-    """
-    translation_out = []
-    all_inst_ids = []
-    avg_loss_denom = 0
-    avg_loss = 0.0
-    for _, (src_seq, tgt_seq, src_valid_length, tgt_valid_length, inst_ids) \
-            in enumerate(data_loader):
-        src_seq = src_seq.as_in_context(ctx)
-        tgt_seq = tgt_seq.as_in_context(ctx)
-        src_valid_length = src_valid_length.as_in_context(ctx)
-        tgt_valid_length = tgt_valid_length.as_in_context(ctx)
-        # Calculating Loss
-        out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
-        loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).sum().asscalar()
-        all_inst_ids.extend(inst_ids.asnumpy().astype(np.int32).tolist())
-        avg_loss += loss * (tgt_seq.shape[1] - 1)
-        avg_loss_denom += (tgt_valid_length - 1).sum().asscalar()
-        # Translate
-        samples, _, sample_valid_length = translator.translate(
-            src_seq=src_seq, src_valid_length=src_valid_length)
-        max_score_sample = samples[:, 0, :].asnumpy()
-        sample_valid_length = sample_valid_length[:, 0].asnumpy()
-        for i in range(max_score_sample.shape[0]):
-            translation_out.append(
-                [tgt_vocab.idx_to_token[ele] for ele in
-                 max_score_sample[i][1:(sample_valid_length[i] - 1)]])
-    avg_loss = avg_loss / avg_loss_denom
-    real_translation_out = [None for _ in range(len(all_inst_ids))]
-    for ind, sentence in zip(all_inst_ids, translation_out):
-        real_translation_out[ind] = sentence
-    return avg_loss, real_translation_out
-
-
-def train():
-    """Training function."""
-    trainer = gluon.Trainer(model.collect_params(), args.optimizer, {'learning_rate': args.lr})
-
-    train_data_loader, val_data_loader, test_data_loader \
-        = dataprocessor.make_dataloader(data_train, data_val, data_test, args)
-
-    best_valid_bleu = 0.0
-    for epoch_id in range(args.epochs):
-        log_loss = 0
-        log_denom = 0
-        log_avg_gnorm = 0
-        log_wc = 0
-        log_start_time = time.time()
-        for batch_id, (src_seq, tgt_seq, src_valid_length, tgt_valid_length)\
-                in enumerate(train_data_loader):
-            # logging.info(src_seq.context) Context suddenly becomes GPU.
-            src_seq = src_seq.as_in_context(ctx)
-            tgt_seq = tgt_seq.as_in_context(ctx)
-            src_valid_length = src_valid_length.as_in_context(ctx)
-            tgt_valid_length = tgt_valid_length.as_in_context(ctx)
-            with mx.autograd.record():
-                out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
-                loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean()
-                loss = loss * (tgt_seq.shape[1] - 1)
-                log_loss += loss * tgt_seq.shape[0]
-                log_denom += (tgt_valid_length - 1).sum()
-                loss = loss / (tgt_valid_length - 1).mean()
-                loss.backward()
-            grads = [p.grad(ctx) for p in model.collect_params().values()]
-            gnorm = gluon.utils.clip_global_norm(grads, args.clip)
-            trainer.step(1)
-            src_wc = src_valid_length.sum().asscalar()
-            tgt_wc = (tgt_valid_length - 1).sum().asscalar()
-            log_loss = log_loss.asscalar()
-            log_denom = log_denom.asscalar()
-            log_avg_gnorm += gnorm
-            log_wc += src_wc + tgt_wc
-            if (batch_id + 1) % args.log_interval == 0:
-                wps = log_wc / (time.time() - log_start_time)
-                logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, gnorm={:.4f}, '
-                             'throughput={:.2f}K wps, wc={:.2f}K'
-                             .format(epoch_id, batch_id + 1, len(train_data_loader),
-                                     log_loss / log_denom,
-                                     np.exp(log_loss / log_denom),
-                                     log_avg_gnorm / args.log_interval,
-                                     wps / 1000, log_wc / 1000))
-                log_start_time = time.time()
-                log_loss = 0
-                log_denom = 0
-                log_avg_gnorm = 0
-                log_wc = 0
-        valid_loss, valid_translation_out = evaluate(val_data_loader)
-        valid_bleu_score, _, _, _, _ = compute_bleu([val_tgt_sentences], valid_translation_out)
-        logging.info('[Epoch {}] valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'
-                     .format(epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100))
-        test_loss, test_translation_out = evaluate(test_data_loader)
-        test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], test_translation_out)
-        logging.info('[Epoch {}] test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'
-                     .format(epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100))
-        dataprocessor.write_sentences(valid_translation_out,
-                                      os.path.join(args.save_dir,
-                                                   'epoch{:d}_valid_out.txt').format(epoch_id))
-        dataprocessor.write_sentences(test_translation_out,
-                                      os.path.join(args.save_dir,
-                                                   'epoch{:d}_test_out.txt').format(epoch_id))
-        if valid_bleu_score > best_valid_bleu:
-            best_valid_bleu = valid_bleu_score
-            save_path = os.path.join(args.save_dir, 'valid_best.params')
-            logging.info('Save best parameters to {}'.format(save_path))
-            model.save_parameters(save_path)
-        if epoch_id + 1 >= (args.epochs * 2) // 3:
-            new_lr = trainer.learning_rate * args.lr_update_factor
-            logging.info('Learning rate change to {}'.format(new_lr))
-            trainer.set_learning_rate(new_lr)
-    if os.path.exists(os.path.join(args.save_dir, 'valid_best.params')):
-        model.load_parameters(os.path.join(args.save_dir, 'valid_best.params'))
-    valid_loss, valid_translation_out = evaluate(val_data_loader)
-    valid_bleu_score, _, _, _, _ = compute_bleu([val_tgt_sentences], valid_translation_out)
-    logging.info('Best model valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'
-                 .format(valid_loss, np.exp(valid_loss), valid_bleu_score * 100))
-    test_loss, test_translation_out = evaluate(test_data_loader)
-    test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], test_translation_out)
-    logging.info('Best model test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'
-                 .format(test_loss, np.exp(test_loss), test_bleu_score * 100))
-    dataprocessor.write_sentences(valid_translation_out,
-                                  os.path.join(args.save_dir, 'best_valid_out.txt'))
-    dataprocessor.write_sentences(test_translation_out,
-                                  os.path.join(args.save_dir, 'best_test_out.txt'))
-
-
-if __name__ == '__main__':
-    train()
diff --git a/scripts/machine_translation/train_transformer.py b/scripts/machine_translation/train_transformer.py
index baa8249c04..655c2771b5 100644
--- a/scripts/machine_translation/train_transformer.py
+++ b/scripts/machine_translation/train_transformer.py
@@ -2,7 +2,7 @@
 Transformer
 =================================
 
-This example shows how to implement the Transformer model with Gluon NLP Toolkit.
+This example shows how to implement the Transformer model with GluonNLP Toolkit.
 
 @inproceedings{vaswani2017attention,
   title={Attention is all you need},
@@ -33,380 +33,501 @@
 # pylint:disable=redefined-outer-name,logging-format-interpolation
 
 import argparse
+import time
+import random
+import os
 import logging
+import itertools
 import math
-import os
-import random
-import time
-
 import numpy as np
 import mxnet as mx
 from mxnet import gluon
-
-import gluonnlp as nlp
-from gluonnlp.loss import LabelSmoothing, MaskedSoftmaxCELoss
-from gluonnlp.model.transformer import ParallelTransformer, get_transformer_encoder_decoder
-from gluonnlp.model.translation import NMTModel
-from gluonnlp.utils.parallel import Parallel
-import dataprocessor
-from bleu import _bpe_to_words, compute_bleu
-from translation import BeamSearchTranslator
-from utils import logging_config
-
-np.random.seed(100)
-random.seed(100)
-mx.random.seed(10000)
-
-nlp.utils.check_version('0.9.0')
-
-parser = argparse.ArgumentParser(
-    formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    description='Neural Machine Translation Example with the Transformer Model.')
-parser.add_argument('--dataset', type=str.upper, default='WMT2016BPE', help='Dataset to use.',
-                    choices=['IWSLT2015', 'WMT2016BPE', 'WMT2014BPE', 'TOY'])
-parser.add_argument('--src_lang', type=str, default='en', help='Source language')
-parser.add_argument('--tgt_lang', type=str, default='de', help='Target language')
-parser.add_argument('--epochs', type=int, default=10, help='upper epoch limit')
-parser.add_argument('--num_units', type=int, default=512, help='Dimension of the embedding '
-                                                               'vectors and states.')
-parser.add_argument('--hidden_size', type=int, default=2048,
-                    help='Dimension of the hidden state in position-wise feed-forward networks.')
-parser.add_argument('--dropout', type=float, default=0.1,
-                    help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--epsilon', type=float, default=0.1,
-                    help='epsilon parameter for label smoothing')
-parser.add_argument('--num_layers', type=int, default=6,
-                    help='number of layers in the encoder and decoder')
-parser.add_argument('--num_heads', type=int, default=8,
-                    help='number of heads in multi-head attention')
-parser.add_argument('--scaled', action='store_true', help='Turn on to use scale in attention')
-parser.add_argument('--batch_size', type=int, default=1024,
-                    help='Batch size. Number of tokens per gpu in a minibatch')
-parser.add_argument('--beam_size', type=int, default=4, help='Beam size')
-parser.add_argument('--lp_alpha', type=float, default=0.6,
-                    help='Alpha used in calculating the length penalty')
-parser.add_argument('--lp_k', type=int, default=5, help='K used in calculating the length penalty')
-parser.add_argument('--test_batch_size', type=int, default=256, help='Test batch size')
-parser.add_argument('--num_buckets', type=int, default=10, help='Bucket number')
-parser.add_argument('--bucket_scheme', type=str, default='constant',
-                    help='Strategy for generating bucket keys. It supports: '
-                         '"constant": all the buckets have the same width; '
-                         '"linear": the width of bucket increases linearly; '
-                         '"exp": the width of bucket increases exponentially')
-parser.add_argument('--bucket_ratio', type=float, default=0.0, help='Ratio for increasing the '
-                                                                    'throughput of the bucketing')
-parser.add_argument('--src_max_len', type=int, default=-1, help='Maximum length of the source '
-                                                                'sentence, -1 means no clipping')
-parser.add_argument('--tgt_max_len', type=int, default=-1, help='Maximum length of the target '
-                                                                'sentence, -1 means no clipping')
-parser.add_argument('--optimizer', type=str, default='adam', help='optimization algorithm')
-parser.add_argument('--lr', type=float, default=1.0, help='Initial learning rate')
-parser.add_argument('--warmup_steps', type=float, default=4000,
-                    help='number of warmup steps used in NOAM\'s stepsize schedule')
-parser.add_argument('--num_accumulated', type=int, default=1,
-                    help='Number of steps to accumulate the gradients. '
-                         'This is useful to mimic large batch training with limited gpu memory')
-parser.add_argument('--magnitude', type=float, default=3.0,
-                    help='Magnitude of Xavier initialization')
-parser.add_argument('--average_checkpoint', action='store_true',
-                    help='Turn on to perform final testing based on '
-                         'the average of last few checkpoints')
-parser.add_argument('--num_averages', type=int, default=5,
-                    help='Perform final testing based on the '
-                         'average of last num_averages checkpoints. '
-                         'This is only used if average_checkpoint is True')
-parser.add_argument('--average_start', type=int, default=5,
-                    help='Perform average SGD on last average_start epochs')
-parser.add_argument('--full', action='store_true',
-                    help='In default, we use the test dataset in'
-                         ' http://statmt.org/wmt14/test-filtered.tgz.'
-                         ' When the option full is turned on, we use the test dataset in'
-                         ' http://statmt.org/wmt14/test-full.tgz')
-parser.add_argument('--bleu', type=str, default='tweaked',
-                    help='Schemes for computing bleu score. It can be: '
-                    '"tweaked": it uses similar steps in get_ende_bleu.sh in tensor2tensor '
-                    'repository, where compound words are put in ATAT format; '
-                    '"13a": This uses official WMT tokenization and produces the same results'
-                    ' as official script (mteval-v13a.pl) used by WMT; '
-                    '"intl": This use international tokenization in mteval-v14a.pl')
-parser.add_argument('--log_interval', type=int, default=100, metavar='N',
-                    help='report interval')
-parser.add_argument('--save_dir', type=str, default='transformer_out',
-                    help='directory path to save the final model and training log')
-parser.add_argument('--gpus', type=str,
-                    help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.'
-                         '(using single gpu is suggested)')
-args = parser.parse_args()
-logging_config(args.save_dir)
-logging.info(args)
-
-
-data_train, data_val, data_test, val_tgt_sentences, test_tgt_sentences, src_vocab, tgt_vocab \
-    = dataprocessor.load_translation_data(dataset=args.dataset, bleu=args.bleu, args=args)
-
-dataprocessor.write_sentences(val_tgt_sentences, os.path.join(args.save_dir, 'val_gt.txt'))
-dataprocessor.write_sentences(test_tgt_sentences, os.path.join(args.save_dir, 'test_gt.txt'))
-
-data_train = data_train.transform(lambda src, tgt: (src, tgt, len(src), len(tgt)), lazy=False)
-data_val = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
-                                     for i, ele in enumerate(data_val)])
-data_test = gluon.data.SimpleDataset([(ele[0], ele[1], len(ele[0]), len(ele[1]), i)
-                                      for i, ele in enumerate(data_test)])
-
-ctx = [mx.cpu()] if args.gpus is None or args.gpus == '' else \
-    [mx.gpu(int(x)) for x in args.gpus.split(',')]
-num_ctxs = len(ctx)
-
-data_train_lengths, data_val_lengths, data_test_lengths = [dataprocessor.get_data_lengths(x)
-                                                           for x in
-                                                           [data_train, data_val, data_test]]
-
-if args.src_max_len <= 0 or args.tgt_max_len <= 0:
-    max_len = np.max(
-        [np.max(data_train_lengths, axis=0), np.max(data_val_lengths, axis=0),
-         np.max(data_test_lengths, axis=0)],
-        axis=0)
-if args.src_max_len > 0:
-    src_max_len = args.src_max_len
-else:
-    src_max_len = max_len[0]
-if args.tgt_max_len > 0:
-    tgt_max_len = args.tgt_max_len
-else:
-    tgt_max_len = max_len[1]
-encoder, decoder, one_step_ahead_decoder = get_transformer_encoder_decoder(
-    units=args.num_units, hidden_size=args.hidden_size, dropout=args.dropout,
-    num_layers=args.num_layers, num_heads=args.num_heads, max_src_length=max(src_max_len, 500),
-    max_tgt_length=max(tgt_max_len, 500), scaled=args.scaled)
-model = NMTModel(src_vocab=src_vocab, tgt_vocab=tgt_vocab, encoder=encoder, decoder=decoder,
-                 one_step_ahead_decoder=one_step_ahead_decoder,
-                 share_embed=args.dataset not in ('TOY', 'IWSLT2015'), embed_size=args.num_units,
-                 tie_weights=args.dataset not in ('TOY', 'IWSLT2015'), embed_initializer=None,
-                 prefix='transformer_')
-model.initialize(init=mx.init.Xavier(magnitude=args.magnitude), ctx=ctx)
-static_alloc = True
-model.hybridize(static_alloc=static_alloc)
-logging.info(model)
-
-translator = BeamSearchTranslator(model=model, beam_size=args.beam_size,
-                                  scorer=nlp.model.BeamSearchScorer(alpha=args.lp_alpha,
-                                                                    K=args.lp_k),
-                                  max_length=200)
-logging.info('Use beam_size={}, alpha={}, K={}'.format(args.beam_size, args.lp_alpha, args.lp_k))
-
-label_smoothing = LabelSmoothing(epsilon=args.epsilon, units=len(tgt_vocab))
-label_smoothing.hybridize(static_alloc=static_alloc)
-
-loss_function = MaskedSoftmaxCELoss(sparse_label=False)
-loss_function.hybridize(static_alloc=static_alloc)
-
-test_loss_function = MaskedSoftmaxCELoss()
-test_loss_function.hybridize(static_alloc=static_alloc)
-
-rescale_loss = 100.
-parallel_model = ParallelTransformer(model, label_smoothing, loss_function, rescale_loss)
-detokenizer = nlp.data.SacreMosesDetokenizer()
-
-
-def evaluate(data_loader, context=ctx[0]):
-    """Evaluate given the data loader
+from gluonnlp.models.transformer import TransformerModel
+from gluonnlp.utils.misc import logging_config, AverageSGDTracker, count_parameters,\
+    md5sum, grouper, init_comm
+from gluonnlp.data.sampler import (
+    ConstWidthBucket,
+    LinearWidthBucket,
+    ExpWidthBucket,
+    FixedBucketSampler,
+    BoundedBudgetSampler,
+    ShardedIterator
+)
+import gluonnlp.data.batchify as bf
+from gluonnlp.data import Vocab
+from gluonnlp.data import tokenizers
+from gluonnlp.data.tokenizers import BaseTokenizerWithVocab
+from gluonnlp.lr_scheduler import InverseSquareRootScheduler
+from gluonnlp.loss import LabelSmoothCrossEntropyLoss
+try:
+    import horovod.mxnet as hvd
+except ImportError:
+    hvd = None
+
+mx.npx.set_np()
+
+
+CACHE_PATH = os.path.realpath(os.path.join(os.path.realpath(__file__), '..', 'cached'))
+if not os.path.exists(CACHE_PATH):
+    os.makedirs(CACHE_PATH, exist_ok=True)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Transformer for Neural Machine Translation.')
+    parser.add_argument('--train_src_corpus', type=str,
+                        help='The source training corpus.')
+    parser.add_argument('--train_tgt_corpus', type=str,
+                        help='The target training corpus.')
+    parser.add_argument('--dev_src_corpus', type=str,
+                        help='The source dev corpus.')
+    parser.add_argument('--dev_tgt_corpus', type=str,
+                        help='The target dev corpus.')
+    parser.add_argument('--src_tokenizer', choices=['spm',
+                                                    'subword_nmt',
+                                                    'yttm',
+                                                    'hf_bytebpe',
+                                                    'hf_wordpiece',
+                                                    'hf_bpe',
+                                                    'whitespace'],
+                        default='whitespace', type=str,
+                        help='The source tokenizer. '
+                             'Whitespace tokenizer supports processing pre-encoded corpus, '
+                             'and the tokenizers besides whitespace supports online encoding.')
+    parser.add_argument('--tgt_tokenizer', choices=['spm',
+                                                    'subword_nmt',
+                                                    'yttm',
+                                                    'hf_bytebpe',
+                                                    'hf_wordpiece',
+                                                    'hf_bpe',
+                                                    'whitespace'],
+                        default='whitespace', type=str,
+                        help='The target tokenizer.')
+    parser.add_argument('--src_subword_model_path', type=str,
+                        help='Path to the source subword model.')
+    parser.add_argument('--src_vocab_path', type=str,
+                        help='Path to the source vocab.')
+    parser.add_argument('--tgt_subword_model_path', type=str,
+                        help='Path to the target subword model.')
+    parser.add_argument('--tgt_vocab_path', type=str,
+                        help='Path to the target vocab.')
+    parser.add_argument('--seed', type=int, default=100, help='The random seed.')
+    parser.add_argument('--epochs', type=int, default=30, help='Upper epoch limit, '
+                        'the model will keep training when epochs < 0 and max_update < 0.')
+    parser.add_argument('--max_update', type=int, default=-1,
+                        help='Max update steps, when max_update > 0, epochs will be set to -1, '
+                             'each update step contains gpu_num * num_accumulated batches.')
+    parser.add_argument('--save_interval_update', type=int, default=500,
+                         help='Update interval of saving checkpoints while using max_update.')
+    parser.add_argument('--cfg', type=str, default='transformer_base',
+                        help='Configuration of the transformer model. '
+                             'You may select a yml file or use the prebuild configurations.')
+    parser.add_argument('--label_smooth_alpha', type=float, default=0.1,
+                        help='Weight of label smoothing')
+    parser.add_argument('--sampler', type=str, choices=['BoundedBudgetSampler', 'FixedBucketSampler'],
+                        default='FixedBucketSampler', help='Type of sampler')
+    parser.add_argument('--batch_size', type=int, default=2700,
+                        help='Batch size. Number of tokens per gpu in a minibatch.')
+    parser.add_argument('--val_batch_size', type=int, default=16,
+                        help='Batch size for evaluation.')
+    parser.add_argument('--num_buckets', type=int, default=20, help='Bucket number.')
+    parser.add_argument('--bucket_scheme', type=str, default='exp',
+                        help='Strategy for generating bucket keys. It supports: '
+                             '"constant": all the buckets have the same width; '
+                             '"linear": the width of bucket increases linearly; '
+                             '"exp": the width of bucket increases exponentially')
+    parser.add_argument('--bucket_ratio', type=float, default=0.0,
+                        help='Ratio for increasing the throughput of the bucketing')
+    parser.add_argument('--max_num_tokens', type=int, default=-1,
+                        help='max tokens num of each batch, applicable while using BoundedBudgetSampler')
+    parser.add_argument('--max_num_sentences', type=int, default=-1,
+                        help='max sentences num of each batch, applicable while using BoundedBudgetSampler')
+    parser.add_argument('--lr', type=float, default=0.002,
+                        help='The learning rate at the end of the warmup stage. '
+                             'If it is not given, we will use the formula suggested in the '
+                             'original Transformer paper:'
+                             ' 1.0 / sqrt(d_model) / sqrt(warmup_steps). '
+                             'Otherwise, we will use the given lr as the final learning rate in '
+                             'the warmup phase.')
+    parser.add_argument('--warmup_steps', type=int, default=4000,
+                        help='number of warmup steps used in NOAM\'s stepsize schedule')
+    parser.add_argument('--warmup_init_lr', type=float, default=0.0,
+                        help='Initial learning rate at the beginning of the warm-up stage')
+    parser.add_argument('--num_accumulated', type=int, default=32,
+                        help='Number of steps to accumulate the gradients. '
+                             'This is useful to mimic large batch training with limited gpu memory')
+    parser.add_argument('--magnitude', type=float, default=3.0,
+                        help='Magnitude of Xavier initialization')
+    parser.add_argument('--num_averages', type=int, default=-1,
+                        help='Perform final testing based on the '
+                             'average of last num_averages checkpoints. '
+                             'Use num_average will cause extra gpu memory usage.')
+    parser.add_argument('--log_interval', type=int, default=10, metavar='N',
+                        help='report interval')
+    parser.add_argument('--save_dir', type=str, default='transformer_out',
+                        help='directory path to save the final model and training log')
+    parser.add_argument('--overwrite_cache', action='store_true')
+    parser.add_argument('--fp16', action='store_true',
+                        help='Whether to use dtype float16')
+    parser.add_argument('--comm_backend', type=str, default='device',
+                        choices=['horovod', 'dist_sync_device', 'device'],
+                        help='Communication backend.')
+    parser.add_argument('--gpus', type=str,
+                        help='list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.')
+    args = parser.parse_args()
+    if args.max_update > 0:
+        args.epochs = -1
+    logging_config(args.save_dir, console=True)
+    logging.info(args)
+    return args
+
+
+def validation(model, data_loader, ctx_l):
+    """Validate the model on the dataset
 
     Parameters
     ----------
+    model : TransformerModel
+        The transformer model
     data_loader : DataLoader
+        DataLoader
+    ctx_l : list
+        List of mx.ctx.Context
+
 
     Returns
     -------
-    avg_loss : float
-        Average loss
-    real_translation_out : list of list of str
-        The translation output
+    avg_nll_loss : float
+        The average negative log-likelihood loss
     """
-    translation_out = []
-    all_inst_ids = []
-    avg_loss_denom = 0
-    avg_loss = 0.0
-    for _, (src_seq, tgt_seq, src_valid_length, tgt_valid_length, inst_ids) \
-            in enumerate(data_loader):
-        src_seq = src_seq.as_in_context(context)
-        tgt_seq = tgt_seq.as_in_context(context)
-        src_valid_length = src_valid_length.as_in_context(context)
-        tgt_valid_length = tgt_valid_length.as_in_context(context)
-        # Calculating Loss
-        out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1)
-        loss = test_loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean().asscalar()
-        all_inst_ids.extend(inst_ids.asnumpy().astype(np.int32).tolist())
-        avg_loss += loss * (tgt_seq.shape[1] - 1)
-        avg_loss_denom += (tgt_seq.shape[1] - 1)
-        # Translate
-        samples, _, sample_valid_length = \
-            translator.translate(src_seq=src_seq, src_valid_length=src_valid_length)
-        max_score_sample = samples[:, 0, :].asnumpy()
-        sample_valid_length = sample_valid_length[:, 0].asnumpy()
-        for i in range(max_score_sample.shape[0]):
-            translation_out.append(
-                [tgt_vocab.idx_to_token[ele] for ele in
-                 max_score_sample[i][1:(sample_valid_length[i] - 1)]])
-    avg_loss = avg_loss / avg_loss_denom
-    real_translation_out = [None for _ in range(len(all_inst_ids))]
-    for ind, sentence in zip(all_inst_ids, translation_out):
-        if args.bleu == 'tweaked':
-            real_translation_out[ind] = sentence
-        elif args.bleu == '13a' or args.bleu == 'intl':
-            real_translation_out[ind] = detokenizer(_bpe_to_words(sentence))
-        else:
-            raise NotImplementedError
-    return avg_loss, real_translation_out
-
-
-def train():
-    """Training function."""
-    trainer = gluon.Trainer(model.collect_params(), args.optimizer,
-                            {'learning_rate': args.lr, 'beta2': 0.98, 'epsilon': 1e-9})
+    avg_nll_loss = mx.np.array(0, dtype=np.float32, ctx=mx.cpu())
+    ntokens = 0
+    for sample_data_l in grouper(data_loader, len(ctx_l)):
+        loss_l = []
+        ntokens += sum([ele[3].sum().asnumpy() - ele[0].shape[0] for ele in sample_data_l
+                        if ele is not None])
+        for sample_data, ctx in zip(sample_data_l, ctx_l):
+            if sample_data is None:
+                continue
+            src_token_ids, tgt_token_ids, src_valid_length, tgt_valid_length, sample_ids = sample_data
+            src_token_ids = src_token_ids.as_in_ctx(ctx)
+            tgt_token_ids = tgt_token_ids.as_in_ctx(ctx)
+            src_valid_length = src_valid_length.as_in_ctx(ctx)
+            tgt_valid_length = tgt_valid_length.as_in_ctx(ctx)
+            tgt_pred = model(src_token_ids, src_valid_length, tgt_token_ids[:, :-1],
+                             tgt_valid_length - 1)
+            tgt_labels = tgt_token_ids[:, 1:]
+            tgt_pred_logits = mx.npx.log_softmax(tgt_pred, axis=-1)
+            nll_loss = - mx.npx.pick(tgt_pred_logits, tgt_labels, axis=-1)
+            loss = mx.npx.sequence_mask(nll_loss,
+                                        sequence_length=tgt_valid_length - 1,
+                                        use_sequence_length=True,
+                                        axis=1)
+            loss_l.append(loss.sum())
+        avg_nll_loss += sum([loss.as_in_ctx(mx.cpu()) for loss in loss_l])
+        mx.npx.waitall()
+    avg_loss = avg_nll_loss.asnumpy() / ntokens
+    return avg_loss
+
+
+def load_dataset_with_cache(src_corpus_path: str,
+                            tgt_corpus_path: str,
+                            src_tokenizer: BaseTokenizerWithVocab,
+                            tgt_tokenizer: BaseTokenizerWithVocab,
+                            overwrite_cache: bool,
+                            local_rank: int):
+    # TODO online h5py multi processing encode (Tao)
+    src_md5sum = md5sum(src_corpus_path)
+    tgt_md5sum = md5sum(tgt_corpus_path)
+    cache_filepath = os.path.join(CACHE_PATH,
+                                  '{}_{}.cache.npz'.format(src_md5sum[:6], tgt_md5sum[:6]))
+    if os.path.exists(cache_filepath) and not overwrite_cache:
+        if local_rank == 0:
+            logging.info('Load cache from {}'.format(cache_filepath))
+        npz_data = np.load(cache_filepath, allow_pickle=True)
+        src_data, tgt_data = npz_data['src_data'][:], npz_data['tgt_data'][:]
+    else:
+        assert src_tokenizer.vocab.eos_id is not None,\
+            'You will need to add the EOS token to the vocabulary used in the tokenizer of ' \
+            'the source language.'
+        assert tgt_tokenizer.vocab.bos_id is not None and tgt_tokenizer.vocab.eos_id is not None, \
+            'You will need to add both the BOS token and the EOS tokens to the vocabulary used ' \
+            'in the tokenizer of the target language.'
+        src_data = []
+        tgt_data = []
+        # TODO(sxjscience) Optimize the speed of converting to cache
+        with open(src_corpus_path) as f:
+            for line in f:
+                sample = np.array(src_tokenizer.encode(line.strip(), output_type=int) +
+                                  [src_tokenizer.vocab.eos_id], dtype=np.int32)
+                src_data.append(sample)
+        with open(tgt_corpus_path) as f:
+            for line in f:
+                sample = np.array([tgt_tokenizer.vocab.bos_id] +
+                                  tgt_tokenizer.encode(line.strip(), output_type=int) +
+                                  [tgt_tokenizer.vocab.eos_id], dtype=np.int32)
+                tgt_data.append(sample)
+        src_data = np.array(src_data)
+        tgt_data = np.array(tgt_data)
+        np.savez(cache_filepath, src_data=src_data, tgt_data=tgt_data)
+    return src_data, tgt_data
+
+
+def create_tokenizer(tokenizer_type, model_path, vocab_path):
+    if tokenizer_type == 'whitespace':
+        return tokenizers.create(tokenizer_type, vocab=Vocab.load(vocab_path))
+    elif tokenizer_type == 'spm':
+        return tokenizers.create(tokenizer_type, model_path=model_path, vocab=vocab_path)
+    elif tokenizer_type == 'subword_nmt':
+        return tokenizers.create(tokenizer_type, codec_path=model_path, vocab_path=vocab_path)
+    elif tokenizer_type == 'yttm':
+        return tokenizers.create(tokenizer_type, model_path=model_path)
+    elif tokenizer_type == 'hf_bytebpe':
+        return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path)
+    elif tokenizer_type == 'hf_wordpiece':
+        return tokenizers.create(tokenizer_type, vocab_file=vocab_path)
+    elif tokenizer_type == 'hf_bpe':
+        return tokenizers.create(tokenizer_type, merges_file=model_path, vocab_file=vocab_path)
+    else:
+        raise NotImplementedError
 
-    train_data_loader, val_data_loader, test_data_loader \
-        = dataprocessor.make_dataloader(data_train, data_val, data_test, args,
-                                        use_average_length=True, num_shards=len(ctx))
 
-    if args.bleu == 'tweaked':
-        bpe = bool(args.dataset != 'IWSLT2015' and args.dataset != 'TOY')
-        split_compound_word = bpe
-        tokenized = True
-    elif args.bleu == '13a' or args.bleu == 'intl':
-        bpe = False
-        split_compound_word = False
-        tokenized = False
+def train(args):
+    _, num_parts, rank, local_rank, _, ctx_l = init_comm(
+        args.comm_backend, args.gpus)
+    src_tokenizer = create_tokenizer(args.src_tokenizer,
+                                     args.src_subword_model_path,
+                                     args.src_vocab_path)
+    tgt_tokenizer = create_tokenizer(args.tgt_tokenizer,
+                                     args.tgt_subword_model_path,
+                                     args.tgt_vocab_path)
+    src_vocab = src_tokenizer.vocab
+    tgt_vocab = tgt_tokenizer.vocab
+    train_src_data, train_tgt_data = load_dataset_with_cache(args.train_src_corpus,
+                                                             args.train_tgt_corpus,
+                                                             src_tokenizer,
+                                                             tgt_tokenizer,
+                                                             args.overwrite_cache,
+                                                             local_rank)
+    dev_src_data, dev_tgt_data = load_dataset_with_cache(args.dev_src_corpus,
+                                                         args.dev_tgt_corpus,
+                                                         src_tokenizer,
+                                                         tgt_tokenizer,
+                                                         args.overwrite_cache,
+                                                         local_rank)
+    data_train = gluon.data.SimpleDataset(
+        [(src_tokens, tgt_tokens, len(src_tokens), len(tgt_tokens), i)
+         for i, (src_tokens, tgt_tokens) in enumerate(zip(train_src_data, train_tgt_data))])
+    data_val = gluon.data.SimpleDataset(
+        [(src_tokens, tgt_tokens, len(src_tokens), len(tgt_tokens), i)
+         for i, (src_tokens, tgt_tokens) in enumerate(zip(dev_src_data, dev_tgt_data))])
+    # Construct the model + loss function
+    if args.cfg.endswith('.yml'):
+        cfg = TransformerModel.get_cfg().clone_merge(args.cfg)
+    else:
+        cfg = TransformerModel.get_cfg(args.cfg)
+    cfg.defrost()
+    cfg.MODEL.src_vocab_size = len(src_vocab)
+    cfg.MODEL.tgt_vocab_size = len(tgt_vocab)
+    if args.fp16:
+        raise NotImplementedError
+#        cfg.MODEL.dtype = 'float16'
+    cfg.freeze()
+    model = TransformerModel.from_cfg(cfg)
+    model.initialize(mx.init.Xavier(magnitude=args.magnitude),
+                     ctx=ctx_l)
+    model.hybridize()
+    if local_rank == 0:
+        logging.info(model)
+    with open(os.path.join(args.save_dir, 'config.yml'), 'w') as cfg_f:
+        cfg_f.write(cfg.dump())
+    label_smooth_loss = LabelSmoothCrossEntropyLoss(num_labels=len(tgt_vocab),
+                                                    alpha=args.label_smooth_alpha,
+                                                    from_logits=False)
+    label_smooth_loss.hybridize()
+    rescale_loss = 100.0
+    
+    if args.comm_backend == 'horovod':
+        hvd.broadcast_parameters(model.collect_params(), root_rank=0)
+    
+    # Construct the trainer
+    # TODO(sxjscience) Support AMP
+    if args.lr is None:
+        base_lr = 2.0 / math.sqrt(args.num_units) / math.sqrt(args.warmup_steps)
+    else:
+        base_lr = args.lr
+    lr_scheduler = InverseSquareRootScheduler(warmup_steps=args.warmup_steps, base_lr=base_lr,
+                                              warmup_init_lr=args.warmup_init_lr)
+    trainer_settings = (model.collect_params(), 'adam',
+                            {'learning_rate': args.lr, 'beta1': 0.9,
+                             'beta2': 0.98, 'epsilon': 1e-9, 'lr_scheduler': lr_scheduler})
+    if args.comm_backend == 'horovod':
+        trainer = hvd.DistributedTrainer(*trainer_settings)
+    else:
+        trainer = gluon.Trainer(*trainer_settings)
+    # Load Data
+    if args.sampler == 'BoundedBudgetSampler':
+        train_batch_sampler = BoundedBudgetSampler(lengths=[(ele[2], ele[3]) for ele in data_train],
+                                                     max_num_tokens=args.max_num_tokens,
+                                                     max_num_sentences=args.max_num_sentences,
+                                                     seed=args.seed)
+        if num_parts > 1:
+            train_batch_sampler = ShardedIterator(train_batch_sampler, num_parts=num_parts, part_index=rank)
+    elif args.sampler == 'FixedBucketSampler':
+        if args.comm_backend == 'horovod':
+            raise NotImplementedError('FixedBucketSampler does not support horovod at present')
+
+        if args.bucket_scheme == 'constant':
+            bucket_scheme = ConstWidthBucket()
+        elif args.bucket_scheme == 'linear':
+            bucket_scheme = LinearWidthBucket()
+        elif args.bucket_scheme == 'exp':
+            bucket_scheme = ExpWidthBucket(bucket_len_step=1.2)
+        else:
+            raise NotImplementedError
+        # TODO(sxjscience) Support auto-bucket-size tuning
+        train_batch_sampler = FixedBucketSampler(lengths=[(ele[2], ele[3]) for ele in data_train],
+                                                 batch_size=args.batch_size,
+                                                 num_buckets=args.num_buckets,
+                                                 ratio=args.bucket_ratio,
+                                                 shuffle=True,
+                                                 use_average_length=True,
+                                                 bucket_scheme=bucket_scheme,
+                                                 seed=args.seed)
     else:
         raise NotImplementedError
 
-    best_valid_bleu = 0.0
-    step_num = 0
-    warmup_steps = args.warmup_steps
-    grad_interval = args.num_accumulated
-    model.collect_params().setattr('grad_req', 'add')
-    average_start = (len(train_data_loader) // grad_interval) * (args.epochs - args.average_start)
-    average_param_dict = None
-    model.collect_params().zero_grad()
-    parallel = Parallel(num_ctxs, parallel_model)
-    for epoch_id in range(args.epochs):
-        log_avg_loss = 0
-        log_wc = 0
-        loss_denom = 0
-        step_loss = 0
-        log_start_time = time.time()
-        for batch_id, seqs \
-                in enumerate(train_data_loader):
-            if batch_id % grad_interval == 0:
-                step_num += 1
-                new_lr = args.lr / math.sqrt(args.num_units) \
-                         * min(1. / math.sqrt(step_num), step_num * warmup_steps ** (-1.5))
-                trainer.set_learning_rate(new_lr)
-            src_wc, tgt_wc, bs = np.sum([(shard[2].sum(), shard[3].sum(), shard[0].shape[0])
-                                         for shard in seqs], axis=0)
-            seqs = [[seq.as_in_context(context) for seq in shard]
-                    for context, shard in zip(ctx, seqs)]
-            Ls = []
-            for seq in seqs:
-                parallel.put((seq, args.batch_size))
-            Ls = [parallel.get() for _ in range(len(ctx))]
-            src_wc = src_wc.asscalar()
-            tgt_wc = tgt_wc.asscalar()
-            loss_denom += tgt_wc - bs
-            if batch_id % grad_interval == grad_interval - 1 or\
-                    batch_id == len(train_data_loader) - 1:
-                if average_param_dict is None:
-                    average_param_dict = {k: v.data(ctx[0]).copy() for k, v in
-                                          model.collect_params().items()}
-                trainer.step(float(loss_denom) / args.batch_size / rescale_loss)
-                param_dict = model.collect_params()
-                param_dict.zero_grad()
-                if step_num > average_start:
-                    alpha = 1. / max(1, step_num - average_start)
-                    for name, average_param in average_param_dict.items():
-                        average_param[:] += alpha * (param_dict[name].data(ctx[0]) - average_param)
-            step_loss += sum([L.asscalar() for L in Ls])
-            if batch_id % grad_interval == grad_interval - 1 or\
-                    batch_id == len(train_data_loader) - 1:
-                log_avg_loss += step_loss / loss_denom * args.batch_size * rescale_loss
+    logging.info(train_batch_sampler)
+
+    batchify_fn = bf.Tuple(bf.Pad(), bf.Pad(), bf.Stack(), bf.Stack(), bf.Stack())
+    train_data_loader = gluon.data.DataLoader(data_train,
+                                              batch_sampler=train_batch_sampler,
+                                              batchify_fn=batchify_fn,
+                                              num_workers=0)
+
+    val_data_loader = gluon.data.DataLoader(data_val,
+                                            batch_size=args.val_batch_size,
+                                            batchify_fn=batchify_fn,
+                                            num_workers=0,
+                                            shuffle=False)
+    for v in model.collect_params().values():
+        if v.grad_req != 'null':
+            v.grad_req = 'add'
+    model.zero_grad()
+    model_averager = AverageSGDTracker(model.collect_params())
+    log_start_time = time.time()
+    num_params, num_fixed_params = None, None
+    # TODO(sxjscience) Add a log metric class
+    accum_count = 0
+    loss_denom = 0
+    n_train_iters = 0
+    log_wc = 0
+    log_avg_loss = 0.0
+    log_loss_denom = 0
+    epoch_id = 0
+    while (args.epochs < 0 or epoch_id < args.epochs): # when args.epochs < 0, the model will keep training
+        n_epoch_train_iters = 0
+        processed_batch_num = 0
+        train_multi_data_loader = grouper(train_data_loader, len(ctx_l))
+        is_last_batch = False
+        sample_data_l = next(train_multi_data_loader)
+        while not is_last_batch:
+            processed_batch_num += len(sample_data_l)
+            loss_l = []
+            for sample_data, ctx in zip(sample_data_l, ctx_l):
+                if sample_data is None:
+                    continue
+                src_token_ids, tgt_token_ids, src_valid_length, tgt_valid_length, sample_ids = sample_data
+                src_wc, tgt_wc, bs = src_valid_length.sum(), tgt_valid_length.sum(), src_token_ids.shape[0]
+                loss_denom += tgt_wc - bs
+                log_loss_denom += tgt_wc - bs
+                log_wc += src_wc + tgt_wc
+                src_token_ids = src_token_ids.as_in_ctx(ctx)
+                tgt_token_ids = tgt_token_ids.as_in_ctx(ctx)
+                src_valid_length = src_valid_length.as_in_ctx(ctx)
+                tgt_valid_length = tgt_valid_length.as_in_ctx(ctx)
+                with mx.autograd.record():
+                    tgt_pred = model(src_token_ids, src_valid_length, tgt_token_ids[:, :-1],
+                                     tgt_valid_length - 1)
+                    tgt_labels = tgt_token_ids[:, 1:]
+                    loss = label_smooth_loss(tgt_pred, tgt_labels)
+                    loss = mx.npx.sequence_mask(loss,
+                                                sequence_length=tgt_valid_length - 1,
+                                                use_sequence_length=True,
+                                                axis=1)
+                    loss_l.append(loss.sum() / rescale_loss)
+            for l in loss_l:
+                l.backward()
+            accum_count += 1
+            try:
+                sample_data_l = next(train_multi_data_loader)
+            except StopIteration:
+                is_last_batch = True
+            if local_rank == 0 and num_params is None:
+                num_params, num_fixed_params = count_parameters(model.collect_params())
+                logging.info('Total Number of Parameters (not-fixed/fixed): {}/{}'
+                             .format(num_params, num_fixed_params))
+            sum_loss = sum([l.as_in_ctx(mx.cpu()) for l in loss_l]) * rescale_loss
+            log_avg_loss += sum_loss
+            mx.npx.waitall()
+            if accum_count == args.num_accumulated or is_last_batch:
+                # Update the parameters
+                n_train_iters += 1
+                n_epoch_train_iters += 1
+                trainer.step(loss_denom.asnumpy() / rescale_loss)
+                accum_count = 0
                 loss_denom = 0
-                step_loss = 0
-            log_wc += src_wc + tgt_wc
-            if (batch_id + 1) % (args.log_interval * grad_interval) == 0:
-                wps = log_wc / (time.time() - log_start_time)
-                logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, '
-                             'throughput={:.2f}K wps, wc={:.2f}K'
-                             .format(epoch_id, batch_id + 1, len(train_data_loader),
-                                     log_avg_loss / args.log_interval,
-                                     np.exp(log_avg_loss / args.log_interval),
-                                     wps / 1000, log_wc / 1000))
-                log_start_time = time.time()
-                log_avg_loss = 0
-                log_wc = 0
-        mx.nd.waitall()
-        valid_loss, valid_translation_out = evaluate(val_data_loader, ctx[0])
-        valid_bleu_score, _, _, _, _ = compute_bleu([val_tgt_sentences], valid_translation_out,
-                                                    tokenized=tokenized, tokenizer=args.bleu,
-                                                    split_compound_word=split_compound_word,
-                                                    bpe=bpe)
-        logging.info('[Epoch {}] valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'
-                     .format(epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100))
-        test_loss, test_translation_out = evaluate(test_data_loader, ctx[0])
-        test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], test_translation_out,
-                                                   tokenized=tokenized, tokenizer=args.bleu,
-                                                   split_compound_word=split_compound_word,
-                                                   bpe=bpe)
-        logging.info('[Epoch {}] test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'
-                     .format(epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100))
-        dataprocessor.write_sentences(valid_translation_out,
-                                      os.path.join(args.save_dir,
-                                                   'epoch{:d}_valid_out.txt').format(epoch_id))
-        dataprocessor.write_sentences(test_translation_out,
-                                      os.path.join(args.save_dir,
-                                                   'epoch{:d}_test_out.txt').format(epoch_id))
-        if valid_bleu_score > best_valid_bleu:
-            best_valid_bleu = valid_bleu_score
-            save_path = os.path.join(args.save_dir, 'valid_best.params')
-            logging.info('Save best parameters to {}'.format(save_path))
-            model.save_parameters(save_path)
-        save_path = os.path.join(args.save_dir, 'epoch{:d}.params'.format(epoch_id))
-        model.save_parameters(save_path)
-    save_path = os.path.join(args.save_dir, 'average.params')
-    mx.nd.save(save_path, average_param_dict)
-    if args.average_checkpoint:
-        for j in range(args.num_averages):
-            params = mx.nd.load(os.path.join(args.save_dir,
-                                             'epoch{:d}.params'.format(args.epochs - j - 1)))
-            alpha = 1. / (j + 1)
-            for k, v in model._collect_params_with_prefix().items():
-                for c in ctx:
-                    v.data(c)[:] += alpha * (params[k].as_in_context(c) - v.data(c))
-        save_path = os.path.join(args.save_dir,
-                                 'average_checkpoint_{}.params'.format(args.num_averages))
-        model.save_parameters(save_path)
-    elif args.average_start > 0:
-        for k, v in model.collect_params().items():
-            v.set_data(average_param_dict[k])
-        save_path = os.path.join(args.save_dir, 'average.params')
-        model.save_parameters(save_path)
-    else:
-        model.load_parameters(os.path.join(args.save_dir, 'valid_best.params'), ctx)
-    valid_loss, valid_translation_out = evaluate(val_data_loader, ctx[0])
-    valid_bleu_score, _, _, _, _ = compute_bleu([val_tgt_sentences], valid_translation_out,
-                                                tokenized=tokenized, tokenizer=args.bleu, bpe=bpe,
-                                                split_compound_word=split_compound_word)
-    logging.info('Best model valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'
-                 .format(valid_loss, np.exp(valid_loss), valid_bleu_score * 100))
-    test_loss, test_translation_out = evaluate(test_data_loader, ctx[0])
-    test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], test_translation_out,
-                                               tokenized=tokenized, tokenizer=args.bleu, bpe=bpe,
-                                               split_compound_word=split_compound_word)
-    logging.info('Best model test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'
-                 .format(test_loss, np.exp(test_loss), test_bleu_score * 100))
-    dataprocessor.write_sentences(valid_translation_out,
-                                  os.path.join(args.save_dir, 'best_valid_out.txt'))
-    dataprocessor.write_sentences(test_translation_out,
-                                  os.path.join(args.save_dir, 'best_test_out.txt'))
+                model.zero_grad()
+                if (args.epochs > 0 and epoch_id >= args.epochs - args.num_averages) or \
+                   (args.max_update > 0 and n_train_iters >= args.max_update - args.num_averages * args.save_interval_update):
+                    model_averager.step()
+                if local_rank == 0 and \
+                   (n_epoch_train_iters % args.log_interval == 0 or is_last_batch):
+                    log_end_time = time.time()
+                    log_wc = log_wc.asnumpy()
+                    wps = log_wc / (log_end_time - log_start_time)
+                    log_avg_loss = (log_avg_loss / log_loss_denom).asnumpy()
+                    logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}, '
+                                 'throughput={:.2f}K wps, wc={:.2f}K, LR={}'
+                                 .format(epoch_id, processed_batch_num * num_parts,
+                                         len(train_data_loader), log_avg_loss, np.exp(log_avg_loss),
+                                         wps / 1000, log_wc / 1000, trainer.learning_rate))
+                    log_start_time = time.time()
+                    log_avg_loss = 0
+                    log_loss_denom = 0
+                    log_wc = 0
+                if local_rank == 0 and \
+                   (args.max_update > 0 and n_train_iters % args.save_interval_update == 0):
+                    n_update = n_train_iters // args.save_interval_update
+                    model.save_parameters(os.path.join(args.save_dir,
+                                                       'update{:d}.params'.format(n_update)),
+                                          deduplicate=True)
+                    avg_valid_loss = validation(model, val_data_loader, ctx_l)
+                    logging.info('[Update {}] validation loss/ppl={:.4f}/{:.4f}'
+                                 .format(n_update, avg_valid_loss, np.exp(avg_valid_loss)))
+                if args.max_update > 0 and n_train_iters >= args.max_update:
+                    break
+        if local_rank == 0:
+            model.save_parameters(os.path.join(args.save_dir,
+                                               'epoch{:d}.params'.format(epoch_id)),
+                                  deduplicate=True)
+            avg_valid_loss = validation(model, val_data_loader, ctx_l)
+            logging.info('[Epoch {}] validation loss/ppl={:.4f}/{:.4f}'
+                         .format(epoch_id, avg_valid_loss, np.exp(avg_valid_loss)))
+
+        if args.max_update > 0 and n_train_iters >= args.max_update:
+            break
+        epoch_id += 1
+
+    if args.num_averages > 0:
+        model_averager.copy_back(model.collect_params())  # TODO(sxjscience) Rewrite using update
+        model.save_parameters(os.path.join(args.save_dir, 'average.params'),
+                              deduplicate=True)
 
 
 if __name__ == '__main__':
-    train()
+    os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
+    args = parse_args()
+    np.random.seed(args.seed)
+    mx.random.seed(args.seed)
+    random.seed(args.seed)
+    train(args)
diff --git a/scripts/machine_translation/translation.py b/scripts/machine_translation/translation.py
deleted file mode 100644
index 34127b6f4c..0000000000
--- a/scripts/machine_translation/translation.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Machine translation models and translators."""
-
-
-__all__ = ['BeamSearchTranslator']
-
-import numpy as np
-import mxnet as mx
-from gluonnlp.model import BeamSearchScorer, BeamSearchSampler
-
-class BeamSearchTranslator:
-    """Beam Search Translator
-
-    Parameters
-    ----------
-    model : NMTModel
-        The neural machine translation model
-    beam_size : int
-        Size of the beam
-    scorer : BeamSearchScorer
-        Score function used in beamsearch
-    max_length : int
-        The maximum decoding length
-    """
-    def __init__(self, model, beam_size=1, scorer=BeamSearchScorer(), max_length=100):
-        self._model = model
-        self._sampler = BeamSearchSampler(
-            decoder=self._decode_logprob,
-            beam_size=beam_size,
-            eos_id=model.tgt_vocab.token_to_idx[model.tgt_vocab.eos_token],
-            scorer=scorer,
-            max_length=max_length)
-
-    def _decode_logprob(self, step_input, states):
-        out, states, _ = self._model.decode_step(step_input, states)
-        return mx.nd.log_softmax(out), states
-
-    def translate(self, src_seq, src_valid_length):
-        """Get the translation result given the input sentence.
-
-        Parameters
-        ----------
-        src_seq : mx.nd.NDArray
-            Shape (batch_size, length)
-        src_valid_length : mx.nd.NDArray
-            Shape (batch_size,)
-
-        Returns
-        -------
-        samples : NDArray
-            Samples draw by beam search. Shape (batch_size, beam_size, length). dtype is int32.
-        scores : NDArray
-            Scores of the samples. Shape (batch_size, beam_size). We make sure that scores[i, :] are
-            in descending order.
-        valid_length : NDArray
-            The valid length of the samples. Shape (batch_size, beam_size). dtype will be int32.
-        """
-        batch_size = src_seq.shape[0]
-        encoder_outputs, _ = self._model.encode(src_seq, valid_length=src_valid_length)
-        decoder_states = self._model.decoder.init_state_from_encoder(encoder_outputs,
-                                                                     src_valid_length)
-        inputs = mx.nd.full(shape=(batch_size,), ctx=src_seq.context, dtype=np.float32,
-                            val=self._model.tgt_vocab.token_to_idx[self._model.tgt_vocab.bos_token])
-        samples, scores, sample_valid_length = self._sampler(inputs, decoder_states)
-        return samples, scores, sample_valid_length
diff --git a/scripts/machine_translation/utils.py b/scripts/machine_translation/utils.py
deleted file mode 100644
index 1494fa43ab..0000000000
--- a/scripts/machine_translation/utils.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utility functions."""
-
-import os
-import logging
-import inspect
-
-__all__ = ['logging_config']
-
-
-def logging_config(folder=None, name=None,
-                   level=logging.DEBUG,
-                   console_level=logging.INFO,
-                   no_console=False):
-    """ Config the logging.
-
-    Parameters
-    ----------
-    folder : str or None
-    name : str or None
-    level : int
-    console_level
-    no_console: bool
-        Whether to disable the console log
-    Returns
-    -------
-    folder : str
-        Folder that the logging file will be saved into.
-    """
-    if name is None:
-        name = inspect.stack()[1][1].split('.')[0]
-    if folder is None:
-        folder = os.path.join(os.getcwd(), name)
-    if not os.path.exists(folder):
-        os.makedirs(folder)
-    # Remove all the current handlers
-    for handler in logging.root.handlers:
-        logging.root.removeHandler(handler)
-    logging.root.handlers = []
-    logpath = os.path.join(folder, name + '.log')
-    print('All Logs will be saved to {}'.format(logpath))
-    logging.root.setLevel(level)
-    formatter = logging.Formatter('%(asctime)s - %(name)s - %(message)s')
-    logfile = logging.FileHandler(logpath)
-    logfile.setLevel(level)
-    logfile.setFormatter(formatter)
-    logging.root.addHandler(logfile)
-    if not no_console:
-        # Initialze the console logging
-        logconsole = logging.StreamHandler()
-        logconsole.setLevel(console_level)
-        logconsole.setFormatter(formatter)
-        logging.root.addHandler(logconsole)
-    return folder
diff --git a/scripts/machine_translation/wmt2014_back_translation.sh b/scripts/machine_translation/wmt2014_back_translation.sh
new file mode 100644
index 0000000000..ebe344a773
--- /dev/null
+++ b/scripts/machine_translation/wmt2014_back_translation.sh
@@ -0,0 +1,160 @@
+SUBWORD_ALGO=$1
+SRC=en
+TGT=de
+
+# prepare en_de data for the reverse model
+cd ../datasets/machine_translation
+bash wmt2014_ende.sh ${SUBWORD_ALGO}
+
+# Fetch the raw mono text
+nlp_data prepare_wmt \
+        --mono \
+        --mono_lang ${TGT} \
+        --dataset newscrawl \
+        --save-path wmt2014_mono
+
+
+# Clean and tokenize the monolingual corpus
+cd wmt2014_mono
+nlp_preprocess clean_tok_mono_corpus \
+                        --lang ${TGT} \
+                        --corpus train.raw.${TGT} \
+                        --min-num-words 1 \
+                        --max-num-words 100 \
+                        --save-path train.tok.${TGT}
+
+cd ../../../machine_translation
+datapath=../datasets/machine_translation
+
+# train the reverse model to translate German to English
+python3 train_transformer.py \
+    --train_src_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${TGT} \
+    --train_tgt_corpus ${datapath}/wmt2014_ende/train.tok.${SUBWORD_ALGO}.${SRC} \
+    --dev_src_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${TGT} \
+    --dev_tgt_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${SRC} \
+    --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+    --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+    --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+    --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+    --save_dir transformer_wmt2014_de_en_${SUBWORD_ALGO} \
+    --cfg transformer_base \
+    --lr 0.002 \
+    --warmup_steps 4000 \
+    --warmup_init_lr 0.0 \
+    --seed 100 \
+    --gpus 0,1,2,3
+
+# Due to the limited memory, we need to split the data and process the data divided respectively 
+split -l 400000 ${datapath}/wmt2014_mono/train.tok.${TGT} ${datapath}/wmt2014_mono/train.tok.${TGT}.split -d -a 3
+
+# Infer the synthetic data
+# Notice that some batches are too large and GPU memory may be not enough
+GPUS=(0 1 2 3)
+IDX=0
+for NUM in ` seq -f %03g 0 193 `; do
+    split_corpus=${datapath}/wmt2014_mono/train.tok.${TGT}.split${NUM}
+    if [ ${IDX} -eq ${#GPUS[@]} ]; then
+        let "IDX=0"
+        wait
+    fi
+    {
+        echo processing ${split_corpus}
+        python3 evaluate_transformer.py \
+            --param_path transformer_wmt2014_de_en_${SUBWORD_ALGO}/average.params \
+            --src_lang ${TGT} \
+            --tgt_lang ${SRC} \
+            --cfg transformer_base \
+            --src_tokenizer ${SUBWORD_ALGO} \
+            --tgt_tokenizer ${SUBWORD_ALGO} \
+            --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+            --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+            --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+            --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+            --src_corpus ${split_corpus} \
+            --save_dir ${split_corpus/.${TGT}./.${SRC}.} \
+            --beam-size 1 \
+            --inference \
+            --gpus ${GPUS[IDX]}
+    } &
+    let "IDX++"
+done
+wait
+
+cat ` seq -f "${datapath}/wmt2014_mono/train.tok.${SRC}.split%03g/pred_sentences.txt" 0 193 ` \
+    > ${datapath}/wmt2014_mono/syn.train.raw.${SRC}
+cp ${datapath}/wmt2014_mono/train.tok.${TGT} ${datapath}/wmt2014_mono/syn.train.raw.${TGT}
+
+# Clean the synthetic data
+nlp_preprocess clean_tok_para_corpus --src-lang ${SRC} \
+    --tgt-lang ${TGT} \
+    --src-corpus ${datapath}/wmt2014_mono/syn.train.raw.${SRC} \
+    --tgt-corpus ${datapath}/wmt2014_mono/syn.train.raw.${TGT} \
+    --min-num-words 1 \
+    --max-num-words 250 \
+    --max-ratio 1.5 \
+    --src-save-path ${datapath}/wmt2014_mono/syn.train.tok.${SRC} \
+    --tgt-save-path ${datapath}/wmt2014_mono/syn.train.tok.${TGT}
+
+# Combine the synthetic data with upsampled original data
+# TODO upsample
+rm -rf ${datapath}/wmt2014_backtranslation
+mkdir ${datapath}/wmt2014_backtranslation
+for LANG in ${SRC} ${TGT} ; do
+    cat ${datapath}/wmt2014_ende/train.tok.${LANG} ${datapath}/wmt2014_mono/syn.train.tok.${LANG} \
+        > ${datapath}/wmt2014_backtranslation/bt.train.tok.${LANG}
+done
+
+# Tokenize
+for LANG in ${SRC} ${TGT} ; do
+    nlp_preprocess apply_subword --model ${SUBWORD_ALGO} \
+        --output-type subword \
+        --model-path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+        --vocab-path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+        --corpus ${datapath}/wmt2014_backtranslation/bt.train.tok.${LANG} \
+        --save-path ${datapath}/wmt2014_backtranslation/bt.train.tok.${SUBWORD_ALGO}.${LANG}
+done
+
+# Use the combine data to train the new model
+python3 train_transformer.py \
+    --train_src_corpus ${datapath}/wmt2014_backtranslation/bt.train.tok.${SUBWORD_ALGO}.${SRC} \
+    --train_tgt_corpus ${datapath}/wmt2014_backtranslation/bt.train.tok.${SUBWORD_ALGO}.${TGT} \
+    --dev_src_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${SRC} \
+    --dev_tgt_corpus ${datapath}/wmt2014_ende/dev.tok.${SUBWORD_ALGO}.${TGT} \
+    --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+    --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+    --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+    --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+    --save_dir backtranslation_transformer_wmt2014_ende_${SUBWORD_ALGO} \
+    --cfg transformer_base \
+    --lr 0.003 \
+    --max_num_tokens 4096 \
+    --sampler BoundedBudgetSampler \
+    --comm_backend horovod \
+    --max_update 30000 \
+    --save_interval_update 1000 \
+    --warmup_steps 6000 \
+    --warmup_init_lr 0.0 \
+    --num_averages -1 \
+    --seed 123 \
+    --gpus 0,1,2,3
+
+# TODO nlp_average_checkpoint
+nlp_nmt average_checkpoint --prefix range() \
+    --suffix \
+    --save-path backtranslation_transformer_wmt2014_ende_${SUBWORD_ALGO}/average.params
+
+# Finally, we can evaluate the model
+python3 evaluate_transformer.py \
+    --param_path backtranslation_transformer_wmt2014_ende_${SUBWORD_ALGO}/avg_20_29.params \
+    --src_lang ${SRC} \
+    --tgt_lang ${TGT} \
+    --cfg transformer_base \
+    --src_tokenizer ${SUBWORD_ALGO} \
+    --tgt_tokenizer ${SUBWORD_ALGO} \
+    --src_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+    --tgt_subword_model_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.model \
+    --src_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+    --tgt_vocab_path ${datapath}/wmt2014_ende/${SUBWORD_ALGO}.vocab \
+    --src_corpus ${datapath}/wmt2014_ende/test.raw.${SRC} \
+    --tgt_corpus ${datapath}/wmt2014_ende/test.raw.${TGT} \
+    --gpus 0
diff --git a/scripts/natural_language_inference/dataset.py b/scripts/natural_language_inference/dataset.py
deleted file mode 100644
index 31496a691e..0000000000
--- a/scripts/natural_language_inference/dataset.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# Copyright 2018 Mengxiao Lin <linmx0130@gmail.com>.
-# pylint: disable=logging-format-interpolation
-
-"""
-Data loading and batching.
-"""
-
-import os
-import logging
-from mxnet import gluon
-import gluonnlp as nlp
-import gluonnlp.data.batchify as btf
-
-logger = logging.getLogger('nli')
-LABEL_TO_IDX = {'neutral': 0, 'contradiction': 1, 'entailment': 2}
-
-def read_dataset(args, dataset):
-    """
-    Read dataset from tokenized files.
-    """
-    path = os.path.join(vars(args)[dataset])
-    logger.info('reading data from {}'.format(path))
-    examples = [line.strip().split('\t') for line in open(path)]
-    if args.max_num_examples > 0:
-        examples = examples[:args.max_num_examples]
-    # NOTE: assume data has been tokenized
-    dataset = gluon.data.SimpleDataset([(e[0], e[1], LABEL_TO_IDX[e[2]]) for e in examples])
-    dataset = dataset.transform(lambda s1, s2, label: (
-        ['NULL'] + s1.lower().split(),
-        ['NULL'] + s2.lower().split(), label),
-                                lazy=False)
-    logger.info('read {} examples'.format(len(dataset)))
-    return dataset
-
-def build_vocab(dataset):
-    """
-    Build vocab given a dataset.
-    """
-    counter = nlp.data.count_tokens([w for e in dataset for s in e[:2] for w in s],
-                                    to_lower=True)
-    vocab = nlp.Vocab(counter)
-    return vocab
-
-def prepare_data_loader(args, dataset, vocab, test=False):
-    """
-    Read data and build data loader.
-    """
-    # Preprocess
-    dataset = dataset.transform(lambda s1, s2, label: (vocab(s1), vocab(s2), label),
-                                lazy=False)
-
-    # Batching
-    batchify_fn = btf.Tuple(btf.Pad(pad_val=0), btf.Pad(pad_val=0), btf.Stack(dtype='int32'))
-    data_lengths = [max(len(d[0]), len(d[1])) for d in dataset]
-    batch_sampler = nlp.data.FixedBucketSampler(lengths=data_lengths,
-                                                batch_size=args.batch_size,
-                                                shuffle=(not test))
-    data_loader = gluon.data.DataLoader(dataset=dataset,
-                                        batch_sampler=batch_sampler,
-                                        batchify_fn=batchify_fn)
-    return data_loader
diff --git a/scripts/natural_language_inference/decomposable_attention.py b/scripts/natural_language_inference/decomposable_attention.py
deleted file mode 100644
index f991461e03..0000000000
--- a/scripts/natural_language_inference/decomposable_attention.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# Copyright 2018 Mengxiao Lin <linmx0130@gmail.com>.
-# pylint: disable=arguments-differ
-
-"""
-Implementation of the decomposable attention model with intra sentence attention.
-"""
-
-from mxnet import gluon
-from mxnet.gluon import nn
-
-
-class DecomposableAttentionModel(gluon.HybridBlock):
-    """
-    A Decomposable Attention Model for Natural Language Inference
-    using intra-sentence attention.
-    Arxiv paper: https://arxiv.org/pdf/1606.01933.pdf
-    """
-    def __init__(self, vocab_size, word_embed_size, hidden_size,
-                 dropout=0., intra_attention=False, **kwargs):
-        super(DecomposableAttentionModel, self).__init__(**kwargs)
-        self.word_embed_size = word_embed_size
-        self.hidden_size = hidden_size
-        self.use_intra_attention = intra_attention
-        with self.name_scope():
-            self.dropout_layer = nn.Dropout(dropout)
-            self.word_emb = nn.Embedding(vocab_size, word_embed_size)
-            self.lin_proj = nn.Dense(hidden_size, in_units=word_embed_size,
-                                     flatten=False, use_bias=False)
-            if self.use_intra_attention:
-                self.intra_attention = IntraSentenceAttention(hidden_size, hidden_size, dropout)
-                input_size = hidden_size * 2
-            else:
-                self.intra_attention = None
-                input_size = hidden_size
-            self.model = DecomposableAttention(input_size, hidden_size, 3, dropout)
-
-    def hybrid_forward(self, F, sentence1, sentence2):
-        """
-        Predict the relation of two sentences.
-
-        Parameters
-        ----------
-        sentence1 : NDArray
-            Shape (batch_size, length)
-        sentence2 : NDArray
-            Shape (batch_size, length)
-
-        Returns
-        -------
-        pred : NDArray
-            Shape (batch_size, num_classes). num_classes == 3.
-
-        """
-        feature1 = self.lin_proj(self.word_emb(sentence1))
-        feature2 = self.lin_proj(self.word_emb(sentence2))
-        if self.use_intra_attention:
-            feature1 = F.concat(feature1, self.intra_attention(feature1), dim=-1)
-            feature2 = F.concat(feature2, self.intra_attention(feature2), dim=-1)
-        pred = self.model(feature1, feature2)
-        return pred
-
-class IntraSentenceAttention(gluon.HybridBlock):
-    """
-    Intra Sentence Attention block.
-    """
-    def __init__(self, inp_size, hidden_size, dropout=0., **kwargs):
-        super(IntraSentenceAttention, self).__init__(**kwargs)
-        self.hidden_size = hidden_size
-        with self.name_scope():
-            self.dropout_layer = nn.Dropout(dropout)
-            # F_intra in the paper
-            self.intra_attn_emb = nn.HybridSequential()
-            self.intra_attn_emb.add(self.dropout_layer)
-            self.intra_attn_emb.add(nn.Dense(hidden_size, in_units=inp_size,
-                                             activation='relu', flatten=False))
-            self.intra_attn_emb.add(self.dropout_layer)
-            self.intra_attn_emb.add(nn.Dense(hidden_size, in_units=hidden_size,
-                                             activation='relu', flatten=False))
-
-    def hybrid_forward(self, F, feature_a):
-        """
-        Compute intra-sentence attention given embedded words.
-
-        Parameters
-        ----------
-        feature_a : NDArray
-            Shape (batch_size, length, hidden_size)
-
-        Returns
-        -------
-        alpha : NDArray
-            Shape (batch_size, length, hidden_size)
-        """
-        tilde_a = self.intra_attn_emb(feature_a)
-        e_matrix = F.batch_dot(tilde_a, tilde_a, transpose_b=True)
-        alpha = F.batch_dot(e_matrix.softmax(), tilde_a)
-        return alpha
-
-class DecomposableAttention(gluon.HybridBlock):
-    """
-    Decomposable Attention block.
-    """
-    def __init__(self, inp_size, hidden_size, num_class, dropout=0., **kwargs):
-        super(DecomposableAttention, self).__init__(**kwargs)
-        with self.name_scope():
-            self.dropout_layer = nn.Dropout(dropout)
-            # attention function
-            self.f = self._ff_layer(in_units=inp_size, out_units=hidden_size, flatten=False)
-            # compare function
-            self.g = self._ff_layer(in_units=hidden_size * 2, out_units=hidden_size, flatten=False)
-            # predictor
-            self.h = self._ff_layer(in_units=hidden_size * 2, out_units=hidden_size, flatten=True)
-            self.h.add(nn.Dense(num_class, in_units=hidden_size))
-        # extract features
-        self.hidden_size = hidden_size
-        self.inp_size = inp_size
-
-    def _ff_layer(self, in_units, out_units, flatten=True):
-        m = nn.HybridSequential()
-        m.add(self.dropout_layer)
-        m.add(nn.Dense(out_units, in_units=in_units, activation='relu', flatten=flatten))
-        m.add(self.dropout_layer)
-        m.add(nn.Dense(out_units, in_units=out_units, activation='relu', flatten=flatten))
-        return m
-
-    def hybrid_forward(self, F, a, b):
-        """
-        Forward of Decomposable Attention layer
-        """
-        # a.shape = [B, L1, H]
-        # b.shape = [B, L2, H]
-        # extract features
-        tilde_a = self.f(a)  # shape = [B, L1, H]
-        tilde_b = self.f(b)  # shape = [B, L2, H]
-        # attention
-        # e.shape = [B, L1, L2]
-        e = F.batch_dot(tilde_a, tilde_b, transpose_b=True)
-        # beta: b align to a, [B, L1, H]
-        beta = F.batch_dot(e.softmax(), tilde_b)
-        # alpha: a align to b, [B, L2, H]
-        alpha = F.batch_dot(e.transpose([0, 2, 1]).softmax(), tilde_a)
-        # compare
-        feature1 = self.g(F.concat(tilde_a, beta, dim=2))
-        feature2 = self.g(F.concat(tilde_b, alpha, dim=2))
-        feature1 = feature1.sum(axis=1)
-        feature2 = feature2.sum(axis=1)
-        yhat = self.h(F.concat(feature1, feature2, dim=1))
-        return yhat
diff --git a/scripts/natural_language_inference/esim.py b/scripts/natural_language_inference/esim.py
deleted file mode 100644
index e6d17b8698..0000000000
--- a/scripts/natural_language_inference/esim.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""
-Build an Enhancing LSTM model for Natural Language Inference
-"""
-
-__all__ = ['ESIMModel']
-
-from mxnet.gluon import nn, rnn
-
-EPS = 1e-12
-
-
-class ESIMModel(nn.HybridBlock):
-    """"Enhanced LSTM for Natural Language Inference" Qian Chen,
-    Xiaodan Zhu, Zhenhua Ling, Si Wei, Hui Jiang, Diana Inkpen. ACL (2017)
-
-    Parameters
-    ----------
-    vocab_size: int
-        Number of words in vocab
-    word_embed_size : int
-        Dimension of word vector
-    hidden_size : int
-        Number of hidden units in lstm cell
-    dense_size : int
-        Number of hidden units in dense layer
-    num_classes : int
-        Number of categories
-    dropout : int
-        Dropout prob
-    """
-
-    def __init__(self, vocab_size, num_classes, word_embed_size, hidden_size, dense_size,
-                 dropout=0., **kwargs):
-        super(ESIMModel, self).__init__(**kwargs)
-        with self.name_scope():
-            self.word_emb= nn.Embedding(vocab_size, word_embed_size)
-            self.embedding_dropout = nn.Dropout(dropout, axes=1)
-            self.lstm_encoder1 = rnn.LSTM(hidden_size, input_size=word_embed_size, bidirectional=True, layout='NTC')
-            self.ff_proj = nn.Dense(hidden_size, in_units=hidden_size * 2 * 4, flatten=False, activation='relu')
-            self.lstm_encoder2 = rnn.LSTM(hidden_size, input_size=hidden_size, bidirectional=True, layout='NTC')
-
-            self.classifier = nn.HybridSequential()
-            if dropout:
-                self.classifier.add(nn.Dropout(rate=dropout))
-            self.classifier.add(nn.Dense(units=hidden_size, activation='relu'))
-            if dropout:
-                self.classifier.add(nn.Dropout(rate=dropout))
-            self.classifier.add(nn.Dense(units=num_classes))
-
-    def _soft_attention_align(self, F, x1, x2):
-        # attention shape: (batch, x1_seq_len, x2_seq_len)
-        attention = F.batch_dot(x1, x2, transpose_b=True)
-
-        x1_align = F.batch_dot(attention.softmax(), x2)
-        x2_align = F.batch_dot(attention.transpose([0, 2, 1]).softmax(), x1)
-
-        return x1_align, x2_align
-
-    def _submul(self, F, x1, x2):
-        mul = x1 * x2
-        sub = x1 - x2
-
-        return F.concat(mul, sub, dim=-1)
-
-    def _pool(self, F, x):
-        p1 = x.mean(axis=1)
-        p2 = x.max(axis=1)
-
-        return F.concat(p1, p2, dim=-1)
-
-    def hybrid_forward(self, F, x1, x2):
-        # x1_embed x2_embed shape: (batch, seq_len, word_embed_size)
-        x1_embed = self.embedding_dropout(self.word_emb(x1))
-        x2_embed = self.embedding_dropout(self.word_emb(x2))
-
-        x1_lstm_encode = self.lstm_encoder1(x1_embed)
-        x2_lstm_encode = self.lstm_encoder1(x2_embed)
-
-        # attention
-        x1_algin, x2_algin = self._soft_attention_align(F, x1_lstm_encode, x2_lstm_encode)
-
-        # compose
-        x1_combined = F.concat(x1_lstm_encode, x1_algin,
-                               self._submul(F, x1_lstm_encode, x1_algin), dim=-1)
-        x2_combined = F.concat(x2_lstm_encode, x2_algin,
-                               self._submul(F, x2_lstm_encode, x2_algin), dim=-1)
-
-        x1_compose = self.lstm_encoder2(self.ff_proj(x1_combined))
-        x2_compose = self.lstm_encoder2(self.ff_proj(x2_combined))
-
-        # aggregate
-        x1_agg = self._pool(F, x1_compose)
-        x2_agg = self._pool(F, x2_compose)
-
-        # fully connection
-        output = self.classifier(F.concat(x1_agg, x2_agg, dim=-1))
-
-        return output
diff --git a/scripts/natural_language_inference/index.rst b/scripts/natural_language_inference/index.rst
deleted file mode 100644
index 8abf55fd45..0000000000
--- a/scripts/natural_language_inference/index.rst
+++ /dev/null
@@ -1,53 +0,0 @@
-Natural Language Inference
---------------------------
-
-:download:`Download scripts </model_zoo/natural_language_inference.zip>`
-
-Replication of the model described in `A Decomposable Attention Model for Natural Language Inference <https://arxiv.org/abs/1606.01933>`_.
-
-Download the SNLI dataset:
-
-.. code-block:: console
-
-    $ mkdir data
-    $ curl https://nlp.stanford.edu/projects/snli/snli_1.0.zip -o data/snli_1.0.zip
-    $ unzip data/snli_1.0.zip -d data
-
-Preprocess the data:
-
-.. code-block:: console
-
-	$ for split in train dev test; do python preprocess.py --input data/snli_1.0/snli_1.0_$split.txt --output data/snli_1.0/$split.txt; done
-
-Train the model without intra-sentence attention:
-
-.. code-block:: console
-
-	$ python main.py --train-file data/snli_1.0/train.txt --test-file data/snli_1.0/dev.txt --output-dir output/snli-basic --batch-size 32 --print-interval 5000 --lr 0.025 --epochs 300 --gpu-id 0 --dropout 0.2 --weight-decay 1e-5 --fix-embedding
-
-Test:
-
-.. code-block:: console
-
-	$ python main.py --test-file data/snli_1.0/test.txt --model-dir output/snli-basic --gpu-id 0 --mode test --output-dir output/snli-basic/test
-
-We achieve 85.0% accuracy on the SNLI test set, comparable to 86.3% reported in the
-original paper. `[Training log] <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/natural_language_inference/decomposable_attention_snli.log>`__
-
-Train the model with intra-sentence attention:
-
-.. code-block:: console
-
-	$ python main.py --train-file data/snli_1.0/train.txt --test-file data/snli_1.0/dev.txt --output-dir output/snli-intra --batch-size 32 --print-interval 5000 --lr 0.025 --epochs 300 --gpu-id 0 --dropout 0.2 --weight-decay 1e-5 --intra-attention --fix-embedding
-
-Test:
-
-.. code-block:: console
-
-	$ python main.py --test-file data/snli_1.0/test.txt --model-dir output/snli-intra --gpu-id 0 --mode test --output-dir output/snli-intra/test
-
-We achieve 85.5% accuracy on the SNLI test set, compared to 86.8% reported in the
-original paper. `[Training log] <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/natural_language_inference/decomposable_intra_attention_snli.log>`__
-Note that our intra-sentence attention implementation omitted the
-distance-sensitive bias term described in Equation (7) in the original paper.
-
diff --git a/scripts/natural_language_inference/main.py b/scripts/natural_language_inference/main.py
deleted file mode 100644
index 5dc79f2b5c..0000000000
--- a/scripts/natural_language_inference/main.py
+++ /dev/null
@@ -1,254 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# Copyright 2018 Mengxiao Lin <linmx0130@gmail.com>.
-# pylint: disable=redefined-outer-name,logging-format-interpolation
-
-"""
-Decomposable Attention Models for Natural Language Inference
-============================================================
-
-This script reproduces results in [Parikh et al., 2016]  with the Gluon NLP Toolkit.
-
-@article{parikh2016decomposable,
-  title={A decomposable attention model for natural language inference},
-  author={Parikh, Ankur P and T{\"a}ckstr{\"o}m, Oscar and Das, Dipanjan and Uszkoreit, Jakob},
-  journal={arXiv preprint arXiv:1606.01933},
-  year={2016}
-}
-"""
-
-import os
-import argparse
-import json
-import logging
-import numpy as np
-
-import mxnet as mx
-from mxnet import gluon, autograd
-import gluonnlp as nlp
-
-from decomposable_attention import DecomposableAttentionModel
-from esim import ESIMModel
-from dataset import read_dataset, prepare_data_loader, build_vocab
-from utils import logging_config
-
-logger = logging.getLogger('nli')
-
-nlp.utils.check_version('0.7.0')
-
-def parse_args():
-    """
-    Parse arguments.
-    """
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--gpu-id', type=int, default=0,
-                        help='GPU id (-1 means CPU)')
-    parser.add_argument('--train-file', default='snli_1.0/snli_1.0_train.txt',
-                        help='training set file')
-    parser.add_argument('--test-file', default='snli_1.0/snli_1.0_dev.txt',
-                        help='validation set file')
-    parser.add_argument('--max-num-examples', type=int, default=-1,
-                        help='maximum number of examples to load (for debugging)')
-    parser.add_argument('--batch-size', type=int, default=32,
-                        help='batch size')
-    parser.add_argument('--print-interval', type=int, default=20,
-                        help='the interval of two print')
-    parser.add_argument('--model', choices=['da', 'esim'], default=None, required=True,
-                        help='which model to use')
-    parser.add_argument('--mode', choices=['train', 'test'], default='train',
-                        help='train or test')
-    parser.add_argument('--lr', type=float, default=0.025,
-                        help='learning rate')
-    parser.add_argument('--epochs', type=int, default=300,
-                        help='maximum number of epochs to train')
-    parser.add_argument('--embedding', default='glove',
-                        help='word embedding type')
-    parser.add_argument('--fix-embedding', action='store_true',
-                        help='whether to fix pretrained word embedding')
-    parser.add_argument('--embedding-source', default='glove.840B.300d',
-                        help='embedding file source')
-    parser.add_argument('--embedding-size', type=int, default=300,
-                        help='size of pretrained word embedding')
-    parser.add_argument('--hidden-size', type=int, default=200,
-                        help='hidden layer size')
-    parser.add_argument('--output-dir', default='./output',
-                        help='directory for all experiment output')
-    parser.add_argument('--model-dir', default='./output',
-                        help='directory to load model')
-    parser.add_argument('--seed', type=int, default=0,
-                        help='random seed')
-    parser.add_argument('--dropout', type=float, default=0.,
-                        help='dropout rate')
-    parser.add_argument('--optimizer', choices=['adam', 'adagrad'], default='adagrad',
-                        help='optimization method')
-    parser.add_argument('--weight-decay', type=float, default=0.,
-                        help='l2 regularization weight')
-    parser.add_argument('--intra-attention', action='store_true',
-                        help='use intra-sentence attention')
-
-    return parser.parse_args()
-
-def train_model(model, train_data_loader, val_data_loader, embedding, ctx, args):
-    """
-    Train model and validate/save every epoch.
-    """
-    logger.info(vars(args))
-
-    # Initialization
-    model.hybridize()
-    model.collect_params().initialize(mx.init.Normal(0.01), ctx=ctx)
-    model.word_emb.weight.set_data(embedding.idx_to_vec)
-    # Fix word embedding
-    if args.fix_embedding:
-        model.word_emb.weight.grad_req = 'null'
-
-    loss_func = gluon.loss.SoftmaxCrossEntropyLoss()
-    trainer = gluon.Trainer(model.collect_params(), args.optimizer,
-                            {'learning_rate': args.lr,
-                             'wd': args.weight_decay,
-                             'clip_gradient': 5})
-
-    checkpoints_dir = os.path.join(args.output_dir, 'checkpoints')
-    if not os.path.exists(checkpoints_dir):
-        os.makedirs(checkpoints_dir)
-
-    best_val_acc = 0.
-    for epoch_id in range(args.epochs):
-        avg_loss = 0.
-        avg_acc = 0.
-        for batch_id, example in enumerate(train_data_loader):
-            s1, s2, label = example
-            s1 = s1.as_in_context(ctx)
-            s2 = s2.as_in_context(ctx)
-            label = label.as_in_context(ctx)
-
-            with autograd.record():
-                output = model(s1, s2)
-                loss = loss_func(output, label).mean()
-            loss.backward()
-            trainer.step(1)
-            avg_loss += loss.sum().asscalar()
-
-            pred = output.argmax(axis=1)
-            acc = (pred == label.astype(np.float32)).mean()
-            avg_acc += acc.asscalar()
-
-            if (batch_id + 1) % args.print_interval == 0:
-                avg_loss /= args.print_interval
-                avg_acc /= args.print_interval
-                logger.info('[Epoch {} Batch {}/{}] loss={:.4f}, acc={:.4f}'
-                            .format(epoch_id, batch_id + 1, len(train_data_loader),
-                                    avg_loss, avg_acc))
-                avg_loss = 0.
-                avg_acc = 0.
-
-        # Validation
-        val_loss, val_acc = test_model(model, val_data_loader, loss_func, ctx)
-        if val_acc > best_val_acc:
-            best_val_acc = val_acc
-            checkpoint_path = os.path.join(args.output_dir, 'checkpoints', 'valid_best.params')
-            model.save_parameters(checkpoint_path)
-        logger.info('[Epoch {}] valid loss={:.4f}, valid acc={:.4f}, best valid acc={:.4f}'
-                    .format(epoch_id, val_loss, val_acc, best_val_acc))
-
-        # Save checkpoint of last epoch
-        checkpoint_path = os.path.join(args.output_dir, 'checkpoints', 'last.params')
-        model.save_parameters(checkpoint_path)
-
-def test_model(model, data_loader, loss_func, ctx):
-    """
-    Test model.
-    """
-    acc = 0.
-    loss = 0.
-    for _, example in enumerate(data_loader):
-        s1, s2, label = example
-        s1 = s1.as_in_context(ctx)
-        s2 = s2.as_in_context(ctx)
-        label = label.as_in_context(ctx)
-        output = model(s1, s2)
-        loss += loss_func(output, label).mean().asscalar()
-        pred = output.argmax(axis=1)
-        acc += (pred == label.astype(np.float32)).mean().asscalar()
-    acc /= len(data_loader)
-    loss /= len(data_loader)
-    return loss, acc
-
-def build_model(args, vocab):
-    if args.model == 'da':
-        model = DecomposableAttentionModel(len(vocab), args.embedding_size, args.hidden_size,
-                                           args.dropout, args.intra_attention)
-    elif args.model == 'esim':
-        model = ESIMModel(len(vocab), 3, args.embedding_size, args.hidden_size,
-                          args.dropout)
-    return model
-
-def main(args):
-    """
-    Entry point: train or test.
-    """
-    json.dump(vars(args), open(os.path.join(args.output_dir, 'config.json'), 'w'))
-
-    if args.gpu_id == -1:
-        ctx = mx.cpu()
-    else:
-        ctx = mx.gpu(args.gpu_id)
-
-    mx.random.seed(args.seed, ctx=ctx)
-
-    if args.mode == 'train':
-        train_dataset = read_dataset(args, 'train_file')
-        val_dataset = read_dataset(args, 'test_file')
-
-        vocab_path = os.path.join(args.output_dir, 'vocab.jsons')
-        if os.path.exists(vocab_path):
-            vocab = nlp.Vocab.from_json(open(vocab_path).read())
-        else:
-            vocab = build_vocab(train_dataset)
-            with open(vocab_path, 'w') as fout:
-                fout.write(vocab.to_json())
-        glove = nlp.embedding.create(args.embedding, source=args.embedding_source)
-        vocab.set_embedding(glove)
-
-        train_data_loader = prepare_data_loader(args, train_dataset, vocab)
-        val_data_loader = prepare_data_loader(args, val_dataset, vocab, test=True)
-
-        model = build_model(args, vocab)
-        train_model(model, train_data_loader, val_data_loader, vocab.embedding, ctx, args)
-    elif args.mode == 'test':
-        model_args = argparse.Namespace(**json.load(
-            open(os.path.join(args.model_dir, 'config.json'))))
-        vocab = nlp.Vocab.from_json(
-            open(os.path.join(args.model_dir, 'vocab.jsons')).read())
-        val_dataset = read_dataset(args, 'test_file')
-        val_data_loader = prepare_data_loader(args, val_dataset, vocab, test=True)
-        model = build_model(model_args, vocab)
-        model.load_parameters(os.path.join(
-            args.model_dir, 'checkpoints', 'valid_best.params'), ctx=ctx)
-        loss_func = gluon.loss.SoftmaxCrossEntropyLoss()
-        logger.info('Test on {}'.format(args.test_file))
-        loss, acc = test_model(model, val_data_loader, loss_func, ctx)
-        logger.info('loss={:.4f} acc={:.4f}'.format(loss, acc))
-
-if __name__ == '__main__':
-    args = parse_args()
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-
-    logging_config(os.path.join(args.output_dir, 'main.log'))
-
-    main(args)
diff --git a/scripts/natural_language_inference/preprocess.py b/scripts/natural_language_inference/preprocess.py
deleted file mode 100644
index d1031bee11..0000000000
--- a/scripts/natural_language_inference/preprocess.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=redefined-outer-name
-
-"""
-Tokenize the SNLI dataset.
-"""
-
-import argparse
-import csv
-import nltk
-
-def parse_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input',
-                        help='.txt file for the SNLI dataset')
-    parser.add_argument('--output',
-                        help='path for tokenized output file')
-    args = parser.parse_args()
-    return args
-
-def read_tokens(tree_str):
-    t = nltk.Tree.fromstring(tree_str)
-    return t.leaves()
-
-def main(args):
-    """
-    Read tokens from the provided parse tree in the SNLI dataset.
-    Illegal examples are removed.
-    """
-    examples = []
-    with open(args.input, 'r') as fin:
-        reader = csv.DictReader(fin, delimiter='\t')
-        for cols in reader:
-            s1 = read_tokens(cols['sentence1_parse'])
-            s2 = read_tokens(cols['sentence2_parse'])
-            label = cols['gold_label']
-            if label in ('neutral', 'contradiction', 'entailment'):
-                examples.append((s1, s2, label))
-    with open(args.output, 'w') as fout:
-        for s1, s2, l in examples:
-            fout.write('{}\t{}\t{}\n'.format(' '.join(s1), ' '.join(s2), l))
-
-
-if __name__ == '__main__':
-    args = parse_args()
-    main(args)
diff --git a/scripts/natural_language_inference/utils.py b/scripts/natural_language_inference/utils.py
deleted file mode 100644
index 9e1b848491..0000000000
--- a/scripts/natural_language_inference/utils.py
+++ /dev/null
@@ -1,54 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# Copyright 2018 Mengxiao Lin <linmx0130@gmail.com>.
-
-"""
-Utility functions.
-"""
-
-import logging
-
-def logging_config(logpath=None,
-                   level=logging.DEBUG,
-                   console_level=logging.INFO,
-                   no_console=False):
-    """
-    Config the logging.
-    """
-    logger = logging.getLogger('nli')
-    # Remove all the current handlers
-    for handler in logger.handlers:
-        logger.removeHandler(handler)
-    logger.handlers = []
-    logger.propagate = False
-    logger.setLevel(logging.DEBUG)
-
-    formatter = logging.Formatter('%(filename)s:%(funcName)s: %(message)s')
-
-    if logpath is not None:
-        print('All Logs will be saved to {}'.format(logpath))
-        logfile = logging.FileHandler(logpath, mode='w')
-        logfile.setLevel(level)
-        logfile.setFormatter(formatter)
-        logger.addHandler(logfile)
-
-    if not no_console:
-        # Initialze the console logging
-        logconsole = logging.StreamHandler()
-        logconsole.setLevel(console_level)
-        logconsole.setFormatter(formatter)
-        logger.addHandler(logconsole)
diff --git a/scripts/ner/data.py b/scripts/ner/data.py
deleted file mode 100644
index f160f607da..0000000000
--- a/scripts/ner/data.py
+++ /dev/null
@@ -1,355 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Data utilities for the named entity recognition task."""
-
-import logging
-from collections import namedtuple
-
-import numpy as np
-import mxnet as mx
-import gluonnlp as nlp
-
-TaggedToken = namedtuple('TaggedToken', ['text', 'tag'])
-PredictedToken = namedtuple('PredictedToken', ['text', 'true_tag', 'pred_tag'])
-
-NULL_TAG = 'X'
-
-def bio_bioes(tokens):
-    """Convert a list of TaggedTokens in BIO(2) scheme to BIOES scheme.
-
-    Parameters
-    ----------
-    tokens: List[TaggedToken]
-        A list of tokens in BIO(2) scheme
-
-    Returns
-    -------
-    List[TaggedToken]:
-        A list of tokens in BIOES scheme
-    """
-    ret = []
-    for index, token in enumerate(tokens):
-        if token.tag == 'O':
-            ret.append(token)
-        elif token.tag.startswith('B'):
-            # if a B-tag is continued by other tokens with the same entity,
-            # then it is still a B-tag
-            if index + 1 < len(tokens) and tokens[index + 1].tag.startswith('I'):
-                ret.append(token)
-            else:
-                ret.append(TaggedToken(text=token.text, tag='S' + token.tag[1:]))
-        elif token.tag.startswith('I'):
-            # if an I-tag is continued by other tokens with the same entity,
-            # then it is still an I-tag
-            if index + 1 < len(tokens) and tokens[index + 1].tag.startswith('I'):
-                ret.append(token)
-            else:
-                ret.append(TaggedToken(text=token.text, tag='E' + token.tag[1:]))
-    return ret
-
-
-def read_bio_as_bio2(data_path):
-    """Read CoNLL-formatted text file in BIO scheme in given path as sentences in BIO2 scheme.
-
-    Parameters
-    ----------
-    data_path: str
-        Path of the data file to read
-
-    Returns
-    -------
-    List[List[TaggedToken]]:
-        List of sentences, each of which is a List of TaggedTokens
-    """
-
-    with open(data_path, 'r') as ifp:
-        sentence_list = []
-        current_sentence = []
-        prev_tag = 'O'
-
-        for line in ifp:
-            if len(line.strip()) > 0:
-                word, _, _, tag = line.rstrip().split(' ')
-                # convert BIO tag to BIO2 tag
-                if tag == 'O':
-                    bio2_tag = 'O'
-                else:
-                    if prev_tag == 'O' or tag[2:] != prev_tag[2:]:
-                        bio2_tag = 'B' + tag[1:]
-                    else:
-                        bio2_tag = tag
-                current_sentence.append(TaggedToken(text=word, tag=bio2_tag))
-                prev_tag = tag
-            else:
-                # the sentence was completed if an empty line occurred; flush the current sentence.
-                sentence_list.append(current_sentence)
-                current_sentence = []
-                prev_tag = 'O'
-
-        # check if there is a remaining token. in most CoNLL data files, this does not happen.
-        if len(current_sentence) > 0:
-            sentence_list.append(current_sentence)
-        return sentence_list
-
-
-def remove_docstart_sentence(sentences):
-    """Remove -DOCSTART- sentences in the list of sentences.
-
-    Parameters
-    ----------
-    sentences: List[List[TaggedToken]]
-        List of sentences, each of which is a List of TaggedTokens.
-        This list may contain DOCSTART sentences.
-
-    Returns
-    -------
-        List of sentences, each of which is a List of TaggedTokens.
-        This list does not contain DOCSTART sentences.
-    """
-    ret = []
-    for sentence in sentences:
-        current_sentence = []
-        for token in sentence:
-            if token.text != '-DOCSTART-':
-                current_sentence.append(token)
-        if len(current_sentence) > 0:
-            ret.append(current_sentence)
-    return ret
-
-
-def bert_tokenize_sentence(sentence, bert_tokenizer):
-    """Apply BERT tokenizer on a tagged sentence to break words into sub-words.
-    This function assumes input tags are following IOBES, and outputs IOBES tags.
-
-    Parameters
-    ----------
-    sentence: List[TaggedToken]
-        List of tagged words
-    bert_tokenizer: nlp.data.BertTokenizer
-        BERT tokenizer
-
-    Returns
-    -------
-    List[TaggedToken]: list of annotated sub-word tokens
-    """
-    ret = []
-    for token in sentence:
-        # break a word into sub-word tokens
-        sub_token_texts = bert_tokenizer(token.text)
-        # only the first token of a word is going to be tagged
-        ret.append(TaggedToken(text=sub_token_texts[0], tag=token.tag))
-        ret += [TaggedToken(text=sub_token_text, tag=NULL_TAG)
-                for sub_token_text in sub_token_texts[1:]]
-
-    return ret
-
-
-def load_segment(file_path, bert_tokenizer):
-    """Load CoNLL format NER datafile with BIO-scheme tags.
-
-    Tagging scheme is converted into BIOES, and words are tokenized into wordpieces
-    using `bert_tokenizer`.
-
-    Parameters
-    ----------
-    file_path: str
-        Path of the file
-    bert_tokenizer: nlp.data.BERTTokenizer
-
-    Returns
-    -------
-    List[List[TaggedToken]]: List of sentences, each of which is the list of `TaggedToken`s.
-    """
-    logging.info('Loading sentences in %s...', file_path)
-    bio2_sentences = remove_docstart_sentence(read_bio_as_bio2(file_path))
-    bioes_sentences = [bio_bioes(sentence) for sentence in bio2_sentences]
-    subword_sentences = [bert_tokenize_sentence(sentence, bert_tokenizer)
-                         for sentence in bioes_sentences]
-
-    logging.info('load %s, its max seq len: %d',
-                 file_path, max(len(sentence) for sentence in subword_sentences))
-
-    return subword_sentences
-
-
-class BERTTaggingDataset:
-    """
-
-    Parameters
-    ----------
-    text_vocab: gluon.nlp.Vocab
-        Vocabulary of text tokens/
-    train_path: Optional[str]
-        Path of the file to locate training data.
-    dev_path: Optional[str]
-        Path of the file to locate development data.
-    test_path: Optional[str]
-        Path of the file to locate test data.
-    seq_len: int
-        Length of the input sequence to BERT.
-    is_cased: bool
-        Whether to use cased model.
-    """
-
-    def __init__(self, text_vocab, train_path, dev_path, test_path, seq_len, is_cased,
-                 tag_vocab=None):
-        self.text_vocab = text_vocab
-        self.seq_len = seq_len
-
-        self.bert_tokenizer = nlp.data.BERTTokenizer(vocab=text_vocab, lower=not is_cased)
-
-        train_sentences = [] if train_path is None else load_segment(train_path,
-                                                                     self.bert_tokenizer)
-        dev_sentences = [] if dev_path is None else load_segment(dev_path, self.bert_tokenizer)
-        test_sentences = [] if test_path is None else load_segment(test_path, self.bert_tokenizer)
-        all_sentences = train_sentences + dev_sentences + test_sentences
-
-        if tag_vocab is None:
-            logging.info('Indexing tags...')
-            tag_counter = nlp.data.count_tokens(token.tag
-                                                for sentence in all_sentences for token in sentence)
-            self.tag_vocab = nlp.Vocab(tag_counter, padding_token=NULL_TAG,
-                                       bos_token=None, eos_token=None, unknown_token=None)
-        else:
-            self.tag_vocab = tag_vocab
-        self.null_tag_index = self.tag_vocab[NULL_TAG]
-
-        if len(test_sentences) > 0:
-            logging.info('example test sentences:')
-            for i in range(2):
-                logging.info(str(test_sentences[i]))
-
-        self.train_inputs = [self._encode_as_input(sentence) for sentence in train_sentences]
-        self.dev_inputs = [self._encode_as_input(sentence) for sentence in dev_sentences]
-        self.test_inputs = [self._encode_as_input(sentence) for sentence in test_sentences]
-
-        logging.info('tag_vocab: %s', self.tag_vocab)
-
-    def _encode_as_input(self, sentence):
-        """Enocde a single sentence into numpy arrays as input to the BERTTagger model.
-
-        Parameters
-        ----------
-        sentence: List[TaggedToken]
-            A sentence as a list of tagged tokens.
-
-        Returns
-        -------
-        np.array: token text ids (batch_size, seq_len)
-        np.array: token types (batch_size, seq_len),
-                which is all zero because we have only one sentence for tagging.
-        np.array: valid_length (batch_size,) the number of tokens until [SEP] token
-        np.array: tag_ids (batch_size, seq_len)
-        np.array: flag_nonnull_tag (batch_size, seq_len),
-                which is simply tag_ids != self.null_tag_index
-
-        """
-        # check whether the given sequence can be fit into `seq_len`.
-        assert len(sentence) <= self.seq_len - 2, \
-            'the number of tokens {} should not be larger than {} - 2. offending sentence: {}' \
-            .format(len(sentence), self.seq_len, sentence)
-
-        text_tokens = ([self.text_vocab.cls_token] + [token.text for token in sentence] +
-                       [self.text_vocab.sep_token])
-        padded_text_ids = (self.text_vocab.to_indices(text_tokens)
-                           + ([self.text_vocab[self.text_vocab.padding_token]]
-                              * (self.seq_len - len(text_tokens))))
-
-        tags = [NULL_TAG] + [token.tag for token in sentence] + [NULL_TAG]
-        padded_tag_ids = (self.tag_vocab.to_indices(tags)
-                          + [self.tag_vocab[NULL_TAG]] * (self.seq_len - len(tags)))
-
-        assert len(text_tokens) == len(tags)
-        assert len(padded_text_ids) == len(padded_tag_ids)
-        assert len(padded_text_ids) == self.seq_len
-
-        valid_length = len(text_tokens)
-
-        # in sequence tagging problems, only one sentence is given
-        token_types = [0] * self.seq_len
-
-        np_tag_ids = np.array(padded_tag_ids, dtype='int32')
-        # gluon batchify cannot batchify numpy.bool? :(
-        flag_nonnull_tag = (np_tag_ids != self.null_tag_index).astype('int32')
-
-        return (np.array(padded_text_ids, dtype='int32'),
-                np.array(token_types, dtype='int32'),
-                np.array(valid_length, dtype='int32'),
-                np_tag_ids,
-                flag_nonnull_tag)
-
-    @staticmethod
-    def _get_data_loader(inputs, shuffle, batch_size):
-        return mx.gluon.data.DataLoader(inputs, batch_size=batch_size, shuffle=shuffle,
-                                        last_batch='keep')
-
-    def get_train_data_loader(self, batch_size):
-        return self._get_data_loader(self.train_inputs, shuffle=True, batch_size=batch_size)
-
-    def get_dev_data_loader(self, batch_size):
-        return self._get_data_loader(self.dev_inputs, shuffle=False, batch_size=batch_size)
-
-    def get_test_data_loader(self, batch_size):
-        return self._get_data_loader(self.test_inputs, shuffle=False, batch_size=batch_size)
-
-    @property
-    def num_tag_types(self):
-        """Returns the number of unique tags.
-
-        Returns
-        -------
-        int: number of tag types.
-        """
-        return len(self.tag_vocab)
-
-
-def convert_arrays_to_text(text_vocab, tag_vocab,
-                           np_text_ids, np_true_tags, np_pred_tags, np_valid_length):
-    """Convert numpy array data into text
-
-    Parameters
-    ----------
-    np_text_ids: token text ids (batch_size, seq_len)
-    np_true_tags: tag_ids (batch_size, seq_len)
-    np_pred_tags: tag_ids (batch_size, seq_len)
-    np.array: valid_length (batch_size,) the number of tokens until [SEP] token
-
-    Returns
-    -------
-    List[List[PredictedToken]]:
-
-    """
-    predictions = []
-    for sample_index in range(np_valid_length.shape[0]):
-        sample_len = np_valid_length[sample_index]
-        entries = []
-        for i in range(1, sample_len - 1):
-            token_text = text_vocab.idx_to_token[np_text_ids[sample_index, i]]
-            true_tag = tag_vocab.idx_to_token[int(np_true_tags[sample_index, i])]
-            pred_tag = tag_vocab.idx_to_token[int(np_pred_tags[sample_index, i])]
-            # we don't need to predict on NULL tags
-            if true_tag == NULL_TAG:
-                last_entry = entries[-1]
-                entries[-1] = PredictedToken(text=last_entry.text + token_text,
-                                             true_tag=last_entry.true_tag,
-                                             pred_tag=last_entry.pred_tag)
-            else:
-                entries.append(PredictedToken(text=token_text,
-                                              true_tag=true_tag, pred_tag=pred_tag))
-
-        predictions.append(entries)
-    return predictions
diff --git a/scripts/ner/dataset_sample/test_sample.txt b/scripts/ner/dataset_sample/test_sample.txt
deleted file mode 100644
index 3db1cb9558..0000000000
--- a/scripts/ner/dataset_sample/test_sample.txt
+++ /dev/null
@@ -1,17 +0,0 @@
--DOCSTART- -X- -X- O
-
-SOCCER NN I-NP O
-- : O O
-JAPAN NNP I-NP I-LOC
-GET VB I-VP O
-LUCKY NNP I-NP O
-WIN NNP I-NP O
-, , O O
-CHINA NNP I-NP I-PER
-IN IN I-PP O
-SURPRISE DT I-NP O
-DEFEAT NN I-NP O
-. . O O
-
-Nadim NNP I-NP I-PER
-Ladki NNP I-NP I-PER
diff --git a/scripts/ner/dataset_sample/train_sample.txt b/scripts/ner/dataset_sample/train_sample.txt
deleted file mode 100644
index d4c0f9f7dd..0000000000
--- a/scripts/ner/dataset_sample/train_sample.txt
+++ /dev/null
@@ -1,14 +0,0 @@
--DOCSTART- -X- O O
-
-EU NNP I-NP I-ORG
-rejects VBZ I-VP O
-German JJ I-NP I-MISC
-call NN I-NP O
-to TO I-VP O
-boycott VB I-VP O
-British JJ I-NP I-MISC
-lamb NN I-NP O
-. . O O
-
-Peter NNP I-NP I-PER
-Blackburn NNP I-NP I-PER
diff --git a/scripts/ner/dataset_sample/validation_sample.txt b/scripts/ner/dataset_sample/validation_sample.txt
deleted file mode 100644
index d3219e9079..0000000000
--- a/scripts/ner/dataset_sample/validation_sample.txt
+++ /dev/null
@@ -1,16 +0,0 @@
--DOCSTART- -X- O O
-
-CRICKET NNP I-NP O
-- : O O
-LEICESTERSHIRE NNP I-NP I-ORG
-TAKE NNP I-NP O
-OVER IN I-PP O
-AT NNP I-NP O
-TOP NNP I-NP O
-AFTER NNP I-NP O
-INNINGS NNP I-NP O
-VICTORY NN I-NP O
-. . O O
-
-LONDON NNP I-NP I-LOC
-1996-08-30 CD I-NP O
diff --git a/scripts/ner/finetune_bert.py b/scripts/ner/finetune_bert.py
deleted file mode 100644
index a0943fd05c..0000000000
--- a/scripts/ner/finetune_bert.py
+++ /dev/null
@@ -1,222 +0,0 @@
-#!/usr/bin/env python
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Provides command-line interace for training BERT-based named entity recognition model."""
-
-import argparse
-import logging
-import random
-
-import numpy as np
-import mxnet as mx
-
-import gluonnlp as nlp
-
-from ner_utils import get_context, get_bert_model, dump_metadata, str2bool
-from data import BERTTaggingDataset, convert_arrays_to_text
-from model import BERTTagger, attach_prediction
-
-# seqeval is a dependency that is specific to named entity recognition.
-import seqeval.metrics
-
-nlp.utils.check_version('0.7.0')
-
-def parse_args():
-    """Parse command line arguments."""
-    arg_parser = argparse.ArgumentParser(
-        description='Train a BERT-based named entity recognition model',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-    # data file paths
-    arg_parser.add_argument('--train-path', type=str, required=True,
-                            help='Path to the training data file')
-    arg_parser.add_argument('--dev-path', type=str, required=True,
-                            help='Path to the development data file')
-    arg_parser.add_argument('--test-path', type=str, required=True,
-                            help='Path to the test data file')
-
-    arg_parser.add_argument('--save-checkpoint-prefix', type=str, required=False, default=None,
-                            help='Prefix of model checkpoint file')
-
-    # bert options
-    arg_parser.add_argument('--bert-model', type=str, default='bert_12_768_12',
-                            help='Name of the BERT model')
-    arg_parser.add_argument('--cased', type=str2bool, default=True,
-                            help='Path to the development data file')
-    arg_parser.add_argument('--dropout-prob', type=float, default=0.1,
-                            help='Dropout probability for the last layer')
-
-    # optimization parameters
-    arg_parser.add_argument('--seed', type=int, default=13531,
-                            help='Random number seed.')
-    arg_parser.add_argument('--seq-len', type=int, default=180,
-                            help='The length of the sequence input to BERT.'
-                                 ' An exception will raised if this is not large enough.')
-    arg_parser.add_argument('--gpu', type=int,
-                            help='Number (index) of GPU to run on, e.g. 0.  '
-                                 'If not specified, uses CPU.')
-    arg_parser.add_argument('--batch-size', type=int, default=32, help='Batch size for training')
-    arg_parser.add_argument('--num-epochs', type=int, default=4, help='Number of epochs to train')
-    arg_parser.add_argument('--optimizer', type=str, default='bertadam',
-                            help='Optimization algorithm to use')
-    arg_parser.add_argument('--learning-rate', type=float, default=5e-5,
-                            help='Learning rate for optimization')
-    arg_parser.add_argument('--warmup-ratio', type=float, default=0.1,
-                            help='Warmup ratio for learning rate scheduling')
-    args = arg_parser.parse_args()
-    return args
-
-
-def main(config):
-    """Main method for training BERT-based NER model."""
-    # provide random seed for every RNGs we use
-    np.random.seed(config.seed)
-    random.seed(config.seed)
-    mx.random.seed(config.seed)
-
-    ctx = get_context(config.gpu)
-
-    logging.info('Loading BERT model...')
-    bert_model, text_vocab = get_bert_model(config.bert_model, config.cased, ctx,
-                                            config.dropout_prob)
-
-    dataset = BERTTaggingDataset(text_vocab, config.train_path, config.dev_path, config.test_path,
-                                 config.seq_len, config.cased)
-
-    train_data_loader = dataset.get_train_data_loader(config.batch_size)
-    dev_data_loader = dataset.get_dev_data_loader(config.batch_size)
-    test_data_loader = dataset.get_test_data_loader(config.batch_size)
-
-    net = BERTTagger(bert_model, dataset.num_tag_types, config.dropout_prob)
-    net.tag_classifier.initialize(init=mx.init.Normal(0.02), ctx=ctx)
-    net.hybridize(static_alloc=True)
-
-    loss_function = mx.gluon.loss.SoftmaxCrossEntropyLoss()
-    loss_function.hybridize(static_alloc=True)
-
-    # step size adaptation, adopted from: https://github.com/dmlc/gluon-nlp/blob/
-    # 87d36e3cc7c615f93732d01048cf7ce3b3b09eb7/scripts/bert/finetune_classifier.py#L348-L351
-    step_size = config.batch_size
-    num_train_steps = int(len(dataset.train_inputs) / step_size * config.num_epochs)
-    num_warmup_steps = int(num_train_steps * config.warmup_ratio)
-
-    optimizer_params = {'learning_rate': config.learning_rate}
-    trainer = mx.gluon.Trainer(net.collect_params(), config.optimizer, optimizer_params)
-
-    # collect differentiable parameters
-    logging.info('Collect params...')
-    # do not apply weight decay on LayerNorm and bias terms
-    for _, v in net.collect_params('.*beta|.*gamma|.*bias').items():
-        v.wd_mult = 0.0
-    params = [p for p in net.collect_params().values() if p.grad_req != 'null']
-
-    if config.save_checkpoint_prefix is not None:
-        logging.info('dumping metadata...')
-        dump_metadata(config, tag_vocab=dataset.tag_vocab)
-
-    def train(data_loader, start_step_num):
-        """Training loop."""
-        step_num = start_step_num
-        logging.info('current starting step num: %d', step_num)
-        for batch_id, (_, _, _, tag_ids, flag_nonnull_tag, out) in \
-                enumerate(attach_prediction(data_loader, net, ctx, is_train=True)):
-            logging.info('training on batch index: %d/%d', batch_id, len(data_loader))
-
-            # step size adjustments
-            step_num += 1
-            if step_num < num_warmup_steps:
-                new_lr = config.learning_rate * step_num / num_warmup_steps
-            else:
-                offset = ((step_num - num_warmup_steps) * config.learning_rate /
-                          (num_train_steps - num_warmup_steps))
-                new_lr = config.learning_rate - offset
-            trainer.set_learning_rate(new_lr)
-
-            with mx.autograd.record():
-                loss_value = loss_function(out, tag_ids,
-                                           flag_nonnull_tag.expand_dims(axis=2)).mean()
-
-            loss_value.backward()
-            nlp.utils.clip_grad_global_norm(params, 1)
-            trainer.step(1)
-
-            pred_tags = out.argmax(axis=-1)
-            logging.info('loss_value: %6f', loss_value.asscalar())
-
-            num_tag_preds = flag_nonnull_tag.sum().asscalar()
-            logging.info(
-                'accuracy: %6f', (((pred_tags == tag_ids) * flag_nonnull_tag).sum().asscalar()
-                                  / num_tag_preds))
-        return step_num
-
-    def evaluate(data_loader):
-        """Eval loop."""
-        predictions = []
-
-        for batch_id, (text_ids, _, valid_length, tag_ids, _, out) in \
-                enumerate(attach_prediction(data_loader, net, ctx, is_train=False)):
-            logging.info('evaluating on batch index: %d/%d', batch_id, len(data_loader))
-
-            # convert results to numpy arrays for easier access
-            np_text_ids = text_ids.astype('int32').asnumpy()
-            np_pred_tags = out.argmax(axis=-1).asnumpy()
-            np_valid_length = valid_length.astype('int32').asnumpy()
-            np_true_tags = tag_ids.asnumpy()
-
-            predictions += convert_arrays_to_text(text_vocab, dataset.tag_vocab, np_text_ids,
-                                                  np_true_tags, np_pred_tags, np_valid_length)
-
-        all_true_tags = [[entry.true_tag for entry in entries] for entries in predictions]
-        all_pred_tags = [[entry.pred_tag for entry in entries] for entries in predictions]
-        seqeval_f1 = seqeval.metrics.f1_score(all_true_tags, all_pred_tags)
-        return seqeval_f1
-
-    best_dev_f1 = 0.0
-    last_test_f1 = 0.0
-    best_epoch = -1
-
-    last_epoch_step_num = 0
-    for epoch_index in range(config.num_epochs):
-        last_epoch_step_num = train(train_data_loader, last_epoch_step_num)
-        train_f1 = evaluate(train_data_loader)
-        logging.info('train f1: %3f', train_f1)
-        dev_f1 = evaluate(dev_data_loader)
-        logging.info('dev f1: %3f, previous best dev f1: %3f', dev_f1, best_dev_f1)
-        if dev_f1 > best_dev_f1:
-            best_dev_f1 = dev_f1
-            best_epoch = epoch_index
-            logging.info('update the best dev f1 to be: %3f', best_dev_f1)
-            test_f1 = evaluate(test_data_loader)
-            logging.info('test f1: %3f', test_f1)
-            last_test_f1 = test_f1
-
-            # save params
-            params_file = config.save_checkpoint_prefix + '_{:03d}.params'.format(epoch_index)
-            logging.info('saving current checkpoint to: %s', params_file)
-            net.save_parameters(params_file)
-
-        logging.info('current best epoch: %d', best_epoch)
-
-    logging.info('best epoch: %d, best dev f1: %3f, test f1 at tha epoch: %3f',
-                 best_epoch, best_dev_f1, last_test_f1)
-
-
-if __name__ == '__main__':
-    logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
-                        level=logging.DEBUG, datefmt='%Y-%m-%d %I:%M:%S')
-    logging.getLogger().setLevel(logging.INFO)
-    main(parse_args())
diff --git a/scripts/ner/index.rst b/scripts/ner/index.rst
deleted file mode 100644
index 3bffe81eeb..0000000000
--- a/scripts/ner/index.rst
+++ /dev/null
@@ -1,34 +0,0 @@
-Named Entity Recognition
-------------------------
-
-:download:`Download scripts </model_zoo/ner.zip>`
-
-Reference: Devlin, Jacob, et al. "`Bert: Pre-training of deep bidirectional transformers for language understanding. <https://arxiv.org/abs/1810.04805>`_" arXiv preprint arXiv:1810.04805 (2018).
-
-Named Entity Recognition with BERT 
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-GluonNLP provides training and prediction script for named entity recognition models.
-
-The training script for NER requires the seqeval package:
-
-.. code-block:: console
-
-    $ pip install seqeval --user
-
-Dataset should be formatted in `CoNLL-2003 shared task format <https://www.clips.uantwerpen.be/conll2003/ner/>`_.
-Assuming data files are located in `${DATA_DIR}`, below command trains BERT model for
-named entity recognition, and saves model artifacts to `${MODEL_DIR}` with `large_bert`
-prefix in file names (assuming `${MODEL_DIR}` exists):
-
-.. code-block:: console
-
-    $ python finetune_bert.py \
-        --train-path ${DATA_DIR}/train.txt \
-        --dev-path ${DATA_DIR}/dev.txt \
-        --test-path ${DATA_DIR}/test.txt \
-        --gpu 0 --learning-rate 1e-5 --dropout-prob 0.1 --num-epochs 100 --batch-size 8 \
-        --optimizer bertadam --bert-model bert_24_1024_16 \
-        --save-checkpoint-prefix ${MODEL_DIR}/large_bert --seed 13531
-
-This achieves Test F1 from `91.5` to `92.2` (`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/bert/finetuned_conll2003.log>`_).
diff --git a/scripts/ner/model.py b/scripts/ner/model.py
deleted file mode 100644
index 18a2076600..0000000000
--- a/scripts/ner/model.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Gluon model block for the named entity recognition task."""
-
-from contextlib import ExitStack
-
-import mxnet as mx
-from mxnet.gluon import Block, nn
-
-
-class BERTTagger(Block):
-    """Model for sequence tagging with BERT
-
-    Parameters
-    ----------
-    bert_model: BERTModel
-        Bidirectional encoder with transformer.
-    num_tag_types: int
-        number of possible tags
-    dropout_prob: float
-        dropout probability for the last layer
-    prefix: str or None
-        See document of `mx.gluon.Block`.
-    params: ParameterDict or None
-        See document of `mx.gluon.Block`.
-    """
-
-    def __init__(self, bert_model, num_tag_types, dropout_prob, prefix=None, params=None):
-        super(BERTTagger, self).__init__(prefix=prefix, params=params)
-        self.bert_model = bert_model
-        with self.name_scope():
-            self.tag_classifier = nn.Dense(units=num_tag_types, flatten=False)
-            self.dropout = nn.Dropout(rate=dropout_prob)
-
-    def forward(self, token_ids, token_types, valid_length): # pylint: disable=arguments-differ
-        """Generate an unnormalized score for the tag of each token
-
-        Parameters
-        ----------
-        token_ids: NDArray, shape (batch_size, seq_length)
-            ID of tokens in sentences
-            See `input` of `glounnlp.model.BERTModel`
-        token_types: NDArray, shape (batch_size, seq_length)
-            See `glounnlp.model.BERTModel`
-        valid_length: NDArray, shape (batch_size,)
-            See `glounnlp.model.BERTModel`
-
-        Returns
-        -------
-        NDArray, shape (batch_size, seq_length, num_tag_types):
-            Unnormalized prediction scores for each tag on each position.
-        """
-        bert_output = self.dropout(self.bert_model(token_ids, token_types, valid_length))
-        output = self.tag_classifier(bert_output)
-        return output
-
-
-def attach_prediction(data_loader, net, ctx, is_train):
-    """Attach the prediction from a model to a data loader as the last field.
-
-    Parameters
-    ----------
-    data_loader: mx.gluon.data.DataLoader
-        Input data from `bert_model.BERTTaggingDataset._encode_as_input`.
-    net: mx.gluon.Block
-        gluon `Block` for making the preciction.
-    ctx:
-        The context data should be loaded to.
-    is_train:
-        Whether the forward pass should be made with `mx.autograd.record()`.
-
-    Returns
-    -------
-        All fields from `bert_model.BERTTaggingDataset._encode_as_input`,
-        as well as the prediction of the model.
-
-    """
-    for data in data_loader:
-        text_ids, token_types, valid_length, tag_ids, flag_nonnull_tag = \
-            [x.astype('float32').as_in_context(ctx) for x in data]
-
-        with ExitStack() as stack:
-            if is_train:
-                stack.enter_context(mx.autograd.record())
-            out = net(text_ids, token_types, valid_length)
-        yield text_ids, token_types, valid_length, tag_ids, flag_nonnull_tag, out
diff --git a/scripts/ner/ner_utils.py b/scripts/ner/ner_utils.py
deleted file mode 100644
index 332b548aa0..0000000000
--- a/scripts/ner/ner_utils.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Common utilities for the named entity recognition task."""
-
-import argparse
-import pickle
-from collections import namedtuple
-
-import mxnet as mx
-import gluonnlp as nlp
-
-__all__ = ['get_bert_model', 'get_bert_dataset_name', 'get_context',
-           'dump_metadata']
-
-BERTModelMetadata = namedtuple('BERTModelMetadata', ['config', 'tag_vocab'])
-
-def _metadata_file_path(checkpoint_prefix):
-    """Gets the file path for meta data"""
-    return checkpoint_prefix + '_metadata.pkl'
-
-
-def dump_metadata(config, tag_vocab):
-    """Dumps meta-data to the configured path"""
-    metadata = BERTModelMetadata(config=config, tag_vocab=tag_vocab)
-    with open(_metadata_file_path(config.save_checkpoint_prefix), 'wb') as ofp:
-        pickle.dump(metadata, ofp)
-
-
-def load_metadata(checkpoint_prefix):
-    """Loads meta-data to the configured path"""
-    with open(_metadata_file_path(checkpoint_prefix), 'rb') as ifp:
-        metadata = pickle.load(ifp)
-        return metadata.config, metadata.tag_vocab
-
-
-def get_context(gpu_index):
-    """This method gets context of execution"""
-    context = None
-    if gpu_index is None or gpu_index == '':
-        context = mx.cpu()
-    if isinstance(gpu_index, int):
-        context = mx.gpu(gpu_index)
-    return context
-
-
-def str2bool(v):
-    """Utility function for parsing boolean in argparse
-
-    https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
-
-    :param v: value of the argument
-    :return:
-    """
-    if v.lower() in ('yes', 'true', 't', 'y', '1'):
-        return True
-    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
-        return False
-    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
-def get_bert_dataset_name(is_cased):
-    """Returns relevant BERT dataset name, depending on whether we are using a cased model.
-
-    Parameters
-    ----------
-    is_cased: bool
-        Whether we are using a cased model.
-
-    Returns
-    -------
-    str: Named of the BERT dataset.
-
-    """
-    if is_cased:
-        return 'book_corpus_wiki_en_cased'
-    else:
-        return 'book_corpus_wiki_en_uncased'
-
-
-def get_bert_model(bert_model, cased, ctx, dropout_prob):
-    """Get pre-trained BERT model."""
-    bert_dataset_name = get_bert_dataset_name(cased)
-
-    return nlp.model.get_model(
-        name=bert_model,
-        dataset_name=bert_dataset_name,
-        pretrained=True,
-        ctx=ctx,
-        use_pooler=False,
-        use_decoder=False,
-        use_classifier=False,
-        dropout=dropout_prob,
-        embed_dropout=dropout_prob)
diff --git a/scripts/ner/predict_ner.py b/scripts/ner/predict_ner.py
deleted file mode 100644
index abdc3ec535..0000000000
--- a/scripts/ner/predict_ner.py
+++ /dev/null
@@ -1,130 +0,0 @@
-#!/usr/bin/env python
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Script for NER prediction."""
-
-import argparse
-import logging
-import os
-
-import mxnet as mx
-from ner_utils import get_bert_model, get_context
-from ner_utils import load_metadata
-from data import BERTTaggingDataset, convert_arrays_to_text
-from model import BERTTagger
-
-# TODO(bikestra): Currently, our evaluation is dependent on this package.
-# Figure out whether to take actual dependency on it.
-try:
-    import seqeval.metrics
-except ImportError:
-    raise ImportError('seqeval is required to run NER on BERT. Please '
-                      'install it via pip3 install seqeval --user')
-
-
-def _find_model_file_from_checkpoint(checkpoint_prefix: str):
-    """Load model checkpoint"""
-    dirname, file_prefix = os.path.split(checkpoint_prefix)
-    # find checkpoint file names and sort by name to find the most recent one.
-    checkpoint_filenames = ([f for f in os.listdir(dirname)
-                             if f.startswith(file_prefix)
-                             and f.endswith(os.path.extsep + 'params')])
-    last_checkpoint_filename = max(checkpoint_filenames)
-    logging.info('found checkpoint filename: {:s}'.format(last_checkpoint_filename))
-    last_checkpoint_path = os.path.join(dirname, last_checkpoint_filename)
-    return last_checkpoint_path
-
-
-def parse_args():
-    """Parse command line arguments."""
-    arg_parser = argparse.ArgumentParser(
-        description='Predict on CoNLL format data using BERT-based named entity recognition model',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-    # data file paths
-    arg_parser.add_argument('--test-path', type=str, required=True,
-                            help='Path to the test data file')
-    arg_parser.add_argument('--seq-len', type=int, default=200,
-                            help='The length of the sequence input to BERT.'
-                                 ' An exception will raised if this is not large enough.')
-    arg_parser.add_argument('--load-checkpoint-prefix', type=str, required=False, default=None,
-                            help='Prefix of model checkpoint file')
-
-    arg_parser.add_argument('--gpu', type=int,
-                            help='Number (index) of GPU to run on, e.g. 0. '
-                                 'If not specified, CPU context is used.')
-    arg_parser.add_argument('--batch-size', type=int, default=32, help='Batch size for training')
-    args = arg_parser.parse_args()
-    return args
-
-
-def main(config):
-    """Main method for predicting BERT-based NER model on CoNLL-formatted test data."""
-    train_config, tag_vocab = load_metadata(config.load_checkpoint_prefix)
-
-    ctx = get_context(config.gpu)
-    bert_model, text_vocab = get_bert_model(train_config.bert_model, train_config.cased, ctx,
-                                            train_config.dropout_prob)
-
-    dataset = BERTTaggingDataset(text_vocab, None, None, config.test_path,
-                                 config.seq_len, train_config.cased, tag_vocab=tag_vocab)
-
-    test_data_loader = dataset.get_test_data_loader(config.batch_size)
-
-    net = BERTTagger(bert_model, dataset.num_tag_types, train_config.dropout_prob)
-    model_filename = _find_model_file_from_checkpoint(config.load_checkpoint_prefix)
-    net.load_parameters(model_filename, ctx=ctx)
-
-    net.hybridize(static_alloc=True)
-
-    loss_function = mx.gluon.loss.SoftmaxCrossEntropyLoss()
-    loss_function.hybridize(static_alloc=True)
-
-    # TODO(bikestra): make it not redundant between train and predict
-    def evaluate(data_loader):
-        """Eval function"""
-        predictions = []
-
-        for batch_id, data in enumerate(data_loader):
-            logging.info('evaluating on batch index: %d/%d', batch_id, len(data_loader))
-            text_ids, token_types, valid_length, tag_ids, _ = \
-                [x.astype('float32').as_in_context(ctx) for x in data]
-            out = net(text_ids, token_types, valid_length)
-
-            # convert results to numpy arrays for easier access
-            np_text_ids = text_ids.astype('int32').asnumpy()
-            np_pred_tags = out.argmax(axis=-1).asnumpy()
-            np_valid_length = valid_length.astype('int32').asnumpy()
-            np_true_tags = tag_ids.asnumpy()
-
-            predictions += convert_arrays_to_text(text_vocab, dataset.tag_vocab, np_text_ids,
-                                                  np_true_tags, np_pred_tags, np_valid_length)
-
-        all_true_tags = [[entry.true_tag for entry in entries] for entries in predictions]
-        all_pred_tags = [[entry.pred_tag for entry in entries] for entries in predictions]
-        seqeval_f1 = seqeval.metrics.f1_score(all_true_tags, all_pred_tags)
-        return seqeval_f1
-
-    test_f1 = evaluate(test_data_loader)
-    logging.info('test f1: {:.3f}'.format(test_f1))
-
-
-if __name__ == '__main__':
-    logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s',
-                        level=logging.DEBUG, datefmt='%Y-%m-%d %I:%M:%S')
-    logging.getLogger().setLevel(logging.INFO)
-    main(parse_args())
diff --git a/scripts/parsing/__init__.py b/scripts/parsing/__init__.py
deleted file mode 100644
index 13a83393a9..0000000000
--- a/scripts/parsing/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
diff --git a/scripts/parsing/common/__init__.py b/scripts/parsing/common/__init__.py
deleted file mode 100644
index 13a83393a9..0000000000
--- a/scripts/parsing/common/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
diff --git a/scripts/parsing/common/config.py b/scripts/parsing/common/config.py
deleted file mode 100644
index 9a9a1dc63a..0000000000
--- a/scripts/parsing/common/config.py
+++ /dev/null
@@ -1,118 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Training config."""
-
-import os
-import pickle
-
-from scripts.parsing.common.savable import Savable
-
-
-class _Config(Savable):
-    def __init__(self, train_file, dev_file, test_file, save_dir,
-                 pretrained_embeddings_file=None, min_occur_count=2,
-                 lstm_layers=3, word_dims=100, tag_dims=100, dropout_emb=0.33, lstm_hiddens=400,
-                 dropout_lstm_input=0.33,
-                 dropout_lstm_hidden=0.33, mlp_arc_size=500, mlp_rel_size=100,
-                 dropout_mlp=0.33, learning_rate=2e-3, decay=.75, decay_steps=5000,
-                 beta_1=.9, beta_2=.9, epsilon=1e-12,
-                 num_buckets_train=40,
-                 num_buckets_valid=10, num_buckets_test=10,
-                 train_iters=50000, train_batch_size=5000, debug=False):
-        """Internal structure for hyper parameters, intended for pickle serialization.
-
-        May be replaced by a dict, but this class provides intuitive properties
-        and saving/loading mechanism
-
-        Parameters
-        ----------
-        train_file
-        dev_file
-        test_file
-        save_dir
-        pretrained_embeddings_file
-        min_occur_count
-        lstm_layers
-        word_dims
-        tag_dims
-        dropout_emb
-        lstm_hiddens
-        dropout_lstm_input
-        dropout_lstm_hidden
-        mlp_arc_size
-        mlp_rel_size
-        dropout_mlp
-        learning_rate
-        decay
-        decay_steps
-        beta_1
-        beta_2
-        epsilon
-        num_buckets_train
-        num_buckets_valid
-        num_buckets_test
-        train_iters
-        train_batch_size
-        debug
-        """
-        super(_Config, self).__init__()
-        self.pretrained_embeddings_file = pretrained_embeddings_file
-        self.train_file = train_file
-        self.dev_file = dev_file
-        self.test_file = test_file
-        self.min_occur_count = min_occur_count
-        self.save_dir = save_dir
-        self.lstm_layers = lstm_layers
-        self.word_dims = word_dims
-        self.tag_dims = tag_dims
-        self.dropout_emb = dropout_emb
-        self.lstm_hiddens = lstm_hiddens
-        self.dropout_lstm_input = dropout_lstm_input
-        self.dropout_lstm_hidden = dropout_lstm_hidden
-        self.mlp_arc_size = mlp_arc_size
-        self.mlp_rel_size = mlp_rel_size
-        self.dropout_mlp = dropout_mlp
-        self.learning_rate = learning_rate
-        self.decay = decay
-        self.decay_steps = decay_steps
-        self.beta_1 = beta_1
-        self.beta_2 = beta_2
-        self.epsilon = epsilon
-        self.num_buckets_train = num_buckets_train
-        self.num_buckets_valid = num_buckets_valid
-        self.num_buckets_test = num_buckets_test
-        self.train_iters = train_iters
-        self.train_batch_size = train_batch_size
-        self.debug = debug
-
-    @property
-    def save_model_path(self):
-        return os.path.join(self.save_dir, 'model.bin')
-
-    @property
-    def save_vocab_path(self):
-        return os.path.join(self.save_dir, 'vocab.pkl')
-
-    @property
-    def save_config_path(self):
-        return os.path.join(self.save_dir, 'config.pkl')
-
-    def save(self, path=None):
-        if not path:
-            path = self.save_config_path
-        with open(path, 'wb') as f:
-            pickle.dump(self, f)
diff --git a/scripts/parsing/common/data.py b/scripts/parsing/common/data.py
deleted file mode 100644
index a2ac0585ad..0000000000
--- a/scripts/parsing/common/data.py
+++ /dev/null
@@ -1,474 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""CoNLL format template."""
-
-from collections import Counter
-import numpy as np
-
-import gluonnlp
-from scripts.parsing.common.k_means import KMeans
-
-from .savable import Savable
-
-
-class ConllWord:
-    """CoNLL format template, see http://anthology.aclweb.org/W/W06/W06-2920.pdf
-
-    Parameters
-    ----------
-    id : int
-        Token counter, starting at 1 for each new sentence.
-    form : str
-        Word form or punctuation symbol.
-    lemma : str
-        Lemma or stem (depending on the particular treebank) of word form,
-        or an underscore if not available.
-    cpos : str
-        Coarse-grained part-of-speech tag, where the tagset depends on the treebank.
-    pos : str
-        Fine-grained part-of-speech tag, where the tagset depends on the treebank.
-    feats : str
-        Unordered set of syntactic and/or morphological features
-        (depending on the particular treebank), or an underscore if not available.
-    head : int
-        Head of the current token, which is either a value of ID,
-        or zero (’0’) if the token links to the virtual root node of the sentence.
-    relation : str
-        Dependency relation to the HEAD.
-    phead : int
-        Projective head of current token, which is either a value of ID or zero (’0’),
-        or an underscore if not available.
-    pdeprel : str
-        Dependency relation to the PHEAD, or an underscore if not available.
-    """
-    def __init__(self, idx, form, lemma=None, cpos=None, pos=None, feats=None,
-                 head=None, relation=None, phead=None, pdeprel=None):
-        self.idx = idx
-        self.form = form
-        self.cpos = cpos
-        self.pos = pos
-        self.head = head
-        self.relation = relation
-        self.lemma = lemma
-        self.feats = feats
-        self.phead = phead
-        self.pdeprel = pdeprel
-
-    def __str__(self):
-        values = [str(self.idx), self.form, self.lemma, self.cpos, self.pos, self.feats,
-                  str(self.head), self.relation, self.phead, self.pdeprel]
-        return '\t'.join(['_' if v is None else v for v in values])
-
-
-class ConllSentence:
-    """A list of ConllWord
-
-    Parameters
-    ----------
-    words : ConllWord
-        words of a sentence
-    """
-    def __init__(self, words):
-        super().__init__()
-        self.words = words
-
-    def __str__(self):
-        return '\n'.join([word.__str__() for word in self.words])
-
-    def __len__(self):
-        return len(self.words)
-
-    def __getitem__(self, index):
-        return self.words[index]
-
-    def __iter__(self):
-        return (line for line in self.words)
-
-
-class ParserVocabulary(Savable):
-    """Vocabulary, holds word, tag and relation along with their id.
-
-    Load from conll file
-    Adopted from https://github.com/jcyk/Dynet-Biaffine-dependency-parser with some modifications
-
-    Parameters
-    ----------
-    input_file : str
-        conll file
-    pret_embeddings : tuple
-        (embedding_name, source), used for gluonnlp.embedding.create(embedding_name, source)
-    min_occur_count : int
-        threshold of word frequency, those words with smaller frequency will be replaced by UNK
-    """
-    def __init__(self, input_file, pret_embeddings=None, min_occur_count=2):
-        super().__init__()
-        word_counter = Counter()
-        tag_set = set()
-        rel_set = set()
-
-        with open(input_file) as f:
-            for line in f:
-                info = line.strip().split()
-                if info:
-                    if len(info) == 10:
-                        rel_offset = 7
-                    elif len(info) == 8:
-                        rel_offset = 6
-                    word, tag = info[1].lower(), info[3]
-                    rel = info[rel_offset]
-                    word_counter[word] += 1
-                    tag_set.add(tag)
-                    if rel != 'root':
-                        rel_set.add(rel)
-
-        self._id2word = ['<pad>', '<root>', '<unk>']
-        self._id2tag = ['<pad>', '<root>', '<unk>']
-        self._id2rel = ['<pad>', 'root']
-
-        def reverse(x):
-            return dict(list(zip(x, list(range(len(x))))))
-
-        for word, count in word_counter.most_common():
-            if count > min_occur_count:
-                self._id2word.append(word)
-
-        self._pret_embeddings = pret_embeddings
-        self._words_in_train_data = len(self._id2word)
-        if pret_embeddings:
-            self._add_pret_words(pret_embeddings)
-        self._id2tag += list(tag_set)
-        self._id2rel += list(rel_set)
-
-        self._word2id = reverse(self._id2word)
-        self._tag2id = reverse(self._id2tag)
-        self._rel2id = reverse(self._id2rel)
-
-    PAD, ROOT, UNK = 0, 1, 2 # Padding, Root, Unknown
-
-    def log_info(self, logger):
-        """Print statistical information via the provided logger
-
-        Parameters
-        ----------
-        logger : logging.Logger
-            logger created using logging.getLogger()
-        """
-        logger.info('#words in training set: %d', self._words_in_train_data)
-        logger.info('Vocab info: #words %d, #tags %d #rels %d',
-                    self.vocab_size, self.tag_size, self.rel_size)
-
-    def _add_pret_words(self, pret_embeddings):
-        """Read pre-trained embedding file for extending vocabulary
-
-        Parameters
-        ----------
-        pret_embeddings : tuple
-            (embedding_name, source), used for gluonnlp.embedding.create(embedding_name, source)
-        """
-        words_in_train_data = set(self._id2word)
-        pret_embeddings = gluonnlp.embedding.create(pret_embeddings[0], source=pret_embeddings[1])
-
-        for token in pret_embeddings.idx_to_token:
-            if token not in words_in_train_data:
-                self._id2word.append(token)
-
-    def has_pret_embs(self):
-        """Check whether this vocabulary contains words from pre-trained embeddings
-
-        Returns
-        -------
-        bool : Whether this vocabulary contains words from pre-trained embeddings
-        """
-        return self._pret_embeddings is not None
-
-    def get_pret_embs(self, word_dims=None):
-        """Read pre-trained embedding file
-
-        Parameters
-        ----------
-        word_dims : int or None
-            vector size. Use `None` for auto-infer
-
-        Returns
-        -------
-        numpy.ndarray
-            T x C numpy NDArray
-        """
-        assert self._pret_embeddings is not None, 'No pretrained file provided.'
-        pret_embeddings = gluonnlp.embedding.create(self._pret_embeddings[0],
-                                                    source=self._pret_embeddings[1])
-        embs = [None] * len(self._id2word)
-        for idx, vec in enumerate(pret_embeddings.idx_to_vec):
-            embs[idx] = vec.asnumpy()
-        if word_dims is None:
-            word_dims = len(pret_embeddings.idx_to_vec[0])
-        for idx, emb in enumerate(embs):
-            if emb is None:
-                embs[idx] = np.zeros(word_dims)
-        pret_embs = np.array(embs, dtype=np.float32)
-        return pret_embs / np.std(pret_embs)
-
-    def get_word_embs(self, word_dims):
-        """Get randomly initialized embeddings when pre-trained embeddings are used,
-        otherwise zero vectors.
-
-        Parameters
-        ----------
-        word_dims : int
-            word vector size
-        Returns
-        -------
-        numpy.ndarray
-            T x C numpy NDArray
-        """
-        if self._pret_embeddings is not None:
-            return np.random.randn(self.words_in_train, word_dims).astype(np.float32)
-        return np.zeros((self.words_in_train, word_dims), dtype=np.float32)
-
-    def get_tag_embs(self, tag_dims):
-        """Randomly initialize embeddings for tag
-
-        Parameters
-        ----------
-        tag_dims : int
-            tag vector size
-
-        Returns
-        -------
-        numpy.ndarray
-            random embeddings
-        """
-        return np.random.randn(self.tag_size, tag_dims).astype(np.float32)
-
-    def word2id(self, xs):
-        """Map word(s) to its id(s)
-
-        Parameters
-        ----------
-        xs : str or list
-            word or a list of words
-
-        Returns
-        -------
-        int or list
-            id or a list of ids
-        """
-        if isinstance(xs, list):
-            return [self._word2id.get(x, self.UNK) for x in xs]
-        return self._word2id.get(xs, self.UNK)
-
-    def id2word(self, xs):
-        """Map id(s) to word(s)
-
-        Parameters
-        ----------
-        xs : int
-            id or a list of ids
-
-        Returns
-        -------
-        str or list
-            word or a list of words
-        """
-        if isinstance(xs, list):
-            return [self._id2word[x] for x in xs]
-        return self._id2word[xs]
-
-    def rel2id(self, xs):
-        """Map relation(s) to id(s)
-
-        Parameters
-        ----------
-        xs : str or list
-            relation
-
-        Returns
-        -------
-        int or list
-            id(s) of relation
-        """
-        if isinstance(xs, list):
-            return [self._rel2id[x] for x in xs]
-        return self._rel2id[xs]
-
-    def id2rel(self, xs):
-        """Map id(s) to relation(s)
-
-        Parameters
-        ----------
-        xs : int
-            id or a list of ids
-
-        Returns
-        -------
-        str or list
-            relation or a list of relations
-        """
-        if isinstance(xs, list):
-            return [self._id2rel[x] for x in xs]
-        return self._id2rel[xs]
-
-    def tag2id(self, xs):
-        """Map tag(s) to id(s)
-
-        Parameters
-        ----------
-        xs : str or list
-            tag or tags
-
-        Returns
-        -------
-        int or list
-            id(s) of tag(s)
-        """
-        if isinstance(xs, list):
-            return [self._tag2id.get(x, self.UNK) for x in xs]
-        return self._tag2id.get(xs, self.UNK)
-
-    @property
-    def words_in_train(self):
-        """
-        get #words in training set
-        Returns
-        -------
-        int
-            #words in training set
-        """
-        return self._words_in_train_data
-
-    @property
-    def vocab_size(self):
-        return len(self._id2word)
-
-    @property
-    def tag_size(self):
-        return len(self._id2tag)
-
-    @property
-    def rel_size(self):
-        return len(self._id2rel)
-
-
-class DataLoader:
-    """
-    Load CoNLL data
-    Adopted from https://github.com/jcyk/Dynet-Biaffine-dependency-parser with some modifications
-
-    Parameters
-    ----------
-    input_file : str
-        path to CoNLL file
-    n_bkts : int
-        number of buckets
-    vocab : ParserVocabulary
-        vocabulary object
-    """
-
-    def __init__(self, input_file, n_bkts, vocab):
-        self.vocab = vocab
-        sents = []
-        sent = [[ParserVocabulary.ROOT, ParserVocabulary.ROOT, 0, ParserVocabulary.ROOT]]
-        with open(input_file) as f:
-            for line in f:
-                info = line.strip().split()
-                if info:
-                    arc_offset = 5
-                    rel_offset = 6
-                    if len(info) == 10:
-                        arc_offset = 6
-                        rel_offset = 7
-                    assert info[rel_offset] in vocab._rel2id, 'Relation OOV: %s' % line
-                    word, tag = vocab.word2id(info[1].lower()), vocab.tag2id(info[3])
-                    head, rel = int(info[arc_offset]), vocab.rel2id(info[rel_offset])
-                    sent.append([word, tag, head, rel])
-                else:
-                    sents.append(sent)
-                    sent = [[ParserVocabulary.ROOT, ParserVocabulary.ROOT, 0,
-                             ParserVocabulary.ROOT]]
-            if len(sent) > 1:  # last sent in file without '\n'
-                sents.append(sent)
-
-        self.samples = len(sents)
-        len_counter = Counter()
-        for sent in sents:
-            len_counter[len(sent)] += 1
-        self._bucket_lengths = KMeans(n_bkts, len_counter).splits
-        self._buckets = [[] for i in range(n_bkts)]
-        # bkt_idx x length x sent_idx x 4
-        len2bkt = {}
-        prev_length = -1
-        for bkt_idx, length in enumerate(self._bucket_lengths):
-            len2bkt.update(list(zip(list(range(prev_length + 1, length + 1)),
-                                    [bkt_idx] * (length - prev_length))))
-            prev_length = length
-
-        self._record = []
-        for sent in sents:
-            bkt_idx = len2bkt[len(sent)]
-            idx = len(self._buckets[bkt_idx])
-            self._buckets[bkt_idx].append(sent)
-            self._record.append((bkt_idx, idx))
-
-        for bkt_idx, (bucket, length) in enumerate(zip(self._buckets, self._bucket_lengths)):
-            self._buckets[bkt_idx] = np.zeros((length, len(bucket), 4), dtype=np.int32)
-            for idx, sent in enumerate(bucket):
-                self._buckets[bkt_idx][:len(sent), idx, :] = np.array(sent, dtype=np.int32)
-
-    @property
-    def idx_sequence(self):
-        """Indices of sentences when enumerating data set from batches.
-        Useful when retrieving the correct order of sentences
-
-        Returns
-        -------
-        list
-            List of ids ranging from 0 to #sent -1
-        """
-        return [x[1] for x in sorted(zip(self._record, list(range(len(self._record)))))]
-
-    def get_batches(self, batch_size, shuffle=True):
-        """Get batch iterator
-
-        Parameters
-        ----------
-        batch_size : int
-            size of one batch
-        shuffle : bool
-            whether to shuffle batches. Don't set to True when evaluating on dev or test set.
-        Returns
-        -------
-        tuple
-            word_inputs, tag_inputs, arc_targets, rel_targets
-        """
-        batches = []
-        for bkt_idx, bucket in enumerate(self._buckets):
-            bucket_size = bucket.shape[1]
-            n_tokens = bucket_size * self._bucket_lengths[bkt_idx]
-            n_splits = min(max(n_tokens // batch_size, 1), bucket_size)
-            range_func = np.random.permutation if shuffle else np.arange
-            for bkt_batch in np.array_split(range_func(bucket_size), n_splits):
-                batches.append((bkt_idx, bkt_batch))
-
-        if shuffle:
-            np.random.shuffle(batches)
-
-        for bkt_idx, bkt_batch in batches:
-            word_inputs = self._buckets[bkt_idx][:, bkt_batch, 0]  # word_id x sent_id
-            tag_inputs = self._buckets[bkt_idx][:, bkt_batch, 1]
-            arc_targets = self._buckets[bkt_idx][:, bkt_batch, 2]
-            rel_targets = self._buckets[bkt_idx][:, bkt_batch, 3]
-            yield word_inputs, tag_inputs, arc_targets, rel_targets
diff --git a/scripts/parsing/common/exponential_scheduler.py b/scripts/parsing/common/exponential_scheduler.py
deleted file mode 100644
index 3773a129f2..0000000000
--- a/scripts/parsing/common/exponential_scheduler.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Learning rate schedule for parser training."""
-
-from mxnet.lr_scheduler import LRScheduler
-
-
-class ExponentialScheduler(LRScheduler):
-    """A simple learning rate decay scheduler
-        lr = base_lr * decay_rate ^ (num_update / decay_every)
-
-    Parameters
-    ----------
-    base_lr : float
-        the initial learning rate.
-    decay_rate : float
-        what percentage does the learning rate decreases to in every decay compared to last one
-    decay_every : float
-        how often does the decay occurs
-    """
-    def __init__(self, base_lr=0.01, decay_rate=0.5, decay_every=1):
-        super().__init__(base_lr)
-        self.decay_rate = decay_rate
-        self.decay_every = decay_every
-
-    def __call__(self, num_update):
-        return self.base_lr * self.decay_rate ** (num_update / self.decay_every)
diff --git a/scripts/parsing/common/k_means.py b/scripts/parsing/common/k_means.py
deleted file mode 100755
index 512ee0d2a0..0000000000
--- a/scripts/parsing/common/k_means.py
+++ /dev/null
@@ -1,183 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2016 Timothy Dozat
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""KMeans utility."""
-
-from collections import Counter
-
-import numpy as np
-
-
-class KMeans:
-    """
-    Cluster sentences by their lengths
-
-    Parameters
-    ----------
-    k : int
-        number of clusters
-    len_cntr : Counter
-        length counter
-    """
-    def __init__(self, k, len_cntr):
-        # Error checking
-        if len(len_cntr) < k:
-            raise ValueError('Trying to sort %d data points into %d buckets' % (len(len_cntr), k))
-
-        # Initialize variables
-        self._k = k
-        self._len_cntr = len_cntr
-        self._lengths = sorted(self._len_cntr.keys())
-        self._splits = []
-        self._split2len_idx = {}
-        self._len2split_idx = {}
-        self._split_cntr = Counter()
-
-        # Initialize the splits evenly
-        lengths = []
-        unique_length = []
-        for length, count in list(self._len_cntr.items()):
-            lengths.extend([length] * count)
-            unique_length.append(length)
-        lengths.sort()
-        unique_length.sort()
-        self._splits = [np.max(split) for split in np.array_split(lengths, self._k)]
-
-        i = len(self._splits) - 1
-        while i > 0:
-            while self._splits[i - 1] >= self._splits[i]:
-                index = unique_length.index(self._splits[i - 1])
-                if index == 0:
-                    break
-                self._splits[i - 1] = unique_length[index - 1]
-            i -= 1
-
-        unique_length.reverse()
-        i = 1
-        while i < len(self._splits) - 1:
-            while self._splits[i] <= self._splits[i - 1]:
-                index = unique_length.index(self._splits[i])
-                if index == 0:
-                    break
-                self._splits[i] = unique_length[index - 1]
-            i += 1
-
-        # Reindex everything
-        split_idx = 0
-        split = self._splits[split_idx]
-        for len_idx, length in enumerate(self._lengths):
-            count = self._len_cntr[length]
-            self._split_cntr[split] += count
-            if length == split:
-                self._split2len_idx[split] = len_idx
-                split_idx += 1
-                if split_idx < len(self._splits):
-                    split = self._splits[split_idx]
-                    self._split_cntr[split] = 0
-            elif length > split:
-                raise IndexError()
-
-        # Iterate
-        old_splits = None
-        # print('0) Initial splits: %s; Initial mass: %d' % (self._splits, self.get_mass()))
-        i = 0
-        while self._splits != old_splits:
-            old_splits = list(self._splits)
-            self._recenter()
-            i += 1
-        # print('%d) Final splits: %s; Final mass: %d' % (i, self._splits, self.get_mass()))
-
-        self._reindex()
-
-    def _recenter(self):
-        """
-        one iteration of k-means
-        """
-        for split_idx in range(len(self._splits)):
-            split = self._splits[split_idx]
-            len_idx = self._split2len_idx[split]
-            if split == self._splits[-1]:
-                continue
-            right_split = self._splits[split_idx + 1]
-
-            # Try shifting the centroid to the left
-            if len_idx > 0 and self._lengths[len_idx - 1] not in self._split_cntr:
-                new_split = self._lengths[len_idx - 1]
-                left_delta = (self._len_cntr[split] * (right_split - new_split)
-                              - self._split_cntr[split] * (split - new_split))
-                if left_delta < 0:
-                    self._splits[split_idx] = new_split
-                    self._split2len_idx[new_split] = len_idx - 1
-                    del self._split2len_idx[split]
-                    self._split_cntr[split] -= self._len_cntr[split]
-                    self._split_cntr[right_split] += self._len_cntr[split]
-                    self._split_cntr[new_split] = self._split_cntr[split]
-                    del self._split_cntr[split]
-
-            # Try shifting the centroid to the right
-            elif len_idx < len(self._lengths) - 2 \
-                and self._lengths[len_idx + 1] not in self._split_cntr:
-                new_split = self._lengths[len_idx + 1]
-                right_delta = (self._split_cntr[split] * (new_split - split)
-                               - self._len_cntr[split] * (new_split - split))
-                if right_delta <= 0:
-                    self._splits[split_idx] = new_split
-                    self._split2len_idx[new_split] = len_idx + 1
-                    del self._split2len_idx[split]
-                    self._split_cntr[split] += self._len_cntr[split]
-                    self._split_cntr[right_split] -= self._len_cntr[split]
-                    self._split_cntr[new_split] = self._split_cntr[split]
-                    del self._split_cntr[split]
-
-    def _reindex(self):
-        """
-        Index every sentence into a cluster
-        """
-        self._len2split_idx = {}
-        last_split = -1
-        for split_idx, split in enumerate(self._splits):
-            self._len2split_idx.update(
-                dict(list(zip(list(range(last_split + 1, split)),
-                              [split_idx] * (split - (last_split + 1))))))
-
-    def __len__(self):
-        return self._k
-
-    def __iter__(self):
-        return (split for split in self.splits)
-
-    def __getitem__(self, key):
-        return self._splits[key]
-
-    @property
-    def splits(self):
-        """Get clusters
-
-        Returns
-        -------
-        tuple
-            (bucket, length) mapping
-        """
-        return self._splits
-
-    @property
-    def len2split_idx(self):
-        """Get length to bucket mapping
-
-        Returns
-        -------
-        tuple
-             (length, bucket) mapping
-        """
-        return self._len2split_idx
diff --git a/scripts/parsing/common/savable.py b/scripts/parsing/common/savable.py
deleted file mode 100644
index 55dd42909c..0000000000
--- a/scripts/parsing/common/savable.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utility base class for saving objects."""
-
-import pickle
-
-
-class Savable:
-    """
-    A super class for save/load operations.
-    """
-
-    def __init__(self):
-        super().__init__()
-
-    def save(self, path):
-        """Save to path
-
-        Parameters
-        ----------
-        path : str
-            file path
-        """
-        with open(path, 'wb') as f:
-            pickle.dump(self, f)
-
-    @staticmethod
-    def load(path):
-        """Load from path
-
-        Parameters
-        ----------
-        path : str
-            file path
-
-        Returns
-        -------
-        Savable
-            An object
-        """
-        with open(path, 'rb') as f:
-            return pickle.load(f)
diff --git a/scripts/parsing/common/tarjan.py b/scripts/parsing/common/tarjan.py
deleted file mode 100755
index bf60a2adaf..0000000000
--- a/scripts/parsing/common/tarjan.py
+++ /dev/null
@@ -1,95 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2016 Timothy Dozat
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tarjan's algorithm for strongly connected components."""
-
-from collections import defaultdict
-
-
-# ***************************************************************
-class Tarjan:
-    """
-    Computes Tarjan's algorithm for finding strongly connected components (cycles) of a graph
-
-    Attributes:
-      edges: dictionary of edges such that edges[dep] = head
-      vertices: set of dependents
-      SCCs: list of sets of strongly connected components. Non-singleton sets are cycles.
-
-    Parameters
-    ----------
-    prediction : numpy.ndarray
-        a predicted dependency tree where prediction[dep_idx] = head_idx
-    tokens : numpy.ndarray
-        the tokens we care about (i.e. exclude _GO, _EOS, and _PAD)
-    """
-    def __init__(self, prediction, tokens):
-        self._edges = defaultdict(set)
-        self._vertices = set((0,))
-        for dep, head in enumerate(prediction[tokens]):
-            self._vertices.add(dep + 1)
-            self._edges[head].add(dep + 1)
-        self._indices = {}
-        self._lowlinks = {}
-        self._onstack = defaultdict(lambda: False)
-        self._SCCs = []
-
-        index = 0
-        stack = []
-        for v in self.vertices:
-            if v not in self.indices:
-                self.strongconnect(v, index, stack)
-
-    # =============================================================
-    def strongconnect(self, v, index, stack):
-        """Find strongly connected components."""
-
-        self._indices[v] = index
-        self._lowlinks[v] = index
-        index += 1
-        stack.append(v)
-        self._onstack[v] = True
-        for w in self.edges[v]:
-            if w not in self.indices:
-                self.strongconnect(w, index, stack)
-                self._lowlinks[v] = min(self._lowlinks[v], self._lowlinks[w])
-            elif self._onstack[w]:
-                self._lowlinks[v] = min(self._lowlinks[v], self._indices[w])
-
-        if self._lowlinks[v] == self._indices[v]:
-            self._SCCs.append(set())
-            while stack[-1] != v:
-                w = stack.pop()
-                self._onstack[w] = False
-                self._SCCs[-1].add(w)
-            w = stack.pop()
-            self._onstack[w] = False
-            self._SCCs[-1].add(w)
-
-    # ======================
-    @property
-    def edges(self):
-        return self._edges
-
-    @property
-    def vertices(self):
-        return self._vertices
-
-    @property
-    def indices(self):
-        return self._indices
-
-    @property
-    def SCCs(self):
-        return self._SCCs
diff --git a/scripts/parsing/common/utils.py b/scripts/parsing/common/utils.py
deleted file mode 100644
index 06d7aebdba..0000000000
--- a/scripts/parsing/common/utils.py
+++ /dev/null
@@ -1,526 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utility classes."""
-
-import logging
-import math
-import os
-import sys
-import time
-
-import numpy as np
-import mxnet as mx
-from mxnet import nd
-from mxnet.gluon import rnn, contrib
-
-from .data import ParserVocabulary
-from .tarjan import Tarjan
-
-
-class Progbar:
-    """Progbar class copied from keras (https://github.com/fchollet/keras/)
-
-    Displays a progress bar.
-    Small edit : added strict arg to update
-
-    Parameters
-    ----------
-    target : int
-        Total number of steps expected.
-    width : int
-        Progress bar width.
-    verbose : int
-        Verbosity level. Options are 1 and 2.
-    """
-    def __init__(self, target, width=30, verbose=1):
-        self.width = width
-        self.target = target
-        self.sum_values = {}
-        self.unique_values = []
-        self.start = time.time()
-        self.total_width = 0
-        self.seen_so_far = 0
-        self.verbose = verbose
-
-    def update(self, current, values=None, exact=None, strict=None):
-        """
-        Updates the progress bar.
-
-        Parameters
-        ----------
-        current : int
-            Index of current step.
-        values : List of tuples (name, value_for_last_step).
-            The progress bar will display averages for these values.
-        exact : List of tuples (name, value_for_last_step).
-            The progress bar will display these values directly.
-        """
-        values = values or []
-        exact = exact or []
-        strict = strict or []
-
-        for k, v in values:
-            if k not in self.sum_values:
-                self.sum_values[k] = [v * (current - self.seen_so_far), current - self.seen_so_far]
-                self.unique_values.append(k)
-            else:
-                self.sum_values[k][0] += v * (current - self.seen_so_far)
-                self.sum_values[k][1] += (current - self.seen_so_far)
-
-        for cells in exact:
-            k, v, w = cells[0], cells[1], 4
-            if len(cells) == 3:
-                w = cells[2]
-            if k not in self.sum_values:
-                self.unique_values.append(k)
-            self.sum_values[k] = [v, 1, w]
-
-        for k, v in strict:
-            if k not in self.sum_values:
-                self.unique_values.append(k)
-            self.sum_values[k] = v
-
-        self.seen_so_far = current
-
-        now = time.time()
-        if self.verbose == 1:
-            prev_total_width = self.total_width
-            sys.stdout.write('\b' * prev_total_width)
-            sys.stdout.write('\r')
-
-            numdigits = 0 if self.target == 0 or math.isnan(self.target) \
-                        else int(np.floor(np.log10(self.target))) + 1
-            barstr = '%%%dd/%%%dd [' % (numdigits, numdigits)
-            bar = barstr % (current, self.target)
-            prog = 0 if self.target == 0 else float(current) / self.target
-            prog_width = int(self.width * prog)
-            if prog_width > 0:
-                bar += ('=' * (prog_width - 1))
-                if current < self.target:
-                    bar += '>'
-                else:
-                    bar += '='
-            bar += ('.' * (self.width - prog_width))
-            bar += ']'
-            sys.stdout.write(bar)
-            self.total_width = len(bar)
-
-            if current:
-                time_per_unit = (now - self.start) / current
-            else:
-                time_per_unit = 0
-            eta = time_per_unit * (self.target - current)
-            info = ''
-            if current < self.target:
-                info += ' - ETA: %ds' % eta
-            else:
-                info += ' - %ds' % (now - self.start)
-            for k in self.unique_values:
-                if isinstance(self.sum_values[k], list):
-                    info += (' - %s: %.' + str(self.sum_values[k][2]) + 'f') % (
-                        k, self.sum_values[k][0] / max(1, self.sum_values[k][1]))
-                else:
-                    info += ' - %s: %s' % (k, self.sum_values[k])
-
-            self.total_width += len(info)
-            if prev_total_width > self.total_width:
-                info += ((prev_total_width - self.total_width) * ' ')
-
-            sys.stdout.write(info)
-            sys.stdout.flush()
-
-            if current >= self.target:
-                sys.stdout.write('\n')
-
-        if self.verbose == 2:
-            if current >= self.target:
-                info = '%ds' % (now - self.start)
-                for k in self.unique_values:
-                    info += ' - %s: %.4f' % (k,
-                                             self.sum_values[k][0] / max(1, self.sum_values[k][1]))
-                sys.stdout.write(info + '\n')
-
-    def add(self, n, values=None):
-        values = values or []
-        self.update(self.seen_so_far + n, values)
-
-
-def mxnet_prefer_gpu():
-    """If gpu available return gpu, else cpu
-
-    Returns
-    -------
-    context : Context
-        The preferable GPU context.
-    """
-    gpu = int(os.environ.get('MXNET_GPU', default=0))
-    if gpu in mx.test_utils.list_gpus():
-        return mx.gpu(gpu)
-    return mx.cpu()
-
-
-def init_logger(root_dir, name='train.log'):
-    """Initialize a logger
-
-    Parameters
-    ----------
-    root_dir : str
-        directory for saving log
-    name : str
-        name of logger
-
-    Returns
-    -------
-    logger : logging.Logger
-        a logger
-    """
-    os.makedirs(root_dir, exist_ok=True)
-    log_formatter = logging.Formatter('%(message)s')
-    logger = logging.getLogger(name)
-    file_handler = logging.FileHandler('{0}/{1}'.format(root_dir, name), mode='w')
-    file_handler.setFormatter(log_formatter)
-    logger.addHandler(file_handler)
-    console_handler = logging.StreamHandler()
-    console_handler.setFormatter(log_formatter)
-    logger.addHandler(console_handler)
-    logger.setLevel(logging.INFO)
-    return logger
-
-
-def orthonormal_VanillaLSTMBuilder(lstm_layers, input_dims, lstm_hiddens,
-                                   dropout_h=0., debug=False):
-    """Build a standard LSTM cell, with variational dropout,
-    with weights initialized to be orthonormal (https://arxiv.org/abs/1312.6120)
-
-    Parameters
-    ----------
-    lstm_layers : int
-        Currently only support one layer
-    input_dims : int
-        word vector dimensions
-    lstm_hiddens : int
-        hidden size
-    dropout_h : float
-        dropout on hidden states
-    debug : bool
-        set to True to skip orthonormal initialization
-
-    Returns
-    -------
-    lstm_cell : VariationalDropoutCell
-        A LSTM cell
-    """
-    assert lstm_layers == 1, 'only accept one layer lstm'
-    W = orthonormal_initializer(lstm_hiddens, lstm_hiddens + input_dims, debug)
-    W_h, W_x = W[:, :lstm_hiddens], W[:, lstm_hiddens:]
-    b = nd.zeros((4 * lstm_hiddens,))
-    b[lstm_hiddens:2 * lstm_hiddens] = -1.0
-    lstm_cell = rnn.LSTMCell(input_size=input_dims, hidden_size=lstm_hiddens,
-                             i2h_weight_initializer=mx.init.Constant(np.concatenate([W_x] * 4, 0)),
-                             h2h_weight_initializer=mx.init.Constant(np.concatenate([W_h] * 4, 0)),
-                             h2h_bias_initializer=mx.init.Constant(b))
-    wrapper = contrib.rnn.VariationalDropoutCell(lstm_cell, drop_states=dropout_h)
-    return wrapper
-
-
-def biLSTM(f_lstm, b_lstm, inputs, dropout_x=0.):
-    """Feature extraction through BiLSTM
-
-    Parameters
-    ----------
-    f_lstm : VariationalDropoutCell
-        Forward cell
-    b_lstm : VariationalDropoutCell
-        Backward cell
-    inputs : NDArray
-        seq_len x batch_size
-    dropout_x : float
-        Variational dropout on inputs
-
-    Returns
-    -------
-    outputs : NDArray
-        Outputs of BiLSTM layers, seq_len x 2 hidden_dims x batch_size
-    """
-    for f, b in zip(f_lstm, b_lstm):
-        inputs = nd.Dropout(inputs, dropout_x, axes=[0])  # important for variational dropout
-        fo, _ = f.unroll(length=inputs.shape[0], inputs=inputs, layout='TNC', merge_outputs=True)
-        bo, _ = b.unroll(length=inputs.shape[0], inputs=inputs.flip(axis=0), layout='TNC',
-                         merge_outputs=True)
-        f.reset()
-        b.reset()
-        inputs = nd.concat(fo, bo.flip(axis=0), dim=2)
-    return inputs
-
-
-def leaky_relu(x):
-    """slope=0.1 leaky ReLu
-
-    Parameters
-    ----------
-    x : NDArray
-        Input
-
-    Returns
-    -------
-    y : NDArray
-        y = x > 0 ? x : 0.1 * x
-    """
-    return nd.LeakyReLU(x, slope=.1)
-
-
-def bilinear(x, W, y, input_size, seq_len, batch_size, num_outputs=1, bias_x=False, bias_y=False):
-    """Do xWy
-
-    Parameters
-    ----------
-    x : NDArray
-        (input_size x seq_len) x batch_size
-    W : NDArray
-        (num_outputs x ny) x nx
-    y : NDArray
-        (input_size x seq_len) x batch_size
-    input_size : int
-        input dimension
-    seq_len : int
-        sequence length
-    batch_size : int
-        batch size
-    num_outputs : int
-        number of outputs
-    bias_x : bool
-        whether concat bias vector to input x
-    bias_y : bool
-        whether concat bias vector to input y
-
-    Returns
-    -------
-    output : NDArray
-        [seq_len_y x seq_len_x if output_size == 1 else seq_len_y x num_outputs x seq_len_x]
-        x batch_size
-    """
-    if bias_x:
-        x = nd.concat(x, nd.ones((1, seq_len, batch_size)), dim=0)
-    if bias_y:
-        y = nd.concat(y, nd.ones((1, seq_len, batch_size)), dim=0)
-
-    ny = input_size + bias_y
-    # W: (num_outputs x ny) x nx
-    lin = nd.dot(W, x)
-    if num_outputs > 1:
-        lin = reshape_fortran(lin, (ny, num_outputs * seq_len, batch_size))
-    y = y.transpose([2, 1, 0])  # May cause performance issues
-    lin = lin.transpose([2, 1, 0])
-    blin = nd.batch_dot(lin, y, transpose_b=True)
-    blin = blin.transpose([2, 1, 0])
-    if num_outputs > 1:
-        blin = reshape_fortran(blin, (seq_len, num_outputs, seq_len, batch_size))
-    return blin
-
-
-def orthonormal_initializer(output_size, input_size, debug=False):
-    """adopted from Timothy Dozat https://github.com/tdozat/Parser/blob/master/lib/linalg.py
-
-    Parameters
-    ----------
-    output_size : int
-    input_size : int
-    debug : bool
-        Whether to skip this initializer
-    Returns
-    -------
-    Q : np.ndarray
-        The orthonormal weight matrix of input_size x output_size
-    """
-    print((output_size, input_size))
-    if debug:
-        Q = np.random.randn(input_size, output_size) / np.sqrt(output_size)
-        return np.transpose(Q.astype(np.float32))
-    I = np.eye(output_size)
-    lr = .1
-    eps = .05 / (output_size + input_size)
-    success = False
-    tries = 0
-    while not success and tries < 10:
-        Q = np.random.randn(input_size, output_size) / np.sqrt(output_size)
-        for _ in range(100):
-            QTQmI = Q.T.dot(Q) - I
-            loss = np.sum(QTQmI ** 2 / 2)
-            Q2 = Q ** 2
-            Q -= lr * Q.dot(QTQmI) / (
-                np.abs(Q2 + Q2.sum(axis=0, keepdims=True)
-                       + Q2.sum(axis=1, keepdims=True) - 1) + eps)
-            if np.max(Q) > 1e6 or loss > 1e6 or not np.isfinite(loss):
-                tries += 1
-                lr /= 2
-                break
-        success = True
-    if success:
-        print(('Orthogonal pretrainer loss: %.2e' % loss))
-    else:
-        print('Orthogonal pretrainer failed, using non-orthogonal random matrix')
-        Q = np.random.randn(input_size, output_size) / np.sqrt(output_size)
-    return np.transpose(Q.astype(np.float32))
-
-
-def arc_argmax(parse_probs, length, tokens_to_keep, ensure_tree=True):
-    """MST
-    Adopted from Timothy Dozat https://github.com/tdozat/Parser/blob/master/lib/models/nn.py
-
-    Parameters
-    ----------
-    parse_probs : NDArray
-        seq_len x seq_len, the probability of arcs
-    length : NDArray
-        real sentence length
-    tokens_to_keep : NDArray
-        mask matrix
-    ensure_tree :
-        whether to ensure tree structure of output (apply MST)
-    Returns
-    -------
-    parse_preds : np.ndarray
-        prediction of arc parsing with size of (seq_len,)
-    """
-    if ensure_tree:
-        I = np.eye(len(tokens_to_keep))
-        # block loops and pad heads
-        parse_probs = parse_probs * tokens_to_keep * (1 - I)
-        parse_preds = np.argmax(parse_probs, axis=1)
-        tokens = np.arange(1, length)
-        roots = np.where(parse_preds[tokens] == 0)[0] + 1
-        # ensure at least one root
-        if len(roots) < 1:
-            # The current root probabilities
-            root_probs = parse_probs[tokens, 0]
-            # The current head probabilities
-            old_head_probs = parse_probs[tokens, parse_preds[tokens]]
-            # Get new potential root probabilities
-            new_root_probs = root_probs / old_head_probs
-            # Select the most probable root
-            new_root = tokens[np.argmax(new_root_probs)]
-            # Make the change
-            parse_preds[new_root] = 0
-        # ensure at most one root
-        elif len(roots) > 1:
-            # The probabilities of the current heads
-            root_probs = parse_probs[roots, 0]
-            # Set the probability of depending on the root zero
-            parse_probs[roots, 0] = 0
-            # Get new potential heads and their probabilities
-            new_heads = np.argmax(parse_probs[roots][:, tokens], axis=1) + 1
-            new_head_probs = parse_probs[roots, new_heads] / root_probs
-            # Select the most probable root
-            new_root = roots[np.argmin(new_head_probs)]
-            # Make the change
-            parse_preds[roots] = new_heads
-            parse_preds[new_root] = 0
-        # remove cycles
-        tarjan = Tarjan(parse_preds, tokens)
-        for SCC in tarjan.SCCs:
-            if len(SCC) > 1:
-                dependents = set()
-                to_visit = set(SCC)
-                while len(to_visit) > 0:
-                    node = to_visit.pop()
-                    if not node in dependents:
-                        dependents.add(node)
-                        to_visit.update(tarjan.edges[node])
-                # The indices of the nodes that participate in the cycle
-                cycle = np.array(list(SCC))
-                # The probabilities of the current heads
-                old_heads = parse_preds[cycle]
-                old_head_probs = parse_probs[cycle, old_heads]
-                # Set the probability of depending on a non-head to zero
-                non_heads = np.array(list(dependents))
-                parse_probs[np.repeat(cycle, len(non_heads)),
-                            np.repeat([non_heads], len(cycle), axis=0).flatten()] = 0
-                # Get new potential heads and their probabilities
-                new_heads = np.argmax(parse_probs[cycle][:, tokens], axis=1) + 1
-                new_head_probs = parse_probs[cycle, new_heads] / old_head_probs
-                # Select the most probable change
-                change = np.argmax(new_head_probs)
-                changed_cycle = cycle[change]
-                old_head = old_heads[change]
-                new_head = new_heads[change]
-                # Make the change
-                parse_preds[changed_cycle] = new_head
-                tarjan.edges[new_head].add(changed_cycle)
-                tarjan.edges[old_head].remove(changed_cycle)
-        return parse_preds
-    else:
-        # block and pad heads
-        parse_probs = parse_probs * tokens_to_keep
-        parse_preds = np.argmax(parse_probs, axis=1)
-        return parse_preds
-
-
-def rel_argmax(rel_probs, length, ensure_tree=True):
-    """Fix the relation prediction by heuristic rules
-
-    Parameters
-    ----------
-    rel_probs : NDArray
-        seq_len x rel_size
-    length :
-        real sentence length
-    ensure_tree :
-        whether to apply rules
-    Returns
-    -------
-    rel_preds : np.ndarray
-        prediction of relations of size (seq_len,)
-    """
-    if ensure_tree:
-        rel_probs[:, ParserVocabulary.PAD] = 0
-        root = ParserVocabulary.ROOT
-        tokens = np.arange(1, length)
-        rel_preds = np.argmax(rel_probs, axis=1)
-        roots = np.where(rel_preds[tokens] == root)[0] + 1
-        if len(roots) < 1:
-            rel_preds[1 + np.argmax(rel_probs[tokens, root])] = root
-        elif len(roots) > 1:
-            root_probs = rel_probs[roots, root]
-            rel_probs[roots, root] = 0
-            new_rel_preds = np.argmax(rel_probs[roots], axis=1)
-            new_rel_probs = rel_probs[roots, new_rel_preds] / root_probs
-            new_root = roots[np.argmin(new_rel_probs)]
-            rel_preds[roots] = new_rel_preds
-            rel_preds[new_root] = root
-        return rel_preds
-    else:
-        rel_probs[:, ParserVocabulary.PAD] = 0
-        rel_preds = np.argmax(rel_probs, axis=1)
-        return rel_preds
-
-
-def reshape_fortran(tensor, shape):
-    """The missing Fortran reshape for mx.NDArray
-
-    Parameters
-    ----------
-    tensor : NDArray
-        source tensor
-    shape : NDArray
-        desired shape
-
-    Returns
-    -------
-    output : NDArray
-        reordered result
-    """
-    return tensor.T.reshape(tuple(reversed(shape))).T
diff --git a/scripts/parsing/index.rst b/scripts/parsing/index.rst
deleted file mode 100644
index 878f9d9d2b..0000000000
--- a/scripts/parsing/index.rst
+++ /dev/null
@@ -1,79 +0,0 @@
-Dependency Parsing
----------------------------------
-
-:download:`Download scripts </model_zoo/parsing.zip>`
-
-Deep Biaffine Dependency Parser
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-This package contains an implementation of `Deep Biaffine Attention for Neural Dependency Parsing <https://arxiv.org/pdf/1611.01734.pdf>`_ proposed by Dozat and Manning (2016), with SOTA accuracy.
-
-Train
-""""""""""
-
-As the Penn Treebank dataset (PTB) is proprietary, we are unable to distribute it.
-If you have a legal copy, please place it in ``tests/data/biaffine/ptb``, use this `pre-processing script <https://github.com/hankcs/TreebankPreprocessing>`_ to convert it into conllx format.
-The tree view of data folder should be as follows.
-
-.. code-block:: console
-
-	$ tree tests/data/biaffine
-	tests/data/biaffine
-	└── ptb
-		├── dev.conllx
-		├── test.conllx
-		└── train.conllx
-
-Then Run the following code to train the biaffine model.
-
-.. code-block:: python
-
-    parser = DepParser()
-    parser.train(train_file='tests/data/biaffine/ptb/train.conllx',
-                 dev_file='tests/data/biaffine/ptb/dev.conllx',
-                 test_file='tests/data/biaffine/ptb/test.conllx', save_dir='tests/data/biaffine/model',
-                 pretrained_embeddings=('glove', 'glove.6B.100d'))
-    parser.evaluate(test_file='tests/data/biaffine/ptb/test.conllx', save_dir='tests/data/biaffine/model')
-
-
-The expected UAS should be around ``96%`` (see `training log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/syntactics/biaffine-ptb-train.log>`_ and `evaluation log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/syntactics/biaffine-ptb-test.log>`_). The trained model will be saved in following folder.
-
-.. code-block:: console
-
-	$ tree tests/data/biaffine/model
-	tests/data/biaffine/model
-	├── config.pkl
-	├── model.bin
-	├── test.log
-	├── train.log
-	└── vocab.pkl
-
-Note that the embeddings are not kept in ``model.bin``, in order to reduce file size.
-Users need to keep embeddings at the same place after training.
-A good practice is to place embeddings in the model folder and distribute them together.
-
-Decode
-""""""""""
-
-Once we trained a model or downloaded a pre-trained one, we can load it and decode raw sentences.
-
-.. code-block:: python
-
-    parser = DepParser()
-    parser.load('tests/data/biaffine/model')
-    sentence = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'), ('of', 'IN'), ('chamber', 'NN'),
-                ('music', 'NN'), ('?', '.')]
-    print(parser.parse(sentence))
-
-
-The output should be as follows.
-
-.. code-block:: text
-
-	1	Is	_	_	VBZ	_	4	cop	_	_
-	2	this	_	_	DT	_	4	nsubj	_	_
-	3	the	_	_	DT	_	4	det	_	_
-	4	future	_	_	NN	_	0	root	_	_
-	5	of	_	_	IN	_	4	prep	_	_
-	6	chamber	_	_	NN	_	7	nn	_	_
-	7	music	_	_	NN	_	5	pobj	_	_
-	8	?	_	_	.	_	4	punct	_	_
diff --git a/scripts/parsing/parser/__init__.py b/scripts/parsing/parser/__init__.py
deleted file mode 100644
index 13a83393a9..0000000000
--- a/scripts/parsing/parser/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
diff --git a/scripts/parsing/parser/biaffine_parser.py b/scripts/parsing/parser/biaffine_parser.py
deleted file mode 100644
index 5c9dfdd5ed..0000000000
--- a/scripts/parsing/parser/biaffine_parser.py
+++ /dev/null
@@ -1,357 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Deep Biaffine Parser Model."""
-
-import numpy as np
-import mxnet as mx
-from mxnet import nd, ndarray, autograd
-from mxnet.gluon import nn, loss
-
-from scripts.parsing.common import utils
-from gluonnlp.model import apply_weight_drop
-
-
-class BiaffineParser(nn.Block):
-    """A MXNet replicate of biaffine parser, see following paper
-    Dozat, T., & Manning, C. D. (2016). Deep biaffine attention for neural dependency parsing.
-    arXiv:1611.01734.
-
-    It's a re-implementation of DyNet version
-    https://github.com/jcyk/Dynet-Biaffine-dependency-parser
-
-    Parameters
-    ----------
-    vocab : ParserVocabulary
-        built from a data set
-    word_dims : int
-        word vector dimension
-    tag_dims : int
-        tag vector dimension
-    dropout_dim : int
-        keep rate of word dropout (drop out entire embedding)
-    lstm_layers : int
-        number of lstm layers
-    lstm_hiddens : int
-        size of lstm hidden states
-    dropout_lstm_input : float
-        dropout on x in variational RNN
-    dropout_lstm_hidden : float
-        dropout on h in variational RNN
-    mlp_arc_size : int
-        output size of MLP for arc feature extraction
-    mlp_rel_size : int
-        output size of MLP for rel feature extraction
-    dropout_mlp : int
-        dropout on the output of LSTM
-    debug : bool
-        debug mode
-    """
-    def __init__(self, vocab,
-                 word_dims,
-                 tag_dims,
-                 dropout_dim,
-                 lstm_layers,
-                 lstm_hiddens,
-                 dropout_lstm_input,
-                 dropout_lstm_hidden,
-                 mlp_arc_size,
-                 mlp_rel_size,
-                 dropout_mlp,
-                 debug=False):
-        super(BiaffineParser, self).__init__()
-
-        def embedding_from_numpy(_we, trainable=True):
-            word_embs = nn.Embedding(_we.shape[0], _we.shape[1],
-                                     weight_initializer=mx.init.Constant(_we))
-            apply_weight_drop(word_embs, 'weight', dropout_dim, axes=(1,))
-            if not trainable:
-                word_embs.collect_params().setattr('grad_req', 'null')
-            return word_embs
-
-        self._vocab = vocab
-        self.word_embs = embedding_from_numpy(vocab.get_word_embs(word_dims))
-        self.pret_word_embs = embedding_from_numpy(vocab.get_pret_embs(),
-                                                   trainable=False) if vocab.has_pret_embs() \
-                              else None
-        self.tag_embs = embedding_from_numpy(vocab.get_tag_embs(tag_dims))
-
-        self.f_lstm = nn.Sequential()
-        self.b_lstm = nn.Sequential()
-        self.f_lstm.add(utils.orthonormal_VanillaLSTMBuilder(1, word_dims + tag_dims,
-                                                             lstm_hiddens,
-                                                             dropout_lstm_hidden, debug))
-        self.b_lstm.add(
-            utils.orthonormal_VanillaLSTMBuilder(1, word_dims + tag_dims,
-                                                 lstm_hiddens,
-                                                 dropout_lstm_hidden, debug))
-        for _ in range(lstm_layers - 1):
-            self.f_lstm.add(
-                utils.orthonormal_VanillaLSTMBuilder(1, 2 * lstm_hiddens,
-                                                     lstm_hiddens,
-                                                     dropout_lstm_hidden, debug))
-            self.b_lstm.add(
-                utils.orthonormal_VanillaLSTMBuilder(1, 2 * lstm_hiddens,
-                                                     lstm_hiddens,
-                                                     dropout_lstm_hidden, debug))
-        self.dropout_lstm_input = dropout_lstm_input
-        self.dropout_lstm_hidden = dropout_lstm_hidden
-
-        mlp_size = mlp_arc_size + mlp_rel_size
-        W = utils.orthonormal_initializer(mlp_size, 2 * lstm_hiddens, debug)
-        self.mlp_dep_W = self.parameter_from_numpy('mlp_dep_W', W)
-        self.mlp_head_W = self.parameter_from_numpy('mlp_head_W', W)
-        self.mlp_dep_b = self.parameter_init('mlp_dep_b', (mlp_size,), mx.init.Zero())
-        self.mlp_head_b = self.parameter_init('mlp_head_b', (mlp_size,), mx.init.Zero())
-        self.mlp_arc_size = mlp_arc_size
-        self.mlp_rel_size = mlp_rel_size
-        self.dropout_mlp = dropout_mlp
-
-        self.arc_W = self.parameter_init('arc_W', (mlp_arc_size, mlp_arc_size + 1),
-                                         init=mx.init.Zero())
-        self.rel_W = self.parameter_init('rel_W', (vocab.rel_size * (mlp_rel_size + 1),
-                                                   mlp_rel_size + 1),
-                                         init=mx.init.Zero())
-        self.softmax_loss = loss.SoftmaxCrossEntropyLoss(axis=0, batch_axis=-1)
-
-        self.initialize()
-
-    def parameter_from_numpy(self, name, array):
-        """ Create parameter with its value initialized according to a numpy tensor
-
-        Parameters
-        ----------
-        name : str
-            parameter name
-        array : np.ndarray
-            initiation value
-
-        Returns
-        -------
-        mxnet.gluon.parameter
-            a parameter object
-        """
-        p = self.params.get(name, shape=array.shape, init=mx.init.Constant(array))
-        return p
-
-    def parameter_init(self, name, shape, init):
-        """Create parameter given name, shape and initiator
-
-        Parameters
-        ----------
-        name : str
-            parameter name
-        shape : tuple
-            parameter shape
-        init : mxnet.initializer
-            an initializer
-
-        Returns
-        -------
-        mxnet.gluon.parameter
-            a parameter object
-        """
-        p = self.params.get(name, shape=shape, init=init)
-        return p
-
-    def forward(self, word_inputs, tag_inputs, arc_targets=None, rel_targets=None):
-        # pylint: disable=arguments-differ
-        """Run decoding
-
-        Parameters
-        ----------
-        word_inputs : mxnet.ndarray.NDArray
-            word indices of seq_len x batch_size
-        tag_inputs : mxnet.ndarray.NDArray
-            tag indices of seq_len x batch_size
-        arc_targets : mxnet.ndarray.NDArray
-            gold arc indices of seq_len x batch_size
-        rel_targets : mxnet.ndarray.NDArray
-            gold rel indices of seq_len x batch_size
-        Returns
-        -------
-        tuple
-            (arc_accuracy, rel_accuracy, overall_accuracy, loss) when training,
-            else if given gold target
-            then return arc_accuracy, rel_accuracy, overall_accuracy, outputs,
-            otherwise return outputs, where outputs is a list of (arcs, rels).
-        """
-        def flatten_numpy(arr):
-            """Flatten nd-array to 1-d column vector
-
-            Parameters
-            ----------
-            arr : numpy.ndarray
-                input tensor
-
-            Returns
-            -------
-            numpy.ndarray
-                A column vector
-
-            """
-            return np.reshape(arr, (-1,), 'F')
-
-        is_train = autograd.is_training()
-        batch_size = word_inputs.shape[1]
-        seq_len = word_inputs.shape[0]
-        mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32)
-        num_tokens = int(np.sum(mask))  # non padding, non root token number
-
-        if is_train or arc_targets is not None:
-            mask_1D = flatten_numpy(mask)
-            mask_1D_tensor = nd.array(mask_1D)
-
-        unked_words = np.where(word_inputs < self._vocab.words_in_train,
-                               word_inputs, self._vocab.UNK)
-        word_embs = self.word_embs(nd.array(unked_words, dtype='int'))
-        if self.pret_word_embs:
-            word_embs = word_embs + self.pret_word_embs(nd.array(word_inputs))
-        tag_embs = self.tag_embs(nd.array(tag_inputs))
-
-        # Dropout
-        emb_inputs = nd.concat(word_embs, tag_embs, dim=2)  # seq_len x batch_size
-
-        top_recur = utils.biLSTM(self.f_lstm, self.b_lstm, emb_inputs,
-                                 dropout_x=self.dropout_lstm_input)
-        top_recur = nd.Dropout(data=top_recur, axes=[0], p=self.dropout_mlp)
-
-        W_dep, b_dep = self.mlp_dep_W.data(), self.mlp_dep_b.data()
-        W_head, b_head = self.mlp_head_W.data(), self.mlp_head_b.data()
-        dep = nd.Dropout(data=utils.leaky_relu(nd.dot(top_recur, W_dep.T) + b_dep),
-                         axes=[0], p=self.dropout_mlp)
-        head = nd.Dropout(data=utils.leaky_relu(nd.dot(top_recur, W_head.T) + b_head),
-                          axes=[0], p=self.dropout_mlp)
-        dep, head = nd.transpose(dep, axes=[2, 0, 1]), nd.transpose(head, axes=[2, 0, 1])
-        dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
-        head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]
-
-        W_arc = self.arc_W.data()
-        arc_logits = utils.bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size,
-                                    seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False)
-        # (#head x #dep) x batch_size
-
-        flat_arc_logits = utils.reshape_fortran(arc_logits, (seq_len, seq_len * batch_size))
-        # (#head ) x (#dep x batch_size)
-
-        arc_preds = arc_logits.argmax(0)
-        # seq_len x batch_size
-
-        if is_train or arc_targets is not None:
-            correct = np.equal(arc_preds.asnumpy(), arc_targets)
-            arc_correct = correct.astype(np.float32) * mask
-            arc_accuracy = np.sum(arc_correct) / num_tokens
-            targets_1D = flatten_numpy(arc_targets)
-            losses = self.softmax_loss(flat_arc_logits, nd.array(targets_1D))
-            arc_loss = nd.sum(losses * mask_1D_tensor) / num_tokens
-
-        if not is_train:
-            arc_probs = np.transpose(
-                np.reshape(nd.softmax(flat_arc_logits, axis=0).asnumpy(),
-                           (seq_len, seq_len, batch_size), 'F'))
-        # #batch_size x #dep x #head
-
-        W_rel = self.rel_W.data()
-        rel_logits = utils.bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size,
-                                    seq_len, batch_size, num_outputs=self._vocab.rel_size,
-                                    bias_x=True, bias_y=True)
-        # (#head x rel_size x #dep) x batch_size
-
-        flat_rel_logits = utils.reshape_fortran(rel_logits, (seq_len, self._vocab.rel_size,
-                                                             seq_len * batch_size))
-        # (#head x rel_size) x (#dep x batch_size)
-
-        if is_train: # pylint: disable=using-constant-test
-            _target_vec = targets_1D
-        else:
-            _target_vec = flatten_numpy(arc_preds.asnumpy())
-        _target_vec = nd.array(_target_vec).reshape(seq_len * batch_size, 1)
-        _target_mat = _target_vec * nd.ones((1, self._vocab.rel_size))
-
-        partial_rel_logits = nd.pick(flat_rel_logits, _target_mat.T, axis=0)
-        # (rel_size) x (#dep x batch_size)
-
-        if is_train or arc_targets is not None:
-            rel_preds = partial_rel_logits.argmax(0)
-            targets_1D = flatten_numpy(rel_targets)
-            rel_correct = np.equal(rel_preds.asnumpy(), targets_1D).astype(np.float32) * mask_1D
-            rel_accuracy = np.sum(rel_correct) / num_tokens
-            losses = self.softmax_loss(partial_rel_logits, nd.array(targets_1D))
-            rel_loss = nd.sum(losses * mask_1D_tensor) / num_tokens
-
-        if not is_train:
-            rel_probs = np.transpose(np.reshape(nd.softmax(flat_rel_logits.transpose([1, 0, 2]),
-                                                           axis=0).asnumpy(),
-                                                (self._vocab.rel_size, seq_len,
-                                                 seq_len, batch_size), 'F'))
-        # batch_size x #dep x #head x #nclasses
-
-        if is_train or arc_targets is not None:
-            l = arc_loss + rel_loss
-            correct = rel_correct * flatten_numpy(arc_correct)
-            overall_accuracy = np.sum(correct) / num_tokens
-
-        if is_train: # pylint: disable=using-constant-test
-            return arc_accuracy, rel_accuracy, overall_accuracy, l
-
-        outputs = []
-
-        for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs):
-            # parse sentences one by one
-            msk[0] = 1.
-            sent_len = int(np.sum(msk))
-            arc_pred = utils.arc_argmax(arc_prob, sent_len, msk)
-            rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred]
-            rel_pred = utils.rel_argmax(rel_prob, sent_len)
-            outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len]))
-
-        if arc_targets is not None:
-            return arc_accuracy, rel_accuracy, overall_accuracy, outputs
-        return outputs
-
-    def save_parameters(self, filename):  # pylint: disable=arguments-differ
-        """Save model
-
-        Parameters
-        ----------
-        filename : str
-            path to model file
-        """
-        params = self._collect_params_with_prefix()
-        if self.pret_word_embs:  # don't save word embeddings inside model
-            params.pop('pret_word_embs.weight', None)
-        arg_dict = {key: val._reduce() for key, val in params.items()}
-        ndarray.save(filename, arg_dict)
-
-    def save(self, save_path):
-        """Save model
-
-        Parameters
-        ----------
-        filename : str
-            path to model file
-        """
-        self.save_parameters(save_path)
-
-    def load(self, load_path):
-        """Load model
-
-        Parameters
-        ----------
-        load_path : str
-            path to model file
-        """
-        self.load_parameters(load_path, allow_missing=True)
diff --git a/scripts/parsing/parser/dep_parser.py b/scripts/parsing/parser/dep_parser.py
deleted file mode 100644
index f9f1b9fa9b..0000000000
--- a/scripts/parsing/parser/dep_parser.py
+++ /dev/null
@@ -1,310 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Deep Biaffine Dependency Parser driver class and script."""
-
-import math
-import os
-import numpy as np
-
-import mxnet as mx
-from mxnet import gluon, autograd
-
-from scripts.parsing.common.config import _Config
-from scripts.parsing.common.data import ParserVocabulary, DataLoader, ConllWord, ConllSentence
-from scripts.parsing.common.exponential_scheduler import ExponentialScheduler
-from scripts.parsing.common.utils import init_logger, mxnet_prefer_gpu, Progbar
-from scripts.parsing.parser.biaffine_parser import BiaffineParser
-from scripts.parsing.parser.evaluate import evaluate_official_script
-
-
-class DepParser:
-    """User interfaces for biaffine dependency parser.
-
-    It wraps a biaffine model inside, provides training, evaluating and parsing.
-    """
-
-    def __init__(self):
-        super().__init__()
-        self._parser = None
-        self._vocab = None
-
-    def train(self, train_file, dev_file, test_file, save_dir,
-              pretrained_embeddings=None, min_occur_count=2,
-              lstm_layers=3, word_dims=100, tag_dims=100, dropout_emb=0.33, lstm_hiddens=400,
-              dropout_lstm_input=0.33, dropout_lstm_hidden=0.33,
-              mlp_arc_size=500, mlp_rel_size=100,
-              dropout_mlp=0.33, learning_rate=2e-3, decay=.75, decay_steps=5000,
-              beta_1=.9, beta_2=.9, epsilon=1e-12,
-              num_buckets_train=40,
-              num_buckets_valid=10, num_buckets_test=10, train_iters=50000, train_batch_size=5000,
-              test_batch_size=5000, validate_every=100, save_after=5000, debug=False):
-        """Train a deep biaffine dependency parser.
-
-        Parameters
-        ----------
-        train_file : str
-            path to training set
-        dev_file : str
-            path to dev set
-        test_file : str
-            path to test set
-        save_dir : str
-            a directory for saving model and related meta-data
-        pretrained_embeddings : tuple
-            (embedding_name, source), used for gluonnlp.embedding.create(embedding_name, source)
-        min_occur_count : int
-            threshold of rare words, which will be replaced with UNKs,
-        lstm_layers : int
-            layers of lstm
-        word_dims : int
-            dimension of word embedding
-        tag_dims : int
-            dimension of tag embedding
-        dropout_emb : float
-            word dropout
-        lstm_hiddens : int
-            size of lstm hidden states
-        dropout_lstm_input : int
-            dropout on x in variational RNN
-        dropout_lstm_hidden : int
-            dropout on h in variational RNN
-        mlp_arc_size : int
-            output size of MLP for arc feature extraction
-        mlp_rel_size : int
-            output size of MLP for rel feature extraction
-        dropout_mlp : float
-            dropout on the output of LSTM
-        learning_rate : float
-            learning rate
-        decay : float
-            see ExponentialScheduler
-        decay_steps : int
-            see ExponentialScheduler
-        beta_1 : float
-            see ExponentialScheduler
-        beta_2 : float
-            see ExponentialScheduler
-        epsilon : float
-            see ExponentialScheduler
-        num_buckets_train : int
-            number of buckets for training data set
-        num_buckets_valid : int
-            number of buckets for dev data set
-        num_buckets_test : int
-            number of buckets for testing data set
-        train_iters : int
-            training iterations
-        train_batch_size : int
-            training batch size
-        test_batch_size : int
-            test batch size
-        validate_every : int
-            validate on dev set every such number of batches
-        save_after : int
-            skip saving model in early epochs
-        debug : bool
-            debug mode
-
-        Returns
-        -------
-        DepParser
-            parser itself
-        """
-        logger = init_logger(save_dir)
-        config = _Config(train_file, dev_file, test_file, save_dir, pretrained_embeddings,
-                         min_occur_count,
-                         lstm_layers, word_dims, tag_dims, dropout_emb, lstm_hiddens,
-                         dropout_lstm_input, dropout_lstm_hidden, mlp_arc_size, mlp_rel_size,
-                         dropout_mlp, learning_rate, decay, decay_steps,
-                         beta_1, beta_2, epsilon, num_buckets_train, num_buckets_valid,
-                         num_buckets_test, train_iters,
-                         train_batch_size, debug)
-        config.save()
-        self._vocab = vocab = ParserVocabulary(train_file,
-                                               pretrained_embeddings,
-                                               min_occur_count)
-        vocab.save(config.save_vocab_path)
-        vocab.log_info(logger)
-
-        with mx.Context(mxnet_prefer_gpu()):
-            self._parser = parser = BiaffineParser(vocab, word_dims, tag_dims,
-                                                   dropout_emb,
-                                                   lstm_layers,
-                                                   lstm_hiddens, dropout_lstm_input,
-                                                   dropout_lstm_hidden,
-                                                   mlp_arc_size,
-                                                   mlp_rel_size, dropout_mlp, debug)
-            parser.initialize()
-            scheduler = ExponentialScheduler(learning_rate, decay, decay_steps)
-            optimizer = mx.optimizer.Adam(learning_rate, beta_1, beta_2, epsilon,
-                                          lr_scheduler=scheduler)
-            trainer = gluon.Trainer(parser.collect_params(), optimizer=optimizer)
-            data_loader = DataLoader(train_file, num_buckets_train, vocab)
-            global_step = 0
-            best_UAS = 0.
-            batch_id = 0
-            epoch = 1
-            total_epoch = math.ceil(train_iters / validate_every)
-            logger.info('Epoch %d out of %d', epoch, total_epoch)
-            bar = Progbar(target=min(validate_every, data_loader.samples))
-            while global_step < train_iters:
-                for words, tags, arcs, rels in data_loader.get_batches(batch_size=train_batch_size,
-                                                                       shuffle=True):
-                    with autograd.record():
-                        arc_accuracy, _, _, loss = parser.forward(words, tags, arcs, rels)
-                        loss_value = loss.asscalar()
-                    loss.backward()
-                    trainer.step(train_batch_size)
-                    batch_id += 1
-                    try:
-                        bar.update(batch_id,
-                                   exact=[('UAS', arc_accuracy, 2),
-                                          ('loss', loss_value)])
-                    except OverflowError:
-                        pass  # sometimes loss can be 0 or infinity, crashes the bar
-
-                    global_step += 1
-                    if global_step % validate_every == 0:
-                        bar = Progbar(target=min(validate_every, train_iters - global_step))
-                        batch_id = 0
-                        UAS, LAS, speed = evaluate_official_script(parser, vocab,
-                                                                   num_buckets_valid,
-                                                                   test_batch_size,
-                                                                   dev_file,
-                                                                   os.path.join(save_dir,
-                                                                                'valid_tmp'))
-                        logger.info('Dev: UAS %.2f%% LAS %.2f%% %d sents/s', UAS, LAS, speed)
-                        epoch += 1
-                        if global_step < train_iters:
-                            logger.info('Epoch %d out of %d', epoch, total_epoch)
-                        if global_step > save_after and UAS > best_UAS:
-                            logger.info('- new best score!')
-                            best_UAS = UAS
-                            parser.save(config.save_model_path)
-
-        # When validate_every is too big
-        if not os.path.isfile(config.save_model_path) or best_UAS != UAS:
-            parser.save(config.save_model_path)
-
-        return self
-
-    def load(self, path):
-        """Load from disk
-
-        Parameters
-        ----------
-        path : str
-            path to the directory which typically contains a config.pkl file and a model.bin file
-
-        Returns
-        -------
-        DepParser
-            parser itself
-        """
-        config = _Config.load(os.path.join(path, 'config.pkl'))
-        config.save_dir = path  # redirect root path to what user specified
-        self._vocab = vocab = ParserVocabulary.load(config.save_vocab_path)
-        with mx.Context(mxnet_prefer_gpu()):
-            self._parser = BiaffineParser(vocab, config.word_dims, config.tag_dims,
-                                          config.dropout_emb,
-                                          config.lstm_layers,
-                                          config.lstm_hiddens, config.dropout_lstm_input,
-                                          config.dropout_lstm_hidden,
-                                          config.mlp_arc_size, config.mlp_rel_size,
-                                          config.dropout_mlp, config.debug)
-            self._parser.load(config.save_model_path)
-        return self
-
-    def evaluate(self, test_file, save_dir=None, logger=None,
-                 num_buckets_test=10, test_batch_size=5000):
-        """Run evaluation on test set
-
-        Parameters
-        ----------
-        test_file : str
-            path to test set
-        save_dir : str
-            where to store intermediate results and log
-        logger : logging.logger
-            logger for printing results
-        num_buckets_test : int
-            number of clusters for sentences from test set
-        test_batch_size : int
-            batch size of test set
-
-        Returns
-        -------
-        tuple
-            UAS, LAS
-        """
-        parser = self._parser
-        vocab = self._vocab
-        with mx.Context(mxnet_prefer_gpu()):
-            UAS, LAS, speed = evaluate_official_script(parser, vocab, num_buckets_test,
-                                                       test_batch_size, test_file,
-                                                       os.path.join(save_dir, 'valid_tmp'))
-        if logger is None:
-            logger = init_logger(save_dir, 'test.log')
-        logger.info('Test: UAS %.2f%% LAS %.2f%% %d sents/s', UAS, LAS, speed)
-
-        return UAS, LAS
-
-    def parse(self, sentence):
-        """Parse raw sentence into ConllSentence
-
-        Parameters
-        ----------
-        sentence : list
-            a list of (word, tag) tuples
-
-        Returns
-        -------
-        ConllSentence
-            ConllSentence object
-        """
-        words = np.zeros((len(sentence) + 1, 1), np.int32)
-        tags = np.zeros((len(sentence) + 1, 1), np.int32)
-        words[0, 0] = ParserVocabulary.ROOT
-        tags[0, 0] = ParserVocabulary.ROOT
-        vocab = self._vocab
-
-        for i, (word, tag) in enumerate(sentence):
-            words[i + 1, 0], tags[i + 1, 0] = vocab.word2id(word.lower()), vocab.tag2id(tag)
-
-        with mx.Context(mxnet_prefer_gpu()):
-            outputs = self._parser.forward(words, tags)
-        words = []
-        for arc, rel, (word, tag) in zip(outputs[0][0], outputs[0][1], sentence):
-            words.append(ConllWord(idx=len(words) + 1, form=word, pos=tag,
-                                   head=arc, relation=vocab.id2rel(rel)))
-        return ConllSentence(words)
-
-
-if __name__ == '__main__':
-    dep_parser = DepParser()
-    dep_parser.train(train_file='tests/data/biaffine/ptb/train.conllx',
-                     dev_file='tests/data/biaffine/ptb/dev.conllx',
-                     test_file='tests/data/biaffine/ptb/test.conllx',
-                     save_dir='tests/data/biaffine/model',
-                     pretrained_embeddings=('glove', 'glove.6B.100d'))
-    dep_parser.load('tests/data/biaffine/model')
-    dep_parser.evaluate(test_file='tests/data/biaffine/ptb/test.conllx',
-                        save_dir='tests/data/biaffine/model')
-
-    sent = [('Is', 'VBZ'), ('this', 'DT'), ('the', 'DT'), ('future', 'NN'),
-            ('of', 'IN'), ('chamber', 'NN'), ('music', 'NN'), ('?', '.')]
-    print(dep_parser.parse(sent))
diff --git a/scripts/parsing/parser/evaluate/__init__.py b/scripts/parsing/parser/evaluate/__init__.py
deleted file mode 100644
index ee3f45778c..0000000000
--- a/scripts/parsing/parser/evaluate/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Official evaluation for parsing."""
-
-from .evaluate import evaluate_official_script
diff --git a/scripts/parsing/parser/evaluate/evaluate.py b/scripts/parsing/parser/evaluate/evaluate.py
deleted file mode 100644
index 4622919d79..0000000000
--- a/scripts/parsing/parser/evaluate/evaluate.py
+++ /dev/null
@@ -1,136 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Evaluation module for parsing results."""
-
-import time
-from functools import reduce
-import numpy as np
-import gluonnlp as nlp
-
-from scripts.parsing.common.data import DataLoader
-
-nlp.utils.check_version('0.7.0')
-
-def evaluate_official_script(parser, vocab, num_buckets_test, test_batch_size,
-                             test_file, output_file, debug=False):
-    """Evaluate parser on a data set
-
-    Parameters
-    ----------
-    parser : BiaffineParser
-        biaffine parser
-    vocab : ParserVocabulary
-        vocabulary built from data set
-    num_buckets_test : int
-        size of buckets (cluster sentences into this number of clusters)
-    test_batch_size : int
-        batch size
-    test_file : str
-        gold test file
-    output_file : str
-        output result to this file
-    debug : bool
-        only evaluate first 1000 sentences for debugging
-
-    Returns
-    -------
-    tuple
-        UAS, LAS, speed
-    """
-    data_loader = DataLoader(test_file, num_buckets_test, vocab)
-    record = data_loader.idx_sequence
-    results = [None] * len(record)
-    idx = 0
-    seconds = time.time()
-    uc, lc, total = 0, 0, 0
-    for words, tags, arcs, rels in data_loader.get_batches(batch_size=test_batch_size,
-                                                           shuffle=False):
-        outputs = parser.forward(words, tags)
-        for output, gold_arc, gold_rel in zip(
-                outputs, arcs.transpose([1, 0]), rels.transpose([1, 0])):
-            pred_arc = output[0]
-            pred_rel = output[1]
-            length = pred_arc.shape[0]
-            gold_arc = gold_arc[1:length + 1]
-            gold_rel = gold_rel[1:length + 1]
-
-            arc_mask = np.equal(pred_arc, gold_arc)
-            uc += np.sum(arc_mask)
-            total += length
-
-            lc += np.sum(np.equal(pred_rel, gold_rel) * arc_mask)
-            sent_idx = record[idx]
-            results[sent_idx] = output
-            idx += 1
-    speed = len(record) / seconds
-    UAS = uc / total * 100
-    LAS = lc / total * 100
-    if output_file:
-        arcs = reduce(lambda x, y: x + y, [list(result[0]) for result in results])
-        rels = reduce(lambda x, y: x + y, [list(result[1]) for result in results])
-        idx = 0
-        with open(test_file) as f:
-            if debug:
-                f = f.readlines()[:1000]
-            with open(output_file, 'w') as fo:
-                for line in f:
-                    info = line.strip().split()
-                    if info:
-                        arc_offset = 5
-                        rel_offset = 6
-                        if len(info) == 10:  # conll or conllx
-                            arc_offset = 6
-                            rel_offset = 7
-                        # assert len(info) == 10, 'Illegal line: %s' % line
-                        info[arc_offset] = str(arcs[idx])
-                        info[rel_offset] = vocab.id2rel(rels[idx])
-                        fo.write('\t'.join(info) + '\n')
-                        idx += 1
-                    else:
-                        fo.write('\n')
-    return UAS, LAS, speed
-
-
-def prf(correct, pred_sum, gold_sum):
-    """
-    Calculate precision, recall and f1 score
-    Parameters
-    ----------
-    correct : int
-                number of correct predictions
-    pred_sum : int
-                number of predictions
-    gold_sum : int
-                number of gold answers
-    Returns
-    -------
-    tuple
-                (p, r, f)
-    """
-    if pred_sum:
-        p = correct / pred_sum
-    else:
-        p = 0
-    if gold_sum:
-        r = correct / gold_sum
-    else:
-        r = 0
-    if p + r:
-        f = 2 * p * r / (p + r)
-    else:
-        f = 0
-    return p, r, f
diff --git a/scripts/preprocess/README.md b/scripts/preprocess/README.md
new file mode 100644
index 0000000000..bbecc9ca27
--- /dev/null
+++ b/scripts/preprocess/README.md
@@ -0,0 +1,20 @@
+# Data Preprocessing Toolkit in GluonNLP
+
+## Clean and Tokenize a Parallel Corpus
+
+To clean and tokenize a parallel corpus, use
+```
+nlp_preprocess clean_tok_para_corpus --help
+```
+
+## Learn/Apply Subwords
+
+To learn a subword tokenizer, use
+```
+nlp_preprocess learn_subword --help
+```
+
+To apply the learned subword tokenizer, user
+```
+nlp_preprocess apply_subword --help
+```
diff --git a/scripts/preprocess/__init__.py b/scripts/preprocess/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/scripts/preprocess/__main__.py b/scripts/preprocess/__main__.py
new file mode 100644
index 0000000000..472d304e58
--- /dev/null
+++ b/scripts/preprocess/__main__.py
@@ -0,0 +1,49 @@
+import argparse
+import textwrap
+
+from . import (
+    clean_tok_para_corpus,
+    clean_tok_mono_corpus,
+    learn_subword,
+    apply_subword
+)
+
+
+SUBCOMMANDS = ['clean_tok_para_corpus', 'clean_tok_mono_corpus',
+               'learn_subword', 'apply_subword', 'help']
+
+
+def cli_main():
+    parser = argparse.ArgumentParser(
+        description='Sharable data preprocessing utilities in GluonNLP.',
+        prog='nlp_preprocess', add_help=False)
+    parser.add_argument('command', type=str,
+                        choices=SUBCOMMANDS,
+                        metavar='[subcommand]',
+                        help='The subcommand to use. '
+                             'Choices are {}.'.format(SUBCOMMANDS))
+    args, other_args = parser.parse_known_args()
+    if args.command == 'clean_tok_para_corpus':
+        parser = clean_tok_para_corpus.get_parser()
+        sub_args = parser.parse_args(other_args)
+        clean_tok_para_corpus.main(sub_args)
+    elif args.command == 'clean_tok_mono_corpus':
+        parser = clean_tok_mono_corpus.get_parser()
+        sub_args = parser.parse_args(other_args)
+        clean_tok_mono_corpus.main(sub_args)
+    elif args.command == 'learn_subword':
+        parser = learn_subword.get_parser()
+        sub_args = parser.parse_args(other_args)
+        learn_subword.main(sub_args)
+    elif args.command == 'apply_subword':
+        parser = apply_subword.get_parser()
+        sub_args = parser.parse_args(other_args)
+        apply_subword.main(sub_args)
+    elif args.command == 'help':
+        parser.print_help()
+    else:
+        parser.print_help()
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/scripts/preprocess/apply_subword.py b/scripts/preprocess/apply_subword.py
new file mode 100644
index 0000000000..dc4c0c974a
--- /dev/null
+++ b/scripts/preprocess/apply_subword.py
@@ -0,0 +1,176 @@
+import argparse
+import textwrap
+from multiprocessing import Pool
+import numpy as np
+import time
+from gluonnlp.data import tokenizers
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=textwrap.dedent('''
+    Encode based on different implementations.
+
+    We support the following models:
+
+        "python3 apply_subword.py --model spm" : Encode with Sentencepiece Model;
+        "python3 apply_subword.py --model subword_nmt" : Encode with the subword-nmt package;
+        "python3 apply_subword.py --model yttm" : Encode with YouTokenToMe; 
+        "python3 apply_subword.py --model hf_bytebpe" : Encode with the Byte-level BPE Tokenizer Implemented by Huggingface.
+        "python3 apply_subword.py --model hf_wordpiece" : Encode with the Wordpiece Tokenizer Implementated by Huggingface.
+        "python3 apply_subword.py --model hf_bpe" : Encode with the BPE Tokenizer Implemented by Huggingface.
+    ''')
+    )
+    parser.add_argument('--corpus', type=str, nargs='+', required=True,
+                        help='Path of the corpus. '
+                             'You may input multiple corpus files separated by space.')
+    parser.add_argument('--save-path', type=str, required=True,
+                        help='Path of the output file')
+    parser.add_argument('--model-path', type=str, default=None,
+                        help='Path of the model file')
+    parser.add_argument('--vocab-path', type=str, default=None,
+                        help='Path of the vocabulary file')
+    parser.add_argument('--model', type=str, choices=['spm',
+                                                      'subword_nmt',
+                                                      'yttm',
+                                                      'hf_bytebpe',
+                                                      'hf_wordpiece',
+                                                      'hf_bpe'],
+                        required=True, help='Subword model type')
+    parser.add_argument('--num-process', type=int, default=16,
+                        help='Number of process')
+    parser.add_argument('--lowercase', action='store_true', default=False,
+                        help='Use lowercase, '
+                        'only applicable to hf_bpe, hf_bytebpe and hf_wordpiece')
+    parser.add_argument('--strip-accents', action='store_true', default=False,
+                        help='Disable BERT characters normalization, '
+                        'only applicable to hf_wordpiece')
+    parser.add_argument('--output-type', type=str, choices=['subword', 'id'], default='subword',
+                        help='Whether output subwords or ids')
+    parser.add_argument('--bpe-dropout', type=float, default=None,
+                        help='BPE dropout, applicable to subword_nmt, yttm, hf_bpe and hf_bytebpe')
+    
+    return parser
+
+
+class ParallelCorpusApplyer:
+    def __init__(self, corpus, tokenizer_model, output_type):
+        self.chunk_size = 1024 * 1024
+        self.corpus = corpus
+        self.tokenizer_model = tokenizer_model
+        self.output_type = output_type
+        
+    def chunk_iter(self, step=10):
+        for corpus_path in self.corpus:
+            line_pos = [0]
+            with open(corpus_path, 'rb') as fcb:
+                pos = 0
+                for line in fcb:
+                    pos += len(line)
+                    line_pos.append(pos)
+            line_pos = np.array(line_pos, dtype=np.int64)
+            line_size = line_pos[1:] - line_pos[:-1]
+            num_lines = line_pos.shape[0] - 1
+            budget = self.chunk_size
+            chunk_start = 0
+            cur_chunk_size = 0
+            for i in range(0, num_lines, step):
+                line_batch_num = min(num_lines - i, step)
+                batch_line_size = line_size[i:(i + line_batch_num)].sum()
+                budget -= batch_line_size
+                cur_chunk_size += batch_line_size
+                if budget <= 0 or i + step >= num_lines:
+                    yield corpus_path, chunk_start, cur_chunk_size
+                    chunk_start += cur_chunk_size
+                    budget = self.chunk_size
+                    cur_chunk_size = 0
+        
+    def process_chunk(self, args):
+        corpus_path, chunk_start, cur_chunk_size = args
+        with open(corpus_path, 'rb') as fcb:
+            fcb.seek(chunk_start)
+            lines_byte = fcb.read(cur_chunk_size)
+            lines_byte = lines_byte.splitlines()
+            sentences = [line_byte.decode('utf-8').strip() for line_byte in lines_byte]
+            all_tokens = self.tokenizer_model.encode(sentences, self.output_type)
+            tokenized_sentences = []
+            for ele_tokens in all_tokens:
+                if self.output_type == int:
+                    ele_tokens = [str(token) for token in ele_tokens]
+                tokenized_sentences.append(' '.join(ele_tokens))
+            sentence_num = len(tokenized_sentences)
+            token_num = sum([len(sentence) for sentence in tokenized_sentences])
+            unk = self.tokenizer_model.vocab.unk_token
+            unk_num = sum(sentence.count(unk) for sentence in tokenized_sentences)
+            return tokenized_sentences, sentence_num, token_num, unk_num
+
+
+def main(args):
+    start = time.time()
+    if args.model == 'spm':
+        tokenizer_model = tokenizers.create('spm',
+                                            model_path=args.model_path,
+                                            vocab=args.vocab_path)
+    elif args.model == 'subword_nmt':
+        tokenizer_model = tokenizers.create('subword_nmt',
+                                            codec_path=args.model_path,
+                                            vocab_path=args.vocab_path,
+                                            bpe_dropout=args.bpe_dropout)
+    elif args.model == 'yttm':
+        args.bpe_dropout = 0.0 if not args.bpe_dropout else args.bpe_dropout
+        tokenizer_model = tokenizers.create('yttm',
+                                            model_path=args.model_path,
+                                            bpe_dropout=args.bpe_dropout,
+                                            n_threads=1)
+    elif args.model == 'hf_bytebpe':
+        tokenizer_model = tokenizers.create('hf_bytebpe',
+                                            merges_file=args.model_path,
+                                            vocab_file=args.vocab_path,
+                                            dropout=args.bpe_dropout,
+                                            lowercase=args.lowercase)
+    elif args.model == 'hf_wordpiece':
+        tokenizer_model = tokenizers.create('hf_wordpiece',
+                                            vocab_file=args.vocab_path,
+                                            lowercase=args.lowercase,
+                                            strip_accents=args.strip_accents)
+    elif args.model == 'hf_bpe':
+        tokenizer_model = tokenizers.create('hf_bpe',
+                                            merges_file=args.model_path,
+                                            vocab_file=args.vocab_path,
+                                            dropout=args.bpe_dropout,
+                                            lowercase=args.lowercase)
+    else:
+        raise NotImplementedError
+    print('Applying {} to {}'. format(tokenizer_model.__class__.__name__,
+                                      ', '.join(args.corpus)))
+    output_type = {'subword': str, 'id': int}[args.output_type]
+    applyer = ParallelCorpusApplyer(args.corpus, tokenizer_model, output_type)
+    with open(args.save_path, 'w', encoding='utf-8', newline='\n') as fo:
+        with Pool(args.num_process) as pool:
+            sentence_count = token_count = unk_count = 0
+            for i, (tokenized_sentences, sentence_num, token_num, unk_num) in \
+                enumerate(pool.imap(applyer.process_chunk, applyer.chunk_iter())):
+                fo.write('\n'.join(tokenized_sentences))
+                fo.write('\n')
+                sentence_count += sentence_num
+                token_count += token_num
+                unk_count += unk_num
+                if (i + 1) % 100 == 0:
+                    print('Chunk {} , #Lines processed: {}'
+                          .format(i + 1, sentence_count))
+    end = time.time()
+    print('Done, #Lines processed {}, Avg tokens of sentences {:.1f},'
+          'Unknown rate {:.1f}%, Time spent {}'
+          .format(sentence_count, token_count / sentence_count,
+                  unk_count * 100 / token_count, end - start))    
+
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/scripts/preprocess/clean_tok_mono_corpus.py b/scripts/preprocess/clean_tok_mono_corpus.py
new file mode 100644
index 0000000000..79416b4798
--- /dev/null
+++ b/scripts/preprocess/clean_tok_mono_corpus.py
@@ -0,0 +1,252 @@
+import argparse
+import os
+import multiprocessing
+import time
+import numpy as np
+import warnings
+import re
+from gluonnlp.data.filtering import MosesNormalizer
+from gluonnlp.data.tokenizers import MosesTokenizer, BaseTokenizer,\
+                                     WhitespaceTokenizer, JiebaTokenizer
+from typing import List, Union, Optional
+re._MAXCACHE = 1024
+
+
+def get_tokenizer(tokenizer, lang=None):
+    if isinstance(tokenizer, BaseTokenizer):
+        return tokenizer
+    else:
+        if tokenizer == 'moses':
+            return MosesTokenizer(lang=lang)
+        elif tokenizer == 'whitespace':
+            return WhitespaceTokenizer()
+        elif tokenizer == 'jieba':
+            return JiebaTokenizer()
+        else:
+            raise NotImplementedError
+
+
+# TODO(sxjscience) Consider whether to
+def check_latin1(sentence: str) -> bool:
+    """Check whether the sentence can be encoded in latin1
+
+    This is used in
+    https://github.com/mlperf/training/blob/master/rnn_translator/pytorch/scripts/filter_dataset.py
+
+    The idea is to filter the sentences with rare unicode glyphs
+
+    Returns
+    -------
+    ret
+        Whether sentences are latin1
+    """
+    try:
+        sentence.encode('latin1')
+    except UnicodeEncodeError:
+        return False
+    else:
+        return True
+
+
+def get_line_byte_start(corpus_path: str) -> np.ndarray:
+    """Get the start position of each lines in terms of bytes so that we can use seek + read to
+     load an arbitrary line.
+
+    Parameters
+    ----------
+    corpus_path
+        The path of the corpus
+
+    Returns
+    -------
+    line_pos
+        Shape (#Lens + 1,)
+    """
+    line_pos = [0]
+    with open(corpus_path, 'rb') as in_f:
+        pos = 0
+        for line in in_f:
+            pos += len(line)
+            line_pos.append(pos)
+    return np.array(line_pos, dtype=np.int64)
+
+
+class MonoCorpusProcessor:
+    """Process sentence of corpus.
+
+    This largely recovers the functionality of 'clean-corpus-n.perl' in mosesdecoder.
+    The difference is that it is customizable with pure python.
+
+    By default, we will perform the following pre-processing pipeline.
+    Each stage could be turned on/off and specialized based on the input arguments.
+    Also, you may directly revise the code and write your own processing script.
+
+    1. Normalize sentence
+    2. Pre-filter
+    3. Tokenize the sentence
+    4. Filter the sentence based on different rules
+        3.1 Remove sentences where `max(len(lhs) / len(rhs), len(rhs) / len(lhs) > max_ratio`
+        3.2 Remove sentences where not `min_max_words <= len(lhs) <= max_num_words` and
+                                       `min_max_words <= len(rhs) <= max_num_words`
+    """
+    def __init__(self, lang: str,
+                 normalize: bool = True,
+                 tokenizer: Union[str, BaseTokenizer] = 'whitespace',
+                 min_num_words: Optional[int] = None,
+                 max_num_words: Optional[int] = None,
+                 discard_non_latin1: bool = False):
+        self._lang = lang
+        if normalize:
+            self._normalizer = MosesNormalizer(lang=lang)
+        self._tokenizer = get_tokenizer(tokenizer, lang)
+        self._min_num_words = min_num_words
+        self._max_num_words = max_num_words
+        self._discard_non_latin1 = discard_non_latin1
+
+    def process_chunk(self, args):
+        path, chunk_start, chunk_size = args
+        processed_lines = []
+        with open(path, 'rb') as in_f:
+            # Read chunk
+            in_f.seek(chunk_start)
+            lines = in_f.read(chunk_size)
+            lines = lines.splitlines()
+            unfiltered_line_num = len(lines)
+            for line in lines:
+                line = line.decode('utf-8').strip()
+                # 1. Normalize
+                line = self._normalizer(line)
+                # 2. Filter after normalization.
+                if self._discard_non_latin1:
+                    if not check_latin1(line):
+                        continue
+                # 3. Tokenize the sentence
+                tokens = self._tokenizer.encode(line)
+                # 4. Filter after tokenization. Filter with multiple rules
+                if len(tokens) == 0:
+                    continue
+                if self._max_num_words is not None:
+                    if len(tokens) > self._max_num_words:
+                        continue
+                if self._min_num_words is not None:
+                    if len(tokens) < self._min_num_words:
+                        continue
+                processed_lines.append(' '.join(tokens))
+        return processed_lines, unfiltered_line_num
+
+    def process_mono_corpus(self, 
+                            corpus_paths: List[str],
+                            out_path: str,
+                            chunk_size: int = 1024 * 1024,
+                            num_process: int = 8) -> int:
+        """Preprocess the mono corpus
+
+        Parameters
+        ----------
+        corpus_paths
+            Corpus paths
+        out_path
+            Write the results to the output path
+        chunk_size
+            Approximately split the corpus files into multiple chunks
+        num_process
+            The number of process
+
+        Returns
+        -------
+        line_count
+            The number of lines in the final filtered file
+        """
+        start = time.time()
+        total_line_count = 0
+        filtered_line_count = 0
+
+        def chunk_iterator(step=10):
+            for path in corpus_paths:
+                line_pos = get_line_byte_start(path)
+                line_size = line_pos[1:] - line_pos[:-1]
+                num_lines = line_pos.shape[0] - 1
+                budget = chunk_size
+                chunk_start = 0
+                cur_chunk_size = 0
+                for i in range(0, num_lines, step):
+                    line_batch_num = min(num_lines - i, step)
+                    batch_line_size = line_size[i:(i + line_batch_num)].sum()
+                    budget -= batch_line_size
+                    cur_chunk_size += batch_line_size
+                    if budget <= 0 or i + step >= num_lines:
+                        yield path, chunk_start, cur_chunk_size
+                        chunk_start += cur_chunk_size
+                        cur_chunk_size = 0
+                        budget = chunk_size
+
+        with open(out_path, 'w', encoding='utf-8', newline='\n') as out_f:
+            with multiprocessing.Pool(num_process) as pool:
+                for i, (processed_lines, unfiltered_line_num) in \
+                        enumerate(pool.imap(self.process_chunk, chunk_iterator())):
+                    out_f.write('\n'.join(processed_lines) + '\n')
+                    filtered_line_count += len(processed_lines)
+                    total_line_count += unfiltered_line_num
+                    if (i + 1) % 100 == 0:
+                        print('Chunk {}, #Lines Processed: {}, Filtered: {}, Remain: {}'
+                              .format(i + 1, total_line_count,
+                                      total_line_count - filtered_line_count,
+                                      filtered_line_count))
+        end = time.time()
+        print('Done, #Lines {}/{}, Time spent {}'.format(filtered_line_count,
+                                                         total_line_count,
+                                                         end - start))
+        return filtered_line_count
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description='Clean mono corpus used in machine translation.')
+    parser.add_argument('--corpus', type=str, nargs='+', required=True)
+    parser.add_argument('--lang', type=str, required=True)
+    parser.add_argument('--save-path', type=str, default=None,
+                        help='Path to save the cleaned and tokenized corpus. If not set, '
+                             'the default is "corpus.tok.{lang}"')
+    parser.add_argument('--tokenizer', type=str, default='moses')
+    parser.add_argument('--min-num-words', type=int, default=None)
+    parser.add_argument('--max-num-words', type=int, default=None)
+    parser.add_argument('--discard-non-latin1', action='store_true',
+                        help='Whether to discard the sentence pair if both sentences cannot be '
+                             'encoded into latin1.')
+    parser.add_argument('--num-process', type=int, default=8,
+                        help='number of process')
+    parser.add_argument('--overwrite', action='store_true')
+
+    return parser
+
+
+def main(args):
+    corpus_processor = MonoCorpusProcessor(lang=args.lang,
+                                           tokenizer=args.tokenizer,
+                                           min_num_words=args.min_num_words,
+                                           max_num_words=args.max_num_words,
+                                           discard_non_latin1=args.discard_non_latin1)
+    print('Clean the mono corpus:')
+    print('   {}: {}'.format(args.lang, args.corpus))
+    if args.save_path is None:
+        save_path = 'corpus.tok.{}'.format(args.lang)
+    else:
+        save_path = args.save_path
+    print('Save to {} -> {} \n'.format(args.lang, save_path))
+    if os.path.exists(save_path) and not args.overwrite:
+        warnings.warn('{} or {} exists, skip. If you need to overwrite this file, '
+                      'rerun the script with --overwrite.'.format(save_path))
+    else:
+        corpus_processor.process_mono_corpus(
+            corpus_paths=args.corpus,
+            out_path=save_path,
+            num_process=args.num_process)
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/scripts/preprocess/clean_tok_para_corpus.py b/scripts/preprocess/clean_tok_para_corpus.py
new file mode 100644
index 0000000000..dd4afcb4a6
--- /dev/null
+++ b/scripts/preprocess/clean_tok_para_corpus.py
@@ -0,0 +1,314 @@
+import argparse
+import os
+import multiprocessing
+import time
+import numpy as np
+import warnings
+import re
+from gluonnlp.data.filtering import MosesNormalizer
+from gluonnlp.data.tokenizers import MosesTokenizer, BaseTokenizer,\
+                                     WhitespaceTokenizer, JiebaTokenizer
+from typing import List, Union, Optional
+re._MAXCACHE = 1024
+
+
+def get_tokenizer(tokenizer, lang=None):
+    if isinstance(tokenizer, BaseTokenizer):
+        return tokenizer
+    else:
+        if tokenizer == 'moses':
+            return MosesTokenizer(lang=lang)
+        elif tokenizer == 'whitespace':
+            return WhitespaceTokenizer()
+        elif tokenizer == 'jieba':
+            return JiebaTokenizer()
+        else:
+            raise NotImplementedError
+
+
+# TODO(sxjscience) Consider whether to
+def check_both_latin1(src_sentence: str, tgt_sentence: str) -> bool:
+    """Check whether the sentence pair can all be encoded in latin1
+
+    This is used in
+    https://github.com/mlperf/training/blob/master/rnn_translator/pytorch/scripts/filter_dataset.py
+
+    The idea is to filter the sentences with rare unicode glyphs and are unlikely to be en-de
+
+    Returns
+    -------
+    ret
+        Whether both sentences are latin1
+    """
+    try:
+        src_sentence.encode('latin1')
+        tgt_sentence.encode('latin1')
+    except UnicodeEncodeError:
+        return False
+    else:
+        return True
+
+
+def get_line_byte_start(corpus_path: str) -> np.ndarray:
+    """Get the start position of each lines in terms of bytes so that we can use seek + read to
+     load an arbitrary line.
+
+    Parameters
+    ----------
+    corpus_path
+        The path of the corpus
+
+    Returns
+    -------
+    line_pos
+        Shape (#Lens + 1,)
+    """
+    line_pos = [0]
+    with open(corpus_path, 'rb') as in_f:
+        pos = 0
+        for line in in_f:
+            pos += len(line)
+            line_pos.append(pos)
+    return np.array(line_pos, dtype=np.int64)
+
+
+class ParallelCorpusProcessor:
+    """Process a pair of corpus.
+
+    This largely recovers the functionality of 'clean-corpus-n.perl' in mosesdecoder.
+    The difference is that it is customizable with pure python.
+
+    By default, we will perform the following pre-processing pipeline.
+    Each stage could be turned on/off and specialized based on the input arguments.
+    Also, you may directly revise the code and write your own processing script.
+
+    1. Normalize sentence
+    2. Pre-filter
+    3. Tokenize the sentence
+    4. Filter the sentence based on different rules
+        3.1 Remove pairs where `max(len(lhs) / len(rhs), len(rhs) / len(lhs) > max_ratio`
+        3.2 Remove pairs where not `min_max_words <= len(lhs) <= max_num_words` and
+                                   `min_max_words <= len(rhs) <= max_num_words`
+    """
+    def __init__(self, src_lang: str, tgt_lang: str,
+                 normalize: bool = True,
+                 src_tokenizer: Union[str, BaseTokenizer] = 'whitespace',
+                 tgt_tokenizer: Union[str, BaseTokenizer] = 'whitespace',
+                 max_ratio: Optional[float] = None,
+                 min_num_words: Optional[int] = None,
+                 max_num_words: Optional[int] = None,
+                 discard_non_latin1: bool = False):
+        self._src_lang = src_lang
+        self._tgt_lang = tgt_lang
+        if normalize:
+            self._src_normalizer = MosesNormalizer(lang=src_lang)
+            self._tgt_normalizer = MosesNormalizer(lang=tgt_lang)
+        self._src_tokenizer = get_tokenizer(src_tokenizer, src_lang)
+        self._tgt_tokenizer = get_tokenizer(tgt_tokenizer, tgt_lang)
+        self._max_ratio = max_ratio
+        self._min_num_words = min_num_words
+        self._max_num_words = max_num_words
+        self._discard_non_latin1 = discard_non_latin1
+
+    def process_chunk(self, args):
+        src_path, src_chunk_start, src_chunk_size, tgt_path, tgt_chunk_start, tgt_chunk_size = args
+        processed_src_lines = []
+        processed_tgt_lines = []
+        with open(src_path, 'rb') as src_in_f:
+            with open(tgt_path, 'rb') as tgt_in_f:
+                # Read chunk from source and target
+                src_in_f.seek(src_chunk_start)
+                src_lines = src_in_f.read(src_chunk_size)
+                tgt_in_f.seek(tgt_chunk_start)
+                tgt_lines = tgt_in_f.read(tgt_chunk_size)
+                src_lines = src_lines.splitlines()
+                tgt_lines = tgt_lines.splitlines()
+                unfiltered_line_num = len(src_lines)
+                for src_line, tgt_line in zip(src_lines, tgt_lines):
+                    src_line = src_line.decode('utf-8').strip()
+                    tgt_line = tgt_line.decode('utf-8').strip()
+                    # 1. Normalize
+                    src_line = self._src_normalizer(src_line)
+                    tgt_line = self._tgt_normalizer(tgt_line)
+                    # 2. Filter after normalization.
+                    if self._discard_non_latin1:
+                        if not check_both_latin1(src_line, tgt_line):
+                            continue
+                    # 3. Tokenize the sentence
+                    src_tokens = self._src_tokenizer.encode(src_line)
+                    tgt_tokens = self._tgt_tokenizer.encode(tgt_line)
+                    # 4. Filter after tokenization. Filter with multiple rules
+                    if len(src_tokens) == 0 or len(tgt_tokens) == 0:
+                        continue
+                    if self._max_ratio is not None:
+                        if max(len(src_tokens) / len(tgt_tokens),
+                               len(tgt_tokens) / len(src_tokens)) > self._max_ratio:
+                            continue
+                    if self._max_num_words is not None:
+                        if len(src_tokens) > self._max_num_words or\
+                                len(tgt_tokens) > self._max_num_words:
+                            continue
+                    if self._min_num_words is not None:
+                        if len(src_tokens) < self._min_num_words\
+                                or len(tgt_tokens) < self._min_num_words:
+                            continue
+                    processed_src_lines.append(' '.join(src_tokens))
+                    processed_tgt_lines.append(' '.join(tgt_tokens))
+        return processed_src_lines, processed_tgt_lines, unfiltered_line_num
+
+    def process_parallel_corpus(self, src_corpus_paths: List[str],
+                                tgt_corpus_paths: List[str],
+                                src_out_path: str, tgt_out_path: str,
+                                chunk_size: int = 1024 * 1024,
+                                num_process: int = 8) -> int:
+        """Preprocess the parallel corpus
+
+        Parameters
+        ----------
+        src_corpus_paths
+            Source corpus paths
+        tgt_corpus_paths
+            Target corpus paths
+        src_out_path
+            Write the results to the source output path
+        tgt_out_path
+            Write the results to the target output path
+        chunk_size
+            Approximately split the corpus files into multiple chunks
+        num_process
+            The number of process
+
+        Returns
+        -------
+        line_count
+            The number of lines in the final filtered file
+        """
+        start = time.time()
+        total_line_count = 0
+        filtered_line_count = 0
+
+        def chunk_iterator(step=10):
+            for src_path, tgt_path in zip(src_corpus_paths, tgt_corpus_paths):
+                src_line_pos = get_line_byte_start(src_path)
+                tgt_line_pos = get_line_byte_start(tgt_path)
+                src_line_size = src_line_pos[1:] - src_line_pos[:-1]
+                tgt_line_size = tgt_line_pos[1:] - tgt_line_pos[:-1]
+                num_src_lines = src_line_pos.shape[0] - 1
+                num_tgt_lines = tgt_line_pos.shape[0] - 1
+                assert num_src_lines == num_tgt_lines
+                src_budget = chunk_size
+                tgt_budget = chunk_size
+                src_chunk_start = 0
+                tgt_chunk_start = 0
+                src_chunk_size = 0
+                tgt_chunk_size = 0
+                for i in range(0, num_src_lines, step):
+                    line_batch_num = min(num_src_lines - i, step)
+                    src_batch_line_size = src_line_size[i:(i + line_batch_num)].sum()
+                    tgt_batch_line_size = tgt_line_size[i:(i + line_batch_num)].sum()
+                    src_budget -= src_batch_line_size
+                    tgt_budget -= tgt_batch_line_size
+                    src_chunk_size += src_batch_line_size
+                    tgt_chunk_size += tgt_batch_line_size
+                    if src_budget <= 0 or tgt_budget <= 0 or i + step >= num_src_lines:
+                        yield src_path, src_chunk_start, src_chunk_size,\
+                              tgt_path, tgt_chunk_start, tgt_chunk_size
+                        src_chunk_start += src_chunk_size
+                        tgt_chunk_start += tgt_chunk_size
+                        src_chunk_size = 0
+                        tgt_chunk_size = 0
+                        src_budget = chunk_size
+                        tgt_budget = chunk_size
+
+        with open(src_out_path, 'w', encoding='utf-8', newline='\n') as src_out_f:
+            with open(tgt_out_path, 'w', encoding='utf-8', newline='\n') as tgt_out_f:
+                with multiprocessing.Pool(num_process) as pool:
+                    for i, (processed_src_lines, processed_tgt_lines, unfiltered_line_num) in \
+                            enumerate(pool.imap(self.process_chunk, chunk_iterator())):
+                        src_out_f.write('\n'.join(processed_src_lines) + '\n')
+                        tgt_out_f.write('\n'.join(processed_tgt_lines) + '\n')
+                        filtered_line_count += len(processed_src_lines)
+                        total_line_count += unfiltered_line_num
+                        if (i + 1) % 100 == 0:
+                            print('Chunk {}, #Lines Processed: {}, Filtered: {}, Remain: {}'
+                                  .format(i + 1, total_line_count,
+                                          total_line_count - filtered_line_count,
+                                          filtered_line_count))
+        end = time.time()
+        print('Done, #Lines {}/{}, Time spent {}'.format(filtered_line_count,
+                                                         total_line_count,
+                                                         end - start))
+        return filtered_line_count
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description='Clean parallel corpus used in machine translation.')
+    parser.add_argument('--src-corpus', type=str, nargs='+', required=True)
+    parser.add_argument('--tgt-corpus', type=str, nargs='+', required=True)
+    parser.add_argument('--src-lang', type=str, required=True)
+    parser.add_argument('--tgt-lang', type=str, required=True)
+    parser.add_argument('--src-save-path', type=str, default=None,
+                        help='Path to save the cleaned and tokenized source corpus. If not set, '
+                             'the default is "corpus.tok.{src_lang}"')
+    parser.add_argument('--tgt-save-path', type=str, default=None,
+                        help='Path to save the cleaned and tokenized source corpus. If not set, '
+                             'the default is "corpus.tok.{src_lang}"')
+    parser.add_argument('--src-tokenizer', type=str, default='moses')
+    parser.add_argument('--tgt-tokenizer', type=str, default='moses')
+    parser.add_argument('--max-ratio', type=float, default=None)
+    parser.add_argument('--min-num-words', type=int, default=None)
+    parser.add_argument('--max-num-words', type=int, default=None)
+    parser.add_argument('--discard-non-latin1', action='store_true',
+                        help='Whether to discard the sentence pair if both sentences cannot be '
+                             'encoded into latin1.')
+    parser.add_argument('--num-process', type=int, default=8,
+                        help='number of process')
+    parser.add_argument('--overwrite', action='store_true')
+
+    return parser
+
+
+def main(args):
+    src_lang, tgt_lang = args.src_lang, args.tgt_lang
+    corpus_processor = ParallelCorpusProcessor(src_lang=src_lang,
+                                               tgt_lang=tgt_lang,
+                                               src_tokenizer=args.src_tokenizer,
+                                               tgt_tokenizer=args.tgt_tokenizer,
+                                               max_ratio=args.max_ratio,
+                                               min_num_words=args.min_num_words,
+                                               max_num_words=args.max_num_words,
+                                               discard_non_latin1=args.discard_non_latin1)
+    print('Clean the corpus:')
+    print('   Source {}: {}'.format(src_lang, args.src_corpus))
+    print('   Target {}: {}'.format(tgt_lang, args.tgt_corpus))
+    if args.src_save_path is None:
+        src_save_path = 'corpus.tok.{}'.format(src_lang)
+    else:
+        src_save_path = args.src_save_path
+    if args.tgt_save_path is None:
+        tgt_save_path = 'corpus.tok.{}'.format(tgt_lang)
+    else:
+        tgt_save_path = args.tgt_save_path
+    print('Save to {} -> {} \n'
+          '        {} -> {}'.format(src_lang, src_save_path, tgt_lang, tgt_save_path))
+    if (os.path.exists(src_save_path) or os.path.exists(tgt_save_path)) and not args.overwrite:
+        warnings.warn('{} or {} exists, skip. If you need to overwrite these two files, '
+                      'rerun the script with --overwrite.'.format(src_save_path, tgt_save_path))
+    else:
+        corpus_processor.process_parallel_corpus(
+            src_corpus_paths=args.src_corpus,
+            tgt_corpus_paths=args.tgt_corpus,
+            src_out_path=src_save_path,
+            tgt_out_path=tgt_save_path,
+            num_process=args.num_process)
+
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/scripts/preprocess/learn_subword.py b/scripts/preprocess/learn_subword.py
new file mode 100644
index 0000000000..ba0dbde627
--- /dev/null
+++ b/scripts/preprocess/learn_subword.py
@@ -0,0 +1,252 @@
+from gluonnlp.utils.lazy_imports import try_import_sentencepiece,\
+    try_import_subword_nmt, try_import_yttm, try_import_huggingface_tokenizers
+import argparse
+import textwrap
+import os
+from collections import OrderedDict
+import json
+from uuid import uuid4
+from gluonnlp.data import Vocab
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        description=textwrap.dedent('''
+    Learn BPE based on different implementations.
+
+    We support the following models:
+
+        "python3 learn_subword.py --model spm" : Train a Sentencepiece Model on raw text;
+        "python3 learn_subword.py --model subword_nmt" : Train with the subword-nmt package;
+        "python3 learn_subword.py --model yttm" : Train with YouTokenToMe; 
+        "python3 learn_subword.py --model hf_bytebpe" : Train with the Byte-level BPE Tokenizer Implemented by Huggingface.
+        "python3 learn_subword.py --model hf_wordpiece" : Train with the Wordpiece Tokenizer Implementated by Huggingface.
+        "python3 learn_subword.py --model hf_bpe" : Train with the BPE Tokenizer Implemented by Huggingface.
+    ''')
+    )
+    parser.add_argument('--corpus', type=str, nargs='+', required=True,
+                        help='Path of the corpus. '
+                             'You may input multiple corpus files separated by space.')
+    parser.add_argument('--vocab-size', type=int, required=True,
+                        help='Estimated learned vocabulary size')
+    parser.add_argument('--model', type=str, choices=['spm',
+                                                      'subword_nmt',
+                                                      'yttm',
+                                                      'hf_bytebpe',
+                                                      'hf_wordpiece',
+                                                      'hf_bpe'],
+                        required=True, help='Subword model type')
+    parser.add_argument('--save-dir', type=str, required=True,
+                        help='Directory for saving the model and vocabulary file')
+    parser.add_argument('--coverage', type=float, default=1.0, 
+                        help='Amount of characters covered by the model, '
+                             'this is only applicable to spm and yttm')
+    parser.add_argument('--n-threads', type=int, default=-1,
+                        help='Number of threads, only applicable to yttm')
+    parser.add_argument('--input-sentence-size', type=int, default=1000000,
+                        help='Size of input sentence, only applicable to sentencepiece, '
+                        'you can reduce this value when getting out of memory error')
+    parser.add_argument('--lowercase', action='store_true', default=False,
+                        help='Use lowercase, '
+                        'only applicable to hf_bpe, hf_bytebpe and hf_wordpiece')
+    parser.add_argument('--strip-accents', action='store_true', default=False,
+                        help='Disable BERT characters normalization, '
+                        'only applicable to hf_wordpiece')
+    parser.add_argument('--disable-bos', action='store_true', default=False,
+                        help='Disable bos token (default settings enable bos)')
+    parser.add_argument('--disable-eos', action='store_true', default=False,
+                        help='Disable eos token (default settings enable eos)')
+    parser.add_argument('--disable-pad', action='store_true', default=False,
+                        help='Disable pad token (default settings enable pad)')
+    parser.add_argument('--custom-special-tokens', type=str, nargs='*', default=[], 
+                        help='Specified special tokens key value pairs besides unk, '
+                             'bos, eos and pad, for example: '
+                             '--custom special tokens cls_token=<cls> sep_token=<sep>, '
+                             'this is not applicable to yttm')
+    return parser
+
+def main(args):
+    corpus_path_list = args.corpus
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+    model_prefix = os.path.join(args.save_dir, args.model)
+    special_tokens_kv = OrderedDict()
+    # unk is always required
+    special_tokens_kv['unk_token'] = Vocab.UNK_TOKEN
+    if not args.disable_bos:
+        special_tokens_kv['bos_token'] = Vocab.BOS_TOKEN
+    if not args.disable_eos:
+        special_tokens_kv['eos_token'] = Vocab.EOS_TOKEN
+    if not args.disable_pad:
+        special_tokens_kv['pad_token'] = Vocab.PAD_TOKEN
+    # split custom special tokens
+    if args.model in ['yttm'] and len(args.custom_special_tokens) > 0:
+        raise ValueError('model {} do not support custom_special_tokens'.format(args.model))
+    for custom_special_token in args.custom_special_tokens:
+        kv = custom_special_token.split('=')
+        if not len(kv) == 2:
+            raise ValueError('parameter {} has wrong format'.format(custom_special_token))
+        k, v = kv[0], kv[1]
+        if k in special_tokens_kv:
+            raise ValueError('There are overlaps between the custom special tokens and the'
+                             ' unk, bos, eos, pad tokens')
+        special_tokens_kv[k] = v
+    # hf_wordpiece must contains mask, cls and sep tokens
+    # the costom defined mask,cls,sep can overwrite the default settings
+    if args.model == 'hf_wordpiece':
+        if 'mask_token' not in special_tokens_kv:
+            special_tokens_kv['mask_token'] = Vocab.MASK_TOKEN
+        if 'cls_token' not in special_tokens_kv:
+            special_tokens_kv['cls_token'] = Vocab.CLS_TOKEN
+        if 'sep_token' not in special_tokens_kv:
+            special_tokens_kv['sep_token'] = Vocab.SEP_TOKEN
+    special_tokens = list(special_tokens_kv.values())
+    print('special tokens: ' + ', '.join(special_tokens))
+    vocab = []
+    if args.model == 'spm':
+        try_import_sentencepiece()
+        import sentencepiece as spm
+        corpus_path = ','.join(corpus_path_list)
+        script = '--input={} --model_prefix={} --vocab_size={} --character_coverage={} --input_sentence_size={}' \
+                 .format(corpus_path, model_prefix, args.vocab_size, args.coverage, args.input_sentence_size)
+        script += (' --unk_id=' + str(special_tokens.index(Vocab.UNK_TOKEN)))
+        script += (' --bos_id=' + ('-1' if args.disable_bos else str(special_tokens.index(Vocab.BOS_TOKEN))))
+        script += (' --eos_id=' + ('-1' if args.disable_eos else str(special_tokens.index(Vocab.EOS_TOKEN))))
+        script += (' --pad_id=' + ('-1' if args.disable_pad else str(special_tokens.index(Vocab.PAD_TOKEN))))
+        if len(args.custom_special_tokens) > 0:
+            ids_in_script = script.count('_id')
+            script += (' --control_symbols=' + ','.join(special_tokens[ids_in_script:]))
+        print(script)
+        spm.SentencePieceTrainer.Train(script)
+        if 'bos_token' in special_tokens_kv:
+            special_tokens_kv['bos_token'] = '<s>'
+        if 'eos_token' in special_tokens_kv:
+            special_tokens_kv['eos_token'] = '</s>'
+        # build spm vocab
+        spm_model = spm.SentencePieceProcessor()
+        spm_model.load(model_prefix + '.model')
+        vocab = [spm_model.id_to_piece(i) for i in range(len(spm_model))]
+        os.remove(model_prefix + '.vocab')
+    elif args.model == 'subword_nmt':
+        try_import_subword_nmt()
+        from subword_nmt import learn_bpe
+        corpus_path = cat_corpus(corpus_path_list)\
+            if len(corpus_path_list) > 1 else corpus_path_list[0]
+        # build model
+        with open(corpus_path, 'r', encoding='utf-8') as fc,\
+             open(model_prefix + '.model', 'w', encoding='utf-8') as fm:
+            learn_bpe.learn_bpe(fc, fm, args.vocab_size - len(special_tokens), total_symbols=True)
+        # build vocab
+        with open(corpus_path, 'r', encoding='utf-8') as fc, \
+             open(model_prefix + '.model', 'r', encoding='utf-8') as fm:
+            vocab.extend(special_tokens)
+            uniq_chars_internal = set()
+            uniq_chars_final = set()
+            uniq_words = set()
+            for line in fc:
+                for word in line.strip('\r\n ').split(' '):
+                    if word:
+                        uniq_words.add(word)
+            # this code piece is same as 
+            # https://github.com/rsennrich/subword-nmt/blob/master/subword_nmt/learn_bpe.py shows
+            uniq_words = [tuple(x[:-1]) + (x[-1]+'</w>',) for x in uniq_words]
+            for word in uniq_words:
+                for char in word[:-1]:
+                    uniq_chars_internal.add(char)
+                uniq_chars_final.add(word[-1])
+            # sort to ensure the same settings produce the same vocab
+            vocab.extend(sorted(list(uniq_chars_internal)))
+            vocab.extend(sorted(list(uniq_chars_final)))
+            fm.readline()
+            pair = fm.readline()
+            while (pair):
+                vocab.append(pair.replace(' ', '', 1).strip())
+                pair = fm.readline()
+        if len(corpus_path_list) > 1:
+            os.remove(corpus_path)
+    elif args.model == 'yttm':
+        try_import_yttm()
+        import youtokentome as yttm
+        corpus_path = cat_corpus(corpus_path_list)\
+            if len(corpus_path_list) > 1 else corpus_path_list[0]
+        tokenizer = yttm.BPE.train(
+            data=corpus_path, 
+            model=model_prefix + '.model',
+            vocab_size=args.vocab_size, 
+            coverage=args.coverage, 
+            n_threads=args.n_threads,
+            unk_id=special_tokens.index(Vocab.UNK_TOKEN),
+            bos_id=-1 if args.disable_bos else special_tokens.index(Vocab.BOS_TOKEN),
+            eos_id=-1 if args.disable_eos else special_tokens.index(Vocab.EOS_TOKEN),
+            pad_id=-1 if args.disable_pad else special_tokens.index(Vocab.PAD_TOKEN))
+        vocab = tokenizer.vocab()
+        if 'unk_token' in special_tokens_kv:
+            special_tokens_kv['unk_token'] = '<UNK>'
+        if 'bos_token' in special_tokens_kv:
+            special_tokens_kv['bos_token'] = '<BOS>'
+        if 'eos_token' in special_tokens_kv:
+            special_tokens_kv['eos_token'] = '<EOS>'        
+        if 'pad_token' in special_tokens_kv:
+            special_tokens_kv['pad_token'] = '<PAD>'
+        if len(corpus_path_list) > 1:
+            os.remove(corpus_path)
+    elif args.model in ['hf_bpe', 'hf_bytebpe', 'hf_wordpiece']:
+        tokenizers = try_import_huggingface_tokenizers()
+        if args.model == 'hf_bpe':
+            tokenizer = tokenizers.CharBPETokenizer(lowercase=args.lowercase)
+        elif args.model == 'hf_bytebpe':
+            tokenizer = tokenizers.ByteLevelBPETokenizer(lowercase=args.lowercase)
+        elif args.model == 'hf_wordpiece':
+            tokenizer = tokenizers.BertWordPieceTokenizer(lowercase=args.lowercase,
+                                                          strip_accents=args.strip_accents)
+        else:
+            raise NotImplementedError
+        tokenizer.train(
+            corpus_path_list,
+            vocab_size=args.vocab_size,
+            show_progress=True,
+            special_tokens=special_tokens)
+        tokenizer.save(args.save_dir, args.model)
+        # we replace the huggingface vocab file with our Vocab implementation
+        if args.model == 'hf_wordpiece':
+            hf_vocab_file = model_prefix + '-vocab.txt'
+            with open(hf_vocab_file, 'r', encoding='utf-8') as fv:
+                for line in fv:
+                    vocab.append(line.strip())
+        else:
+            # Move the hf_${model}-merges.txt to hf_${model}.models
+            os.rename(os.path.join(args.save_dir, '{}-merges.txt'.format(args.model)),
+                      os.path.join(args.save_dir, '{}.model'.format(args.model)))
+            hf_vocab_file = model_prefix + '-vocab.json'
+            with open(hf_vocab_file, 'r', encoding='utf-8') as fv:
+                vocab_kv = json.load(fv)
+                vocab_kv = sorted(list(vocab_kv.items()), key=lambda x: x[1])
+                for kv in vocab_kv:
+                    vocab.append(kv[0])
+        os.remove(hf_vocab_file)
+    else:
+        raise NotImplementedError
+    unk_token = special_tokens_kv.pop('unk_token')
+    vocab_obj = Vocab(vocab, unk_token=unk_token, **special_tokens_kv)
+    vocab_obj.save(model_prefix + '.vocab')
+
+
+def cat_corpus(corpus_path_list):
+    # TODO Use temporary file
+    corpus_path = "./" + str(uuid4()) + '.corpus'
+    with open(corpus_path, 'wb') as cat_corpus:
+        for cp in corpus_path_list:
+            with open(cp, 'rb') as corpus:
+                cat_corpus.write(corpus.read())
+    return corpus_path
+
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/scripts/pretraining/README.md b/scripts/pretraining/README.md
new file mode 100644
index 0000000000..ec2c0a7ea2
--- /dev/null
+++ b/scripts/pretraining/README.md
@@ -0,0 +1,101 @@
+# Datasets
+## OpenWebTextCorpus
+Following the instruction of [Prepare OpenWebTextCorpus](../datasets/pretrain_corpus#openwebtext), download and prepare the dataset, obtaining a total of 20610 text files in the folder `prepared_owt`.
+
+```bash
+python3 data_preprocessing.py --input prepared_owt --output preprocessed_owt --max_seq_length 128 --shuffle
+```
+The above command allows us to generate the preprocessed Numpy features saved in `.npz`.
+# Pretrain Model
+## ELECTRA
+Following [Official Quickstart](https://github.com/google-research/electra#quickstart-pre-train-a-small-electra-model), pretrain a small model using OpenWebText as pretraining corpus. Note that [horovod](https://github.com/horovod/horovod) needs to be installed in advance, if `comm_backend` is set to `horovod`.
+
+```bash
+horovodrun -np 2 -H localhost:2 python3 -m run_electra \
+    --model_name google_electra_small \
+    --data 'preprocessed_owt/*.npz' \
+    --generator_units_scale 0.25 \
+    --gpus 0,1 \
+    --do_train \
+    --do_eval \
+    --output_dir ${OUTPUT} \
+    --num_accumulated 1 \
+    --batch_size 64 \
+    --lr 5e-4 \
+    --wd 0.01 \
+    --max_seq_len 128 \
+    --max_grad_norm 1 \
+    --warmup_steps 10000 \
+    --num_train_steps 1000000 \
+    --log_interval 50 \
+    --save_interval 10000 \
+    --mask_prob 0.15 \
+    --comm_backend horovod \
+```
+
+Alternatively, we could preprocessing the features on the fly and train this model with raw text directly like
+```bash
+horovodrun -np 2 -H localhost:2 python3 -m run_electra \
+    --model_name google_electra_small \
+    --generator_units_scale 0.25 \
+    --data 'prepared_owt/*.txt' \
+    --from_raw \
+    --gpus 0,1 \
+    --do_train \
+    --do_eval \
+    --output_dir ${OUTPUT} \
+    --num_accumulated 1 \
+    --batch_size 64 \
+    --lr 5e-4 \
+    --wd 0.01 \
+    --max_seq_len 128 \
+    --max_grad_norm 1 \
+    --warmup_steps 10000 \
+    --num_train_steps 1000000 \
+    --log_interval 50 \
+    --save_interval 10000 \
+    --mask_prob 0.15 \
+    --comm_backend horovod \
+```
+
+For the convenience of verification, the pretrained small model trained on OpenWebText named `gluon_electra_small_owt` is released and uploaded to S3 with directory structure as
+
+```
+gluon_electra_small_owt
+├── vocab-{short_hash}.json    
+├── model-{short_hash}.params
+├── model-{short_hash}.yml    
+├── gen_model-{short_hash}.params   
+├── disc_model-{short_hash}.params
+```
+
+After pretraining, several downstream NLP tasks such as Question Answering are available to fine-tune. Here is an example of fine-tuning a local pretrained model on [SQuAD 1.1/2.0](../question_answering#squad).
+
+```bash
+python3 run_squad.py \
+    --model_name google_electra_small \
+    --data_dir squad \
+    --backbone_path ${OUTPUT}/model-{short_hash}.params \
+    --output_dir ${FINE-TUNE_OUTPUT} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size 32 \
+    --num_accumulated 1 \
+    --gpus 0 \
+    --epochs 2 \
+    --lr 3e-4 \
+    --layerwise_decay 0.8 \
+    --warmup_ratio 0.1 \
+    --max_saved_ckpt 6 \
+    --all_evaluate \
+    --wd 0 \
+    --max_seq_length 128 \
+    --max_grad_norm 0.1 \
+```
+
+Resulting in the following output
+
+| Model Name    | SQuAD1.1 dev  | SQuAD2.0 dev |
+|--------------------------|---------------|--------------|
+|gluon_electra_small_owt   | 69.40/76.98   | 67.63/69.89  |
diff --git a/scripts/pretraining/data_preprocessing.py b/scripts/pretraining/data_preprocessing.py
new file mode 100644
index 0000000000..1f75e2f782
--- /dev/null
+++ b/scripts/pretraining/data_preprocessing.py
@@ -0,0 +1,89 @@
+"""
+Prepare the feature for openwebtext dataset
+"""
+import os
+import time
+import math
+import random
+import argparse
+import multiprocessing
+
+import numpy as np
+
+from pretraining_utils import get_all_features
+from gluonnlp.models import get_backbone
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("-i", "--input", required=True,
+                        help="path to extraed openwebtext dataset")
+    parser.add_argument("-o", "--output", default="preprocessed_owt",
+                        help="directory for preprocessed features")
+    parser.add_argument("--num_process", type=int, default=8,
+                        help="number of processes for multiprocessing")
+    parser.add_argument("--max_seq_length", type=int, default=128,
+                        help="the maximum length of the pretraining sequence")
+    parser.add_argument("--num_out_files", type=int, default=1000,
+                        help="Number of desired output files, where each is processed"
+                             " independently by a worker.")
+    parser.add_argument('--model_name', type=str, default='google_electra_small',
+                        help='Name of the pretrained model.')
+    parser.add_argument("--shuffle", action="store_true",
+                        help="Wether to shuffle the data order")
+    parser.add_argument("--do_lower_case", dest='do_lower_case',
+                        action="store_true", help="Lower case input text.")
+    parser.add_argument("--no_lower_case", dest='do_lower_case',
+                        action='store_false', help="Don't lower case input text.")
+    parser.add_argument("--short_seq_prob", type=float, default=0.05,
+                        help="The probability of sampling sequences shorter than"
+                             " the max_seq_length.")
+    parser.set_defaults(do_lower_case=True)
+    return parser
+
+
+def main(args):
+    num_process = min(multiprocessing.cpu_count(), args.num_process)
+    _, cfg, tokenizer, _, _ = \
+        get_backbone(args.model_name, load_backbone=False)
+
+    fnames = sorted(os.listdir(args.input))
+    fnames = [os.path.join(args.input, fname) for fname in fnames]
+    if args.shuffle:
+        random.shuffle(fnames)
+    num_files = len(fnames)
+    num_out_files = min(args.num_out_files, num_files)
+    splited_files = np.array_split(fnames, num_out_files)
+    output_files = [os.path.join(
+        args.output, "owt-pretrain-record-{}.npz".format(str(i).zfill(4))) for i in range(num_out_files)]
+    print("All preprocessed features will be saved in {} npz files".format(num_out_files))
+    if not os.path.exists(args.output):
+        os.makedirs(args.output, exist_ok=True)
+    num_process = min(num_process, num_out_files)
+    print('Start preprocessing {} text files with {} cores'.format(
+        num_files, num_process))
+    process_args = [
+        (splited_files[i],
+         output_files[i],
+         tokenizer,
+         args.max_seq_length,
+         args.short_seq_prob) for i in range(
+            num_out_files)]
+    start_time = time.time()
+    with multiprocessing.Pool(num_process) as pool:
+        iter = pool.imap(get_all_features, process_args)
+        fea_written = 0
+        f_read = 0
+        for i, np_features in enumerate(iter):
+            elapsed = time.time() - start_time
+            fea_written += len(np_features[0])
+            f_read += len(splited_files[i])
+            print("Processed {:} files, Elapsed: {:.2f}s, ETA: {:.2f}s, ".format(
+                fea_written, elapsed, (num_files - f_read) / (f_read / elapsed)))
+    print("Done processing within {:.2f} seconds".format(elapsed))
+
+
+if __name__ == '__main__':
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
diff --git a/scripts/pretraining/pretraining_utils.py b/scripts/pretraining/pretraining_utils.py
new file mode 100644
index 0000000000..cc84641589
--- /dev/null
+++ b/scripts/pretraining/pretraining_utils.py
@@ -0,0 +1,554 @@
+"""Utilities for pre-training."""
+import io
+import os
+import re
+import random
+import logging
+import collections
+
+import numpy as np
+from mxnet.gluon import HybridBlock
+from mxnet.gluon.data import ArrayDataset
+
+import gluonnlp.data.batchify as bf
+from gluonnlp.utils.misc import glob
+from gluonnlp.data.loading import NumpyDataset, DatasetLoader
+from gluonnlp.data.sampler import SplitSampler, FixedBucketSampler
+from gluonnlp.op import select_vectors_by_position, update_vectors_by_position
+
+PretrainFeature = collections.namedtuple(
+    'PretrainFeature',
+    ['input_id',
+     'segment_id',
+     'valid_length'])
+
+
+def tokenize_lines_to_ids(lines, tokenizer):
+    """
+    Worker function to tokenize lines based on the tokenizer, and perform vocabulary lookup.
+
+    Parameters
+    ----------
+    lines
+        Lines to be tokenized of the whole file
+    tokenizer
+        The trained tokenizer
+
+    Returns
+    -------
+    results
+        A list storing the valid tokenized lines
+    """
+    results = []
+    # tag line delimiters or doc delimiters
+    for line in lines:
+        if not line:
+            break
+        line = line.strip()
+        # Single empty lines are used as line delimiters
+        # Double empty lines are used as document delimiters
+        if not line:
+            results.append([])
+        else:
+            token_ids = tokenizer.encode(line, int)
+            if token_ids:
+                results.append(token_ids)
+    return results
+
+
+def get_all_features(x):
+    """
+    Get the feature data in numpy form.
+
+    Parameters
+    ----------
+    x
+        List/tuple that contains:
+
+        - file_list
+            A list of text files
+        - output_file
+             The path to a output file that store the np_features
+        - tokenizer
+            The trained tokenizer
+        - max_seq_length
+            Maximum sequence length of the training features
+        - short_seq_prob
+             The probability of sampling sequences shorter than the max_seq_length.
+
+    Returns
+    -------
+    np_features
+        A tuple of (input_ids, segment_ids, valid_lengths),
+        in which each item is a list of numpy arrays.
+    """
+    file_list, output_file, tokenizer, max_seq_length, short_seq_prob = x
+    all_features = []
+    for text_file in file_list:
+        features = process_a_text(text_file, tokenizer, max_seq_length, short_seq_prob)
+        all_features.extend(features)
+    np_features = convert_to_npz(all_features, output_file)
+    return np_features
+
+
+def process_a_text(text_file, tokenizer, max_seq_length, short_seq_prob=0.05):
+    """
+    Create features from a single raw text file, in which one line is treated
+    as a sentence, and double blank lines represent document separators.
+
+    In this process, mxnet-unrelated features are generated, to easily convert
+     to features of a particular deep learning framework in subsequent steps
+
+    Parameters
+    ----------
+    text_file
+        The path to a single text file
+    tokenizer
+        The trained tokenizer
+    max_seq_length
+        Maximum sequence length of the training features
+    short_seq_prob
+        The probability of sampling sequences shorter than the max_seq_length.
+
+    Returns
+    -------
+    features
+        A list of processed features from a single text file
+    """
+    vocab = tokenizer.vocab
+    features = []
+    # TODO(zheyuye), support whole word masking
+    with io.open(text_file, 'r', encoding='utf-8') as reader:
+        lines = reader.readlines()
+        tokenized_lines = tokenize_lines_to_ids(lines, tokenizer)
+        target_seq_length = max_seq_length
+        current_sentences = []
+        current_length = 0
+        for tokenized_line in tokenized_lines:
+            current_sentences.append(tokenized_line)
+            current_length += len(tokenized_line)
+            # Create feature when meets the empty line or reaches the target length
+            if (not tokenized_line and current_length != 0) or (
+                    current_length >= target_seq_length):
+                first_segment, second_segment = \
+                    sentenceize(current_sentences, max_seq_length, target_seq_length)
+
+                input_id = [vocab.cls_id] + first_segment + [vocab.sep_id]
+                segment_id = [0] * len(input_id)
+
+                if second_segment:
+                    input_id += second_segment + [vocab.sep_id]
+                    segment_id += [1] * (len(second_segment) + 1)
+
+                # Padding with zeros for parallel storage
+                valid_length = len(input_id)
+                input_id += [0] * (max_seq_length - len(input_id))
+                segment_id += [0] * (max_seq_length - len(segment_id))
+
+                feature = PretrainFeature(input_id=input_id,
+                                          segment_id=segment_id,
+                                          valid_length=valid_length)
+                features.append(feature)
+
+                current_sentences = []
+                current_length = 0
+                # small chance for random-length instead of max_length-length feature
+                if random.random() < short_seq_prob:
+                    target_seq_length = random.randint(5, max_seq_length)
+                else:
+                    target_seq_length = max_seq_length
+
+    return features
+
+
+def convert_to_npz(all_features, output_file=None):
+    """
+    Convert features to numpy array and store if output_file provided
+
+    Parameters
+    ----------
+    all_features
+        A list of processed features.
+    output_file
+        The path to a output file that store the np_features.
+    Returns
+    -------
+    input_ids
+        A tuple of features
+    segment_ids
+        The segment ids
+    valid_lengths
+        The valid lengths
+    """
+    input_ids = []
+    segment_ids = []
+    valid_lengths = []
+    for fea_index, feature in enumerate(all_features):
+        input_ids.append(np.ascontiguousarray(feature.input_id, dtype='int32'))
+        segment_ids.append(np.ascontiguousarray(feature.segment_id, dtype='int32'))
+        valid_lengths.append(feature.valid_length)
+        if fea_index < 1:
+            logging.debug('*** Example Feature ***')
+            logging.debug('Generated {}'.format(feature))
+
+    if output_file:
+        # The length numpy array are fixed to max_seq_length with zero padding
+        npz_outputs = collections.OrderedDict()
+        npz_outputs['input_ids'] = np.array(input_ids, dtype='int32')
+        npz_outputs['segment_ids'] = np.array(segment_ids, dtype='int32')
+        npz_outputs['valid_lengths'] = np.array(valid_lengths, dtype='int32')
+        np.savez_compressed(output_file, **npz_outputs)
+        logging.info("Saved {} features in {} ".format(len(all_features), output_file))
+    return input_ids, segment_ids, valid_lengths
+
+
+def sentenceize(current_sentences, max_seq_length, target_seq_length):
+    """
+    Generate a pair of sentences based on a segmentation strategy
+    cloned from official electra model.
+
+    Parameters
+    ----------
+    current_sentences
+    max_seq_length
+        Maximum sequence length of the training features
+    target_seq_length
+        Target sequence length of the training features
+    Returns
+    -------
+    first_segment
+        The first sentence of the pretraining sequence
+    second_segment
+        The second sentence of the pretraining sequence.
+        Could be None for diversity of training instances.
+    """
+    # 10% chance to only produce one segment
+    if random.random() < 0.1:
+        first_segment_target_length = 100000
+    else:
+        # The reserved space for [CLS] and [SEP] tokens
+        first_segment_target_length = (target_seq_length - 3) // 2
+    first_segment = []
+    second_segment = []
+    for sentence in current_sentences:
+        if sentence:
+            # the sentence goes to the first segment if (1) the first segment is
+            # empty, (2) the sentence doesn't put the first segment over length or
+            # (3) 50% of the time when it does put the first segment over length
+            if (len(first_segment) == 0 or
+                    len(first_segment) + len(sentence) < first_segment_target_length or
+                    (len(second_segment) == 0 and
+                     len(first_segment) < first_segment_target_length and
+                     random.random() < 0.5)):
+                first_segment += sentence
+            else:
+                second_segment += sentence
+
+    # trim to max_length while accounting for not-yet-added [CLS]/[SEP] tokens
+    first_segment = first_segment[:max_seq_length - 2]
+    second_segment = second_segment[:max(0, max_seq_length -
+                                         len(first_segment) - 3)]
+
+    return first_segment, second_segment
+
+
+def prepare_pretrain_npz_dataset(filename, allow_pickle=False):
+    """Create dataset based on the numpy npz file"""
+    if isinstance(filename, (list, tuple)):
+        assert len(filename) == 1, \
+            'When .npy/.npz data file is loaded, len(filename) must be 1.' \
+            ' Received len(filename)={}.'.format(len(filename))
+        filename = filename[0]
+    logging.debug('start to load file %s ...', filename)
+    return NumpyDataset(filename, allow_pickle=allow_pickle)
+
+
+def prepare_pretrain_text_dataset(
+        filenames,
+        tokenizer,
+        max_seq_length,
+        short_seq_prob,
+        cached_file_path):
+    """Create dataset based on the raw text files"""
+    if not isinstance(filenames, (list, tuple)):
+        filenames = [filenames]
+    if cached_file_path:
+        # generate a filename based on the input filename ensuring no crash.
+        # filename example: urlsf_subset00-130_data.txt
+        suffix = re.split(r'\.|/', filenames[0])[-2]
+        output_file = os.path.join(cached_file_path, "{}-pretrain-record.npz".format(suffix))
+    else:
+        output_file = None
+    np_features = get_all_features(
+        (filenames, output_file, tokenizer, max_seq_length, short_seq_prob))
+
+    return ArrayDataset(*np_features)
+
+
+def prepare_pretrain_bucket_sampler(dataset, batch_size, shuffle=False, num_buckets=1):
+    """Create data sampler based on the dataset"""
+    if isinstance(dataset, NumpyDataset):
+        lengths = dataset.get_field('valid_lengths')
+    else:
+        lengths = dataset.transform(lambda input_ids, segment_ids,
+                                    valid_lengths: valid_lengths, lazy=False)
+    sampler = FixedBucketSampler(lengths,
+                                 batch_size=batch_size,
+                                 num_buckets=num_buckets,
+                                 ratio=0,
+                                 shuffle=shuffle)
+    logging.debug('Sampler created for a new dataset:\n {}'.format(sampler))
+    return sampler
+
+
+def get_pretrain_data_npz(data, batch_size, shuffle, num_buckets,
+                          vocab, num_parts=1, part_idx=0,
+                          num_dataset_workers=1, num_batch_workers=1,
+                          circle_length=1, repeat=1,
+                          dataset_cached=False,
+                          num_max_dataset_cached=0):
+    """Get a data iterator from pre-processed npz files.
+
+    Parameters
+    ----------
+    data: str
+        The path to the dataset directory
+    batch_size : int
+        The batch size per GPU.
+    shuffle : bool
+        Whether to shuffle the data.
+    num_buckets : int
+        The number of buckets for the FixedBucketSampler for training.
+    vocab : Vocab
+        The vocabulary.
+    num_parts : int
+        The number of partitions for the dataset.
+    part_idx : int
+        The index of the partition to read.
+    num_dataset_workers : int
+        The number of worker processes for dataset construction.
+    num_batch_workers : int
+        The number of worker processes for batch contruction.
+    circle_length : int, default is 1
+        The number of files to be read for a single worker at the same time.
+        When circle_length is larger than 1, we merge circle_length files.
+    repeat : int, default is 1
+        The number of times that files are repeated.
+    dataset_cached : bool, default is False
+        Whether or not to cache last processed dataset.
+        Each processed dataset can only be cached for once.
+        When there is no new available processed dataset to be fetched,
+        we pop a cached processed dataset.
+    num_max_dataset_cached : int, default is 0
+        Maximum number of cached datasets. It is valid only if dataset_cached is True
+    """
+    num_files = len(glob(data))
+    logging.info('%d files are found.', num_files)
+    assert num_files >= num_parts, \
+        'The number of text files must be no less than the number of ' \
+        'workers/partitions (%d). Only %d files at %s are found.' % (num_parts, num_files, data)
+    split_sampler = SplitSampler(num_files, num_parts=num_parts,
+                                 part_index=part_idx, repeat=repeat)
+    dataset_fn = prepare_pretrain_npz_dataset
+    sampler_fn = prepare_pretrain_bucket_sampler
+    dataset_params = {'allow_pickle': True}
+    sampler_params = {'batch_size': batch_size, 'shuffle': shuffle, 'num_buckets': num_buckets}
+    batchify_fn = bf.Tuple(
+        bf.Pad(val=vocab.pad_id),  # input_ids
+        bf.Pad(val=0),  # segment_ids
+        bf.Stack(),  # valid_lengths
+    )
+    dataloader = DatasetLoader(data,
+                               file_sampler=split_sampler,
+                               dataset_fn=dataset_fn,
+                               batch_sampler_fn=sampler_fn,
+                               dataset_params=dataset_params,
+                               batch_sampler_params=sampler_params,
+                               batchify_fn=batchify_fn,
+                               num_dataset_workers=num_dataset_workers,
+                               num_batch_workers=num_batch_workers,
+                               pin_memory=False,
+                               circle_length=circle_length)
+    return dataloader
+
+
+def get_pretrain_data_text(data, batch_size, shuffle, num_buckets, tokenizer, vocab,
+                           max_seq_length, short_seq_prob=0.05, num_parts=1,
+                           part_idx=0, num_dataset_workers=1, num_batch_workers=1,
+                           circle_length=1, repeat=1, cached_file_path=None):
+    """Get a data iterator from raw text documents.
+
+    Parameters
+    ----------
+    batch_size : int
+        The batch size per GPU.
+    shuffle : bool
+        Whether to shuffle the data.
+    num_buckets : int
+        The number of buckets for the FixedBucketSampler for training.
+    vocab : Vocab
+        The vocabulary.
+    tokenizer : HuggingFaceWordPieceTokenizer or SentencepieceTokenizer
+        The tokenizer.
+    max_seq_length : int
+        The hard limit of maximum sequence length of sentence pairs.
+    short_seq_prob : float
+        The probability of sampling sequences shorter than the max_seq_length.
+    num_parts : int
+        The number of partitions for the dataset.
+    part_idx : int
+        The index of the partition to read.
+    num_dataset_workers : int
+        The number of worker processes for dataset construction.
+    num_batch_workers : int
+        The number of worker processes for batch construction.
+    circle_length : int, default is 1
+        The number of files to be read for a single worker at the same time.
+        When circle_length is larger than 1, we merge circle_length files.
+    repeat : int, default is 1
+        The number of times that files are repeated.
+    cached_file_path: str, default is None
+        Directory for saving preprocessed features
+    """
+    num_files = len(glob(data))
+    logging.info('%d files are found.', num_files)
+    assert num_files >= num_parts, \
+        'The number of text files must be no less than the number of ' \
+        'workers/partitions (%d). Only %d files at %s are found.' % (num_parts, num_files, data)
+    split_sampler = SplitSampler(num_files, num_parts=num_parts,
+                                 part_index=part_idx, repeat=repeat)
+    dataset_fn = prepare_pretrain_text_dataset
+    sampler_fn = prepare_pretrain_bucket_sampler
+    dataset_params = {'tokenizer': tokenizer, 'max_seq_length': max_seq_length,
+                      'short_seq_prob': short_seq_prob, 'cached_file_path': cached_file_path}
+    sampler_params = {'batch_size': batch_size, 'shuffle': shuffle, 'num_buckets': num_buckets}
+    batchify_fn = bf.Tuple(
+        bf.Pad(val=vocab.pad_id),  # input_ids
+        bf.Pad(val=0),  # segment_ids
+        bf.Stack(),  # valid_lengths
+    )
+
+    dataloader = DatasetLoader(data,
+                               file_sampler=split_sampler,
+                               dataset_fn=dataset_fn,
+                               batch_sampler_fn=sampler_fn,
+                               dataset_params=dataset_params,
+                               batch_sampler_params=sampler_params,
+                               batchify_fn=batchify_fn,
+                               num_dataset_workers=num_dataset_workers,
+                               num_batch_workers=num_batch_workers,
+                               pin_memory=False,
+                               circle_length=circle_length)
+    return dataloader
+
+
+class ElectraMasker(HybridBlock):
+    """process the pre-processed pretrain data"""
+    MaskedInput = collections.namedtuple('MaskedInput',
+                                         ['input_ids',
+                                          'masks',
+                                          'unmasked_tokens',
+                                          'masked_positions',
+                                          'masked_weights'])
+
+    def __init__(self, tokenizer, max_seq_length, mask_prob,
+                 proposal_distribution=1.0):
+        super().__init__()
+        self._max_seq_length = max_seq_length
+        self._mask_prob = mask_prob
+        self._max_num_masked_position = int((self._mask_prob + 0.005) *
+                                            self._max_seq_length)
+        self._proposal_distribution = proposal_distribution
+        self.vocab = tokenizer.vocab
+
+    def dynamic_masking(self, F, input_ids, valid_lengths):
+        # TODO(zheyuye), two additional flag `disallow_from_mask` and `already_masked`
+        # that control the masking status for each positions in the sequence.
+        """
+        Generate masking positions on-the-fly instead of during preprocessing
+        Parameters
+        ----------
+        input_ids
+            The batchified input_ids with shape (batch_size, max_seq_length)
+        valid_lengths
+            The batchified valid_lengths with shape (batch_size, )
+        Returns
+        ------
+        masked_input_ids
+            The masked input sequence with 15% tokens are masked with [MASK]
+            shape (batch_size, max_seq_length)
+        length_masks
+            The masking matrix for the whole sequence that indicates the positions
+            are greater than valid_length.
+
+            shape (batch_size, max_seq_length)
+        unmasked_tokens
+            The original tokens that appear in the unmasked input sequence
+            shape (batch_size, num_masked_positions)
+        masked_positions
+            The masking positions in mx.np.ndarray with shape (batch_size, num_masked_positions)
+            shape (batch_size, num_masked_positions)
+        masked_lm_weights
+            The weight matrix containing 0 or 1 to mark the actual effect of masked positions
+            shape (batch_size, num_masked_positions)
+        """
+        N = self._max_num_masked_position
+        # Only valid token without special token are allowed to mask
+        valid_candidates = F.np.ones_like(input_ids, dtype=np.bool)
+        ignore_tokens = [self.vocab.cls_id, self.vocab.sep_id, self.vocab.pad_id]
+
+        for ignore_token in ignore_tokens:
+            # TODO(zheyuye), Update when operation += supported
+            valid_candidates = valid_candidates * \
+                F.np.not_equal(input_ids, ignore_token)
+        valid_lengths = valid_lengths.astype(np.float32)
+        valid_candidates = valid_candidates.astype(np.float32)
+        num_masked_position = F.np.maximum(
+            1, F.np.minimum(N, round(valid_lengths * self._mask_prob)))
+
+        # Get the masking probability of each position
+        sample_probs = self._proposal_distribution * valid_candidates
+        sample_probs /= F.np.sum(sample_probs, axis=-1, keepdims=True)
+        sample_probs = F.npx.stop_gradient(sample_probs)
+        gumbels = F.np.random.gumbel(F.np.zeros_like(sample_probs))
+        # Following the instruction of official repo to avoid deduplicate postions
+        # with Top_k Sampling as https://github.com/google-research/electra/issues/41
+        masked_positions = F.npx.topk(
+            F.np.log(sample_probs) + gumbels, k=N,
+            axis=-1, ret_typ='indices', dtype=np.int32)
+
+        masked_weights = F.npx.sequence_mask(
+            F.np.ones_like(masked_positions),
+            sequence_length=num_masked_position,
+            use_sequence_length=True, axis=1, value=0)
+        masked_positions = masked_positions * masked_weights
+        length_masks = F.npx.sequence_mask(
+            F.np.ones_like(input_ids, dtype=np.float32),
+            sequence_length=valid_lengths,
+            use_sequence_length=True, axis=1, value=0)
+        unmasked_tokens = select_vectors_by_position(
+            F, input_ids, masked_positions) * masked_weights
+        masked_weights = masked_weights.astype(np.float32)
+        replaced_positions = (
+            F.np.random.uniform(
+                F.np.zeros_like(masked_positions),
+                F.np.ones_like(masked_positions)) > self._mask_prob) * masked_positions
+        # dealing with multiple zero values in replaced_positions which causes
+        # the [CLS] being replaced
+        filled = F.np.where(
+            replaced_positions,
+            self.vocab.mask_id,
+            self.vocab.cls_id).astype(
+            np.int32)
+        # Masking token by replacing with [MASK]
+        masked_input_ids = update_vectors_by_position(F, input_ids, filled, replaced_positions)
+
+        # Note: It is likely have multiple zero values in masked_positions if number of masked of
+        # positions not reached the maximum. However, this example hardly exists since valid_length
+        # is almost always equal to max_seq_length
+        masked_input = self.MaskedInput(input_ids=masked_input_ids,
+                                        masks=length_masks,
+                                        unmasked_tokens=unmasked_tokens,
+                                        masked_positions=masked_positions,
+                                        masked_weights=masked_weights)
+        return masked_input
diff --git a/scripts/pretraining/run_electra.py b/scripts/pretraining/run_electra.py
new file mode 100644
index 0000000000..1678eeae8d
--- /dev/null
+++ b/scripts/pretraining/run_electra.py
@@ -0,0 +1,554 @@
+"""Pretraining Example for Electra Model on the OpenWebText dataset"""
+
+import os
+import time
+import shutil
+import logging
+import argparse
+import functools
+import collections
+
+import mxnet as mx
+import numpy as np
+from mxnet.lr_scheduler import PolyScheduler
+
+from sklearn import metrics
+from pretraining_utils import ElectraMasker, get_pretrain_data_npz, get_pretrain_data_text
+from gluonnlp.utils.misc import repeat, grouper, set_seed, init_comm, logging_config, naming_convention
+from gluonnlp.initializer import TruncNorm
+from gluonnlp.models.electra import ElectraModel, ElectraForPretrain, get_pretrained_electra
+from gluonnlp.utils.parameter import clip_grad_global_norm
+try:
+    import horovod.mxnet as hvd
+except ImportError:
+    pass
+
+mx.npx.set_np()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument('--model_name', type=str, default='google_electra_small',
+                        help='Name of the pretrained model.')
+    parser.add_argument('--do_train', action='store_true',
+                        help='Whether to train the model')
+    parser.add_argument('--do_eval', action='store_true',
+                        help='Whether to evaluate the model')
+    parser.add_argument('--data', type=str, default=None,
+                        help='Path to pretraining corpus file. File name with wildcard such as'
+                        ' dir/*.npz is accepted. Or file name with wildcard such as dir/*.txt if'
+                        ' --from_raw_text is set.')
+    parser.add_argument('--output_dir', type=str, default='electra_owt',
+                        help='The output directory where the model params will be written.'
+                             ' default is squad_out')
+    # Training hyperparameters
+    parser.add_argument('--seed', type=int, default=100, help='Random seed')
+    parser.add_argument('--log_interval', type=int,
+                        default=100, help='The logging interval.')
+    parser.add_argument('--save_interval', type=int, default=1000,
+                        help='the number of steps to save model parameters.'
+                        'default is every epoch')
+    # Data Loading from npz, need to be same as pretraining example
+    parser.add_argument('--max_seq_length', type=int, default=128,
+                        help='The maximum total input sequence length after tokenization.'
+                             'Sequences longer than this will be truncated, and sequences shorter '
+                             'than this will be padded. default is 128')
+    parser.add_argument("--do_lower_case", dest='do_lower_case',
+                        action="store_true", help="Lower case input text. Default is True")
+    parser.add_argument("--no_lower_case", dest='do_lower_case',
+                        action='store_false', help="Don't lower case input text.")
+    parser.add_argument('--mask_prob', type=float, default=0.15,
+                        help='mask probability for generator input')
+    parser.set_defaults(do_lower_case=True)
+    parser.add_argument('--num_dataset_workers', type=int, default=4,
+                        help='Number of workers to pre-process dataset.')
+    parser.add_argument('--num_batch_workers', type=int, default=2,
+                        help='Number of workers to pre-process mini-batch.')
+    parser.add_argument('--num_buckets', type=int, default=1,
+                        help='Number of buckets for variable length sequence sampling')
+    # Data pre-processing from raw text. the below flags are only valid if --from_raw_text is set
+    parser.add_argument('--from_raw_text', action='store_true',
+                        help='If set, both training and dev samples are generated on-the-fly '
+                             'from raw texts instead of pre-processed npz files. ')
+    parser.add_argument("--short_seq_prob", type=float, default=0.05,
+                        help='The probability of sampling sequences '
+                             'shorter than the max_seq_length.')
+    parser.add_argument("--cached_file_path", default=None,
+                        help='Directory for saving preprocessed features')
+    parser.add_argument('--circle_length', type=int, default=2,
+                        help='Number of files to be read for a single GPU at the same time.')
+    parser.add_argument('--repeat', type=int, default=8,
+                        help='Number of times that files are repeated in each shuffle.')
+    # Optimization
+    parser.add_argument('--num_train_steps', type=int, default=1000000,
+                        help='The number of training steps. Note that epochs will be ignored '
+                             'if training steps are set')
+    parser.add_argument('--warmup_steps', type=int, default=10000,
+                        help='warmup steps. Note that either warmup_steps or warmup_ratio is set.')
+    parser.add_argument('--warmup_ratio', type=float, default=0.1,
+                        help='Ratio of warmup steps in the learning rate scheduler.')
+    parser.add_argument('--batch_size', type=int, default=8,
+                        help='Batch size. Number of examples per gpu in a minibatch. default is 8')
+    parser.add_argument('--max_grad_norm', type=float, default=1.0,
+                        help='Max gradient norm.')
+    parser.add_argument('--optimizer', type=str, default='adamw',
+                        help='optimization algorithm. default is adamw')
+    parser.add_argument('--lr_decay_power', type=float, default=1.0,
+                        help="Decay power for layer-wise learning rate")
+    parser.add_argument('--num_accumulated', type=int, default=1,
+                        help='The number of batches for gradients accumulation to '
+                             'simulate large batch size.')
+    parser.add_argument('--lr', type=float, default=5e-4,
+                        help='Initial learning rate. default is 5e-4')
+    parser.add_argument('--wd', type=float, default=0.01, help='weight decay')
+    parser.add_argument('--start_step', type=int, default=0,
+                        help='Start optimization step from the checkpoint.')
+    # Modle Configuration
+    parser.add_argument('--disc_weight', type=float, default=50.0,
+                        help='loss wight for discriminator')
+    parser.add_argument('--gen_weight', type=float, default=1.0,
+                        help='loss wight for generator')
+    parser.add_argument('--hidden_dropout_prob', type=float, default=0.1,
+                        help='dropout of hidden layer')
+    parser.add_argument('--attention_dropout_prob', type=float, default=0.1,
+                        help='dropout of attention layer')
+    parser.add_argument('--generator_units_scale', type=float, default=None,
+                        help='The scale size of the generator units')
+    parser.add_argument('--generator_layers_scale', type=float, default=None,
+                        help='The scale size of the generator layer')
+    # Communication
+    parser.add_argument('--comm_backend', type=str, default='device',
+                        choices=['horovod', 'dist_sync_device', 'device'],
+                        help='Communication backend.')
+    parser.add_argument('--gpus', type=str, default='0',
+                        help='list of gpus to run, e.g. 0 or 0,2,5. -1 means using cpu.')
+    args = parser.parse_args()
+    return args
+
+
+def get_pretraining_model(model_name, ctx_l,
+                          max_seq_length=128,
+                          hidden_dropout_prob=0.1,
+                          attention_dropout_prob=0.1,
+                          generator_units_scale=None,
+                          generator_layers_scale=None):
+    """
+    A Electra Pretrain Model is built with a generator and a discriminator, in which
+    the generator has the same embedding as the discriminator but different backbone.
+    """
+    cfg, tokenizer, _, _ = get_pretrained_electra(
+        model_name, load_backbone=False)
+    cfg = ElectraModel.get_cfg().clone_merge(cfg)
+    cfg.defrost()
+    cfg.MODEL.hidden_dropout_prob = hidden_dropout_prob
+    cfg.MODEL.attention_dropout_prob = attention_dropout_prob
+    cfg.MODEL.max_length = max_seq_length
+    # Keep the original generator size if not designated
+    if generator_layers_scale:
+        cfg.MODEL.generator_layers_scale = generator_layers_scale
+    if generator_units_scale:
+        cfg.MODEL.generator_units_scale = generator_units_scale
+    cfg.freeze()
+
+    model = ElectraForPretrain(cfg,
+                               uniform_generator=False,
+                               tied_generator=False,
+                               tied_embeddings=True,
+                               disallow_correct=False,
+                               weight_initializer=TruncNorm(stdev=0.02))
+    model.initialize(ctx=ctx_l)
+    model.hybridize()
+    return cfg, tokenizer, model
+
+
+ElectraOutput = collections.namedtuple('ElectraOutput',
+                                       ['mlm_scores',
+                                        'rtd_scores',
+                                        'rtd_labels',
+                                        'corrupted_tokens'])
+
+
+def final_save(model, save_dir, tokenizer):
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+
+    with open(os.path.join(save_dir, 'model.yml'), 'w') as of:
+        of.write(model.disc_cfg.dump())
+    tokenizer.vocab.save(os.path.join(save_dir, 'vocab.json'))
+    model.disc_backbone.save_parameters(os.path.join(save_dir, 'model.params'))
+    model.discriminator.save_parameters(os.path.join(save_dir, 'disc_model.params'))
+    model.generator.save_parameters(os.path.join(save_dir, 'gen_model.params'))
+
+    logging.info('Statistics:')
+
+    old_names = os.listdir(save_dir)
+    for old_name in old_names:
+        new_name, long_hash = naming_convention(save_dir, old_name)
+        old_path = os.path.join(save_dir, old_name)
+        new_path = os.path.join(save_dir, new_name)
+        shutil.move(old_path, new_path)
+        file_size = os.path.getsize(new_path)
+        logging.info('\t{}/{} {} {}'.format(save_dir, new_name, long_hash, file_size))
+
+
+def parameters_option(step_num, model, ckpt_dir, option='Saving'):
+    """Save or load the model parameter, marked by step_num."""
+    param_path = os.path.join(
+        ckpt_dir, '{}.params'.format(str(step_num).zfill(7)))
+    logging.info('[step {}], {} model params to/from {}.'.format(
+        step_num, option, param_path))
+    if option == 'Saving':
+        model.save_parameters(param_path)
+        return param_path
+    elif option == 'Loading':
+        model.load_parameters(param_path)
+        return model
+    else:
+        raise NotImplementedError('Unknown Option: {}'.format(option))
+
+
+def states_option(step_num, trainer, ckpt_dir, local_rank=0, option='Saving'):
+    """Save or load the trainer states, marked by step_num and local rank."""
+    state_path = os.path.join(ckpt_dir, '{}.states.{}'.format(
+        str(step_num).zfill(7), str(local_rank).zfill(2)))
+    logging.info('[step {}], {} trainer states to/from {}.'.format(
+        step_num, option, state_path))
+    if option == 'Saving':
+        trainer.save_states(state_path)
+        return state_path
+    elif option == 'Loading':
+        trainer.load_states(state_path)
+        return trainer
+    else:
+        raise NotImplementedError('Unknown Option: {}'.format(option))
+
+
+def train(args):
+    store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm(
+        args.comm_backend, args.gpus)
+    logging_config(args.output_dir,
+                   name='pretrain_owt_' + str(rank),  # avoid race
+                   console=(local_rank == 0))
+    logging.info(args)
+    logging.debug('Random seed set to {}'.format(args.seed))
+    set_seed(args.seed)
+    logging.info('Training info: num_buckets: {}, '
+                 'num_workers: {}, rank: {}'.format(
+                     args.num_buckets, num_workers, rank))
+    cfg, tokenizer, model = get_pretraining_model(args.model_name, ctx_l,
+                                                  args.max_seq_length,
+                                                  args.hidden_dropout_prob,
+                                                  args.attention_dropout_prob,
+                                                  args.generator_units_scale,
+                                                  args.generator_layers_scale)
+    data_masker = ElectraMasker(
+        tokenizer, args.max_seq_length, args.mask_prob)
+    if args.from_raw_text:
+        if args.cached_file_path and not os.path.exists(args.cached_file_path):
+            os.mkdir(args.cached_file_path)
+        get_dataset_fn = functools.partial(get_pretrain_data_text,
+                                           max_seq_length=args.max_seq_length,
+                                           short_seq_prob=args.short_seq_prob,
+                                           tokenizer=tokenizer,
+                                           circle_length=args.circle_length,
+                                           repeat=args.repeat,
+                                           cached_file_path=args.cached_file_path)
+
+        logging.info('Processing and loading the training dataset from raw text.')
+
+    else:
+        logging.info('Loading the training dataset from local Numpy file.')
+        get_dataset_fn = get_pretrain_data_npz
+
+    data_train = get_dataset_fn(args.data, args.batch_size, shuffle=True,
+                                num_buckets=args.num_buckets, vocab=tokenizer.vocab,
+                                num_parts=num_workers, part_idx=rank,
+                                num_dataset_workers=args.num_dataset_workers,
+                                num_batch_workers=args.num_batch_workers)
+
+    logging.info('Creating distributed trainer...')
+    param_dict = model.collect_params()
+    # Do not apply weight decay to all the LayerNorm and bias
+    for _, v in model.collect_params('.*beta|.*gamma|.*bias').items():
+        v.wd_mult = 0.0
+    # Collect differentiable parameters
+    params = [p for p in param_dict.values() if p.grad_req != 'null']
+    # Set grad_req if gradient accumulation is required
+    num_accumulated = args.num_accumulated
+    if num_accumulated > 1:
+        logging.info('Using gradient accumulation. Effective global batch size = {}'
+                     .format(num_accumulated * args.batch_size * len(ctx_l) * num_workers))
+        for p in params:
+            p.grad_req = 'add'
+    # backend specific implementation
+    if args.comm_backend == 'horovod':
+        # Horovod: fetch and broadcast parameters
+        hvd.broadcast_parameters(param_dict, root_rank=0)
+
+    num_train_steps = args.num_train_steps
+    if args.warmup_steps is not None:
+        warmup_steps = args.warmup_steps
+    else:
+        warmup_steps = int(num_train_steps * args.warmup_ratio)
+    assert warmup_steps is not None, 'Must specify either warmup_steps or warmup_ratio'
+    log_interval = args.log_interval
+    save_interval = args.save_interval if args.save_interval is not None\
+        else num_train_steps // 50
+    logging.info('#Total Training Steps={}, Warmup={}, Save Interval={}'
+                 .format(num_train_steps, warmup_steps, save_interval))
+
+    lr_scheduler = PolyScheduler(max_update=num_train_steps,
+                                 base_lr=args.lr,
+                                 warmup_begin_lr=0,
+                                 pwr=1,
+                                 final_lr=0,
+                                 warmup_steps=warmup_steps,
+                                 warmup_mode='linear')
+    optimizer_params = {'learning_rate': args.lr,
+                        'wd': args.wd,
+                        'lr_scheduler': lr_scheduler,
+                        }
+    if args.optimizer == 'adamw':
+        optimizer_params.update({'beta1': 0.9,
+                                 'beta2': 0.999,
+                                 'epsilon': 1e-6,
+                                 'correct_bias': False,
+                                 })
+    if args.comm_backend == 'horovod':
+        trainer = hvd.DistributedTrainer(param_dict, args.optimizer, optimizer_params)
+    else:
+        trainer = mx.gluon.Trainer(param_dict, args.optimizer, optimizer_params,
+                                   update_on_kvstore=False)
+    if args.start_step:
+        logging.info('Restart training from {}'.format(args.start_step))
+        # TODO(zheyuye), How about data splitting, where to start re-training
+        state_path = states_option(
+            args.start_step, trainer, args.output_dir, local_rank, 'Loading')
+        param_path = parameters_option(
+            args.start_step, model, args.output_dir, 'Loading')
+
+    # prepare the loss function
+    mlm_loss_fn = mx.gluon.loss.SoftmaxCELoss()
+    rtd_loss_fn = mx.gluon.loss.SigmoidBinaryCrossEntropyLoss()
+    mlm_loss_fn.hybridize()
+    rtd_loss_fn.hybridize()
+
+    # prepare the records writer
+    writer = None
+    # only one process on each worker will write the tensorboardX's records to avoid race
+    if args.do_eval and local_rank == 0:
+        from tensorboardX import SummaryWriter
+        record_path = os.path.join(args.output_dir, 'records')
+        logging.info('Evaluation records saved in {}'.format(record_path))
+        writer = SummaryWriter(record_path)
+
+    step_num = args.start_step
+    finish_flag = False
+
+    log_total_loss = 0
+    log_mlm_loss = 0
+    log_rtd_loss = 0
+    log_sample_num = 0
+    train_start_time = time.time()
+
+    # start training
+    train_loop_dataloader = grouper(repeat(data_train), len(ctx_l))
+    while step_num < num_train_steps:
+        tic = time.time()
+        for accum_idx in range(num_accumulated):
+            sample_l = next(train_loop_dataloader)
+            loss_l = []
+            mlm_loss_l = []
+            rtd_loss_l = []
+            for sample, ctx in zip(sample_l, ctx_l):
+                if sample is None:
+                    continue
+                # prepare data
+                input_ids, segment_ids, valid_lengths = sample
+                input_ids = input_ids.as_in_ctx(ctx)
+                segment_ids = segment_ids.as_in_ctx(ctx)
+                valid_lengths = valid_lengths.as_in_ctx(ctx)
+                masked_input = data_masker.dynamic_masking(mx.nd, input_ids, valid_lengths)
+                masked_input_ids = masked_input.input_ids
+                length_masks = masked_input.masks
+                unmasked_tokens = masked_input.unmasked_tokens
+                masked_positions = masked_input.masked_positions
+                masked_weights = masked_input.masked_weights
+
+                log_sample_num += len(masked_input_ids)
+
+                with mx.autograd.record():
+                    mlm_scores, rtd_scores, corrupted_tokens, labels = model(
+                        masked_input_ids, segment_ids, valid_lengths, unmasked_tokens, masked_positions)
+                    denominator = (masked_weights.sum() + 1e-6) * num_accumulated * len(ctx_l)
+                    mlm_loss = mlm_loss_fn(
+                        mx.npx.reshape(mlm_scores, (-5, -1)),
+                        unmasked_tokens.reshape((-1,)),
+                        masked_weights.reshape((-1, 1))).sum() / denominator
+                    denominator = (length_masks.sum() + 1e-6) * num_accumulated * len(ctx_l)
+                    rtd_loss = rtd_loss_fn(
+                        rtd_scores, labels, length_masks).sum() / denominator
+                    output = ElectraOutput(mlm_scores=mlm_scores,
+                                           rtd_scores=rtd_scores,
+                                           rtd_labels=labels,
+                                           corrupted_tokens=corrupted_tokens,
+                                           )
+                    mlm_loss_l.append(mlm_loss)
+                    rtd_loss_l.append(rtd_loss)
+                    loss = (args.gen_weight * mlm_loss + args.disc_weight * rtd_loss)
+                    loss_l.append(loss)
+
+            for loss in loss_l:
+                loss.backward()
+            # All Reduce the Step Loss
+            log_mlm_loss += sum([ele.as_in_ctx(ctx_l[0])
+                                 for ele in mlm_loss_l]).asnumpy()
+            log_rtd_loss += sum([ele.as_in_ctx(ctx_l[0])
+                                 for ele in rtd_loss_l]).asnumpy()
+            log_total_loss += sum([ele.as_in_ctx(ctx_l[0])
+                                   for ele in loss_l]).asnumpy()
+
+        # update
+        trainer.allreduce_grads()
+
+        total_norm, ratio, is_finite = clip_grad_global_norm(
+            params, args.max_grad_norm * num_workers)
+
+        if args.comm_backend == 'horovod':
+            # Note that horovod.trainer._scale is default to num_workers,
+            # thus trainer.update(1) will scale the gradients by 1./num_workers
+            trainer.update(1, ignore_stale_grad=True)
+        else:
+            # gluon.trainer._scale is default to 1
+            trainer.update(num_workers, ignore_stale_grad=True)
+
+        total_norm = total_norm / num_workers
+        step_num += 1
+        if num_accumulated > 1:
+            # set grad to zero for gradient accumulation
+            model.zero_grad()
+
+        # saving
+        if step_num % save_interval == 0 or step_num >= num_train_steps:
+            if is_master_node:
+                states_option(
+                    step_num, trainer, args.output_dir, local_rank, 'Saving')
+                if local_rank == 0:
+                    param_path = parameters_option(
+                        step_num, model, args.output_dir, 'Saving')
+
+        # logging
+        if step_num % log_interval == 0:
+            # Output the loss of per step
+            log_mlm_loss /= log_interval
+            log_rtd_loss /= log_interval
+            log_total_loss /= log_interval
+            toc = time.time()
+            logging.info(
+                '[step {}], Loss mlm/rtd/total={:.4f}/{:.4f}/{:.4f},'
+                ' LR={:.6f}, grad_norm={:.4f}. Time cost={:.2f},'
+                ' Throughput={:.2f} samples/s, ETA={:.2f}h'.format(
+                    step_num, log_mlm_loss, log_rtd_loss, log_total_loss,
+                    trainer.learning_rate, total_norm, toc - tic, log_sample_num / (toc - tic),
+                    (num_train_steps - step_num) / (step_num / (toc - train_start_time)) / 3600))
+            tic = time.time()
+
+            if args.do_eval:
+                evaluation(writer, step_num, masked_input, output)
+                if writer is not None:
+                    writer.add_scalars('loss',
+                                       {'total_loss': log_total_loss,
+                                        'mlm_loss': log_mlm_loss,
+                                        'rtd_loss': log_rtd_loss},
+                                       step_num)
+            log_mlm_loss = 0
+            log_rtd_loss = 0
+            log_total_loss = 0
+            log_sample_num = 0
+
+
+    logging.info('Finish training step: %d', step_num)
+    if is_master_node:
+        state_path = states_option(step_num, trainer, args.output_dir, local_rank, 'Saving')
+        if local_rank == 0:
+            param_path = parameters_option(step_num, model, args.output_dir, 'Saving')
+
+    mx.npx.waitall()
+    train_end_time = time.time()
+    logging.info('Train cost={:.1f}s'.format(train_end_time - train_start_time))
+
+    if writer is not None:
+        writer.close()
+
+    if local_rank == 0:
+        model_name = args.model_name.replace('google', 'gluon')
+        save_dir = os.path.join(args.output_dir, model_name)
+        final_save(model, save_dir, tokenizer)
+
+
+# TODO(zheyuye), Directly implement a metric for weighted accuracy
+def accuracy(labels, predictions, weights=None):
+    if weights is None:
+        weights = mx.np.ones_like(labels)
+    is_correct = mx.np.equal(labels, predictions)
+    acc = (is_correct * weights).sum() / (weights.sum() + 1e-6)
+    return acc.asnumpy().item()
+
+# TODO(zheyuye), Directly implement a metric for weighted AUC
+def auc(labels, probs, weights=None):
+    if isinstance(labels, mx.np.ndarray):
+        labels = labels.asnumpy()
+    if isinstance(probs, mx.np.ndarray):
+        probs = probs.asnumpy()
+    if isinstance(weights, mx.np.ndarray):
+        weights = weights.asnumpy()
+    labels = labels.reshape(-1)
+    probs = probs.reshape(-1)
+    weights = weights.reshape(-1)
+
+    fpr, tpr, thresholds = metrics.roc_curve(labels, probs, sample_weight=weights)
+    return metrics.auc(fpr, tpr)
+
+
+def evaluation(writer, step_num, masked_input, eval_input):
+    length_masks = masked_input.masks
+    unmasked_tokens = masked_input.unmasked_tokens
+    masked_weights = masked_input.masked_weights
+    mlm_scores = eval_input.mlm_scores
+    rtd_scores = eval_input.rtd_scores
+    rtd_labels = eval_input.rtd_labels
+    corrupted_tokens = eval_input.corrupted_tokens
+
+    mlm_log_probs = mx.npx.log_softmax(mlm_scores)
+    mlm_preds = mx.np.argmax(mlm_log_probs, axis=-1).astype(np.int32)
+    rtd_probs = mx.npx.sigmoid(rtd_scores)
+    rtd_preds = mx.np.round((mx.np.sign(rtd_scores) + 1) / 2).astype(np.int32)
+
+    mlm_accuracy = accuracy(unmasked_tokens, mlm_preds, masked_weights)
+    corrupted_mlm_accuracy = accuracy(unmasked_tokens, corrupted_tokens, masked_weights)
+    rtd_accuracy = accuracy(rtd_labels, rtd_preds, length_masks)
+    rtd_precision = accuracy(rtd_labels, rtd_preds, length_masks * rtd_preds)
+    rtd_recall = accuracy(rtd_labels, rtd_preds, rtd_labels * rtd_preds)
+    rtd_auc = auc(rtd_labels, rtd_probs, length_masks)
+    logging.info(
+        'Eval [step {}], mlm_accuracy={:.4f}, corrupted_mlm_accuracy={:.4f},'
+        ' rtd_accuracy={:.4f}, rtd_precision={:.4f}, rtd_recall={:.4f},'
+        ' rtd_auc={:.4f}.'.format(step_num,
+            mlm_accuracy, corrupted_mlm_accuracy,
+            rtd_accuracy, rtd_precision, rtd_recall, rtd_auc))
+    if writer is not None:
+        writer.add_scalars('results',
+                           {'mlm_accuracy': mlm_accuracy,
+                            'corrupted_mlm_accuracy': corrupted_mlm_accuracy,
+                            'rtd_accuracy': rtd_accuracy,
+                            'rtd_precision': rtd_precision,
+                            'rtd_recall': rtd_recall,
+                            'rtd_auc': rtd_auc},
+                           step_num)
+
+
+if __name__ == '__main__':
+    os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
+    args = parse_args()
+    if args.do_train:
+        train(args)
diff --git a/scripts/question_answering/README.md b/scripts/question_answering/README.md
new file mode 100644
index 0000000000..c6b8bd790f
--- /dev/null
+++ b/scripts/question_answering/README.md
@@ -0,0 +1,177 @@
+# Question Answering Examples
+
+# SQuAD
+The finetuning scripts for [Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer/) are available,
+supporting a variety of pre-training models including [BERT](https://github.com/google-research/electra), [ALBERT](https://github.com/google-research/albert),
+and [ELECTRA](https://github.com/google-research/bert). Free to choose one of them as `model_name`, listing below.
+
+|               BERT               |          ALBERT          |        ELECTRA       |
+|:--------------------------------:|:------------------------:|:--------------------:|
+| google_en_cased_bert_base        | google_albert_base_v2    | google_electra_small |
+| google_en_uncased_bert_base      | google_albert_large_v2   | google_electra_base  |
+| google_en_cased_bert_large       | google_albert_xalrge_v2  | google_electra_large |
+| google_en_uncased_bert_large     | google_albert_xxlarge_v2 |                      |
+| google_zh_bert_base              |                          |                      |
+| google_multi_cased_bert_base     |                          |                      |
+| google_en_cased_bert_wwm_large   |                          |                      |
+| google_en_uncased_bert_wwm_large |                          |                      |
+
+### Data and official evaluation scripts
+
+*   [train-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json)
+*   [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json)
+*   [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json)
+*   [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json)
+*   [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
+
+download and move them under `$DATA_DIR`
+
+### Running Script
+We provide the script to train on the SQuAD dataset.
+
+```bash
+VERSION=2.0  # Either 2.0 or 1.1
+MODEL_NAME=google_albert_base_v2
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+python3 run_squad.py \
+    --model_name ${MODEL_NAME} \
+    --data_dir squad \
+    --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size 4 \
+    --num_accumulated 3 \
+    --gpus 0,1,2,3 \
+    --epochs 3 \
+    --lr 2e-5 \
+    --warmup_ratio 0.1 \
+    --wd 0.01 \
+    --max_seq_length 512 \
+    --max_grad_norm 0.1 \
+    --overwrite_cache \
+```
+or evaluate SQuAD1.1 based on a SQuAD2.0 fine-tuned checkpoint as
+
+```bash
+python3 run_squad.py \
+    --model_name ${MODEL_NAME} \
+    --data_dir squad \
+    --output_dir ${OUT_DIR} \
+    --param_checkpoint ${CKPT_PATH} \
+    --version 2.0 \
+    --do_eval \
+    --gpus 0,1,2,3 \
+    --eval_batch_size 16 \
+    --overwrite_cache \
+```
+
+We could speed up multi-GPU training via horovod.
+Compared to KVStore, training RoBERTa Large model on SQuAD 2.0 with 3 epochs will save roughly 1/4 training resources (8.48 vs 11.32 hours). Results may vary depending on the training instances.
+
+```bash
+horovodrun -np 4 -H localhost:4 python3 run_squad.py \
+    --comm_backend horovod \
+    ...
+```
+As for ELECTRA model, we fine-tune it with layer-wise learning rate decay as
+
+```bash
+VERSION=2.0  # Either 2.0 or 1.1
+MODEL_NAME=google_electra_small
+
+python3 run_squad.py \
+    --model_name ${MODEL_NAME} \
+    --data_dir squad \
+    --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size 32 \
+    --num_accumulated 1 \
+    --gpus 0 \
+    --epochs 2 \
+    --lr 3e-4 \
+    --layerwise_decay 0.8 \
+    --warmup_ratio 0.1 \
+    --wd 0 \
+    --max_seq_length 512 \
+    --max_grad_norm 0.1 \
+```
+
+For RoBERTa and XLMR, we remove 'segment_ids' and replace `[CLS]` and `[SEP]` with
+`<s>` and `</s>` which stand for the beginning and end of sentences respectively in original purpose.
+
+```bash
+VERSION=2.0  # Either 2.0 or 1.1
+MODEL_NAME=fairseq_roberta_large
+
+python3 run_squad.py \
+    --model_name ${MODEL_NAME} \
+    --data_dir squad \
+    --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size 2 \
+    --num_accumulated 6 \
+    --gpus 0,1,2,3 \
+    --epochs 3 \
+    --lr 3e-5 \
+    --warmup_ratio 0.2 \
+    --wd 0.01 \
+    --max_seq_length 512 \
+    --max_grad_norm 0.1 \
+```
+
+### Results
+We reproduced the ALBERT model which is released by Google, and fine-tune the the SQuAD with single models. ALBERT Version 2 are pre-trained without the dropout mechanism but with extra training steps compared to the version 1 (see the [original paper](https://arxiv.org/abs/1909.11942) for details).
+
+Fine-tuning the listed models with hyper-parameter learning rate 2e-5, epochs 3, warmup ratio 0.1 and max gradient norm 0.1 (as shown in command). Notice that the `batch_size` is set for each GPU and the global batch size is 48 for all experiments, besides that gradient accumulation (`num_accumulated`) is supported in the case of out of memory.
+
+Performance are shown in the table below, in which the SQuAD1.1 are evaluated with SQuAD2.0 checkpoints.
+Notice that the standard metrics of SQuAD are EM and F1. The former is an exact match score between predictions and references, while the latter is a token-level f1 score in which the common tokens are considered as True Positives.
+
+|Reproduced ALBERT Models (F1/EM)  | SQuAD 1.1 dev | SQuAD 2.0 dev | Json | Log | Command |
+|----------------------------------|---------------|---------------|------|-----| --------|
+|ALBERT base                       | 90.55/83.83   | 82.09/79.40   |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_base_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_base_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_base.sh) |
+|ALBERT large                      | 92.66/86.43   | 84.98/82.19   |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_large_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_large_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_large.sh) |
+|ALBERT xlarge                     | 93.85/87.71   | 87.92/85.04   |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_xlarge_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_xlarge_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_xlarge.sh) |
+|ALBERT xxlarge                    | 95.00/89.01   | 89.91/86.87    |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_xxlarge_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_xxlarge_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_xxlarge.sh) |
+
+For reference, we've included the results from Google's Original Experiments
+
+| Model Name | SQuAD 1.1 dev | SQuAD 2.0 dev|
+|------------|---------------|--------------|
+|ALBERT base (googleresearch/albert)    | 90.2/83.2     | 82.1/79.3    |
+|ALBERT large (googleresearch/albert)   | 91.8/85.2     | 84.9/81.8    |
+|ALBERT xlarge (googleresearch/albert)  | 92.9/86.4     | 87.9/84.1    |
+|ALBERT xxlarge (googleresearch/albert) | 94.6/89.1     | 89.8/86.9    |
+
+For the reset pretrained models, the results on SQuAD1.1 and SQuAD2.0 are given as follows.
+
+| Model Name    | SQuAD1.1 dev  | SQuAD2.0 dev |  Json | Log | Command |
+|--------------------------|---------------|--------------|------|-----|--------|
+|BERT base                 | 88.40/81.24   | 76.43/73.59  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_en_uncased_bert_base_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_en_uncased_bert_base_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_uncased_bert_base.sh) |
+|BERT large                | 90.45/83.55   | 81.41/78.46  | [json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_en_uncased_bert_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_en_uncased_bert_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_uncased_bert_large.sh) |
+|ELECTRA small             | 85.42/78.95   | 73.93/71.36  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_small_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_small_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_small.sh) |      
+|ELECTRA base              | 92.63/87.34   | 86.65/83.95  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_base_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_base_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_small.sh) |
+|ELECTRA large             | 94.95/89.94   | 90.67/88.32  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_base.sh) |
+|Mobile BERT             | 82.45/88.99  | 79.60/74.11  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_uncased_mobilebert_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_uncased_mobilebert_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_mobilebert.sh) |
+|RoBERTa large             | 94.58/88.86   | 89.69/86.80  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_fairseq_roberta_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_fairseq_roberta_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_large.sh) |
+
+For reference, we have also included the results of original version from Google and Fairseq
+
+| Model Name               | SQuAD1.1 dev   | SQuAD2.0 dev  |
+|--------------------------|----------------|---------------|
+|Google BERT base          |   88.5/80.8    |     - / -     |
+|Google BERT large         |   90.9/84.1    |     - / -     |
+|Google ELECTRA small      |     - /75.8    |     - /70.1   |
+|Google ELECTRA base       |     - /86.8    |     - /83.7   |
+|Google ELECTRA large      |     - /89.7    |     - /88.1   |
+|Google Mobile BERT        |   81.4/88.6	|   74.4/77.1   |
+|Fairseq RoBERTa large     |   94.6/88.9    |	89.4/86.5   |
diff --git a/scripts/question_answering/__init__.py b/scripts/question_answering/__init__.py
deleted file mode 100644
index 4f3fef8cc4..0000000000
--- a/scripts/question_answering/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Question answering example."""
diff --git a/scripts/question_answering/commands/run_squad2_albert_base.sh b/scripts/question_answering/commands/run_squad2_albert_base.sh
new file mode 100644
index 0000000000..69bee438f8
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_albert_base.sh
@@ -0,0 +1,25 @@
+VERSION=2.0  # Either 2.0 or 1.1
+MODEL_NAME=google_albert_base_v2
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+    --model_name ${MODEL_NAME} \
+    --data_dir squad \
+    --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size 4 \
+    --num_accumulated 3 \
+    --gpus 0,1,2,3 \
+    --epochs 3 \
+    --lr 2e-5 \
+    --warmup_ratio 0.1 \
+    --wd 0.01 \
+    --max_seq_length 512 \
+    --max_grad_norm 0.1 \
+    --overwrite_cache \
diff --git a/scripts/question_answering/commands/run_squad2_albert_large.sh b/scripts/question_answering/commands/run_squad2_albert_large.sh
new file mode 100644
index 0000000000..f4c9d069c5
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_albert_large.sh
@@ -0,0 +1,25 @@
+VERSION=2.0  # Either 2.0 or 1.1
+MODEL_NAME=google_albert_large_v2
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+    --model_name ${MODEL_NAME} \
+    --data_dir squad \
+    --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size 3 \
+    --num_accumulated 4 \
+    --gpus 0,1,2,3 \
+    --epochs 3 \
+    --lr 2e-5 \
+    --warmup_ratio 0.1 \
+    --wd 0.01 \
+    --max_seq_length 512 \
+    --max_grad_norm 0.1 \
+    --overwrite_cache \
diff --git a/scripts/question_answering/commands/run_squad2_albert_xlarge.sh b/scripts/question_answering/commands/run_squad2_albert_xlarge.sh
new file mode 100644
index 0000000000..d14994422d
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_albert_xlarge.sh
@@ -0,0 +1,25 @@
+VERSION=2.0  # Either 2.0 or 1.1
+MODEL_NAME=google_albert_xlarge_v2
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+    --model_name ${MODEL_NAME} \
+    --data_dir squad \
+    --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size 1 \
+    --num_accumulated 12 \
+    --gpus 0,1,2,3 \
+    --epochs 3 \
+    --lr 2e-5 \
+    --warmup_ratio 0.1 \
+    --wd 0.01 \
+    --max_seq_length 512 \
+    --max_grad_norm 0.1 \
+    --overwrite_cache \
diff --git a/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh b/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh
new file mode 100644
index 0000000000..fdb6e89658
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh
@@ -0,0 +1,25 @@
+VERSION=2.0  # Either 2.0 or 1.1
+MODEL_NAME=google_albert_xxlarge_v2
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+    --model_name ${MODEL_NAME} \
+    --data_dir squad \
+    --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size 1 \
+    --num_accumulated 12 \
+    --gpus 0,1,2,3 \
+    --epochs 3 \
+    --lr 2e-5 \
+    --warmup_ratio 0.1 \
+    --wd 0.01 \
+    --max_seq_length 512 \
+    --max_grad_norm 0.1 \
+    --overwrite_cache \
diff --git a/scripts/question_answering/commands/run_squad2_electra_base.sh b/scripts/question_answering/commands/run_squad2_electra_base.sh
new file mode 100644
index 0000000000..a500a3ae50
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_electra_base.sh
@@ -0,0 +1,25 @@
+VERSION=2.0  # Either 2.0 or 1.1
+MODEL_NAME=google_electra_base
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+    --model_name ${MODEL_NAME} \
+    --data_dir squad \
+    --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size 8 \
+    --num_accumulated 1 \
+    --gpus 0,1,2,3 \
+    --epochs 2 \
+    --lr 1e-4 \
+    --layerwise_decay 0.8 \
+    --warmup_ratio 0.1 \
+    --wd 0 \
+    --max_seq_length 512 \
+    --max_grad_norm 0.1 \
diff --git a/scripts/question_answering/commands/run_squad2_electra_large.sh b/scripts/question_answering/commands/run_squad2_electra_large.sh
new file mode 100644
index 0000000000..61872f110b
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_electra_large.sh
@@ -0,0 +1,25 @@
+VERSION=2.0  # Either 2.0 or 1.1
+MODEL_NAME=google_electra_large
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+    --model_name ${MODEL_NAME} \
+    --data_dir squad \
+    --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size 2 \
+    --num_accumulated 4 \
+    --gpus 0,1,2,3 \
+    --epochs 2 \
+    --lr 5e-5 \
+    --layerwise_decay 0.9 \
+    --warmup_ratio 0.1 \
+    --wd 0 \
+    --max_seq_length 512 \
+    --max_grad_norm 0.1 \
diff --git a/scripts/question_answering/commands/run_squad2_electra_small.sh b/scripts/question_answering/commands/run_squad2_electra_small.sh
new file mode 100644
index 0000000000..e174258c17
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_electra_small.sh
@@ -0,0 +1,24 @@
+VERSION=2.0  # Either 2.0 or 1.1
+MODEL_NAME=google_electra_small
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+    --model_name ${MODEL_NAME} \
+    --data_dir squad \
+    --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size 32 \
+    --num_accumulated 1 \
+    --gpus 0 \
+    --epochs 2 \
+    --lr 3e-4 \
+    --layerwise_decay 0.8 \
+    --warmup_ratio 0.1 \
+    --wd 0 \
+    --max_seq_length 512 \
+    --max_grad_norm 0.1 \
diff --git a/scripts/question_answering/commands/run_squad2_mobilebert.sh b/scripts/question_answering/commands/run_squad2_mobilebert.sh
new file mode 100644
index 0000000000..cfeee56356
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_mobilebert.sh
@@ -0,0 +1,25 @@
+VERSION=2.0  # Either 2.0 or 1.1
+MODEL_NAME=google_uncased_mobilebert
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+    --model_name ${MODEL_NAME} \
+    --data_dir squad \
+    --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size 8 \
+    --num_accumulated 1 \
+    --gpus 0,1,2,3 \
+    --epochs  5 \
+    --lr 4e-5 \
+    --warmup_steps 1400 \
+    --wd 0.0 \
+    --max_seq_length 384 \
+    --max_grad_norm 0.1 \
+    --overwrite_cache \
diff --git a/scripts/question_answering/commands/run_squad2_roberta_large.sh b/scripts/question_answering/commands/run_squad2_roberta_large.sh
new file mode 100644
index 0000000000..3cdf2cb6ea
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_roberta_large.sh
@@ -0,0 +1,23 @@
+VERSION=2.0  # Either 2.0 or 1.1
+MODEL_NAME=fairseq_roberta_large
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+python3 run_squad.py \
+    --model_name ${MODEL_NAME} \
+    --data_dir squad \
+    --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size 2 \
+    --num_accumulated 6 \
+    --gpus 0,1,2,3 \
+    --epochs 3 \
+    --lr 3e-5 \
+    --warmup_ratio 0.2 \
+    --wd 0.01 \
+    --max_seq_length 512 \
+    --max_grad_norm 0.1 \
diff --git a/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh b/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh
new file mode 100644
index 0000000000..f087860014
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh
@@ -0,0 +1,25 @@
+VERSION=2.0  # Either 2.0 or 1.1
+MODEL_NAME=google_en_uncased_bert_base
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+    --model_name ${MODEL_NAME} \
+    --data_dir squad \
+    --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size 6 \
+    --num_accumulated 2 \
+    --gpus 0,1,2,3 \
+    --epochs 3 \
+    --lr 3e-5 \
+    --warmup_ratio 0.1 \
+    --wd 0.01 \
+    --max_seq_length 512 \
+    --max_grad_norm 0.1 \
+    --overwrite_cache \
diff --git a/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh b/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh
new file mode 100644
index 0000000000..0e80da7688
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh
@@ -0,0 +1,25 @@
+VERSION=2.0  # Either 2.0 or 1.1
+MODEL_NAME=google_en_uncased_bert_large
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+# Run the script
+
+python3 run_squad.py \
+    --model_name ${MODEL_NAME} \
+    --data_dir squad \
+    --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size 2 \
+    --num_accumulated 6 \
+    --gpus 0,1,2,3 \
+    --epochs 3 \
+    --lr 3e-5 \
+    --warmup_ratio 0.1 \
+    --wd 0.01 \
+    --max_seq_length 512 \
+    --max_grad_norm 0.1 \
+    --overwrite_cache \
diff --git a/scripts/question_answering/data_pipeline.py b/scripts/question_answering/data_pipeline.py
deleted file mode 100644
index bd42d05c2b..0000000000
--- a/scripts/question_answering/data_pipeline.py
+++ /dev/null
@@ -1,946 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=
-"""SQuAD data data preprocessing pipeline."""
-import collections
-import contextlib
-import itertools
-import json
-import multiprocessing as mp
-import os
-import re
-import time
-
-import nltk
-import numpy as np
-import tqdm
-from mxnet.gluon.data import Dataset
-
-import gluonnlp as nlp
-from gluonnlp import data, Vocab
-from gluonnlp.data import SQuAD
-
-
-class SQuADDataPipeline:
-    """Main data processing pipeline class, which encapsulate all preprocessing logic. The class
-    process the data in multiprocessing mode using Pool. It can save/load the result of processing,
-    but since it happens in a single thread, it is usually faster to just process data from scratch.
-    """
-
-    def __init__(self, train_para_limit, train_ques_limit, dev_para_limit, dev_ques_limit,
-                 ans_limit, char_limit, emb_file_name, num_workers=None, save_load_data=False,
-                 data_root_path='./data'):
-        """Method that creates a new instance. If an example is longer that provided limits it will
-        be truncated for the dev set and filtered out for the training set.
-
-        Parameters
-        ----------
-        train_para_limit : int
-            Maximum characters of a paragraph for training dataset
-        train_ques_limit : int
-            Maximum characters of a question for training dataset
-        dev_para_limit : int
-            Maximum characters of a paragraph for dev dataset
-        dev_ques_limit
-            Maximum characters of a question for dev dataset
-        ans_limit : int
-            Maximum characters of an answer
-        char_limit : int
-            Maximum token (word) length of a paragraph, question or answer
-        emb_file_name : str
-            Glove embedding file name
-        num_workers : int, default None
-            Number of workers to use for multiprocessing. Default uses all available cores
-        data_root_path : str
-            Path to store the processed data or load existing processed data, if needed (depends on
-            save_load_data flag)
-        save_load_data : bool
-            Shall save or load data from the ``data_root_path``
-        """
-        self._train_para_limit = train_para_limit
-        self._train_ques_limit = train_ques_limit
-        self._dev_para_limit = dev_para_limit
-        self._dev_ques_limit = dev_ques_limit
-        self._ans_limit = ans_limit
-        self._char_limit = char_limit
-        self._emb_file_name = emb_file_name
-        self._is_cased_embedding = emb_file_name.startswith('glove.840')
-        self._num_workers = num_workers
-        self._save_load_data = save_load_data
-        self._data_root_path = data_root_path
-
-        self._processed_train_data_file_name = 'train_processed.json'
-        self._processed_dev_data_file_name = 'dev_processed.json'
-        self._word_vocab_file_name = 'word_vocab.bin'
-        self._char_vocab_file_name = 'char_vocab.bin'
-
-    def get_processed_data(self, use_spacy=True, shrink_word_vocab=True, squad_data_root=None):
-        """Main method to start data processing
-
-        Parameters
-        ----------
-        use_spacy : bool, default True
-            Shall use Spacy as a tokenizer. If not, uses NLTK
-        shrink_word_vocab : bool, default True
-            When True, only tokens that have embeddings in the embedding file are remained in the
-            word_vocab. Otherwise tokens with no embedding also stay
-        squad_data_root : str, default None
-            Data path to store downloaded original SQuAD data
-        Returns
-        -------
-        train_json_data : dict
-            Train JSON data of SQuAD dataset as is to run official evaluation script
-        dev_json_data : dict
-            Dev JSON data of SQuAD dataset as is to run official evaluation script
-        train_examples : SQuADQADataset
-            Processed examples to be used for training
-        dev_examples : SQuADQADataset
-            Processed examples to be used for evaluation
-        word_vocab : Vocab
-            Word vocabulary
-        char_vocab : Vocab
-            Char vocabulary
-
-        """
-        if self._save_load_data and self._has_processed_data():
-            return self._load_processed_data()
-
-        train_dataset = SQuAD(segment='train', root=squad_data_root) \
-            if squad_data_root else SQuAD(segment='train')
-        dev_dataset = SQuAD(segment='dev', root=squad_data_root) \
-            if squad_data_root else SQuAD(segment='dev')
-
-        with contextlib.closing(mp.Pool(processes=self._num_workers)) as pool:
-            train_examples, dev_examples = SQuADDataPipeline._tokenize_data(train_dataset,
-                                                                            dev_dataset,
-                                                                            use_spacy, pool)
-            word_vocab, char_vocab = SQuADDataPipeline._get_vocabs(train_examples, dev_examples,
-                                                                   self._emb_file_name,
-                                                                   self._is_cased_embedding,
-                                                                   shrink_word_vocab,
-                                                                   pool)
-
-        filter_provider = SQuADDataFilter(self._train_para_limit,
-                                          self._train_ques_limit,
-                                          self._ans_limit)
-        train_examples = list(filter(filter_provider.filter, train_examples))
-
-        train_featurizer = SQuADDataFeaturizer(word_vocab,
-                                               char_vocab,
-                                               self._train_para_limit,
-                                               self._train_ques_limit,
-                                               self._char_limit,
-                                               self._is_cased_embedding)
-
-        dev_featuarizer = SQuADDataFeaturizer(word_vocab,
-                                              char_vocab,
-                                              self._dev_para_limit,
-                                              self._dev_ques_limit,
-                                              self._char_limit,
-                                              self._is_cased_embedding)
-
-        train_examples, dev_examples = SQuADDataPipeline._featurize_data(train_examples,
-                                                                         dev_examples,
-                                                                         train_featurizer,
-                                                                         dev_featuarizer)
-
-        if self._save_load_data:
-            self._save_processed_data(train_examples, dev_examples, word_vocab, char_vocab)
-
-        return train_dataset._read_data(), dev_dataset._read_data(), \
-               SQuADQADataset(train_examples), SQuADQADataset(dev_examples), word_vocab, char_vocab
-
-    @staticmethod
-    def _tokenize_data(train_dataset, dev_dataset, use_spacy, pool):
-        """Tokenize incoming paragpraphs and questions in incoming datsets using provided
-        tokenizer withing the processes of the provided multiprocessing pool
-
-        Parameters
-        ----------
-        train_dataset : SQuAD
-            training dataset
-        dev_dataset : SQuAD
-            Dev dataset
-        use_spacy : bool
-            Use Spacy as a tokenizer. Otherwise uses NLTK
-        pool : Pool
-            Multiprocessing pool to use for the tokenization
-
-        Returns
-        -------
-        train_examples : List[dict]
-            List of tokenized training examples
-        dev_examples : List[dict]
-            List of tokenized dev examples
-        """
-        tokenizer = SQuADDataTokenizer(use_spacy)
-
-        tic = time.time()
-        print('Train examples [{}] transformation started.'.format(len(train_dataset)))
-        train_examples = list(tqdm.tqdm(tokenizer.run_async(pool, train_dataset),
-                                        total=len(train_dataset)))
-        print('Train examples transformed [{}/{}] in {:.3f} sec'.format(len(train_examples),
-                                                                        len(train_dataset),
-                                                                        time.time() - tic))
-        tic = time.time()
-        print('Dev examples [{}] transformation started.'.format(len(dev_dataset)))
-        dev_examples = list(tqdm.tqdm(tokenizer.run_async(pool, dev_dataset),
-                                      total=len(dev_dataset)))
-        print('Dev examples transformed [{}/{}] in {:.3f} sec'.format(len(dev_examples),
-                                                                      len(dev_dataset),
-                                                                      time.time() - tic))
-        return train_examples, dev_examples
-
-    @staticmethod
-    def _featurize_data(train_examples, dev_examples, train_featurizer, dev_featuarizer):
-        """Create features from incoming datasets by replacing tokens with indices.
-
-        Parameters
-        ----------
-        train_examples : List[dict]
-            Tokenized train examples
-        dev_examples : List[dict]
-            Tokenized dev examples
-        train_featurizer : SQuADDataFeaturizer
-            Parametrized featurizer for training examples
-        dev_featuarizer : SQuADDataFeaturizer
-            Parametrized featurizer for dev examples
-
-        Returns
-        -------
-        train_ready : List[Tuple]
-            Processed train examples. Each tuple consists of question_id, record_index,
-            context_tokens_indices, question_tokens_indices, context_chars_indices,
-            question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer,
-            context, context_tokens_spans
-        dev_ready : List[Tuple]
-            Processed dev examples. Each tuple consists of question_id, record_index,
-            context_tokens_indices, question_tokens_indices, context_chars_indices,
-            question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer,
-            context, context_tokens_spans
-
-        """
-        tic = time.time()
-        print('Train examples [{}] featurization started.'.format(len(train_examples)))
-        train_ready = [train_featurizer.build_features(example)
-                       for example in tqdm.tqdm(train_examples, total=len(train_examples))]
-        print('Train examples featurized [{}] in {:.3f} sec'.format(len(train_examples),
-                                                                    time.time() - tic))
-        tic = time.time()
-        print('Dev examples [{}] featurization started.'.format(len(dev_examples)))
-        dev_ready = [dev_featuarizer.build_features(example)
-                     for example in tqdm.tqdm(dev_examples, total=len(dev_examples))]
-        print('Dev examples featurized [{}] in {:.3f} sec'.format(len(dev_examples),
-                                                                  time.time() - tic))
-        return train_ready, dev_ready
-
-    @staticmethod
-    def _get_vocabs(train_examples, dev_examples, emb_file_name, is_cased_embedding,
-                    shrink_word_vocab, pool):
-        """Create both word-level and character-level vocabularies. Vocabularies are built using
-        data from both train and dev datasets.
-
-        Parameters
-        ----------
-        train_examples : List[dict]
-            Tokenized training examples
-        dev_examples : List[dict]
-            Tokenized dev examples
-        emb_file_name : str
-            Glove embedding file name
-        is_cased_embedding : bool
-            When True, provided embedding file is cased, uncased otherwise
-        shrink_word_vocab : bool
-            When True, only tokens that have embeddings in the embedding file are remained in the
-            word_vocab. Otherwise tokens with no embedding also stay
-        pool : Pool
-            Multiprocessing pool to use
-
-        Returns
-        -------
-        word_vocab : Vocab
-            Word-level vocabulary
-        char_vocab : Vocab
-            Char-level vocabulary
-        """
-        tic = time.time()
-        print('Word counters receiving started.')
-
-        word_mapper = SQuADAsyncVocabMapper()
-        word_reducer = SQuADAsyncVocabReducer()
-        word_mapped = list(
-            tqdm.tqdm(word_mapper.run_async(itertools.chain(train_examples, dev_examples), pool),
-                      total=len(train_examples) + len(dev_examples)))
-        word_partitioned = tqdm.tqdm(SQuADDataPipeline._partition(itertools.chain(*word_mapped)),
-                                     total=len(word_mapped))
-        word_counts = list(tqdm.tqdm(word_reducer.run_async(word_partitioned, pool),
-                                     total=len(word_partitioned)))
-        print('Word counters received in {:.3f} sec'.format(time.time() - tic))
-
-        tic = time.time()
-        print('Char counters receiving started.')
-        char_mapper = SQuADAsyncVocabMapper(iterate_over_example=True)
-        char_reducer = SQuADAsyncVocabReducer()
-        char_mapped = list(
-            tqdm.tqdm(char_mapper.run_async(itertools.chain(train_examples, dev_examples), pool),
-                      total=len(train_examples) + len(dev_examples)))
-        char_partitioned = SQuADDataPipeline._partition(itertools.chain(*char_mapped))
-        char_counts = list(tqdm.tqdm(char_reducer.run_async(char_partitioned, pool),
-                                     total=len(char_partitioned)))
-        print('Char counters received in {:.3f} sec'.format(time.time() - tic))
-
-        embedding = nlp.embedding.create('glove', source=emb_file_name)
-
-        if is_cased_embedding:
-            word_counts = itertools.chain(*[[(item[0], item[1]),
-                                             (item[0].lower(), item[1]),
-                                             (item[0].capitalize(), item[1]),
-                                             (item[0].upper(), item[1])] for item in word_counts])
-        else:
-            word_counts = [(item[0].lower(), item[1]) for item in word_counts]
-
-        word_vocab = Vocab({item[0]: item[1] for item in word_counts if
-                            not shrink_word_vocab or item[0] in embedding.token_to_idx},
-                           bos_token=None, eos_token=None)
-        word_vocab.set_embedding(embedding)
-        char_vocab = Vocab({item[0]: item[1] for item in char_counts},
-                           bos_token=None, eos_token=None)
-
-        return word_vocab, char_vocab
-
-    def _has_processed_data(self):
-        """Check if the data was processed and stored already
-
-        Returns
-        -------
-        ret: Boolean
-            Is processed data already exists
-        """
-        return \
-            os.path.exists(
-                os.path.join(self._data_root_path, self._processed_train_data_file_name)) and \
-            os.path.exists(
-                os.path.join(self._data_root_path, self._processed_dev_data_file_name)) and \
-            os.path.exists(
-                os.path.join(self._data_root_path, self._word_vocab_file_name)) and \
-            os.path.exists(
-                os.path.join(self._data_root_path, self._char_vocab_file_name))
-
-    def _load_processed_data(self):
-        """ Load processed data from the disk
-        Returns
-        -------
-        train_examples : List[Tuple]
-            Processed train examples. Each tuple consists of question_id, record_index,
-            context_tokens_indices, question_tokens_indices, context_chars_indices,
-            question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer,
-            context, context_tokens_spans
-        dev_examples : List[Tuple]
-            Processed dev examples. Each tuple consists of question_id, record_index,
-            context_tokens_indices, question_tokens_indices, context_chars_indices,
-            question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer,
-            context, context_tokens_spans
-        word_vocab : Vocab
-            Word-level vocabulary
-        char_vocab : Vocab
-            Char-level vocabulary
-        """
-        with open(os.path.join(self._data_root_path, self._processed_train_data_file_name),
-                  'r') as f:
-            train_examples = json.load(f)
-
-        with open(os.path.join(self._data_root_path, self._processed_dev_data_file_name), 'r') as f:
-            dev_examples = json.load(f)
-
-        with open(os.path.join(self._data_root_path, self._word_vocab_file_name), 'r') as f:
-            word_vocab = Vocab.from_json(json.load(f))
-
-        with open(os.path.join(self._data_root_path, self._char_vocab_file_name), 'r') as f:
-            char_vocab = Vocab.from_json(json.load(f))
-
-        return train_examples, dev_examples, word_vocab, char_vocab
-
-    def _save_processed_data(self, train_examples, dev_examples, word_vocab, char_vocab):
-        """Save processed data to disk
-
-        Parameters
-        ----------
-        train_examples : List[Tuple]
-            Processed train examples. Each tuple consists of question_id, record_index,
-            context_tokens_indices, question_tokens_indices, context_chars_indices,
-            question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer,
-            context, context_tokens_spans
-        dev_examples : List[Tuple]
-            Processed dev examples. Each tuple consists of question_id, record_index,
-            context_tokens_indices, question_tokens_indices, context_chars_indices,
-            question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer,
-            context, context_tokens_spans
-        word_vocab : Vocab
-            Word-level vocabulary
-        char_vocab : Vocab
-            Char-level vocabulary
-        """
-        with open(os.path.join(self._data_root_path, self._processed_train_data_file_name),
-                  'w') as f:
-            json.dump(train_examples, f)
-
-        with open(os.path.join(self._data_root_path, self._processed_dev_data_file_name), 'w') as f:
-            json.dump(dev_examples, f)
-
-        with open(os.path.join(self._data_root_path, self._word_vocab_file_name), 'w') as f:
-            f.write(word_vocab.to_json())
-
-        with open(os.path.join(self._data_root_path, self._char_vocab_file_name), 'w') as f:
-            f.write(char_vocab.to_json())
-
-    @staticmethod
-    def _partition(mapped_values):
-        """Groups items with same keys into a single partition
-
-        Parameters
-        ----------
-        mapped_values : List[Tuple]
-            List of mapped (key, value) tuples
-
-        Returns
-        -------
-        items: List[Tuple]
-            List of partitions, where each partition is (key, List[value])
-        """
-        partitioned_data = collections.defaultdict(list)
-
-        for key, value in mapped_values:
-            partitioned_data[key].append(value)
-
-        return partitioned_data.items()
-
-
-class SQuADDataTokenizer:
-    """SQuAD data tokenizer, that encapsulate the splitting logic of each entry of SQuAD dataset"""
-    try:
-        _spacy_tokenizer = nlp.data.SpacyTokenizer()
-    except (ImportError, AttributeError) as e:
-        _spacy_error = e
-        def _spacy_tokenizer(*args, **kwargs):  # pylint: disable=no-method-argument
-            raise SQuADDataTokenizer._spacy_error
-
-    def __init__(self, use_spacy=True):
-        """Init new SQuADDataTokenizer object
-        Parameters
-        ----------
-        use_spacy : bool, default True
-            Use Spacy as base tokenizer. Otherwise uses NLTK with some cleansing
-        """
-        self._use_spacy = use_spacy
-
-    def run_async(self, pool, dataset):
-        return pool.imap(self, dataset)
-
-    def __call__(self, example):
-        return self.tokenize_one_example(example)
-
-    def tokenize_one_example(self, example):
-        """Tokenize a single example
-
-        Parameters
-        ----------
-        example : Tuple
-            A tuple of SQuAD dataset in format (record_index, question_id, question, context,
-            answer_list, answer_start)
-
-        Returns
-        -------
-        ret : dict
-            Tokenized example with the following keys: context_tokens, context_chars, ques_tokens,
-            ques_chars, y1s, y2s, id, context, spans, record_idx
-        """
-        index, q_id, question, context, answer_list, answer_start = example
-
-        context = context.replace('\'\'', '\" ').replace(r'``', '\" ')
-        context_tokens = SQuADDataTokenizer._word_tokenize_spacy(context) if self._use_spacy else \
-            SQuADDataTokenizer._word_tokenize_nltk(context)
-        context_chars = [list(token) for token in context_tokens]
-        spans = SQuADDataTokenizer._get_token_spans(context, context_tokens)
-
-        ques = question.replace('\'\'', '\" ').replace('``', '\" ')
-        ques_tokens = SQuADDataTokenizer._word_tokenize_spacy(ques) if self._use_spacy else \
-            SQuADDataTokenizer._word_tokenize_nltk(ques)
-        ques_chars = [list(token) for token in ques_tokens]
-
-        y1s, y2s = [], []
-        answer_texts = []
-
-        for answer_text, answer_start in zip(answer_list, answer_start):
-            answer_end = answer_start + len(answer_text)
-            answer_texts.append(answer_text)
-            answer_span = []
-            for idx, span in enumerate(spans):
-                if not (answer_end <= span[0] or answer_start >= span[1]):
-                    answer_span.append(idx)
-            y1, y2 = answer_span[0], answer_span[-1]
-            y1s.append(y1)
-            y2s.append(y2)
-
-        result = {'context_tokens': context_tokens, 'context_chars': context_chars,
-                  'ques_tokens': ques_tokens, 'ques_chars': ques_chars, 'y1s': y1s,
-                  'y2s': y2s, 'id': q_id, 'context': context, 'spans': spans, 'record_idx': index}
-        return result
-
-    @staticmethod
-    def _word_tokenize_spacy(sent):
-        """Default tokenization method that uses Spacy. Called only if not overridden by providing
-        base_tokenizer to SQuADDataTokenizer.__init__
-
-        Parameters
-        ----------
-        sent : str
-            A text to tokenize
-
-        Returns
-        -------
-        tokens : List[str]
-            List of tokens
-        """
-        tokens = SQuADDataTokenizer._spacy_tokenizer(sent)
-        return tokens
-
-    @staticmethod
-    def _word_tokenize_nltk(sent):
-        """Tokenization method that uses NLTK.
-
-        Parameters
-        ----------
-        sent : str
-            A text to tokenize
-
-        Returns
-        -------
-        tokens : List[str]
-            List of tokens
-        """
-        tokens = []
-        splitters = ('-', '\u2212', '\u2014', '\u2013', '/', '~', '"', '\'', '\u201C',
-                     '\u2019', '\u201D', '\u2018', '\u00B0')
-
-        sample = sent.replace('\n', ' ').replace(u'\u000A', '').replace(u'\u00A0', '')
-        temp_tokens = [token.replace('\'\'', '"').replace('``', '"') for token in
-                       nltk.word_tokenize(sample)]
-
-        for token in temp_tokens:
-            tokens.extend(re.split('([{}])'.format(''.join(splitters)), token))
-
-        tokens = [token for token in tokens if len(token) > 0]
-        return tokens
-
-    @staticmethod
-    def _get_token_spans(text, tokens):
-        """Create a list of tuples that contains tokens character inidices. By using this output
-        it is possible to find character-based indices of token start and end
-
-        Parameters
-        ----------
-        text : str
-            Original text
-        tokens : List[str]
-            List of tokens of the original text
-
-        Returns
-        -------
-        ret: List[Tuple]
-            List of tuple, where each tuple contains starting character index of the token in the
-            text and end character index of the token in the text
-        """
-        current = 0
-        spans = []
-        for token in tokens:
-            current = text.find(token, current)
-            if current < 0:
-                print('Token {} cannot be found'.format(token))
-                raise Exception()
-            spans.append((current, current + len(token)))
-            current += len(token)
-        return spans
-
-
-class SQuADDataFilter:
-    """Filter an example based on the specified conditions"""
-
-    def __init__(self, para_limit, ques_limit, ans_limit):
-        """Init SQuADDataFilter object
-
-        Parameters
-        ----------
-        para_limit : int
-            Maximum allowed length of a paragraph
-        ques_limit : int
-            Maximum allowed length of a question
-        ans_limit : int
-            Maximum allowed length of an answer
-        """
-        self._para_limit = para_limit
-        self._ques_limit = ques_limit
-        self._ans_limit = ans_limit
-
-    def filter(self, example):
-        """Returns if the example should be filtered out or not
-
-        Parameters
-        ----------
-        example : dict
-            A dataset examples with context_tokens, ques_tokens, y1s and y2s keys
-
-        Returns
-        -------
-        ret : Boolean
-            True if an example should remain in the dataset, and False if it should be excluded from
-            the dataset
-        """
-        return len(example['context_tokens']) <= self._para_limit and \
-               len(example['ques_tokens']) <= self._ques_limit and \
-               (example['y2s'][0] - example['y1s'][0]) <= self._ans_limit
-
-
-class SQuADAsyncVocabMapper:
-    """A multiprocessing implementation of a Mapper for tokens counting"""
-
-    def __init__(self, iterate_over_example=False):
-        """Init MapReduce object
-
-        Parameters
-        ----------
-        iterate_over_example : bool, default False
-            Should use examples as is, or iterate over its content
-        """
-        self._iterate_over_example = iterate_over_example
-
-    def run_async(self, examples, pool):
-        """Run async processing over examples
-
-        Parameters
-        ----------
-        examples : List[dict]
-            List of dictionaries with context_tokens and ques_tokens keys
-        pool : Pool
-            Multiprocessing pool to use
-
-        Returns
-        -------
-        ret : List[Tuple]
-            List of tuples of tokens and counts: (str, int)
-        """
-        return pool.imap(self, examples)
-
-    def __call__(self, example):
-        """Maps examples into distinct tokens
-
-        Parameters
-        ----------
-        example : dict
-            Example to process with context_tokens and ques_tokens keys
-
-        Returns
-        -------
-        mapped_values : List[Tuple]
-            Result of mapping process. Each tuple of (token, count) format
-        """
-        para_counter = data.count_tokens(example['context_tokens'] if not self._iterate_over_example
-                                         else [c for tkn in example['context_tokens'] for c in tkn])
-        ques_counter = data.count_tokens(example['ques_tokens'] if not self._iterate_over_example
-                                         else [c for tkn in example['ques_tokens'] for c in tkn])
-        counter = para_counter + ques_counter
-        return list(counter.items())
-
-
-class SQuADAsyncVocabReducer:
-    """A multiprocessing implementation of a Reducing for tokens counting"""
-
-    def run_async(self, items, pool):
-        """Run async processing over examples
-
-        Parameters
-        ----------
-        items : List[Tuple]
-            List of tuples of (token, count) structure
-        pool : Pool
-            Multiprocessing pool to use
-
-        Returns
-        -------
-        ret : List[Tuple]
-            List of tuples of tokens and counts: (str, int)
-        """
-        return pool.imap(self, items)
-
-    def __call__(self, item):
-        """Sums up number of times a token was used
-
-        Parameters
-        ----------
-        item : Tuple
-            A tuple of (token, counts) format
-
-        Returns
-        -------
-        ret : Tuple
-            A tuple of (token, sum_of_counts)
-
-        """
-        token, counts = item
-        return token, sum(counts)
-
-
-class SQuADDataFeaturizer:
-    """Class that converts tokenized examples into featurized"""
-
-    def __init__(self, word_vocab, char_vocab, para_limit, ques_limit, char_limit,
-                 is_cased_embedding):
-        """Init SQuADDataFeaturizer object
-
-        Parameters
-        ----------
-        word_vocab : Vocab
-            Word-level vocabulary
-        char_vocab : Vocab
-            Char-level vocabulary
-        para_limit : int
-            Maximum characters in a paragraph
-        ques_limit : int
-            Maximum characters in a question
-        char_limit : int
-            Maximum characters in a token
-        is_cased_embedding: bool
-            Is underlying embedding is cased or uncased
-        """
-        self._para_limit = para_limit
-        self._ques_limit = ques_limit
-        self._char_limit = char_limit
-
-        self._word_vocab = word_vocab
-        self._char_vocab = char_vocab
-
-        self._is_cased_embedding = is_cased_embedding
-
-    def _get_words_emb(self, words):
-        """Get embedding for the words
-
-        Parameters
-        ----------
-        words : list[str]
-            Words to embed
-
-        Returns
-        -------
-        ret : np.array
-            Array of embeddings for words
-        """
-
-        if not self._is_cased_embedding:
-            return self._word_vocab[[word.lower() for word in words]]
-
-        result = np.full([len(words)], fill_value=0, dtype=np.float32)
-        word_emb_matrix = np.full([len(words), 4], fill_value=0, dtype=np.float32)
-
-        for i, w in enumerate(words):
-            word_emb_matrix[i, :] = self._word_vocab[[w, w.lower(), w.capitalize(), w.upper()]]
-
-        mask = word_emb_matrix != 0
-        first_non_zero_embeddings_indices = np.where(mask.any(axis=1), mask.argmax(axis=1), -1)
-
-        for i, index in enumerate(first_non_zero_embeddings_indices):
-            result[i] = word_emb_matrix[i, index]
-
-        return result
-
-    def build_features(self, example):
-        """Generate features for a given example
-
-        Parameters
-        ----------
-        example : dict
-            A tokenized example of a dataset
-
-        Returns
-        -------
-        ret : Tuple
-            An example with tokens replaced with indices of the following format: question_id,
-            record_index, context_tokens_indices, question_tokens_indices, context_chars_indices,
-            question_char_indices, start_token_index_of_the_answer, end_token_index_of_the_answer,
-            context, context_tokens_spans
-        """
-        context_idxs = np.full([self._para_limit],
-                               fill_value=self._word_vocab[self._word_vocab.padding_token],
-                               dtype=np.float32)
-
-        ctx_chars_idxs = np.full([self._para_limit, self._char_limit],
-                                 fill_value=self._char_vocab[self._char_vocab.padding_token],
-                                 dtype=np.float32)
-
-        ques_idxs = np.full([self._ques_limit],
-                            fill_value=self._word_vocab[self._word_vocab.padding_token],
-                            dtype=np.float32)
-
-        ques_char_idxs = np.full([self._ques_limit, self._char_limit],
-                                 fill_value=self._char_vocab[self._char_vocab.padding_token],
-                                 dtype=np.float32)
-
-        context_len = min(len(example['context_tokens']), self._para_limit)
-        context_idxs[:context_len] = self._get_words_emb(example['context_tokens'][:context_len])
-
-        ques_len = min(len(example['ques_tokens']), self._ques_limit)
-        ques_idxs[:ques_len] = self._get_words_emb(example['ques_tokens'][:ques_len])
-
-        for i in range(0, context_len):
-            char_len = min(len(example['context_chars'][i]), self._char_limit)
-            ctx_chars_idxs[i, :char_len] = self._char_vocab[example['context_chars'][i][:char_len]]
-
-        for i in range(0, ques_len):
-            char_len = min(len(example['ques_chars'][i]), self._char_limit)
-            ques_char_idxs[i, :char_len] = self._char_vocab[example['ques_tokens'][i][:char_len]]
-
-        start, end = example['y1s'][-1], example['y2s'][-1]
-
-        record = (example['id'],
-                  example['record_idx'],
-                  context_idxs,
-                  ques_idxs,
-                  ctx_chars_idxs,
-                  ques_char_idxs,
-                  start,
-                  end,
-                  example['context'],
-                  example['spans'])
-
-        return record
-
-
-class SQuADQADataset(Dataset):
-    """Dataset that wraps the featurized examples with standard Gluon API Dataset format. It allows
-    to fetch a record by question id for the evaluation"""
-
-    def __init__(self, records):
-        super(SQuADQADataset, self).__init__()
-        self._data = records
-        self._record_idx_to_record = {}
-
-        for record in records:
-            self._record_idx_to_record[record[1]] = {'q_id': record[0], 'rec': record}
-
-    def __getitem__(self, idx):
-        """Get example by index in the original list
-
-        Parameters
-        ----------
-        idx : int
-
-        Returns
-        -------
-        ret : Tuple of question_id, record_index, context_tokens_indices, question_tokens_indices,
-            context_chars_indices, question_char_indices, start_token_index_of_the_answer,
-            end_token_index_of_the_answer, context, context_tokens_spans
-        """
-        return self._data[idx]
-
-    def __len__(self):
-        """Get the number of the examples in the dataset
-
-        Returns
-        -------
-        ret : int
-            Number of examples of the dataset
-        """
-        return len(self._data)
-
-    def get_q_id_by_rec_idx(self, rec_idx):
-        """Returns a question id associated with provided record index from original SQuAD dataset
-
-        Parameters
-        ----------
-        rec_idx : int
-            Record index in SQuAD dataset
-
-        Returns
-        -------
-        question_id : str
-        """
-        return self._record_idx_to_record[rec_idx]['q_id']
-
-    def get_record_by_idx(self, rec_idx):
-        """Returns a record associated with provided record index from original SQuAD dataset
-
-        Parameters
-        ----------
-        rec_idx : int
-
-        Returns
-        -------
-        ret : Tuple of question_id, record_index, context_tokens_indices, question_tokens_indices,
-            context_chars_indices, question_char_indices, start_token_index_of_the_answer,
-            end_token_index_of_the_answer, context, context_tokens_spans
-        """
-        return self._record_idx_to_record[rec_idx]['rec']
-
-
-class SQuADDataLoaderTransformer:
-    """Thin wrapper on SQuADQADataset that removed non-numeric values from the record. The output of
-    that transformer can be provided to a DataLoader"""
-
-    def __call__(self, q_id, record_idx, ctx_idxs, ques_idxs, ctx_chars_idxs, ques_char_idxs,
-                 start, end, context, spans):
-        """Return the same record with non-numeric values removed from the output
-
-        Parameters
-        ----------
-        q_id : str
-            Question Id
-        record_idx : int
-            Record index
-        ctx_idxs : NDArray
-            Indices of context tokens
-        ques_idxs : NDArray
-            Indices of question tokens
-        ctx_chars_idxs : NDArray
-            Indices of context characters
-        ques_char_idxs : NDArray
-            Indices of question characters
-        start : int
-            Start of the answer
-        end : int
-            End of the answer
-        context : str
-            Original context string
-        spans : List[Tuple]
-            List of character indices of each token of the context.
-
-        Returns
-        -------
-        record_idx : int
-            Record index
-        ctx_idxs : NDArray
-            Indices of context tokens
-        ques_idxs : NDArray
-            Indices of question tokens
-        ctx_chars_idxs : NDArray
-            Indices of context characters
-        ques_char_idxs : NDArray
-            Indices of question characters
-        start : int
-            Start of the answer
-        end : int
-            End of the answer
-        """
-        return record_idx, ctx_idxs, ques_idxs, ctx_chars_idxs, ques_char_idxs, start, end
diff --git a/scripts/question_answering/eval_utils.py b/scripts/question_answering/eval_utils.py
new file mode 100644
index 0000000000..e28aecb7af
--- /dev/null
+++ b/scripts/question_answering/eval_utils.py
@@ -0,0 +1,267 @@
+"""Modification version of official evaluation script for SQuAD version 2.0.
+(https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/)
+
+"""
+import collections
+import json
+import copy
+import re
+import string
+
+
+def make_qid_to_has_ans(dataset):
+    qid_to_has_ans = {}
+    for article in dataset:
+        for p in article['paragraphs']:
+            for qa in p['qas']:
+                qid_to_has_ans[qa['id']] = bool(qa['answers'])
+    return qid_to_has_ans
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    def remove_articles(text):
+        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
+        return re.sub(regex, ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_tokens(s):
+    if not s:
+        return []
+    return normalize_answer(s).split()
+
+
+def compute_exact(a_gold, a_pred):
+    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+
+
+def compute_f1(a_gold, a_pred):
+    """
+    Compute the token-level f1 scores in which the common tokens are considered
+    as True Positives. Precision and recall are percentages of the number of
+    common tokens in the prediction and groud truth, respectively.
+    """
+    gold_toks = get_tokens(a_gold)
+    pred_toks = get_tokens(a_pred)
+    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+    num_same = sum(common.values())
+    if len(gold_toks) == 0 or len(pred_toks) == 0:
+        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+        return int(gold_toks == pred_toks)
+    if num_same == 0:
+        return 0
+    precision = 1.0 * num_same / len(pred_toks)
+    recall = 1.0 * num_same / len(gold_toks)
+    f1 = (2 * precision * recall) / (precision + recall)
+    return f1
+
+
+def get_raw_scores(dataset, preds):
+    exact_scores = {}
+    f1_scores = {}
+    for article in dataset:
+        for p in article['paragraphs']:
+            for qa in p['qas']:
+                qid = qa['id']
+                gold_answers = [a['text'] for a in qa['answers']
+                                if normalize_answer(a['text'])]
+                if not gold_answers:
+                    # For unanswerable questions, only correct answer is empty string
+                    gold_answers = ['']
+                if qid not in preds:
+                    print('Missing prediction for %s' % qid)
+                    continue
+                a_pred = preds[qid]
+                # Take max over all gold answers
+                exact_scores[qid] = max(compute_exact(a, a_pred)
+                                        for a in gold_answers)
+                f1_scores[qid] = max(compute_f1(a, a_pred)
+                                     for a in gold_answers)
+    return exact_scores, f1_scores
+
+
+def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
+    new_scores = {}
+    for qid, s in scores.items():
+        # Treat those whose logits exceeds the threshold as unanswerable
+        pred_na = na_probs[qid] > na_prob_thresh
+        if pred_na:
+            # The raw scores are converted to 1 if the answerability
+            # are predicted else 0
+            new_scores[qid] = float(not qid_to_has_ans[qid])
+        else:
+            new_scores[qid] = s
+    return new_scores
+
+
+def make_eval_dict(exact_scores, f1_scores, qid_list=None):
+    if not qid_list:
+        total = len(exact_scores)
+        return collections.OrderedDict([
+            ('exact', 100.0 * sum(exact_scores.values()) / total),
+            ('f1', 100.0 * sum(f1_scores.values()) / total),
+            ('total', total),
+        ])
+    else:
+        total = len(qid_list)
+        return collections.OrderedDict([
+            ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+            ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+            ('total', total),
+        ])
+
+
+def merge_eval(main_eval, new_eval, prefix):
+    for k in new_eval:
+        main_eval['%s_%s' % (prefix, k)] = new_eval[k]
+
+
+def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
+    """
+    Find the best threshold of the raw scores.
+
+    The initial score is set to the number of unanswerable questions,
+    assuming that each unanswerable question is successfully predicted.
+    In the following traverse, the best threshold is constantly adjusted
+    according to the difference from the assumption ('diff').
+    """
+    num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+    cur_score = num_no_ans
+    best_score = cur_score
+    best_thresh = 0.0
+    # Rearrange the na_probs in an ascending order, so that the questions
+    # with higher probability of answerability the sooner will be read.
+    qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+    for i, qid in enumerate(qid_list):
+        if qid not in scores:
+            continue
+        if qid_to_has_ans[qid]:
+            # For the answerable question
+            diff = scores[qid]
+        else:
+            # For the unanswerable question
+            if preds[qid]:
+                # Falsely predict the answerability
+                diff = -1
+            else:
+                # Correctly predict the answerability. This is Only true if the
+                # prediction is blank, which is no the case before revision
+                diff = 0
+        cur_score += diff
+        if cur_score > best_score:
+            # adjust the best thresh over current thresh (na_probs[qid])
+            best_score = cur_score
+            best_thresh = na_probs[qid]
+    return 100.0 * best_score / len(scores), best_thresh
+
+
+def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+    best_exact, exact_thresh = find_best_thresh(
+        preds, exact_raw, na_probs, qid_to_has_ans)
+    best_f1, f1_thresh = find_best_thresh(
+        preds, f1_raw, na_probs, qid_to_has_ans)
+    main_eval['best_exact'] = best_exact
+    main_eval['best_exact_thresh'] = exact_thresh
+    main_eval['best_f1'] = best_f1
+    main_eval['best_f1_thresh'] = f1_thresh
+
+
+def revise_unanswerable(preds, na_probs, na_prob_thresh):
+    """
+    Revise the predictions results and return a null string for unanswerable question
+    whose unanswerable probability above the threshold.
+
+    Parameters
+    ----------
+    preds: dict
+        A dictionary of full prediction of spans
+    na_probs: dict
+        A dictionary of unanswerable probabilities
+    na_prob_thresh: float
+        threshold of the unanswerable probability
+
+    Returns
+    -------
+    revised: dict
+        A dictionary of revised prediction
+    """
+    revised = copy.deepcopy(preds)
+    for q_id in na_probs.keys():
+        if na_probs[q_id] > na_prob_thresh:
+            revised[q_id] = ""
+    return revised
+
+
+def squad_eval(data_file, preds, na_probs, na_prob_thresh=0.0, revise=False):
+    """
+
+    Parameters
+    ----------
+    data_file
+        dataset(list) or data_file(str)
+    preds
+        predictions dictionary
+    na_probs
+        probabilities dictionary of unanswerable
+    na_prob_thresh
+        threshold of unanswerable
+    revise
+        Wether to get the final predictions with impossible answers replaced
+        with null string ''
+    Returns
+    -------
+    out_eval
+        A dictionary of output results
+    (preds_out)
+        A dictionary of final predictions
+    """
+    if isinstance(data_file, str):
+        with open(data_file) as f:
+            dataset_json = json.load(f)
+            dataset = dataset_json['data']
+    elif isinstance(data_file, list):
+        dataset = data_file
+    if na_probs is None:
+        na_probs = {k: 0.0 for k in preds}
+        # not necessary to revise results of SQuAD 1.1
+        revise = False
+    qid_to_has_ans = make_qid_to_has_ans(dataset)  # maps qid to True/False
+    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
+    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
+    exact_raw, f1_raw = get_raw_scores(dataset, preds)
+    exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
+                                          na_prob_thresh)
+    f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
+                                       na_prob_thresh)
+    out_eval = make_eval_dict(exact_thresh, f1_thresh)
+    if has_ans_qids:
+        has_ans_eval = make_eval_dict(
+            exact_thresh, f1_thresh, qid_list=has_ans_qids)
+        merge_eval(out_eval, has_ans_eval, 'HasAns')
+    if no_ans_qids:
+        no_ans_eval = make_eval_dict(
+            exact_thresh, f1_thresh, qid_list=no_ans_qids)
+        merge_eval(out_eval, no_ans_eval, 'NoAns')
+        find_all_best_thresh(out_eval, preds, exact_raw,
+                             f1_raw, na_probs, qid_to_has_ans)
+
+    if revise:
+        thresh = (out_eval['best_exact_thresh'] +
+                  out_eval['best_f1_thresh']) * 0.5
+        preds_out = revise_unanswerable(preds, na_probs, thresh)
+        return out_eval, preds_out
+    else:
+        return out_eval, preds
diff --git a/scripts/question_answering/models.py b/scripts/question_answering/models.py
new file mode 100644
index 0000000000..58b156cbf3
--- /dev/null
+++ b/scripts/question_answering/models.py
@@ -0,0 +1,360 @@
+import mxnet as mx
+from mxnet.gluon import nn, HybridBlock
+from mxnet.util import use_np
+from gluonnlp.layers import get_activation
+from gluonnlp.op import select_vectors_by_position
+from gluonnlp.attention_cell import masked_logsoftmax, masked_softmax
+
+
+@use_np
+class ModelForQABasic(HybridBlock):
+    """The basic pretrained model for QA. It is used in the original BERT paper for SQuAD 1.1.
+
+    Here, we directly use the backbone network to extract the contextual embeddings and use
+    another dense layer to map the contextual embeddings to the start scores and end scores.
+
+    use_segmentation is used to mark whether we segment the input sentence. In RoBERTa and XLMR,
+    this flag is set to True, then the QA model no longer accept `token_types` as valid input.
+
+    - use_segmentation=True:
+        tokens :      <CLS> Question <SEP> Context <SEP>
+        token_typess:  0       0       0      1      1
+
+    - use_segmentation=False:
+        tokens :      <CLS> Question <SEP> Context <SEP>
+        token_typess:  None
+    """
+    def __init__(self, backbone, weight_initializer=None, bias_initializer=None,
+                 use_segmentation=True):
+        super().__init__()
+
+        self.backbone = backbone
+        self.use_segmentation = use_segmentation
+        self.qa_outputs = nn.Dense(units=2, flatten=False,
+                                   weight_initializer=weight_initializer,
+                                   bias_initializer=bias_initializer)
+
+    def hybrid_forward(self, F, tokens, token_types, valid_length, p_mask):
+        """
+
+        Parameters
+        ----------
+        F
+        tokens
+            Shape (batch_size, seq_length)
+            The merged input tokens
+        token_types
+            Shape (batch_size, seq_length)
+            Token types for the sequences, used to indicate whether the word belongs to the
+            first sentence or the second one.
+        valid_length
+            Shape (batch_size,)
+            Valid length of the sequence. This is used to mask the padded tokens.
+        p_mask
+            The mask that is associated with the tokens.
+
+        Returns
+        -------
+        start_logits
+            Shape (batch_size, sequence_length)
+            The log-softmax scores that the position is the start position.
+        end_logits
+            Shape (batch_size, sequence_length)
+            The log-softmax scores that the position is the end position.
+        """
+        # Get contextual embedding with the shape (batch_size, sequence_length, C)
+        if self.use_segmentation:
+            contextual_embeddings = self.backbone(tokens, token_types, valid_length)
+        else:
+            contextual_embeddings = self.backbone(tokens, valid_length)
+        scores = self.qa_outputs(contextual_embeddings)
+        start_scores = scores[:, :, 0]
+        end_scores = scores[:, :, 1]
+        start_logits = masked_logsoftmax(F, start_scores, mask=p_mask, axis=-1)
+        end_logits = masked_logsoftmax(F, end_scores, mask=p_mask, axis=-1)
+        return start_logits, end_logits
+
+    def inference(self, tokens, token_types, valid_length, p_mask,
+                  start_top_n: int = 5, end_top_n: int = 5):
+        """Get the inference result with beam search
+
+        Parameters
+        ----------
+        tokens
+            The input tokens. Shape (batch_size, sequence_length)
+        token_types
+            The input token types. Shape (batch_size, sequence_length)
+        valid_length
+            The valid length of the tokens. Shape (batch_size,)
+        p_mask
+            The mask which indicates that some tokens won't be used in the calculation.
+            Shape (batch_size, sequence_length)
+        start_top_n
+            The number of candidates to select for the start position.
+        end_top_n
+            The number of candidates to select for the end position.
+
+        Returns
+        -------
+        start_top_logits
+            The top start logits
+            Shape (batch_size, start_top_n)
+        start_top_index
+            Index of the top start logits
+            Shape (batch_size, start_top_n)
+        end_top_logits
+            The top end logits.
+            Shape (batch_size, end_top_n)
+        end_top_index
+            Index of the top end logits
+            Shape (batch_size, end_top_n)
+        """
+        # Shape (batch_size, sequence_length, C)
+        if self.use_segmentation:
+            contextual_embeddings = self.backbone(tokens, token_types, valid_length)
+        else:
+            contextual_embeddings = self.backbone(tokens, valid_length)
+        scores = self.qa_outputs(contextual_embeddings)
+        start_scores = scores[:, :, 0]
+        end_scores = scores[:, :, 1]
+        start_logits = masked_logsoftmax(mx.nd, start_scores, mask=p_mask, axis=-1)
+        end_logits = masked_logsoftmax(mx.nd, end_scores, mask=p_mask, axis=-1)
+        # The shape of start_top_index will be (..., start_top_n)
+        start_top_logits, start_top_index = mx.npx.topk(start_logits, k=start_top_n, axis=-1,
+                                                        ret_typ='both')
+        # Note that end_top_index and end_top_log_probs have shape (bsz, start_n_top, end_n_top)
+        # So that for each start position, there are end_n_top end positions on the third dim.
+        end_top_logits, end_top_index = mx.npx.topk(end_logits, k=end_top_n, axis=-1,
+                                                    ret_typ='both')
+        return start_top_logits, start_top_index, end_top_logits, end_top_index
+
+
+@use_np
+class ModelForQAConditionalV1(HybridBlock):
+    """Here, we use three networks to predict the start scores, end scores and answerable scores.
+
+    We formulate p(start, end, answerable | contextual_embedding) as the product of the
+    following three terms:
+
+    - p(start | contextual_embedding)
+    - p(end | start, contextual_embedding)
+    - p(answerable | contextual_embedding)
+
+    In the inference phase, we are able to use beam search to do the inference.
+
+    use_segmentation is used to mark whether we segment the input sentence. In RoBERTa and XLMR,
+    this flag is set to True, then the QA model no longer accept `token_types` as valid input.
+
+    - use_segmentation=True:
+        tokens :      <CLS> Question <SEP> Context <SEP>
+        token_typess:  0       0       0      1      1
+
+    - use_segmentation=False:
+        tokens :      <CLS> Question <SEP> Context <SEP>
+        token_typess:  None
+    """
+    def __init__(self, backbone, units=768, layer_norm_eps=1E-12, dropout_prob=0.1,
+                 activation='tanh', weight_initializer=None, bias_initializer=None,
+                 use_segmentation=True):
+        super().__init__()
+        self.backbone = backbone
+        self.use_segmentation = use_segmentation
+        self.start_scores = nn.Dense(1, flatten=False,
+                                     weight_initializer=weight_initializer,
+                                     bias_initializer=bias_initializer)
+        self.end_scores = nn.HybridSequential()
+        self.end_scores.add(nn.Dense(units, flatten=False,
+                                     weight_initializer=weight_initializer,
+                                     bias_initializer=bias_initializer))
+        self.end_scores.add(get_activation(activation))
+        self.end_scores.add(nn.LayerNorm(epsilon=layer_norm_eps))
+        self.end_scores.add(nn.Dense(1, flatten=False,
+                                     weight_initializer=weight_initializer,
+                                     bias_initializer=bias_initializer))
+        self.answerable_scores = nn.HybridSequential()
+        self.answerable_scores.add(nn.Dense(units, flatten=False,
+                                            weight_initializer=weight_initializer,
+                                            bias_initializer=bias_initializer))
+        self.answerable_scores.add(get_activation(activation))
+        self.answerable_scores.add(nn.Dropout(dropout_prob))
+        self.answerable_scores.add(nn.Dense(2, flatten=False,
+                                            weight_initializer=weight_initializer,
+                                            bias_initializer=bias_initializer))
+
+    def get_start_logits(self, F, contextual_embedding, p_mask):
+        """
+
+        Parameters
+        ----------
+        F
+        contextual_embedding
+            Shape (batch_size, sequence_length, C)
+
+        Returns
+        -------
+        start_logits
+            Shape (batch_size, sequence_length)
+        """
+        start_scores = F.np.squeeze(self.start_scores(contextual_embedding), -1)
+        start_logits = masked_logsoftmax(F, start_scores, mask=p_mask, axis=-1)
+        return start_logits
+
+    def get_end_logits(self, F, contextual_embedding, start_positions, p_mask):
+        """
+
+        Parameters
+        ----------
+        F
+        contextual_embedding
+            Shape (batch_size, sequence_length, C)
+        start_positions
+            Shape (batch_size, N)
+            We process multiple candidates simultaneously
+        p_mask
+            Shape (batch_size, sequence_length)
+
+        Returns
+        -------
+        end_logits
+            Shape (batch_size, N, sequence_length)
+        """
+        # Select the features at the start_positions
+        # start_feature will have shape (batch_size, N, C)
+        start_features = select_vectors_by_position(F, contextual_embedding, start_positions)
+        # Concatenate the start_feature and the contextual_embedding
+        contextual_embedding = F.np.expand_dims(contextual_embedding, axis=1)  # (B, 1, T, C)
+        start_features = F.np.expand_dims(start_features, axis=2)  # (B, N, 1, C)
+        concat_features = F.np.concatenate([F.npx.broadcast_like(start_features,
+                                                                 contextual_embedding, 2, 2),
+                                            F.npx.broadcast_like(contextual_embedding,
+                                                                 start_features, 1, 1)],
+                                           axis=-1)  # (B, N, T, 2C)
+        end_scores = self.end_scores(concat_features)
+        end_scores = F.np.squeeze(end_scores, -1)
+        end_logits = masked_logsoftmax(F, end_scores, mask=F.np.expand_dims(p_mask, axis=1),
+                                       axis=-1)
+        return end_logits
+
+    def get_answerable_logits(self, F, contextual_embedding, p_mask):
+        """Get the answerable logits.
+
+        Parameters
+        ----------
+        F
+        contextual_embedding
+            Shape (batch_size, sequence_length, C)
+        p_mask
+            Shape (batch_size, sequence_length)
+            Mask the sequence.
+            0 --> Denote that the element is masked,
+            1 --> Denote that the element is not masked
+
+        Returns
+        -------
+        answerable_logits
+            Shape (batch_size, 2)
+        """
+        # Shape (batch_size, sequence_length)
+        start_scores = F.np.squeeze(self.start_scores(contextual_embedding), -1)
+        start_score_weights = masked_softmax(F, start_scores, p_mask, axis=-1)
+        start_agg_feature = F.npx.batch_dot(F.np.expand_dims(start_score_weights, axis=1),
+                                            contextual_embedding)
+        start_agg_feature = F.np.squeeze(start_agg_feature, 1)
+        cls_feature = contextual_embedding[:, 0, :]
+        answerable_scores = self.answerable_scores(F.np.concatenate([start_agg_feature,
+                                                                     cls_feature], axis=-1))
+        answerable_logits = F.npx.log_softmax(answerable_scores, axis=-1)
+        return answerable_logits
+
+    def hybrid_forward(self, F, tokens, token_types, valid_length, p_mask, start_position):
+        """
+
+        Parameters
+        ----------
+        F
+        tokens
+            Shape (batch_size, sequence_length)
+        token_types
+            Shape (batch_size, sequence_length)
+        valid_length
+            Shape (batch_size,)
+        p_mask
+            Shape (batch_size, sequence_length)
+        start_position
+            Shape (batch_size,)
+
+        Returns
+        -------
+        start_logits
+            Shape (batch_size, sequence_length)
+        end_logits
+            Shape (batch_size, sequence_length)
+        answerable_logits
+        """
+        if self.use_segmentation:
+            contextual_embeddings = self.backbone(tokens, token_types, valid_length)
+        else:
+            contextual_embeddings = self.backbone(tokens, valid_length)
+        start_logits = self.get_start_logits(F, contextual_embeddings, p_mask)
+        end_logits = self.get_end_logits(F, contextual_embeddings,
+                                         F.np.expand_dims(start_position, axis=1),
+                                         p_mask)
+        end_logits = F.np.squeeze(end_logits, axis=1)
+        answerable_logits = self.get_answerable_logits(F, contextual_embeddings, p_mask)
+        return start_logits, end_logits, answerable_logits
+
+    def inference(self, tokens, token_types, valid_length, p_mask,
+                  start_top_n: int = 5, end_top_n: int = 5):
+        """Get the inference result with beam search
+
+        Parameters
+        ----------
+        tokens
+            The input tokens. Shape (batch_size, sequence_length)
+        token_types
+            The input token types. Shape (batch_size, sequence_length)
+        valid_length
+            The valid length of the tokens. Shape (batch_size,)
+        p_mask
+            The mask which indicates that some tokens won't be used in the calculation.
+            Shape (batch_size, sequence_length)
+        start_top_n
+            The number of candidates to select for the start position.
+        end_top_n
+            The number of candidates to select for the end position.
+
+        Returns
+        -------
+        start_top_logits
+            The top start logits
+            Shape (batch_size, start_top_n)
+        start_top_index
+            Index of the top start logits
+            Shape (batch_size, start_top_n)
+        end_top_logits
+            The top end logits.
+            Shape (batch_size, start_top_n, end_top_n)
+        end_top_index
+            Index of the top end logits
+            Shape (batch_size, start_top_n, end_top_n)
+        answerable_logits
+            The answerable logits. Here 0 --> answerable and 1 --> not answerable.
+            Shape (batch_size, sequence_length, 2)
+        """
+        # Shape (batch_size, sequence_length, C)
+        if self.use_segmentation:
+            contextual_embeddings = self.backbone(tokens, token_types, valid_length)
+        else:
+            contextual_embeddings = self.backbone(tokens, valid_length)
+        start_logits = self.get_start_logits(mx.nd, contextual_embeddings, p_mask)
+        # The shape of start_top_index will be (..., start_top_n)
+        start_top_logits, start_top_index = mx.npx.topk(start_logits, k=start_top_n, axis=-1,
+                                                        ret_typ='both')
+        end_logits = self.get_end_logits(mx.nd, contextual_embeddings, start_top_index, p_mask)
+        # Note that end_top_index and end_top_log_probs have shape (bsz, start_n_top, end_n_top)
+        # So that for each start position, there are end_n_top end positions on the third dim.
+        end_top_logits, end_top_index = mx.npx.topk(end_logits, k=end_top_n, axis=-1,
+                                                    ret_typ='both')
+        answerable_logits = self.get_answerable_logits(mx.nd, contextual_embeddings, p_mask)
+        return start_top_logits, start_top_index, end_top_logits, end_top_index, \
+                    answerable_logits
diff --git a/scripts/question_answering/run_squad.py b/scripts/question_answering/run_squad.py
new file mode 100644
index 0000000000..e4eaf3f629
--- /dev/null
+++ b/scripts/question_answering/run_squad.py
@@ -0,0 +1,963 @@
+"""
+Question Answering with Pretrained Language Model
+"""
+# pylint:disable=redefined-outer-name,logging-format-interpolation
+
+import os
+import json
+import time
+import logging
+import argparse
+import functools
+import collections
+from multiprocessing import Pool, cpu_count
+
+import mxnet as mx
+import numpy as np
+from mxnet.lr_scheduler import PolyScheduler
+
+import gluonnlp.data.batchify as bf
+from models import ModelForQABasic, ModelForQAConditionalV1
+from eval_utils import squad_eval
+from squad_utils import SquadFeature, get_squad_examples, convert_squad_example_to_feature
+from gluonnlp.models import get_backbone
+from gluonnlp.utils.misc import repeat, grouper, set_seed, init_comm, \
+    logging_config, count_parameters, parse_ctx
+from gluonnlp.initializer import TruncNorm
+from gluonnlp.data.sampler import SplitSampler
+from gluonnlp.utils.parameter import grad_global_norm, clip_grad_global_norm
+
+try:
+    import horovod.mxnet as hvd
+except ImportError:
+    pass
+
+mx.npx.set_np()
+
+CACHE_PATH = os.path.realpath(os.path.join(os.path.realpath(__file__), '..', 'cached'))
+if not os.path.exists(CACHE_PATH):
+    os.makedirs(CACHE_PATH, exist_ok=True)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Question Answering example. '
+                    'We fine-tune the pretrained model on SQuAD dataset.')
+    parser.add_argument('--model_name', type=str, default='google_albert_base_v2',
+                        help='Name of the pretrained model.')
+    parser.add_argument('--do_train', action='store_true',
+                        help='Whether to train the model')
+    parser.add_argument('--do_eval', action='store_true',
+                        help='Whether to evaluate the model')
+    parser.add_argument('--data_dir', type=str, default='squad')
+    parser.add_argument('--version', default='2.0', choices=['1.1', '2.0'],
+                        help='Version of the SQuAD dataset.')
+    parser.add_argument('--output_dir', type=str, default='squad_out',
+                        help='The output directory where the model params will be written.'
+                             ' default is squad_out')
+    # Communication
+    parser.add_argument('--comm_backend', type=str, default='device',
+                        choices=['horovod', 'dist_sync_device', 'device'],
+                        help='Communication backend.')
+    parser.add_argument('--gpus', type=str, default='0',
+                        help='list of gpus to run, e.g. 0 or 0,2,5. -1 means using cpu.')
+    # Training hyperparameters
+    parser.add_argument('--seed', type=int, default=100, help='Random seed')
+    parser.add_argument('--log_interval', type=int, default=50,
+                        help='The logging interval for training')
+    parser.add_argument('--eval_log_interval', type=int, default=10,
+                        help='The logging interval for evaluation')
+    parser.add_argument('--save_interval', type=int, default=None,
+                        help='the number of steps to save model parameters.'
+                             'default is every epoch')
+    parser.add_argument('--epochs', type=float, default=3.0,
+                        help='Number of epochs, default is 3')
+    parser.add_argument('--num_train_steps', type=int, default=None,
+                        help='The number of training steps. Note that epochs will be ignored '
+                             'if training steps are set')
+    parser.add_argument('--batch_size', type=int, default=8,
+                        help='Batch size. Number of examples per gpu in a minibatch. default is 32')
+    parser.add_argument('--eval_batch_size', type=int, default=16,
+                        help='Evaluate batch size. Number of examples per gpu in a minibatch for '
+                             'evaluation.')
+    parser.add_argument('--max_grad_norm', type=float, default=1.0,
+                        help='Max gradient norm.')
+    parser.add_argument('--optimizer', type=str, default='adamw',
+                        help='optimization algorithm. default is adamw')
+    parser.add_argument('--adam_epsilon', type=float, default=1e-6,
+                        help='epsilon of AdamW optimizer')
+    parser.add_argument('--adam_betas', default='(0.9, 0.999)', metavar='B',
+                        help='betas for Adam optimizer')
+    parser.add_argument('--num_accumulated', type=int, default=1,
+                        help='The number of batches for gradients accumulation to '
+                             'simulate large batch size.')
+    parser.add_argument('--lr', type=float, default=2e-5,
+                        help='Initial learning rate. default is 2e-5')
+    parser.add_argument('--warmup_ratio', type=float, default=0.1,
+                        help='Ratio of warmup steps in the learning rate scheduler.')
+    parser.add_argument('--warmup_steps', type=int, default=None,
+                        help='warmup steps. Note that either warmup_steps or warmup_ratio is set.')
+    parser.add_argument('--wd', type=float, default=0.01, help='weight decay')
+    parser.add_argument('--layerwise_decay', type=float, default=-1, help='Layer-wise lr decay')
+    parser.add_argument('--untunable_depth', type=float, default=-1,
+                        help='Depth of untunable parameters')
+    parser.add_argument('--classifier_dropout', type=float, default=0.1,
+                        help='dropout of classifier')
+    # Data pre/post processing
+    parser.add_argument('--max_seq_length', type=int, default=512,
+                        help='The maximum total input sequence length after tokenization.'
+                             'Sequences longer than this will be truncated, and sequences shorter '
+                             'than this will be padded. default is 512')
+    parser.add_argument('--doc_stride', type=int, default=128,
+                        help='When splitting up a long document into chunks, how much stride to '
+                             'take between chunks. default is 128')
+    parser.add_argument('--max_query_length', type=int, default=64,
+                        help='The maximum number of tokens for the query. Questions longer than '
+                             'this will be truncated to this length. default is 64')
+    parser.add_argument('--round_to', type=int, default=None,
+                        help='The length of padded sequences will be rounded up to be multiple'
+                             ' of this argument. When round to is set to 8, training throughput '
+                             'may increase for mixed precision training on GPUs with TensorCores.')
+    parser.add_argument('--overwrite_cache', action='store_true',
+                        help='Whether to overwrite the feature cache.')
+    # Evaluation hyperparameters
+    parser.add_argument('--start_top_n', type=int, default=5,
+                        help='Number of start-position candidates')
+    parser.add_argument('--end_top_n', type=int, default=5,
+                        help='Number of end-position candidates corresponding '
+                             'to a start position')
+    parser.add_argument('--n_best_size', type=int, default=20, help='Top N results written to file')
+    parser.add_argument('--max_answer_length', type=int, default=30,
+                        help='The maximum length of an answer that can be generated. This is '
+                             'needed because the start and end predictions are not conditioned '
+                             'on one another. default is 30')
+    parser.add_argument('--param_checkpoint', type=str, default=None,
+                        help='The parameter checkpoint for evaluating the model')
+    parser.add_argument('--backbone_path', type=str, default=None,
+                        help='The parameter checkpoint of backbone model')
+    parser.add_argument('--all_evaluate', action='store_true',
+                        help='Whether to evaluate all intermediate checkpoints '
+                             'instead of only last one')
+    parser.add_argument('--max_saved_ckpt', type=int, default=5,
+                        help='The maximum number of saved checkpoints')
+    parser.add_argument('--eval_dtype', type=str, default='float32',
+                        help='Data type used for evaluation. Either float32 or float16')
+    args = parser.parse_args()
+    return args
+
+
+class SquadDatasetProcessor:
+
+    def __init__(self, tokenizer, doc_stride, max_seq_length, max_query_length):
+        """
+
+        Parameters
+        ----------
+        tokenizer
+            The tokenizer
+        doc_stride
+            The stride to chunk the document
+        max_seq_length
+            Maximum length of the merged data
+        max_query_length
+            Maximum query length
+        """
+        self._tokenizer = tokenizer
+        self._doc_stride = doc_stride
+        self._max_seq_length = max_seq_length
+        self._max_query_length = max_query_length
+
+        vocab = tokenizer.vocab
+        self.pad_id = vocab.pad_id
+        # For roberta model, taking sepecial token <s> as [CLS] and </s> as [SEP]
+        self.cls_id = vocab.bos_id if 'cls_token' not in vocab.special_token_keys else vocab.cls_id
+        self.sep_id = vocab.eos_id if 'sep_token' not in vocab.special_token_keys else vocab.sep_id
+
+        # TODO(sxjscience) Consider to combine the NamedTuple and batchify functionality.
+        self.ChunkFeature = collections.namedtuple('ChunkFeature',
+                                              ['qas_id',
+                                               'data',
+                                               'valid_length',
+                                               'segment_ids',
+                                               'masks',
+                                               'is_impossible',
+                                               'gt_start',
+                                               'gt_end',
+                                               'context_offset',
+                                               'chunk_start',
+                                               'chunk_length'])
+        self.BatchifyFunction = bf.NamedTuple(self.ChunkFeature,
+                                         {'qas_id': bf.List(),
+                                          'data': bf.Pad(val=self.pad_id),
+                                          'valid_length': bf.Stack(),
+                                          'segment_ids': bf.Pad(),
+                                          'masks': bf.Pad(val=1),
+                                          'is_impossible': bf.Stack(),
+                                          'gt_start': bf.Stack(),
+                                          'gt_end': bf.Stack(),
+                                          'context_offset': bf.Stack(),
+                                          'chunk_start': bf.Stack(),
+                                          'chunk_length': bf.Stack()})
+
+    def process_sample(self, feature: SquadFeature):
+        """Process the data to the following format.
+
+        Note that we mask all the special tokens except the CLS token. The reason for not masking
+        the CLS token is that if the question is not answerable, we will set the start and end to
+        be 0.
+
+
+        Merged:      <CLS> Question <SEP> Context <SEP>
+        Segment IDs:  0       0       0      1      1
+        Mask:         0       1       1      0      1
+
+        Here, we need to emphasize that when mask = 1, the data are actually not masked!
+
+        Parameters
+        ----------
+        feature
+            Tokenized SQuAD feature
+
+        Returns
+        -------
+        ret
+            Divide the feature into multiple chunks and extract the feature which contains
+            the following:
+            - data
+                The data that concatenates the query and the context + special tokens
+            - valid_length
+                The valid_length of the data
+            - segment_ids
+                We assign the query part as segment 0 and the context part as segment 1.
+            - masks
+                We mask all the special tokens. 1 --> not masked, 0 --> masked.
+            - is_impossible
+                Whether the provided context is impossible to answer or not.
+            - gt_start
+                The ground-truth start location of the span
+            - gt_end
+                The ground-truth end location of the span
+            - chunk_start
+                The start of the chunk
+            - chunk_length
+                The length of the chunk
+        """
+        ret = []
+        truncated_query_ids = feature.query_token_ids[:self._max_query_length]
+        chunks = feature.get_chunks(
+            doc_stride=self._doc_stride,
+            max_chunk_length=self._max_seq_length - len(truncated_query_ids) - 3)
+        for chunk in chunks:
+            data = np.array([self.cls_id] + truncated_query_ids + [self.sep_id] +
+                            feature.context_token_ids[chunk.start:(chunk.start + chunk.length)] +
+                            [self.sep_id], dtype=np.int32)
+            valid_length = len(data)
+            segment_ids = np.array([0] + [0] * len(truncated_query_ids) +
+                                   [0] + [1] * chunk.length + [1], dtype=np.int32)
+            masks = np.array([0] + [1] * len(truncated_query_ids) + [1] + [0] * chunk.length + [1],
+                             dtype=np.int32)
+            context_offset = len(truncated_query_ids) + 2
+            if chunk.gt_start_pos is None and chunk.gt_end_pos is None:
+                start_pos = 0
+                end_pos = 0
+            else:
+                # Here, we increase the start and end because we put query before context
+                start_pos = chunk.gt_start_pos + context_offset
+                end_pos = chunk.gt_end_pos + context_offset
+            chunk_feature = self.ChunkFeature(qas_id=feature.qas_id,
+                                              data=data,
+                                              valid_length=valid_length,
+                                              segment_ids=segment_ids,
+                                              masks=masks,
+                                              is_impossible=chunk.is_impossible,
+                                              gt_start=start_pos,
+                                              gt_end=end_pos,
+                                              context_offset=context_offset,
+                                              chunk_start=chunk.start,
+                                              chunk_length=chunk.length)
+            ret.append(chunk_feature)
+        return ret
+
+    def get_train(self, features, skip_unreliable=True):
+        """Get the training dataset
+
+        Parameters
+        ----------
+        features
+        skip_unreliable
+            Whether to skip the unreliable spans in the training set
+
+        Returns
+        -------
+        train_dataset
+        num_token_answer_mismatch
+        num_unreliable
+        """
+        train_dataset = []
+        num_token_answer_mismatch = 0
+        num_unreliable = 0
+        for feature in features:
+            if feature.token_answer_mismatch:
+                num_token_answer_mismatch += 1
+            if feature.unreliable_span:
+                num_unreliable += 1
+            if skip_unreliable and feature.unreliable_span:
+                # Skip when not reliable
+                continue
+            # Process the feature
+            chunk_features = self.process_sample(feature)
+            train_dataset.extend(chunk_features)
+        return train_dataset, num_token_answer_mismatch, num_unreliable
+
+
+def get_squad_features(args, tokenizer, segment):
+    """
+    Get processed data features of SQuADExampls
+
+    Parameters
+    ----------
+    args : argparse.Namespace
+    tokenizer:
+        Tokenizer instance
+    segment: str
+        train or dev
+
+    Returns
+    -------
+    data_features
+        The list of processed data features
+    """
+    data_cache_path = os.path.join(CACHE_PATH,
+                                   '{}_{}_squad_{}.ndjson'.format(
+                                       segment, args.model_name, args.version))
+    is_training = (segment == 'train')
+    if os.path.exists(data_cache_path) and not args.overwrite_cache:
+        data_features = []
+        with open(data_cache_path, 'r') as f:
+            for line in f:
+                data_features.append(SquadFeature.from_json(line))
+        logging.info('Found cached data features, load from {}'.format(data_cache_path))
+    else:
+        data_examples = get_squad_examples(args.data_dir, segment=segment, version=args.version)
+        start = time.time()
+        num_process = min(cpu_count(), 8)
+        logging.info('Tokenize Data:')
+        with Pool(num_process) as pool:
+            data_features = pool.map(functools.partial(convert_squad_example_to_feature,
+                                                       tokenizer=tokenizer,
+                                                       is_training=is_training), data_examples)
+        logging.info('Done! Time spent:{:.2f} seconds'.format(time.time() - start))
+        with open(data_cache_path, 'w') as f:
+            for feature in data_features:
+                f.write(feature.to_json() + '\n')
+
+    return data_features
+
+
+def get_network(model_name,
+                ctx_l,
+                dropout=0.1,
+                checkpoint_path=None,
+                backbone_path=None,
+                dtype='float32'):
+    """
+    Get the network that fine-tune the Question Answering Task
+
+    Parameters
+    ----------
+    model_name : str
+        The model name of the backbone model
+    ctx_l :
+        Context list of training device like [mx.gpu(0), mx.gpu(1)]
+    dropout : float
+        Dropout probability of the task specified layer
+    checkpoint_path: str
+        Path to a Fine-tuned checkpoint
+    backbone_path: str
+        Path to the backbone model to be loaded in qa_net
+
+    Returns
+    -------
+    cfg
+    tokenizer
+    qa_net
+    use_segmentation
+    """
+    # Create the network
+    use_segmentation = 'roberta' not in model_name and 'xlmr' not in model_name
+    Model, cfg, tokenizer, download_params_path, _ = \
+        get_backbone(model_name, load_backbone=not backbone_path)
+    backbone = Model.from_cfg(cfg, use_pooler=False, dtype=dtype)
+    # Load local backbone parameters if backbone_path provided.
+    # Otherwise, download backbone parameters from gluon zoo.
+
+    backbone_params_path = backbone_path if backbone_path else download_params_path
+    if checkpoint_path is None:
+        backbone.load_parameters(backbone_params_path, ignore_extra=True,
+                                 ctx=ctx_l, cast_dtype=True)
+        num_params, num_fixed_params = count_parameters(backbone.collect_params())
+        logging.info(
+            'Loading Backbone Model from {}, with total/fixd parameters={}/{}'.format(
+                backbone_params_path, num_params, num_fixed_params))
+    qa_net = ModelForQAConditionalV1(backbone=backbone,
+                                     dropout_prob=dropout,
+                                     use_segmentation=use_segmentation,
+                                     weight_initializer=TruncNorm(stdev=0.02))
+    if checkpoint_path is None:
+        # Ignore the UserWarning during initialization,
+        # There is no need to re-initialize the parameters of backbone
+        qa_net.initialize(ctx=ctx_l)
+    else:
+        qa_net.load_parameters(checkpoint_path, ctx=ctx_l, cast_dtype=True)
+    qa_net.hybridize()
+
+    return cfg, tokenizer, qa_net, use_segmentation
+
+
+def train(args):
+    store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm(
+        args.comm_backend, args.gpus)
+    cfg, tokenizer, qa_net, use_segmentation = \
+        get_network(args.model_name, ctx_l,
+                    args.classifier_dropout,
+                    args.param_checkpoint,
+                    args.backbone_path)
+
+    logging.info('Prepare training data')
+    train_features = get_squad_features(args, tokenizer, segment='train')
+    dataset_processor = SquadDatasetProcessor(tokenizer=tokenizer,
+                                              doc_stride=args.doc_stride,
+                                              max_seq_length=args.max_seq_length,
+                                              max_query_length=args.max_query_length)
+    logging.info('Processing the Training data:')
+    train_dataset, num_answer_mismatch, num_unreliable \
+        = dataset_processor.get_train(train_features, skip_unreliable=True)
+    logging.info('Done! #Unreliable Span={} / #Mismatched Answer={} / #Total={}'
+                 .format(num_unreliable, num_answer_mismatch, len(train_features)))
+
+    # Get dataset statistics
+    num_impossible = 0
+    for sample in train_dataset:
+        num_impossible += sample.is_impossible
+    logging.info('Before Chunking, #Train/Is Impossible = {}/{}'
+                 .format(len(train_features),
+                         sum([ele.is_impossible for ele in train_features])))
+    logging.info('After Chunking, #Train Sample/Is Impossible = {}/{}'
+                 .format(len(train_dataset), num_impossible))
+    sampler = SplitSampler(len(train_dataset), num_parts=num_workers,
+                           part_index=rank, even_size=True)
+    train_dataloader = mx.gluon.data.DataLoader(
+        train_dataset,
+        batchify_fn=dataset_processor.BatchifyFunction,
+        batch_size=args.batch_size,
+        num_workers=0,
+        sampler=sampler)
+    if 'electra' in args.model_name:
+        # Froze parameters, does not work for albert model since parameters in all layers are shared
+        if args.untunable_depth > 0:
+            qa_net.backbone.frozen_params(args.untunable_depth)
+        if args.layerwise_decay > 0:
+            qa_net.backbone.apply_layerwise_decay(args.layerwise_decay)
+
+    logging.info('Creating distributed trainer...')
+    # Collect differentiable parameters
+    param_dict = qa_net.collect_params()
+    # Do not apply weight decay to all the LayerNorm and bias
+    for _, v in qa_net.collect_params('.*beta|.*gamma|.*bias').items():
+        v.wd_mult = 0.0
+    params = [p for p in param_dict.values() if p.grad_req != 'null']
+    # Set grad_req if gradient accumulation is required
+    if args.num_accumulated > 1:
+        logging.info('Using gradient accumulation. Effective global batch size = {}'
+                     .format(args.num_accumulated * args.batch_size * len(ctx_l) * num_workers))
+        for p in params:
+            p.grad_req = 'add'
+    # backend specific implementation
+    if args.comm_backend == 'horovod':
+        # Horovod: fetch and broadcast parameters
+        hvd.broadcast_parameters(param_dict, root_rank=0)
+
+    epoch_size = (len(train_dataloader) + len(ctx_l) - 1) // len(ctx_l)
+    if args.num_train_steps is not None:
+        num_train_steps = args.num_train_steps
+    else:
+        num_train_steps = int(args.epochs * epoch_size / args.num_accumulated)
+    if args.warmup_steps is not None:
+        warmup_steps = args.warmup_steps
+    else:
+        warmup_steps = int(num_train_steps * args.warmup_ratio)
+    assert warmup_steps is not None, 'Must specify either warmup_steps or warmup_ratio'
+    log_interval = args.log_interval
+    save_interval = args.save_interval if args.save_interval is not None\
+        else epoch_size // args.num_accumulated
+    logging.info('#Total Training Steps={}, Warmup={}, Save Interval={}'
+                 .format(num_train_steps, warmup_steps, save_interval))
+
+    # set up optimization
+    lr_scheduler = PolyScheduler(max_update=num_train_steps,
+                                 base_lr=args.lr,
+                                 warmup_begin_lr=0,
+                                 pwr=1,
+                                 final_lr=0,
+                                 warmup_steps=warmup_steps,
+                                 warmup_mode='linear')
+    optimizer_params = {'learning_rate': args.lr,
+                        'wd': args.wd,
+                        'lr_scheduler': lr_scheduler,
+                        }
+    adam_betas = eval(args.adam_betas)
+    if args.optimizer == 'adamw':
+        optimizer_params.update({'beta1': adam_betas[0],
+                                 'beta2': adam_betas[1],
+                                 'epsilon': args.adam_epsilon,
+                                 'correct_bias': False,
+                                 })
+    elif args.optimizer == 'adam':
+        optimizer_params.update({'beta1': adam_betas[0],
+                                 'beta2': adam_betas[1],
+                                 'epsilon': args.adam_epsilon,
+                                 })
+    if args.comm_backend == 'horovod':
+        trainer = hvd.DistributedTrainer(param_dict, args.optimizer, optimizer_params)
+    else:
+        trainer = mx.gluon.Trainer(param_dict, args.optimizer, optimizer_params,
+                                   update_on_kvstore=False)
+
+    num_samples_per_update = 0
+    loss_denom = float(len(ctx_l) * args.num_accumulated)
+
+    log_span_loss = 0
+    log_answerable_loss = 0
+    log_total_loss = 0
+    log_sample_num = 0
+    if args.num_accumulated != 1:
+        # set grad to zero for gradient accumulation
+        qa_net.zero_grad()
+    global_tic = time.time()
+    tic = time.time()
+    for step_num, batch_data in enumerate(
+            grouper(repeat(train_dataloader), len(ctx_l) * args.num_accumulated)):
+        for sample_l in grouper(batch_data, len(ctx_l)):
+            loss_l = []
+            span_loss_l = []
+            answerable_loss_l = []
+            for sample, ctx in zip(sample_l, ctx_l):
+                if sample is None:
+                    continue
+                # Copy the data to device
+                tokens = sample.data.as_in_ctx(ctx)
+                log_sample_num += len(tokens)
+                num_samples_per_update += len(tokens)
+                segment_ids = sample.segment_ids.as_in_ctx(ctx) if use_segmentation else None
+                valid_length = sample.valid_length.as_in_ctx(ctx)
+                p_mask = sample.masks.as_in_ctx(ctx)
+                gt_start = sample.gt_start.as_in_ctx(ctx).astype(np.int32)
+                gt_end = sample.gt_end.as_in_ctx(ctx).astype(np.int32)
+                is_impossible = sample.is_impossible.as_in_ctx(ctx).astype(np.int32)
+                batch_idx = mx.np.arange(tokens.shape[0], dtype=np.int32, ctx=ctx)
+                p_mask = 1 - p_mask  # In the network, we use 1 --> no_mask, 0 --> mask
+                with mx.autograd.record():
+                    start_logits, end_logits, answerable_logits \
+                        = qa_net(tokens, segment_ids, valid_length, p_mask, gt_start)
+                    sel_start_logits = start_logits[batch_idx, gt_start]
+                    sel_end_logits = end_logits[batch_idx, gt_end]
+                    sel_answerable_logits = answerable_logits[batch_idx, is_impossible]
+                    span_loss = - 0.5 * (sel_start_logits + sel_end_logits).sum()
+                    answerable_loss = -0.5 * sel_answerable_logits.sum()
+                    loss = (span_loss + answerable_loss) / loss_denom
+                    loss_l.append(loss)
+                    span_loss_l.append(span_loss)
+                    answerable_loss_l.append(answerable_loss)
+
+            for loss in loss_l:
+                loss.backward()
+            # All Reduce the Step Loss
+            log_span_loss += sum([ele.as_in_ctx(ctx_l[0]) for ele in span_loss_l]).asnumpy()
+            log_total_loss += sum([ele.as_in_ctx(ctx_l[0])
+                                   for ele in loss_l]).asnumpy() * loss_denom
+            log_answerable_loss += sum([ele.as_in_ctx(ctx_l[0])
+                                        for ele in answerable_loss_l]).asnumpy()
+        # update
+        trainer.allreduce_grads()
+
+        if args.max_grad_norm > 0:
+            # Here, the accumulated gradients are
+            # \sum_{n=1}^N g_n / loss_denom
+            # Thus, in order to clip the average gradient
+            #   \frac{1}{N} \sum_{n=1}^N      -->  clip to args.max_grad_norm
+            # We need to change the ratio to be
+            #  \sum_{n=1}^N g_n / loss_denom  -->  clip to args.max_grad_norm  * N / loss_denom
+            total_norm, ratio, is_finite = clip_grad_global_norm(
+                params, args.max_grad_norm * num_samples_per_update / loss_denom)
+        else:
+            total_norm = grad_global_norm(params)
+
+        total_norm = total_norm / (num_samples_per_update / loss_denom)
+        trainer.update(num_samples_per_update / loss_denom)
+        if args.num_accumulated != 1:
+            # set grad to zero for gradient accumulation
+            qa_net.zero_grad()
+
+        # saving
+        if local_rank == 0 and (step_num + 1) % save_interval == 0 or (
+                step_num + 1) >= num_train_steps:
+            version_prefix = 'squad' + args.version
+            ckpt_name = '{}_{}_{}.params'.format(args.model_name,
+                                                 version_prefix,
+                                                 (step_num + 1))
+            params_saved = os.path.join(args.output_dir, ckpt_name)
+            qa_net.save_parameters(params_saved)
+            ckpt_candidates = [
+                f for f in os.listdir(
+                    args.output_dir) if f.endswith('.params')]
+            # keep last `max_saved_ckpt` checkpoints
+            if len(ckpt_candidates) > args.max_saved_ckpt:
+                ckpt_candidates.sort(key=lambda ele: (len(ele), ele))
+                os.remove(os.path.join(args.output_dir, ckpt_candidates[0]))
+            logging.info('Params saved in: {}'.format(params_saved))
+
+        # logging
+        if local_rank == 0 and (step_num + 1) % log_interval == 0:
+            log_span_loss /= log_sample_num
+            log_answerable_loss /= log_sample_num
+            log_total_loss /= log_sample_num
+            toc = time.time()
+            logging.info(
+                'Step: {}/{}, Loss span/answer/total={:.4f}/{:.4f}/{:.4f},'
+                ' LR={:.8f}, grad_norm={:.4f}. Time cost={:.2f}, Throughput={:.2f} samples/s'
+                ' ETA={:.2f}h'.format((step_num + 1), num_train_steps, log_span_loss,
+                                      log_answerable_loss, log_total_loss, trainer.learning_rate,
+                                      total_norm, toc - tic, log_sample_num / (toc - tic),
+                                      (num_train_steps - (step_num + 1)) / ((step_num + 1) / (toc - global_tic)) / 3600))
+            tic = time.time()
+            log_span_loss = 0
+            log_answerable_loss = 0
+            log_total_loss = 0
+            log_sample_num = 0
+            num_samples_per_update = 0
+
+        if (step_num + 1) >= num_train_steps:
+            toc = time.time()
+            logging.info(
+                'Finish training step: {} within {} hours'.format(
+                    step_num + 1, (toc - global_tic) / 3600))
+            break
+
+    return params_saved
+
+
+RawResultExtended = collections.namedtuple(
+    'RawResultExtended',
+    ['qas_id',
+     'start_top_logits',
+     'start_top_index',
+     'end_top_logits',
+     'end_top_index',
+     'answerable_logits'])
+
+
+def predict_extended(original_feature,
+                     chunked_features,
+                     results,
+                     n_best_size,
+                     max_answer_length=64,
+                     start_top_n=5,
+                     end_top_n=5):
+    """Get prediction results for SQuAD.
+
+    Start Logits: (B, N_start)
+    End Logits: (B, N_start, N_end)
+
+    Parameters
+    ----------
+    original_feature:
+        The original SquadFeature before chunked
+    chunked_features
+        List of ChunkFeatures
+    results
+        List of model predictions for span start and span end.
+    n_best_size
+        Best N results written to file
+    max_answer_length
+        Maximum length of the answer tokens.
+    start_top_n
+        Number of start-position candidates
+    end_top_n
+        Number of end-position candidates
+    Returns
+    -------
+    not_answerable_score
+        Model's estimate that the question is not answerable.
+    prediction
+        The final prediction.
+    nbest_json
+        n-best predictions with their probabilities.
+    """
+    not_answerable_score = 1000000  # Score for not-answerable. We set it to be a large and positive
+    # If one chunk votes for answerable, we will treat the context as answerable,
+    # Thus, the overall not_answerable_score = min(chunk_not_answerable_score)
+    all_start_idx = []
+    all_end_idx = []
+    all_pred_score = []
+    context_length = len(original_feature.context_token_ids)
+    token_max_context_score = np.full((len(chunked_features), context_length),
+                                      -np.inf,
+                                      dtype=np.float32)
+    for i, chunked_feature in enumerate(chunked_features):
+        chunk_start = chunked_feature.chunk_start
+        chunk_length = chunked_feature.chunk_length
+        for j in range(chunk_start, chunk_start + chunk_length):
+            # This is a heuristic score
+            # TODO investigate the impact
+            token_max_context_score[i, j] = min(j - chunk_start,
+                                                chunk_start + chunk_length - 1 - j) \
+                + 0.01 * chunk_length
+    token_max_chunk_id = token_max_context_score.argmax(axis=0)
+
+    for chunk_id, (result, chunk_feature) in enumerate(zip(results, chunked_features)):
+        # We use the log-likelihood as the not answerable score.
+        # Thus, a high score indicates that the answer is not answerable
+        cur_not_answerable_score = float(result.answerable_logits[1])
+        not_answerable_score = min(not_answerable_score, cur_not_answerable_score)
+        # Calculate the start_logits + end_logits as the overall score
+        context_offset = chunk_feature.context_offset
+        chunk_start = chunk_feature.chunk_start
+        chunk_length = chunk_feature.chunk_length
+        for i in range(start_top_n):
+            for j in range(end_top_n):
+                pred_score = result.start_top_logits[i] + result.end_top_logits[i, j]
+                start_index = result.start_top_index[i]
+                end_index = result.end_top_index[i, j]
+                # We could hypothetically create invalid predictions, e.g., predict
+                # that the start of the answer span is in the query tokens or out of
+                # the chunk. We throw out all invalid predictions.
+                if not (context_offset <= start_index < context_offset + chunk_length) or \
+                   not (context_offset <= end_index < context_offset + chunk_length) or \
+                   end_index < start_index:
+                    continue
+                pred_answer_length = end_index - start_index + 1
+                if pred_answer_length > max_answer_length:
+                    continue
+                start_idx = int(start_index - context_offset + chunk_start)
+                end_idx = int(end_index - context_offset + chunk_start)
+                if token_max_chunk_id[start_idx] != chunk_id:
+                    continue
+                all_start_idx.append(start_idx)
+                all_end_idx.append(end_idx)
+                all_pred_score.append(pred_score)
+    sorted_start_end_score = sorted(zip(all_start_idx, all_end_idx, all_pred_score),
+                                    key=lambda args: args[-1], reverse=True)
+    nbest = []
+    context_text = original_feature.context_text
+    context_token_offsets = original_feature.context_token_offsets
+    seen_predictions = set()
+    for start_idx, end_idx, pred_score in sorted_start_end_score:
+        if len(seen_predictions) >= n_best_size:
+            break
+        pred_answer = context_text[context_token_offsets[start_idx][0]:
+                                   context_token_offsets[end_idx][1]]
+        seen_predictions.add(pred_answer)
+        nbest.append((pred_answer, pred_score))
+
+    # In very rare edge cases we could have no valid predictions. So we
+    # just create a nonce prediction in this case to avoid failure.
+    if len(nbest) == 0:
+        nbest.append(('', float('-inf')))
+    all_scores = np.array([ele[1] for ele in nbest], dtype=np.float32)
+    probs = np.exp(all_scores) / np.sum(np.exp(all_scores))
+    nbest_json = []
+    for i, (entry, prob) in enumerate(zip(nbest, probs)):
+        output = collections.OrderedDict()
+        output['text'] = entry[0]
+        output['probability'] = float(prob)
+        nbest_json.append(output)
+
+    assert len(nbest_json) >= 1
+    return not_answerable_score, nbest[0][0], nbest_json
+
+
+def evaluate(args, last=True):
+    store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm(
+        args.comm_backend, args.gpus)
+    # only evaluate once
+    if rank != 0:
+        logging.info('Skipping node {}'.format(rank))
+        return
+    ctx_l = parse_ctx(args.gpus)
+    logging.info(
+        'Srarting inference without horovod on the first node on device {}'.format(
+            str(ctx_l)))
+
+    cfg, tokenizer, qa_net, use_segmentation = get_network(
+        args.model_name, ctx_l, args.classifier_dropout)
+
+    logging.info('Prepare dev data')
+    dev_features = get_squad_features(args, tokenizer, segment='dev')
+    dev_data_path = os.path.join(args.data_dir, 'dev-v{}.json'.format(args.version))
+    dataset_processor = SquadDatasetProcessor(tokenizer=tokenizer,
+                                              doc_stride=args.doc_stride,
+                                              max_seq_length=args.max_seq_length,
+                                              max_query_length=args.max_query_length)
+    dev_all_chunk_features = []
+    dev_chunk_feature_ptr = [0]
+    for feature in dev_features:
+        chunk_features = dataset_processor.process_sample(feature)
+        dev_all_chunk_features.extend(chunk_features)
+        dev_chunk_feature_ptr.append(dev_chunk_feature_ptr[-1] + len(chunk_features))
+
+    def eval_validation(ckpt_name, best_eval):
+        """
+        Model inference during validation or final evaluation.
+        """
+        dev_dataloader = mx.gluon.data.DataLoader(
+            dev_all_chunk_features,
+            batchify_fn=dataset_processor.BatchifyFunction,
+            batch_size=args.eval_batch_size,
+            num_workers=0,
+            shuffle=False)
+
+        log_interval = args.eval_log_interval
+        all_results = []
+        epoch_tic = time.time()
+        tic = time.time()
+        epoch_size = len(dev_features)
+        total_num = 0
+        log_num = 0
+        for batch_idx, dev_batch in enumerate(grouper(dev_dataloader, len(ctx_l))):
+            # Predict for each chunk
+            for sample, ctx in zip(dev_batch, ctx_l):
+                if sample is None:
+                    continue
+                # Copy the data to device
+                tokens = sample.data.as_in_ctx(ctx)
+                total_num += len(tokens)
+                log_num += len(tokens)
+                segment_ids = sample.segment_ids.as_in_ctx(ctx) if use_segmentation else None
+                valid_length = sample.valid_length.as_in_ctx(ctx)
+                p_mask = sample.masks.as_in_ctx(ctx)
+                p_mask = 1 - p_mask  # In the network, we use 1 --> no_mask, 0 --> mask
+                start_top_logits, start_top_index, end_top_logits, end_top_index, answerable_logits \
+                    = qa_net.inference(tokens, segment_ids, valid_length, p_mask,
+                                       args.start_top_n, args.end_top_n)
+                for i, qas_id in enumerate(sample.qas_id):
+                    result = RawResultExtended(qas_id=qas_id,
+                                               start_top_logits=start_top_logits[i].asnumpy(),
+                                               start_top_index=start_top_index[i].asnumpy(),
+                                               end_top_logits=end_top_logits[i].asnumpy(),
+                                               end_top_index=end_top_index[i].asnumpy(),
+                                               answerable_logits=answerable_logits[i].asnumpy())
+
+                    all_results.append(result)
+
+            # logging
+            if (batch_idx + 1) % log_interval == 0:
+                # Output the loss of per step
+                toc = time.time()
+                logging.info(
+                    '[batch {}], Time cost={:.2f},'
+                    ' Throughput={:.2f} samples/s, ETA={:.2f}h'.format(
+                        batch_idx + 1, toc - tic, log_num / (toc - tic),
+                        (epoch_size - total_num) / (total_num / (toc - epoch_tic)) / 3600))
+                tic = time.time()
+                log_num = 0
+
+        epoch_toc = time.time()
+        logging.info('Time cost=%2f s, Thoughput=%.2f samples/s', epoch_toc - epoch_tic,
+                     total_num / (epoch_toc - epoch_tic))
+
+        all_predictions = collections.OrderedDict()
+        all_nbest_json = collections.OrderedDict()
+        no_answer_score_json = collections.OrderedDict()
+        for index, (left_index, right_index) in enumerate(zip(dev_chunk_feature_ptr[:-1],
+                                                              dev_chunk_feature_ptr[1:])):
+            chunked_features = dev_all_chunk_features[left_index:right_index]
+            results = all_results[left_index:right_index]
+            original_feature = dev_features[index]
+            qas_ids = set([result.qas_id for result in results] +
+                          [feature.qas_id for feature in chunked_features])
+            assert len(qas_ids) == 1, 'Mismatch Occured between features and results'
+            example_qas_id = list(qas_ids)[0]
+            assert example_qas_id == original_feature.qas_id, \
+                'Mismatch Occured between original feature and chunked features'
+            not_answerable_score, best_pred, nbest_json = predict_extended(
+                original_feature=original_feature,
+                chunked_features=chunked_features,
+                results=results,
+                n_best_size=args.n_best_size,
+                max_answer_length=args.max_answer_length,
+                start_top_n=args.start_top_n,
+                end_top_n=args.end_top_n)
+            no_answer_score_json[example_qas_id] = not_answerable_score
+            all_predictions[example_qas_id] = best_pred
+            all_nbest_json[example_qas_id] = nbest_json
+
+        if args.version == '2.0':
+            exact = 'best_exact'
+            f1 = 'best_f1'
+            na_prob = no_answer_score_json
+        else:
+            exact = 'exact'
+            f1 = 'f1'
+            na_prob = None
+
+        cur_eval, revised_predictions = squad_eval(
+            dev_data_path, all_predictions, na_prob, revise=na_prob is not None)
+        logging.info('The evaluated results are {}'.format(json.dumps(cur_eval)))
+
+        cur_metrics = 0.5 * (cur_eval[exact] + cur_eval[f1])
+        if best_eval:
+            best_metrics = 0.5 * (best_eval[exact] + best_eval[f1])
+        else:
+            best_metrics = 0.
+
+        if cur_metrics > best_metrics:
+            logging.info('The evaluated files are saved in {}'.format(args.output_dir))
+            output_prediction_file = os.path.join(args.output_dir, 'predictions.json')
+            output_nbest_file = os.path.join(args.output_dir, 'nbest_predictions.json')
+            na_prob_file = os.path.join(args.output_dir, 'na_prob.json')
+            revised_prediction_file = os.path.join(args.output_dir, 'revised_predictions.json')
+
+            with open(output_prediction_file, 'w') as of:
+                of.write(json.dumps(all_predictions, indent=4) + '\n')
+            with open(output_nbest_file, 'w') as of:
+                of.write(json.dumps(all_nbest_json, indent=4) + '\n')
+            with open(na_prob_file, 'w') as of:
+                of.write(json.dumps(no_answer_score_json, indent=4) + '\n')
+            with open(revised_prediction_file, 'w') as of:
+                of.write(json.dumps(revised_predictions, indent=4) + '\n')
+
+            best_eval = cur_eval
+            best_eval.update({'best_ckpt': ckpt_name})
+        return best_eval
+
+    if args.param_checkpoint and args.param_checkpoint.endswith('.params'):
+        ckpt_candidates = [args.param_checkpoint]
+    else:
+        ckpt_candidates = [f for f in os.listdir(args.output_dir) if f.endswith('.params')]
+        ckpt_candidates.sort(key=lambda ele: (len(ele), ele))
+    if last:
+        ckpt_candidates = ckpt_candidates[-1:]
+
+    best_eval = {}
+    for ckpt_name in ckpt_candidates:
+        logging.info('Starting evaluate the checkpoint {}'.format(ckpt_name))
+        ckpt_path = os.path.join(args.output_dir, ckpt_name)
+        qa_net.load_parameters(ckpt_path, ctx=ctx_l, cast_dtype=True)
+        best_eval = eval_validation(ckpt_name, best_eval)
+
+    logging.info('The best evaluated results are {}'.format(json.dumps(best_eval)))
+    output_eval_results_file = os.path.join(args.output_dir, 'best_results.json')
+    with open(output_eval_results_file, 'w') as of:
+        of.write(json.dumps(best_eval, indent=4) + '\n')
+    return best_eval
+
+
+if __name__ == '__main__':
+    os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
+    args = parse_args()
+    logging_config(args.output_dir, name='finetune_squad{}'.format(args.version))
+    set_seed(args.seed)
+    if args.do_train:
+        train(args)
+    if args.do_eval:
+        evaluate(args, last=not args.all_evaluate)
diff --git a/scripts/question_answering/squad_utils.py b/scripts/question_answering/squad_utils.py
new file mode 100644
index 0000000000..80a27a9864
--- /dev/null
+++ b/scripts/question_answering/squad_utils.py
@@ -0,0 +1,455 @@
+"""Utility classes and functions for data processing"""
+from typing import Optional, List
+from collections import namedtuple
+import itertools
+import re
+import numpy as np
+import numpy.ma as ma
+import warnings
+import os
+from tqdm import tqdm
+import json
+import string
+from gluonnlp.data.tokenizers import BaseTokenizerWithVocab
+from gluonnlp.utils.preprocessing import match_tokens_with_char_spans
+from typing import Tuple
+from mxnet.gluon.utils import download
+
+int_float_regex = re.compile('^\d+\.{0,1}\d*$')  # matches if a number is either integer or float
+
+import mxnet as mx
+mx.npx.set_np()
+
+
+def normalize_answer(s):
+    """Lower text and remove punctuation, articles and extra whitespace.
+    This is from the official evaluate-v2.0.py in SQuAD.
+    """
+
+    def remove_articles(text):
+        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
+        return re.sub(regex, ' ', text)
+
+    def white_space_fix(text):
+        return ' '.join(text.split())
+
+    def remove_punc(text):
+        exclude = set(string.punctuation)
+        return ''.join(ch for ch in text if ch not in exclude)
+
+    def lower(text):
+        return text.lower()
+
+    return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+
+def get_official_squad_eval_script(version='2.0', download_dir=None):
+    url_info = {'2.0': ['evaluate-v2.0.py',
+                        'https://worksheets.codalab.org/rest/bundles/'
+                        '0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/',
+                        '5a584f1952c88b4088be5b51f2046a2c337aa706']}
+    if version not in url_info:
+        raise ValueError('Version {} is not supported'.format(version))
+    if download_dir is None:
+        download_dir = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+    download_path = os.path.join(download_dir, url_info[version][0])
+    download(url_info[version][1], download_path, sha1_hash=url_info[version][2])
+    return download_path
+
+
+class SquadExample:
+    """A single training/test example for the Squad dataset, as loaded from disk."""
+    def __init__(self, qas_id: int,
+                 query_text: str,
+                 context_text: str,
+                 answer_text: str,
+                 start_position: int,
+                 end_position: int,
+                 title: str,
+                 answers: Optional[List[str]] = None,
+                 is_impossible: bool = False):
+        """
+
+        Parameters
+        ----------
+        qas_id
+            The example's unique identifier
+        query_text
+            The query string
+        context_text
+            The context string
+        answer_text
+            The answer string
+        start_position
+            The character position of the start of the answer
+        end_position
+            The character position of the end of the answer
+        title
+            The title of the example
+        answers
+            None by default, this is used during evaluation.
+            Holds answers as well as their start positions.
+        is_impossible
+            False by default, set to True if the example has no possible answer.
+        """
+        self.qas_id = qas_id
+        self.query_text = query_text
+        self.context_text = context_text
+        self.answer_text = answer_text
+        self.title = title
+        self.is_impossible = is_impossible
+        self.answers = answers
+        self.start_position = start_position
+        self.end_position = end_position
+
+    def to_json(self):
+        return json.dumps(self.__dict__)
+
+    @classmethod
+    def from_json(cls, s):
+        kwargs = json.loads(s)
+        return cls(**kwargs)
+
+
+DocChunk = namedtuple('DocChunk', ['start', 'length',
+                                   'is_impossible',
+                                   'gt_start_pos',
+                                   'gt_end_pos'])
+
+
+class SquadFeature:
+    def __init__(self, qas_id,
+                 query_token_ids,
+                 context_text,
+                 context_token_ids,
+                 context_token_offsets,
+                 is_impossible,
+                 token_answer_mismatch,
+                 unreliable_span,
+                 gt_answer_text,
+                 gt_start_pos,
+                 gt_end_pos):
+        """The Squad Feature
+
+        Parameters
+        ----------
+        qas_id
+            The unique query/answer ID in the squad dataset
+        query_token_ids
+            The tokenized query.
+        context_text
+            The original text of the context
+        context_token_ids
+            The tokenized context.
+        context_token_offsets
+            The offsets of the tokens in the original context string
+        is_impossible
+            Whether the sample is impossible.
+        token_answer_mismatch
+            If this value is True, it means that we cannot reconstruct the ground-truth answer with
+            the tokenized version. Usually, the span-prediction-based approach won't be very
+            accurate and we should rely on the encoder-decoder approach.
+            For example:
+                GT: "japan", Tokenized Version: "japanese"
+                     "six'                       "sixth"
+                     "one"                       "iPhone"
+                     "breed"                     "breeding"
+                     "emotion"                   "emotional"
+
+        unreliable_span
+            If this value is True, it means that we cannot rely on the gt_start_pos and gt_end_pos.
+            In this scenario, we cannot utilize the span-prediction-based approach.
+            One example is the question about "how many", the answer will spread across the
+            whole document and there is no clear span.
+        gt_answer_text
+            The ground-truth answer text
+        gt_start_pos
+            The start position of the ground-truth span. None indicates that there is no valid
+            ground-truth span.
+        gt_end_pos
+            The end position of the ground-truth span. None indicates that there is no valid
+            ground-truth span.
+        """
+        self.qas_id = qas_id
+        self.query_token_ids = query_token_ids
+        self.context_text = context_text
+        self.context_token_ids = context_token_ids
+        self.context_token_offsets = context_token_offsets
+        self.is_impossible = is_impossible
+        self.token_answer_mismatch = token_answer_mismatch
+        self.unreliable_span = unreliable_span
+        self.gt_answer_text = gt_answer_text
+        self.gt_start_pos = gt_start_pos
+        self.gt_end_pos = gt_end_pos
+
+    def to_json(self):
+        return json.dumps(self.__dict__)
+
+    @classmethod
+    def from_json(cls, s):
+        kwargs = json.loads(s)
+        return cls(**kwargs)
+
+    def __repr__(self):
+        return self.to_json()
+
+    def get_chunks(self, doc_stride, max_chunk_length=None):
+        """Get a sequence of chunks for the squad feature.
+
+        In reality, the document will be too long for the NLP model, and we will split it into
+        multiple chunks.
+
+        For example, consider the following
+        Doc: the man went to the store and bought a gallon of milk
+
+        We may divide it into four chunks:
+
+        Chunk 1: the man went to the
+        Chunk 2: to the store and bought
+        Chunk 3: and bought a gallon of
+        Chunk 4: gallon of milk
+
+        We will use our network to extract features for each chunk,
+        and do the aggregation afterwards. Here, one token may appear in multiple chunks.
+        We can vote the output based on some heuristic score functions.
+
+        Parameters
+        ----------
+        doc_stride
+            The stride used when the context is too large and is split across several features.
+        max_chunk_length
+            The maximum size of the chunk
+
+        Returns
+        -------
+        ret
+            List of DocChunk objects
+        """
+        doc_ptr = 0
+        max_chunk_length = max_chunk_length if max_chunk_length is not None else \
+            len(self.context_token_ids)
+        ret = []
+        while doc_ptr < len(self.context_token_ids):
+            chunk_length = min(max_chunk_length, len(self.context_token_ids) - doc_ptr)
+            if self.gt_answer_text is None:
+                chunk_gt_start_pos = None
+                chunk_gt_end_pos = None
+                chunk_is_impossible = True
+            else:
+                if self.gt_start_pos is not None and self.gt_end_pos is not None and\
+                        self.gt_start_pos >= doc_ptr and self.gt_end_pos < doc_ptr + chunk_length:
+                    # The chunk contains the ground-truth annotation
+                    chunk_gt_start_pos = self.gt_start_pos - doc_ptr
+                    chunk_gt_end_pos = self.gt_end_pos - doc_ptr
+                    chunk_is_impossible = False
+                else:
+                    chunk_gt_start_pos = None
+                    chunk_gt_end_pos = None
+                    chunk_is_impossible = True
+            ret.append(DocChunk(start=doc_ptr,
+                                length=chunk_length,
+                                is_impossible=chunk_is_impossible,
+                                gt_start_pos=chunk_gt_start_pos,
+                                gt_end_pos=chunk_gt_end_pos))
+            if doc_ptr + chunk_length == len(self.context_token_ids):
+                break
+            doc_ptr += doc_stride
+        return ret
+
+
+def get_squad_examples_from_json(json_file: str, is_training: bool) -> List[SquadExample]:
+    """
+    Read the whole entry of raw json file and convert it to examples.
+
+    Parameters
+    ----------
+    json_file
+        The path to the json file
+    is_training
+        Whether or not training
+
+    Returns
+    -------
+    ret
+        List of SquadExample objects
+    """
+    with open(json_file, 'r') as f:
+        data = json.load(f)
+    examples = []
+    for entry in tqdm(data['data']):
+        title = entry['title']
+        for paragraph in entry['paragraphs']:
+            context_text = paragraph['context']
+            for qa in paragraph['qas']:
+                qas_id = qa['id']
+                query_text = qa['question']
+                start_position = None
+                end_position = None
+                answer_text = None
+                answers = None
+                if "is_impossible" in qa:
+                    is_impossible = qa["is_impossible"]
+                else:
+                    is_impossible = False
+
+                if not is_impossible:
+                    if is_training:
+                        answer = qa["answers"][0]
+                        answer_text = answer["text"]
+                        start_position = answer["answer_start"]
+                        end_position = start_position + len(answer_text)
+                        if context_text[start_position:end_position] != answer_text:
+                            warnings.warn(
+                                'Mismatch start/end and answer_text, start/end={}/{},'
+                                ' answer text={}. qas={}'
+                                .format(start_position, end_position, answer_text, qas_id))
+                    else:
+                        answers = qa["answers"]
+                example = SquadExample(
+                    qas_id=qas_id,
+                    query_text=query_text,
+                    context_text=context_text,
+                    answer_text=answer_text,
+                    start_position=start_position,
+                    end_position=end_position,
+                    title=title,
+                    is_impossible=is_impossible,
+                    answers=answers,
+                )
+                examples.append(example)
+    return examples
+
+
+def get_squad_examples(data_dir, segment='train', version='1.1'):
+    """
+
+    Parameters
+    ----------
+    data_dir
+        The directory of the data
+    segment
+        The segment
+    version
+        Version of the SQuAD
+
+    Returns
+    -------
+    examples
+        A list of SquadExampls objects
+    """
+    if version == '1.1':
+        train_path = os.path.join(data_dir, 'train-v1.1.json')
+        dev_path = os.path.join(data_dir, 'dev-v1.1.json')
+    elif version == '2.0':
+        train_path = os.path.join(data_dir, 'train-v2.0.json')
+        dev_path = os.path.join(data_dir, 'dev-v2.0.json')
+    else:
+        raise NotImplementedError
+
+    if segment == 'train':
+        examples = get_squad_examples_from_json(train_path, is_training=True)
+    elif segment == 'dev':
+        examples = get_squad_examples_from_json(dev_path, is_training=False)
+    else:
+        raise NotImplementedError
+
+    return examples
+
+
+def convert_squad_example_to_feature(example: SquadExample,
+                                     tokenizer: BaseTokenizerWithVocab,
+                                     is_training: bool):
+    """
+    Convert a SquadExample object to a SquadFeature object with the designated tokenizer.
+
+    There are accually few examples can not be converted properly with token level tokenization,
+    due to the ground-truth are given by the start position and the answer text, and some examples
+    are annotated with wrong labels. Thus, attribute unreliable_span and token_answer_mismatch are
+    used to indicate these senarios.
+
+    Parameters
+    ----------
+    example
+        A single squad example
+    tokenizer
+        The trained tokenizer
+    is_training
+        Whether to deal with the training case
+    Returns
+    -------
+    feature
+        A SquadFeature
+    """
+    context_text = example.context_text
+    answer_text = example.answer_text
+    query_text = example.query_text
+    context_token_ids, offsets = tokenizer.encode_with_offsets(context_text, int)
+    query_token_ids = tokenizer.encode(query_text, int)
+    gt_answer_text = answer_text
+    gt_span_start_pos, gt_span_end_pos = None, None
+    token_answer_mismatch = False
+    unreliable_span = False
+    np_offsets = np.array(offsets)
+    if is_training and not example.is_impossible:
+        assert example.start_position >= 0 and example.end_position >= 0
+        # We convert the character-level offsets to token-level offsets
+        # Also, if the answer after tokenization + detokenization is not the same as the original
+        # answer, we try to localize the answer text and do a rematch
+        candidates = [(example.start_position, example.end_position)]
+        all_possible_start_pos = {example.start_position}
+        find_all_candidates = False
+        lower_idx, upper_idx = None, None
+        first_lower_idx, first_upper_idx = None, None
+        while len(candidates) > 0:
+            start_position, end_position = candidates.pop()
+            # Match the token offsets
+            token_start_ends = match_tokens_with_char_spans(np_offsets,
+                                                            np.array([[start_position,
+                                                                       end_position]]))
+            lower_idx = int(token_start_ends[0][0])
+            upper_idx = int(token_start_ends[0][1])
+            if not find_all_candidates:
+                first_lower_idx = lower_idx
+                first_upper_idx = upper_idx
+            # The new start pos and end_pos are the lower_idx and upper_idx
+            sliced_answer = context_text[offsets[lower_idx][0]:offsets[upper_idx][1]]
+            norm_sliced_answer = normalize_answer(sliced_answer)
+            norm_answer = normalize_answer(answer_text)
+            if norm_sliced_answer != norm_answer:
+                if not find_all_candidates:
+                    # Try to find a better start+end of the answer and insert all positions to the
+                    # candidates
+                    find_all_candidates = True
+                    pos = context_text.find(answer_text)
+                    while pos != -1:
+                        if pos not in all_possible_start_pos:
+                            all_possible_start_pos.add(pos)
+                            candidates.append((pos, pos + len(answer_text)))
+                        pos = context_text.find(answer_text, pos + 1)
+                elif len(candidates) == 0:
+                    token_answer_mismatch = True
+                    lower_idx = first_lower_idx
+                    upper_idx = first_upper_idx
+                    if int_float_regex.match(answer_text):
+                        # Find an integer/float and the sample won't be reliable.
+                        # The span-based approach is not suitable for this scenario and we will
+                        # set the unreliable span flag.
+                        unreliable_span = True
+            else:
+                break
+
+        gt_span_start_pos = lower_idx
+        gt_span_end_pos = upper_idx
+
+    feature = SquadFeature(qas_id=example.qas_id,
+                           query_token_ids=query_token_ids,
+                           context_text=context_text,
+                           context_token_ids=context_token_ids,
+                           context_token_offsets=offsets,
+                           is_impossible=example.is_impossible,
+                           token_answer_mismatch=token_answer_mismatch,
+                           unreliable_span=unreliable_span,
+                           gt_answer_text=gt_answer_text,
+                           gt_start_pos=gt_span_start_pos,
+                           gt_end_pos=gt_span_end_pos)
+    return feature
diff --git a/scripts/question_answering/utils.py b/scripts/question_answering/utils.py
deleted file mode 100644
index 2e55f7e098..0000000000
--- a/scripts/question_answering/utils.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Various utility methods for Question Answering"""
-import math
-
-
-def warm_up_lr(base_lr, iteration, lr_warmup_steps):
-    """Returns learning rate based on current iteration.
-
-    This function is used to implement learning rate warm up technique.
-
-    math::
-
-      lr = min(base_lr, base_lr * (log(iteration) /  log(lr_warmup_steps)))
-
-    Parameters
-    ----------
-    base_lr : float
-        Initial learning rage
-    iteration : int
-        Current iteration number
-    lr_warmup_steps : int
-        Learning rate warm up steps
-
-    Returns
-    -------
-    learning_rate : float
-        Learning rate
-    """
-    return min(base_lr, base_lr * (math.log(iteration) / math.log(lr_warmup_steps)))
diff --git a/scripts/sentiment_analysis/__init__.py b/scripts/sentiment_analysis/__init__.py
deleted file mode 100644
index 8d81276b5d..0000000000
--- a/scripts/sentiment_analysis/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Sentiment Analysis example."""
diff --git a/scripts/sentiment_analysis/finetune_lm.py b/scripts/sentiment_analysis/finetune_lm.py
deleted file mode 100644
index 3528663076..0000000000
--- a/scripts/sentiment_analysis/finetune_lm.py
+++ /dev/null
@@ -1,344 +0,0 @@
-"""
-Fine-tune Language Model for Sentiment Analysis
-===============================================
-
-This example shows how to load a language model pre-trained on wikitext-2 in Gluon NLP Toolkit model
-zoo, and reuse the language model encoder for sentiment analysis on IMDB movie reviews dataset.
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import time
-import random
-import glob
-import multiprocessing as mp
-
-import numpy as np
-
-import mxnet as mx
-from mxnet import gluon, autograd
-from mxnet.gluon import HybridBlock
-from mxnet.gluon.data import DataLoader
-
-import gluonnlp as nlp
-
-nlp.utils.check_version('0.7.0')
-
-np.random.seed(100)
-random.seed(100)
-mx.random.seed(10000)
-
-tokenizer = nlp.data.SpacyTokenizer('en')
-length_clip = nlp.data.ClipSequence(500)
-
-
-parser = argparse.ArgumentParser(description='MXNet Sentiment Analysis Example on IMDB. '
-                                             'We load a LSTM model that is pre-trained on '
-                                             'WikiText as our encoder.')
-parser.add_argument('--lm_model', type=str, default='standard_lstm_lm_200',
-                    help='type of the pre-trained model to load, can be "standard_lstm_200", '
-                         '"standard_lstm_200", etc.')
-parser.add_argument('--use-mean-pool', type=bool, default=True,
-                    help='whether to use mean pooling to aggregate the states from '
-                         'different timestamps.')
-parser.add_argument('--no_pretrained', action='store_true',
-                    help='Turn on the option to just use the structure and '
-                         'not load the pre-trained weights.')
-parser.add_argument('--lr', type=float, default=2.5E-3,
-                    help='initial learning rate')
-parser.add_argument('--clip', type=float, default=None, help='gradient clipping')
-parser.add_argument('--bucket_type', type=str, default=None,
-                    help='Can be "fixed" or "sorted"')
-parser.add_argument('--bucket_num', type=int, default=10,
-                    help='The bucket_num if bucket_type is "fixed".')
-parser.add_argument('--bucket_ratio', type=float, default=0.0,
-                    help='The ratio used in the FixedBucketSampler.')
-parser.add_argument('--bucket_mult', type=int, default=100,
-                    help='The mult used in the SortedBucketSampler.')
-parser.add_argument('--valid_ratio', type=float, default=0.05,
-                    help='Proportion [0, 1] of training samples to use for validation set.')
-parser.add_argument('--epochs', type=int, default=20,
-                    help='upper epoch limit')
-parser.add_argument('--batch_size', type=int, default=16, metavar='N',
-                    help='batch size')
-parser.add_argument('--dropout', type=float, default=0.,
-                    help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--log-interval', type=int, default=30, metavar='N',
-                    help='report interval')
-parser.add_argument('--save-prefix', type=str, default='sa-model',
-                    help='path to save the final model')
-parser.add_argument('--gpu', type=int, default=None,
-                    help='id of the gpu to use. Set it to empty means to use cpu.')
-args = parser.parse_args()
-print(args)
-
-pretrained = not args.no_pretrained
-if args.gpu is None:
-    print('Use cpu')
-    context = mx.cpu()
-else:
-    print('Use gpu%d' % args.gpu)
-    context = mx.gpu(args.gpu)
-
-class AggregationLayer(HybridBlock):
-    """A block for different ways of aggregating encoder features"""
-    def __init__(self, use_mean_pool=False, prefix=None, params=None):
-        super(AggregationLayer, self).__init__(prefix=prefix, params=params)
-        self._use_mean_pool = use_mean_pool
-
-    def hybrid_forward(self, F, data, valid_length): # pylint: disable=arguments-differ
-        """Forward logic"""
-        # Data will have shape (T, N, C)
-        if self._use_mean_pool:
-            masked_encoded = F.SequenceMask(data,
-                                            sequence_length=valid_length,
-                                            use_sequence_length=True)
-            agg_state = F.broadcast_div(F.sum(masked_encoded, axis=0),
-                                        F.expand_dims(valid_length, axis=1))
-        else:
-            agg_state = F.SequenceLast(data,
-                                       sequence_length=valid_length,
-                                       use_sequence_length=True)
-        return agg_state
-
-
-class SentimentNet(HybridBlock):
-    """Network for sentiment analysis."""
-    def __init__(self, dropout, use_mean_pool=False, prefix=None, params=None):
-        super(SentimentNet, self).__init__(prefix=prefix, params=params)
-        self._use_mean_pool = use_mean_pool
-        with self.name_scope():
-            self.embedding = None
-            self.encoder = None
-            self.agg_layer = AggregationLayer(use_mean_pool=use_mean_pool)
-            self.output = gluon.nn.HybridSequential()
-            with self.output.name_scope():
-                self.output.add(gluon.nn.Dropout(dropout))
-                self.output.add(gluon.nn.Dense(1, flatten=False))
-
-    def hybrid_forward(self, _, data, valid_length): # pylint: disable=arguments-differ
-        encoded = self.encoder(self.embedding(data))  # Shape(T, N, C)
-        agg_state = self.agg_layer(encoded, valid_length)
-        out = self.output(agg_state)
-        return out
-
-net = SentimentNet(dropout=args.dropout, use_mean_pool=args.use_mean_pool)
-with net.name_scope():
-    lm_model, vocab = nlp.model.get_model(name=args.lm_model,
-                                          dataset_name='wikitext-2',
-                                          pretrained=pretrained,
-                                          ctx=context,
-                                          dropout=args.dropout)
-
-net.embedding = lm_model.embedding
-net.encoder = lm_model.encoder
-net.hybridize()
-
-
-# Dataset preprocessing
-def preprocess(x):
-    data, label = x
-    label = int(label > 5)
-    data = vocab[length_clip(tokenizer(data))]
-    return data, label
-
-def get_length(x):
-    return float(len(x[0]))
-
-# Load the dataset
-train_dataset, test_dataset = [nlp.data.IMDB(root='data/imdb', segment=segment)
-                               for segment in ('train', 'test')]
-train_dataset, valid_dataset = nlp.data.train_valid_split(train_dataset, args.valid_ratio)
-print('Tokenize using spaCy...')
-
-def preprocess_dataset(dataset):
-    start = time.time()
-    pool = mp.Pool(8)
-    dataset = gluon.data.SimpleDataset(pool.map(preprocess, dataset))
-    lengths = gluon.data.SimpleDataset(pool.map(get_length, dataset))
-    end = time.time()
-    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
-    return dataset, lengths
-
-# Preprocess the dataset
-train_dataset, train_data_lengths = preprocess_dataset(train_dataset)
-valid_dataset, valid_data_lengths = preprocess_dataset(valid_dataset)
-test_dataset, test_data_lengths = preprocess_dataset(test_dataset)
-
-# Construct the DataLoader. Pad data and stack label
-batchify_fn = nlp.data.batchify.Tuple(nlp.data.batchify.Pad(axis=0, pad_val=0, ret_length=True),
-                                      nlp.data.batchify.Stack(dtype='float32'))
-if args.bucket_type is None:
-    print('Bucketing strategy is not used!')
-    train_dataloader = DataLoader(dataset=train_dataset,
-                                  batch_size=args.batch_size,
-                                  shuffle=True,
-                                  batchify_fn=batchify_fn)
-else:
-    if args.bucket_type == 'fixed':
-        print('Use FixedBucketSampler')
-        batch_sampler = nlp.data.FixedBucketSampler(train_data_lengths,
-                                                    batch_size=args.batch_size,
-                                                    num_buckets=args.bucket_num,
-                                                    ratio=args.bucket_ratio,
-                                                    shuffle=True)
-        print(batch_sampler.stats())
-    elif args.bucket_type == 'sorted':
-        print('Use SortedBucketSampler')
-        batch_sampler = nlp.data.SortedBucketSampler(train_data_lengths,
-                                                     batch_size=args.batch_size,
-                                                     mult=args.bucket_mult,
-                                                     shuffle=True)
-    else:
-        raise NotImplementedError
-    train_dataloader = DataLoader(dataset=train_dataset,
-                                  batch_sampler=batch_sampler,
-                                  batchify_fn=batchify_fn)
-
-valid_dataloader = DataLoader(dataset=valid_dataset,
-                              batch_size=args.batch_size,
-                              shuffle=False,
-                              sampler=nlp.data.SortedSampler(valid_data_lengths),
-                              batchify_fn=batchify_fn)
-
-test_dataloader = DataLoader(dataset=test_dataset,
-                             batch_size=args.batch_size,
-                             shuffle=False,
-                             sampler=nlp.data.SortedSampler(test_data_lengths),
-                             batchify_fn=batchify_fn)
-
-
-net.hybridize()
-print(net)
-if args.no_pretrained:
-    net.initialize(mx.init.Xavier(), ctx=context)
-else:
-    net.output.initialize(mx.init.Xavier(), ctx=context)
-trainer = gluon.Trainer(net.collect_params(), 'ftml', {'learning_rate': args.lr})
-loss = gluon.loss.SigmoidBCELoss()
-
-
-def evaluate(dataloader):
-    """Evaluate network on the specified dataset"""
-    total_L = 0.0
-    total_sample_num = 0
-    total_correct_num = 0
-    start_log_interval_time = time.time()
-    print('Begin Testing...')
-    for i, ((data, valid_length), label) in enumerate(dataloader):
-        data = mx.nd.transpose(data.as_in_context(context))
-        valid_length = valid_length.as_in_context(context).astype(np.float32)
-        label = label.as_in_context(context)
-        output = net(data, valid_length)
-        L = loss(output, label)
-        pred = (output > 0.5).reshape((-1,))
-        total_L += L.sum().asscalar()
-        total_sample_num += label.shape[0]
-        total_correct_num += (pred == label).sum().asscalar()
-        if (i + 1) % args.log_interval == 0:
-            print('[Batch {}/{}] elapsed {:.2f} s'.format(
-                i + 1, len(dataloader), time.time() - start_log_interval_time))
-            start_log_interval_time = time.time()
-    avg_L = total_L / float(total_sample_num)
-    acc = total_correct_num / float(total_sample_num)
-    return avg_L, acc
-
-
-def train():
-    """Training process"""
-    start_pipeline_time = time.time()
-
-    # Training/Testing
-    best_valid_acc = 0
-    stop_early = 0
-    for epoch in range(args.epochs):
-        # Epoch training stats
-        start_epoch_time = time.time()
-        epoch_L = 0.0
-        epoch_sent_num = 0
-        epoch_wc = 0
-        # Log interval training stats
-        start_log_interval_time = time.time()
-        log_interval_wc = 0
-        log_interval_sent_num = 0
-        log_interval_L = 0.0
-
-        for i, ((data, valid_length), label) in enumerate(train_dataloader):
-            data = mx.nd.transpose(data.as_in_context(context))
-            label = label.as_in_context(context)
-            valid_length = valid_length.as_in_context(context).astype(np.float32)
-            wc = valid_length.sum().asscalar()
-            log_interval_wc += wc
-            epoch_wc += wc
-            log_interval_sent_num += data.shape[1]
-            epoch_sent_num += data.shape[1]
-            with autograd.record():
-                output = net(data, valid_length)
-                L = loss(output, label).mean()
-            L.backward()
-            # Clip gradient
-            if args.clip is not None:
-                grads = [p.grad(context) for p in net.collect_params().values()]
-                gluon.utils.clip_global_norm(grads, args.clip)
-            # Update parameter
-            trainer.step(1)
-            log_interval_L += L.asscalar()
-            epoch_L += L.asscalar()
-            if (i + 1) % args.log_interval == 0:
-                print('[Epoch %d Batch %d/%d] avg loss %g, throughput %gK wps' % (
-                    epoch, i + 1, len(train_dataloader),
-                    log_interval_L / log_interval_sent_num,
-                    log_interval_wc / 1000 / (time.time() - start_log_interval_time)))
-                # Clear log interval training stats
-                start_log_interval_time = time.time()
-                log_interval_wc = 0
-                log_interval_sent_num = 0
-                log_interval_L = 0
-        end_epoch_time = time.time()
-        valid_avg_L, valid_acc = evaluate(valid_dataloader)
-        test_avg_L, test_acc = evaluate(test_dataloader)
-        print('[Epoch %d] train avg loss %g, '
-              'valid acc %.4f, valid avg loss %g, '
-              'test acc %.4f, test avg loss %g, throughput %gK wps' % (
-                  epoch, epoch_L / epoch_sent_num,
-                  valid_acc, valid_avg_L, test_acc, test_avg_L,
-                  epoch_wc / 1000 / (end_epoch_time - start_epoch_time)))
-
-        if valid_acc < best_valid_acc:
-            print('No Improvement.')
-            stop_early += 1
-            if stop_early == 3:
-                break
-        else:
-            # Reset stop_early if the validation loss finds a new low value
-            print('Observed Improvement.')
-            stop_early = 0
-            net.save_parameters(args.save_prefix + '_{:04d}.params'.format(epoch))
-            best_valid_acc = valid_acc
-
-    net.load_parameters(glob.glob(args.save_prefix+'_*.params')[-1], context)
-    valid_avg_L, valid_acc = evaluate(valid_dataloader)
-    test_avg_L, test_acc = evaluate(test_dataloader)
-    print('Best validation loss %g, validation acc %.4f'%(valid_avg_L, valid_acc))
-    print('Best test loss %g, test acc %.4f'%(test_avg_L, test_acc))
-    print('Total time cost %.2fs'%(time.time()-start_pipeline_time))
-
-
-if __name__ == '__main__':
-    train()
diff --git a/scripts/sentiment_analysis/index.rst b/scripts/sentiment_analysis/index.rst
deleted file mode 100644
index ef5208f019..0000000000
--- a/scripts/sentiment_analysis/index.rst
+++ /dev/null
@@ -1,247 +0,0 @@
-Sentiment Analysis
-------------------
-
-:download:`Download scripts </model_zoo/sentiment_analysis.zip>`
-
-Through Fine-tuning Word Language Model
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This script can be used to train a sentiment analysis model from scratch, or fine-tune a pre-trained language model.
-The pre-trained language models are loaded from Gluon NLP Toolkit model zoo. It also showcases how to use different
-bucketing strategies to speed up training.
-
-Use the following command to run without using pre-trained model (`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/sentiment_raw_20180817.log>`__)
-
-.. code-block:: console
-
-   $ python finetune_lm.py --gpu 0 --batch_size 16 --bucket_type fixed --epochs 3 --dropout 0 --no_pretrained --lr 0.005 --valid_ratio 0.1 --save-prefix imdb_lstm_200  # Test Accuracy 85.60
-
-Use the following command to run with pre-trained model (`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/sentiment_pretrained_20180817.log>`__)
-
-.. code-block:: console
-
-   $ python finetune_lm.py --gpu 0 --batch_size 16 --bucket_type fixed --epochs 3 --dropout 0 --lr 0.005 --valid_ratio 0.1 --save-prefix imdb_lstm_200  # Test Accuracy 86.46
-
-
-TextCNN
-~~~~~~~
-
-
-This script can be used to train a sentiment analysis model with convolutional neural networks, i.e., textCNN:
-
-Kim, Y. (2014). Convolutional neural networks for sentence classification. arXiv preprint arXiv:1408.5882.
-
-epoch:
-
-+----------------+--------+---------+---------+--------+--------+--------+--------+
-|                | MR     | SST-1   | SST-2   | Subj   | TREC   |   CR   |  MPQA  |
-+================+========+=========+=========+========+========+========+========+
-| rand           |   200  |   200   |   200   |   200  |   200  |   200  |   200  |
-+----------------+--------+---------+---------+--------+--------+--------+--------+
-| static         |   200  |   200   |   200   |   200  |   200  |   200  |   200  |
-+----------------+--------+---------+---------+--------+--------+--------+--------+
-| non-static     |   200  |   200   |   200   |   200  |   200  |   200  |   200  |
-+----------------+--------+---------+---------+--------+--------+--------+--------+
-| multichannel   |   200  |   200   |   200   |   200  |   200  |   200  |   200  |
-+----------------+--------+---------+---------+--------+--------+--------+--------+
-
-log:
-
-
-+----------------+----------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+
-|                | MR                                                                                                       | SST-1                                                                                                       | SST-2                                                                                                        | Subj                                                                                                        | TREC                                                                                                        | CR                                                                                                        | MPQA                                                                                                        |
-+================+==========================================================================================================+=============================================================================================================+==============================================================================================================+=============================================================================================================+=============================================================================================================+===========================================================================================================+=============================================================================================================+
-| rand           | [1]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MR_rand.log>`__           | [5]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-1_rand.log>`__           | [9]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-2_rand.log>`__            | [13]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/Subj_rand.log>`__           | [17]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/TREC_rand.log>`__           | [21]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/CR_rand.log>`__           | [25]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MPQA_rand.log>`__           |
-+----------------+----------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+
-| static         | [2]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MR_static.log>`__         | [6]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-1_static.log>`__         | [10]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-2_static.log>`__         | [14]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/Subj_static.log>`__         | [18]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/TREC_static.log>`__         | [22]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/CR_static.log>`__         | [26]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MPQA_static.log>`__         |
-+----------------+----------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+
-| non-static     | [3]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MR_non-static.log>`__     | [7]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-1_non-static.log>`__     | [11]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-2_non-static.log>`__     | [15]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/Subj_non-static.log>`__     | [19]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/TREC_non-static.log>`__     | [23]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/CR_non-static.log>`__     | [27]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MPQA_non-static.log>`__     |
-+----------------+----------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+
-| multichannel   | [4]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MR_multichannel.log>`__   | [8]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-1_multichannel.log>`__   | [12]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/SST-2_multichannel.log>`__   | [16]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/Subj_multichannel.log>`__   | [20]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/TREC_multichannel.log>`__   | [24]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/CR_multichannel.log>`__   | [28]/`log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/sentiment/MPQA_multichannel.log>`__   |
-+----------------+----------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------+
-
-
-test accuracy (SST-1, SST-2, and TREC) or cross-validation accuracy (MR, Subj, CR and MPQA):
-
-+----------------+----------+-----------+-----------+----------+----------+----------+----------+
-|                |   MR     |   SST-1   |   SST-2   |   Subj   |   TREC   |    CR    |   MPQA   |
-+================+==========+===========+===========+==========+==========+==========+==========+
-| rand           |   75.8   |   44.3    |   82.1    |   89.3   |   90.2   |   79.5   |   85.3   |
-+----------------+----------+-----------+-----------+----------+----------+----------+----------+
-| static         |   79.4   |   48.1    |   87.1    |   91.8   |   91.4   |   83.1   |   89.6   |
-+----------------+----------+-----------+-----------+----------+----------+----------+----------+
-| non-static     |   80.0   |   47.0    |   85.6    |   91.9   |   93.2   |   82.9   |   89.2   |
-+----------------+----------+-----------+-----------+----------+----------+----------+----------+
-| multichannel   |   80.0   |   48.1    |   85.8    |   92.1   |   93.2   |   83.3   |   89.6   |
-+----------------+----------+-----------+-----------+----------+----------+----------+----------+
-
-[1]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode rand --data_name MR
-
-[2]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode static --data_name MR
-
-
-[3]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode non-static --data_name MR
-
-
-[4]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode multichannel --data_name MR
-
-[5]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode rand --data_name SST-1
-
-[6]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode static --data_name SST-1
-
-[7]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode non-static --data_name SST-1
-
-[8]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode multichannel --data_name SST-1
-
-[9]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode rand --data_name SST-2
-
-[10]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode static --data_name SST-2
-
-[11]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode non-static --data_name SST-2
-
-[12]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode multichannel --data_name SST-2
-
-[13]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode rand --data_name Subj
-
-[14]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode static --data_name Subj
-
-[15]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode non-static --data_name Subj
-
-[16]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode multichannel --data_name Subj
-
-[17]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode rand --data_name TREC
-
-[18]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode static --data_name TREC
-
-[19]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode non-static --data_name TREC
-
-[20]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode multichannel --data_name TREC
-   
-[21]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode rand --data_name CR
-
-[22]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode static --data_name CR
-
-[23]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode non-static --data_name CR
-
-[24]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode multichannel --data_name CR
-   
-[25]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode rand --data_name MPQA
-
-[26]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode static --data_name MPQA
-
-[27]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode non-static --data_name MPQA
-
-[28]:
-
-.. code-block:: console
-
-   $ python sentiment_analysis_cnn.py --gpu 0 --batch_size 50 --epochs 200 --dropout 0.5 --model_mode multichannel --data_name MPQA
-
diff --git a/scripts/sentiment_analysis/process_data.py b/scripts/sentiment_analysis/process_data.py
deleted file mode 100644
index 70363f7dea..0000000000
--- a/scripts/sentiment_analysis/process_data.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Load various datasets."""
-
-import re
-import time
-
-from mxnet import nd, gluon
-import gluonnlp as nlp
-
-
-def _load_file(data_name):
-    if data_name == 'MR':
-        train_dataset = nlp.data.MR(root='data/mr')
-        output_size = 2
-        return train_dataset, output_size
-    elif data_name == 'CR':
-        train_dataset = nlp.data.CR(root='data/cr')
-        output_size = 2
-        return train_dataset, output_size
-    elif data_name == 'MPQA':
-        train_dataset = nlp.data.MPQA(root='data/mpqa')
-        output_size = 2
-        return train_dataset, output_size
-    elif data_name == 'SST-1':
-        train_dataset, test_dataset, dev_dataset = [nlp.data.SST_1(root='data/sst-1', segment=seg)
-                                                    for seg in ('train', 'test', 'dev')]
-        output_size = 5
-        return train_dataset, test_dataset, dev_dataset, output_size
-    elif data_name == 'SST-2':
-        train_dataset, test_dataset, dev_dataset = [nlp.data.SST_2(root='data/sst-2', segment=seg)
-                                                    for seg in ('train', 'test', 'dev')]
-        output_size = 2
-        return train_dataset, test_dataset, dev_dataset, output_size
-    elif data_name == 'Subj':
-        train_dataset = nlp.data.SUBJ(root='data/Subj')
-        output_size = 2
-        return train_dataset, output_size
-    else:
-        train_dataset, test_dataset = [nlp.data.TREC(root='data/trec', segment=seg)
-                                       for seg in ('train', 'test')]
-        output_size = 6
-        return train_dataset, test_dataset, output_size
-
-
-def _clean_str(string, data_name):
-    if data_name in ('SST-1', 'SST-2'):
-        string = re.sub(r'[^A-Za-z0-9(),!?\'\`]', ' ', string)
-        string = re.sub(r'\s{2,}', ' ', string)
-        return string.strip().lower()
-    else:
-        string = re.sub(r'[^A-Za-z0-9(),!?\'\`]', ' ', string)
-        string = re.sub(r'\'s', ' \'s', string)
-        string = re.sub(r'\'ve', ' \'ve', string)
-        string = re.sub(r'n\'t', ' n\'t', string)
-        string = re.sub(r'\'re', ' \'re', string)
-        string = re.sub(r'\'d', ' \'d', string)
-        string = re.sub(r'\'ll', ' \'ll', string)
-        string = re.sub(r',', ' , ', string)
-        string = re.sub(r'!', ' ! ', string)
-        string = re.sub(r'\(', ' ( ', string)
-        string = re.sub(r'\)', ' ) ', string)
-        string = re.sub(r'\?', ' ? ', string)
-        string = re.sub(r'\s{2,}', ' ', string)
-        return string.strip() if data_name == 'TREC' else string.strip().lower()
-
-
-def _build_vocab(data_name, train_dataset, test_dataset, dev_dataset):
-    all_token = []
-    max_len = 0
-    for dataset in (train_dataset, dev_dataset, test_dataset):
-        for line in dataset:
-            line = _clean_str(line[0], data_name).split()
-            max_len = max_len if max_len > len(line) else len(line)
-            all_token.extend(line)
-    vocab = nlp.Vocab(nlp.data.count_tokens(all_token))
-    vocab.set_embedding(nlp.embedding.create('Word2Vec', source='GoogleNews-vectors-negative300'))
-    for word in vocab.embedding._idx_to_token:
-        if (vocab.embedding[word] == nd.zeros(300)).sum() == 300:
-            vocab.embedding[word] = nd.random.uniform(0, 0.05, 300)
-    vocab.embedding['<unk>'] = nd.random.uniform(0, 0.05, 300)
-    vocab.embedding['<pad>'] = nd.zeros(300)
-    vocab.embedding['<bos>'] = nd.zeros(300)
-    vocab.embedding['<eos>'] = nd.zeros(300)
-    print('maximum length (in tokens): ', max_len)
-    return vocab, max_len
-
-
-# Dataset preprocessing.
-def _preprocess(x, vocab, max_len):
-    data, label = x
-    data = vocab[data.split()]
-    data = data[:max_len] + [1] * (max_len - len(data[:max_len]))
-    return data, label
-
-
-def _preprocess_dataset(dataset, vocab, max_len):
-    start = time.time()
-    dataset = [_preprocess(d, vocab=vocab, max_len=max_len) for d in dataset]
-    lengths = gluon.data.SimpleDataset([len(d[0]) for d in dataset])
-    end = time.time()
-    print('Done! Tokenizing Time={:.2f}s, #Sentences={}'.format(end - start, len(dataset)))
-    return dataset, lengths
-
-
-def load_dataset(data_name):
-    """Load sentiment dataset."""
-    if data_name in ('MR', 'Subj', 'CR', 'MPQA'):
-        train_dataset, output_size = _load_file(data_name)
-        vocab, max_len = _build_vocab(data_name, train_dataset, [], [])
-        train_dataset, train_data_lengths = _preprocess_dataset(train_dataset, vocab, max_len)
-        return vocab, max_len, output_size, train_dataset, train_data_lengths
-    elif data_name == 'TREC':
-        train_dataset, test_dataset, output_size = _load_file(data_name)
-        vocab, max_len = _build_vocab(data_name, train_dataset, test_dataset, [])
-        train_dataset, train_data_lengths = _preprocess_dataset(train_dataset, vocab, max_len)
-        test_dataset, test_data_lengths = _preprocess_dataset(test_dataset, vocab, max_len)
-        return vocab, max_len, output_size, train_dataset, train_data_lengths, test_dataset, \
-               test_data_lengths
-    else:
-        train_dataset, test_dataset, dev_dataset, output_size = _load_file(data_name)
-        vocab, max_len = _build_vocab(data_name, train_dataset, test_dataset, dev_dataset)
-        train_dataset, train_data_lengths = _preprocess_dataset(train_dataset, vocab, max_len)
-        test_dataset, test_data_lengths = _preprocess_dataset(test_dataset, vocab, max_len)
-        dev_dataset, dev_data_lengths = _preprocess_dataset(dev_dataset, vocab, max_len)
-        return vocab, max_len, output_size, train_dataset, train_data_lengths, test_dataset, \
-               test_data_lengths, dev_dataset, dev_data_lengths
diff --git a/scripts/sentiment_analysis/sentiment_analysis_cnn.py b/scripts/sentiment_analysis/sentiment_analysis_cnn.py
deleted file mode 100644
index f540537ec6..0000000000
--- a/scripts/sentiment_analysis/sentiment_analysis_cnn.py
+++ /dev/null
@@ -1,208 +0,0 @@
-"""
-TextCNN Model for Sentiment Analysis
-===============================================
-This example shows how to use convolutional neural networks (textCNN)
-for sentiment analysis on various datasets.
-
-Kim, Y. (2014). Convolutional neural networks for sentence classification.
-arXiv preprint arXiv:1408.5882.
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import time
-import random
-import numpy as np
-
-import mxnet as mx
-from mxnet import nd, gluon, autograd
-from mxnet.gluon.data import DataLoader
-import gluonnlp
-import process_data
-import text_cnn
-
-gluonnlp.utils.check_version('0.7.0')
-
-np.random.seed(3435)
-random.seed(3435)
-mx.random.seed(3435)
-
-parser = argparse.ArgumentParser(description='Sentiment analysis with the textCNN model on\
-                                 various datasets.')
-parser.add_argument('--data_name', choices=['MR', 'SST-1', 'SST-2', 'Subj', 'TREC', 'CR', 'MPQA'],
-                    default='MR', help='name of the data set')
-parser.add_argument('--model_mode', choices=['rand', 'static', 'non-static', 'multichannel'],
-                    default='multichannel', help='Variants of the textCNN model (see the paper:\
-                    Convolutional Neural Networks for Sentence Classification).')
-parser.add_argument('--epochs', type=int, default=200,
-                    help='upper epoch limit')
-parser.add_argument('--batch_size', type=int, default=50, metavar='N',
-                    help='batch size')
-parser.add_argument('--dropout', type=float, default=.5,
-                    help='dropout applied to layers (0 = no dropout)')
-parser.add_argument('--log-interval', type=int, default=30, metavar='N',
-                    help='report interval')
-parser.add_argument('--gpu', type=int, default=None,
-                    help='id of the gpu to use. Set it to empty means to use cpu.')
-args = parser.parse_args()
-print(args)
-
-if args.gpu is None:
-    print('Use cpu')
-    context = mx.cpu()
-else:
-    print('Use gpu%d' % args.gpu)
-    context = mx.gpu(args.gpu)
-
-if args.data_name in ('MR', 'Subj', 'CR', 'MPQA'):
-    vocab, max_len, output_size, train_dataset, train_data_lengths \
-    = process_data.load_dataset(args.data_name)
-elif args.data_name == 'TREC':
-    vocab, max_len, output_size, train_dataset, train_data_lengths, \
-    test_dataset, test_data_lengths = process_data.load_dataset(args.data_name)
-else:
-    vocab, max_len, output_size, train_dataset, train_data_lengths, test_dataset, \
-    test_data_lengths, dev_dataset, dev_data_lengths = process_data.load_dataset(args.data_name)
-
-model = text_cnn.model(args.dropout, vocab, args.model_mode, output_size)
-print(model)
-
-loss = gluon.loss.SoftmaxCrossEntropyLoss()
-
-def evaluate(net, dataloader):
-    """Evaluate network on the specified dataset"""
-    total_L = 0.0
-    total_sample_num = 0
-    total_correct_num = 0
-    start_log_interval_time = time.time()
-    print('Begin Testing...')
-    for i, (data, label) in enumerate(dataloader):
-        data = mx.nd.transpose(data.as_in_context(context))
-        label = label.as_in_context(context)
-        output = net(data)
-        L = loss(output, label)
-        pred = nd.argmax(output, axis=1)
-        total_L += L.sum().asscalar()
-        total_sample_num += label.shape[0]
-        total_correct_num += (pred.astype('int') == label).sum().asscalar()
-        if (i + 1) % args.log_interval == 0:
-            print('[Batch {}/{}] elapsed {:.2f} s'.format(
-                i + 1, len(dataloader), time.time() - start_log_interval_time))
-            start_log_interval_time = time.time()
-    avg_L = total_L / float(total_sample_num)
-    acc = total_correct_num / float(total_sample_num)
-    return avg_L, acc
-
-def train(net, train_data, test_data, dev_data=None):
-    """Train textCNN model for sentiment analysis."""
-    start_pipeline_time = time.time()
-    net, trainer = text_cnn.init(net, vocab, args.model_mode, context)
-    if dev_data is None:
-        random.shuffle(train_data)
-        sp = len(train_data) // 10
-        train_dataloader = DataLoader(dataset=train_data[sp:],
-                                      batch_size=args.batch_size,
-                                      shuffle=True)
-        val_dataloader = DataLoader(dataset=train_data[:sp],
-                                    batch_size=args.batch_size,
-                                    shuffle=False)
-    else:
-        train_dataloader = DataLoader(dataset=train_data,
-                                      batch_size=args.batch_size,
-                                      shuffle=True)
-        val_dataloader = DataLoader(dataset=dev_data,
-                                    batch_size=args.batch_size,
-                                    shuffle=False)
-    test_dataloader = DataLoader(dataset=test_data,
-                                 batch_size=args.batch_size,
-                                 shuffle=False)
-    # Training/Testing.
-    best_val_acc = 0
-    for epoch in range(args.epochs):
-        # Epoch training stats.
-        start_epoch_time = time.time()
-        epoch_L = 0.0
-        epoch_sent_num = 0
-        epoch_wc = 0
-        # Log interval training stats.
-        start_log_interval_time = time.time()
-        log_interval_wc = 0
-        log_interval_sent_num = 0
-        log_interval_L = 0.0
-        for i, (data, label) in enumerate(train_dataloader):
-            data = mx.nd.transpose(data.as_in_context(context))
-            label = label.as_in_context(context)
-            wc = max_len
-            log_interval_wc += wc
-            epoch_wc += wc
-            log_interval_sent_num += data.shape[1]
-            epoch_sent_num += data.shape[1]
-
-            with autograd.record():
-                output = net(data)
-                L = loss(output, label).mean()
-            L.backward()
-            # Update parameter.
-            trainer.step(args.batch_size)
-            log_interval_L += L.asscalar()
-            epoch_L += L.asscalar()
-            if (i + 1) % args.log_interval == 0:
-                print('[Epoch %d Batch %d/%d] avg loss %g, throughput %gK wps' % (
-                    epoch, i + 1, len(train_dataloader),
-                    log_interval_L / log_interval_sent_num,
-                    log_interval_wc / 1000 / (time.time() - start_log_interval_time)))
-                # Clear log interval training stats.
-                start_log_interval_time = time.time()
-                log_interval_wc = 0
-                log_interval_sent_num = 0
-                log_interval_L = 0
-        end_epoch_time = time.time()
-        val_avg_L, val_acc = evaluate(net, val_dataloader)
-        print('[Epoch %d] train avg loss %g, '
-              'dev acc %.4f, dev avg loss %g, throughput %gK wps' % (
-                  epoch, epoch_L / epoch_sent_num,
-                  val_acc, val_avg_L,
-                  epoch_wc / 1000 / (end_epoch_time - start_epoch_time)))
-
-        if val_acc >= best_val_acc:
-            print('Observed Improvement.')
-            best_val_acc = val_acc
-            test_avg_L, test_acc = evaluate(net, test_dataloader)
-
-    print('Test loss %g, test acc %.4f'%(test_avg_L, test_acc))
-    print('Total time cost %.2fs'%(time.time()-start_pipeline_time))
-    return test_acc
-
-def k_fold_cross_valid(k, net, all_dataset):
-    test_acc = []
-    fold_size = len(all_dataset) // k
-    random.shuffle(all_dataset)
-    for test_i in range(10):
-        test_data = all_dataset[test_i * fold_size: (test_i + 1) * fold_size]
-        train_data = all_dataset[: test_i * fold_size] + all_dataset[(test_i + 1) * fold_size:]
-        test_acc.append(train(net, train_data, test_data))
-    print('K-fold cross valid avg acc', sum(test_acc) / k)
-
-if __name__ == '__main__':
-    if args.data_name == 'TREC':
-        train(model, train_dataset, test_dataset)
-    elif args.data_name == 'SST-1' or args.data_name == 'SST-2':
-        train(model, train_dataset, test_dataset, dev_dataset)
-    else:
-        k_fold_cross_valid(10, model, train_dataset)
diff --git a/scripts/sentiment_analysis/text_cnn.py b/scripts/sentiment_analysis/text_cnn.py
deleted file mode 100644
index ed1427a83a..0000000000
--- a/scripts/sentiment_analysis/text_cnn.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""textCNN model."""
-
-import mxnet as mx
-from mxnet import gluon
-from mxnet.gluon import HybridBlock
-import gluonnlp as nlp
-
-nlp.utils.check_version('0.7.0')
-
-class SentimentNet(HybridBlock):
-    """Network for sentiment analysis."""
-
-    def __init__(self, dropout, embed_size=300, vocab_size=100, prefix=None,
-                 params=None, model_mode='multichannel', output_size=2,
-                 num_filters=(100, 100, 100), ngram_filter_sizes=(3, 4, 5)):
-        super(SentimentNet, self).__init__(prefix=prefix, params=params)
-        self.model_mode = model_mode
-        with self.name_scope():
-            self.embedding = gluon.nn.Embedding(vocab_size, embed_size)
-            if self.model_mode == 'multichannel':
-                self.embedding_extend = gluon.nn.Embedding(vocab_size, embed_size)
-                embed_size *= 2
-            self.encoder = nlp.model.ConvolutionalEncoder(embed_size=embed_size,
-                                                          num_filters=num_filters,
-                                                          ngram_filter_sizes=ngram_filter_sizes,
-                                                          conv_layer_activation='relu',
-                                                          num_highway=None)
-            self.output = gluon.nn.HybridSequential()
-            with self.output.name_scope():
-                self.output.add(gluon.nn.Dropout(dropout))
-                self.output.add(gluon.nn.Dense(output_size, flatten=False))
-
-    def hybrid_forward(self, F, data): # pylint: disable=arguments-differ
-        if self.model_mode == 'multichannel':
-            embedded = F.concat(self.embedding(data), self.embedding_extend(data), dim=2)
-        else:
-            embedded = self.embedding(data)
-        encoded = self.encoder(embedded)  # Shape(T, N, C)
-        out = self.output(encoded)
-        return out
-
-def model(dropout, vocab, model_mode, output_size):
-    """Construct the model."""
-
-    textCNN = SentimentNet(dropout=dropout, vocab_size=len(vocab), model_mode=model_mode,\
-                       output_size=output_size)
-    textCNN.hybridize()
-    return textCNN
-
-def init(textCNN, vocab, model_mode, context):
-    """Initialize parameters."""
-
-    textCNN.initialize(mx.init.Xavier(), ctx=context, force_reinit=True)
-    if model_mode != 'rand':
-        textCNN.embedding.weight.set_data(vocab.embedding.idx_to_vec)
-    if model_mode == 'multichannel':
-        textCNN.embedding_extend.weight.set_data(vocab.embedding.idx_to_vec)
-    if model_mode in ('static', 'multichannel'):
-        # Parameters of textCNN.embedding are not updated during training.
-        textCNN.embedding.collect_params().setattr('grad_req', 'null')
-    trainer = gluon.Trainer(textCNN.collect_params(), 'adadelta', {'rho':0.95, 'clip_gradient':3})
-    return textCNN, trainer
diff --git a/scripts/tests/__init__.py b/scripts/tests/__init__.py
deleted file mode 100644
index 23dc70bcf3..0000000000
--- a/scripts/tests/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""NLP example tests."""
diff --git a/scripts/tests/conftest.py b/scripts/tests/conftest.py
deleted file mode 100644
index 2b577c1152..0000000000
--- a/scripts/tests/conftest.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""conftest.py contains configuration for pytest."""
-
-import os
-
-import pytest
-
-from ..question_answering.data_pipeline import SQuADDataPipeline
-
-
-###############################################################################
-# Datasets
-###############################################################################
-@pytest.fixture(scope='session')
-def squad_dev_and_vocab_spacy_provider():
-    path = os.path.join('tests', 'data', 'squad')
-    pipeline = SQuADDataPipeline(400, 50, 1000, 100, 30, 16, 'glove.840B.300d')
-    return pipeline.get_processed_data(squad_data_root=path)
-
-
-@pytest.fixture(scope='session')
-def squad_dev_and_vocab_nltk_provider():
-    path = os.path.join('tests', 'data', 'squad')
-    pipeline = SQuADDataPipeline(400, 50, 1000, 100, 30, 16, 'glove.6B.100d')
-    return pipeline.get_processed_data(use_spacy=False, squad_data_root=path)
diff --git a/scripts/tests/multi-bleu-detok.perl b/scripts/tests/multi-bleu-detok.perl
deleted file mode 100644
index d2ef60c906..0000000000
--- a/scripts/tests/multi-bleu-detok.perl
+++ /dev/null
@@ -1,211 +0,0 @@
-#!/usr/bin/env perl
-#
-# This file is part of moses.  Its use is licensed under the GNU Lesser General
-# Public License version 2.1 or, at your option, any later version.
-
-# This file uses the internal tokenization of mteval-v13a.pl,
-# giving the exact same (case-sensitive) results on untokenized text.
-# Using this script with detokenized output and untokenized references is
-# preferrable over multi-bleu.perl, since scores aren't affected by tokenization differences.
-# 
-# like multi-bleu.perl , it supports plain text input and multiple references.
-
-# $Id$
-use warnings;
-use strict;
-
-my $lowercase = 0;
-if ($ARGV[0] eq "-lc") {
-  $lowercase = 1;
-  shift;
-}
-
-my $stem = $ARGV[0];
-if (!defined $stem) {
-  print STDERR "usage: multi-bleu-detok.pl [-lc] reference < hypothesis\n";
-  print STDERR "Reads the references from reference or reference0, reference1, ...\n";
-  exit(1);
-}
-
-$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
-
-my @REF;
-my $ref=0;
-while(-e "$stem$ref") {
-    &add_to_ref("$stem$ref",\@REF);
-    $ref++;
-}
-&add_to_ref($stem,\@REF) if -e $stem;
-die("ERROR: could not find reference file $stem") unless scalar @REF;
-
-# add additional references explicitly specified on the command line
-shift;
-foreach my $stem (@ARGV) {
-    &add_to_ref($stem,\@REF) if -e $stem;
-}
-
-
-
-sub add_to_ref {
-    my ($file,$REF) = @_;
-    my $s=0;
-    if ($file =~ /.gz$/) {
-	open(REF,"gzip -dc $file|") or die "Can't read $file";
-    } else { 
-	open(REF,$file) or die "Can't read $file";
-    }
-    while(<REF>) {
-	chop;
-	$_ = tokenization($_);
-	push @{$$REF[$s++]}, $_;
-    }
-    close(REF);
-}
-
-my(@CORRECT,@TOTAL,$length_translation,$length_reference);
-my $s=0;
-while(<STDIN>) {
-    chop;
-    $_ = lc if $lowercase;
-    $_ = tokenization($_);
-    my @WORD = split;
-    my %REF_NGRAM = ();
-    my $length_translation_this_sentence = scalar(@WORD);
-    my ($closest_diff,$closest_length) = (9999,9999);
-    foreach my $reference (@{$REF[$s]}) {
-#      print "$s $_ <=> $reference\n";
-  $reference = lc($reference) if $lowercase;
-	my @WORD = split(' ',$reference);
-	my $length = scalar(@WORD);
-        my $diff = abs($length_translation_this_sentence-$length);
-	if ($diff < $closest_diff) {
-	    $closest_diff = $diff;
-	    $closest_length = $length;
-	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
-	} elsif ($diff == $closest_diff) {
-            $closest_length = $length if $length < $closest_length;
-            # from two references with the same closeness to me
-            # take the *shorter* into account, not the "first" one.
-        }
-	for(my $n=1;$n<=4;$n++) {
-	    my %REF_NGRAM_N = ();
-	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
-		my $ngram = "$n";
-		for(my $w=0;$w<$n;$w++) {
-		    $ngram .= " ".$WORD[$start+$w];
-		}
-		$REF_NGRAM_N{$ngram}++;
-	    }
-	    foreach my $ngram (keys %REF_NGRAM_N) {
-		if (!defined($REF_NGRAM{$ngram}) ||
-		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
-		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
-#	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
-		}
-	    }
-	}
-    }
-    $length_translation += $length_translation_this_sentence;
-    $length_reference += $closest_length;
-    for(my $n=1;$n<=4;$n++) {
-	my %T_NGRAM = ();
-	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
-	    my $ngram = "$n";
-	    for(my $w=0;$w<$n;$w++) {
-		$ngram .= " ".$WORD[$start+$w];
-	    }
-	    $T_NGRAM{$ngram}++;
-	}
-	foreach my $ngram (keys %T_NGRAM) {
-	    $ngram =~ /^(\d+) /;
-	    my $n = $1;
-            # my $corr = 0;
-#	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
-	    $TOTAL[$n] += $T_NGRAM{$ngram};
-	    if (defined($REF_NGRAM{$ngram})) {
-		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
-		    $CORRECT[$n] += $T_NGRAM{$ngram};
-                    # $corr =  $T_NGRAM{$ngram};
-#	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
-		}
-		else {
-		    $CORRECT[$n] += $REF_NGRAM{$ngram};
-                    # $corr =  $REF_NGRAM{$ngram};
-#	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
-		}
-	    }
-            # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
-            # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
-	}
-    }
-    $s++;
-}
-my $brevity_penalty = 1;
-my $bleu = 0;
-
-my @bleu=();
-
-for(my $n=1;$n<=4;$n++) {
-  if (defined ($TOTAL[$n])){
-    $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
-    # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
-  }else{
-    $bleu[$n]=0;
-  }
-}
-
-if ($length_reference==0){
-  printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
-  exit(1);
-}
-
-if ($length_translation<$length_reference) {
-  $brevity_penalty = exp(1-$length_reference/$length_translation);
-}
-$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
-				my_log( $bleu[2] ) +
-				my_log( $bleu[3] ) +
-				my_log( $bleu[4] ) ) / 4) ;
-printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
-    100*$bleu,
-    100*$bleu[1],
-    100*$bleu[2],
-    100*$bleu[3],
-    100*$bleu[4],
-    $brevity_penalty,
-    $length_translation / $length_reference,
-    $length_translation,
-    $length_reference;
-
-sub my_log {
-  return -9999999999 unless $_[0];
-  return log($_[0]);
-}
-
-
-
-sub tokenization
-{
-	my ($norm_text) = @_;
-
-# language-independent part:
-	$norm_text =~ s/<skipped>//g; # strip "skipped" tags
-	$norm_text =~ s/-\n//g; # strip end-of-line hyphenation and join lines
-	$norm_text =~ s/\n/ /g; # join lines
-	$norm_text =~ s/&quot;/"/g;  # convert SGML tag for quote to "
-	$norm_text =~ s/&amp;/&/g;   # convert SGML tag for ampersand to &
-	$norm_text =~ s/&lt;/</g;    # convert SGML tag for less-than to >
-	$norm_text =~ s/&gt;/>/g;    # convert SGML tag for greater-than to <
-
-# language-dependent part (assuming Western languages):
-	$norm_text = " $norm_text ";
-	$norm_text =~ s/([\{-\~\[-\` -\&\(-\+\:-\@\/])/ $1 /g;   # tokenize punctuation
-	$norm_text =~ s/([^0-9])([\.,])/$1 $2 /g; # tokenize period and comma unless preceded by a digit
-	$norm_text =~ s/([\.,])([^0-9])/ $1 $2/g; # tokenize period and comma unless followed by a digit
-	$norm_text =~ s/([0-9])(-)/$1 $2 /g; # tokenize dash when preceded by a digit
-	$norm_text =~ s/\s+/ /g; # one space only between words
-	$norm_text =~ s/^\s+//;  # no leading space
-	$norm_text =~ s/\s+$//;  # no trailing space
-
-	return $norm_text;
-}
diff --git a/scripts/tests/multi-bleu.perl b/scripts/tests/multi-bleu.perl
deleted file mode 100644
index 9a3375cab3..0000000000
--- a/scripts/tests/multi-bleu.perl
+++ /dev/null
@@ -1,177 +0,0 @@
-#!/usr/bin/env perl
-#
-# This file is part of moses.  Its use is licensed under the GNU Lesser General
-# Public License version 2.1 or, at your option, any later version.
-
-# $Id$
-use warnings;
-use strict;
-
-my $lowercase = 0;
-if ($ARGV[0] eq "-lc") {
-  $lowercase = 1;
-  shift;
-}
-
-my $stem = $ARGV[0];
-if (!defined $stem) {
-  print STDERR "usage: multi-bleu.pl [-lc] reference < hypothesis\n";
-  print STDERR "Reads the references from reference or reference0, reference1, ...\n";
-  exit(1);
-}
-
-$stem .= ".ref" if !-e $stem && !-e $stem."0" && -e $stem.".ref0";
-
-my @REF;
-my $ref=0;
-while(-e "$stem$ref") {
-    &add_to_ref("$stem$ref",\@REF);
-    $ref++;
-}
-&add_to_ref($stem,\@REF) if -e $stem;
-die("ERROR: could not find reference file $stem") unless scalar @REF;
-
-# add additional references explicitly specified on the command line
-shift;
-foreach my $stem (@ARGV) {
-    &add_to_ref($stem,\@REF) if -e $stem;
-}
-
-
-
-sub add_to_ref {
-    my ($file,$REF) = @_;
-    my $s=0;
-    if ($file =~ /.gz$/) {
-	open(REF,"gzip -dc $file|") or die "Can't read $file";
-    } else { 
-	open(REF,$file) or die "Can't read $file";
-    }
-    while(<REF>) {
-	chop;
-	push @{$$REF[$s++]}, $_;
-    }
-    close(REF);
-}
-
-my(@CORRECT,@TOTAL,$length_translation,$length_reference);
-my $s=0;
-while(<STDIN>) {
-    chop;
-    $_ = lc if $lowercase;
-    my @WORD = split;
-    my %REF_NGRAM = ();
-    my $length_translation_this_sentence = scalar(@WORD);
-    my ($closest_diff,$closest_length) = (9999,9999);
-    foreach my $reference (@{$REF[$s]}) {
-#      print "$s $_ <=> $reference\n";
-  $reference = lc($reference) if $lowercase;
-	my @WORD = split(' ',$reference);
-	my $length = scalar(@WORD);
-        my $diff = abs($length_translation_this_sentence-$length);
-	if ($diff < $closest_diff) {
-	    $closest_diff = $diff;
-	    $closest_length = $length;
-	    # print STDERR "$s: closest diff ".abs($length_translation_this_sentence-$length)." = abs($length_translation_this_sentence-$length), setting len: $closest_length\n";
-	} elsif ($diff == $closest_diff) {
-            $closest_length = $length if $length < $closest_length;
-            # from two references with the same closeness to me
-            # take the *shorter* into account, not the "first" one.
-        }
-	for(my $n=1;$n<=4;$n++) {
-	    my %REF_NGRAM_N = ();
-	    for(my $start=0;$start<=$#WORD-($n-1);$start++) {
-		my $ngram = "$n";
-		for(my $w=0;$w<$n;$w++) {
-		    $ngram .= " ".$WORD[$start+$w];
-		}
-		$REF_NGRAM_N{$ngram}++;
-	    }
-	    foreach my $ngram (keys %REF_NGRAM_N) {
-		if (!defined($REF_NGRAM{$ngram}) ||
-		    $REF_NGRAM{$ngram} < $REF_NGRAM_N{$ngram}) {
-		    $REF_NGRAM{$ngram} = $REF_NGRAM_N{$ngram};
-#	    print "$i: REF_NGRAM{$ngram} = $REF_NGRAM{$ngram}<BR>\n";
-		}
-	    }
-	}
-    }
-    $length_translation += $length_translation_this_sentence;
-    $length_reference += $closest_length;
-    for(my $n=1;$n<=4;$n++) {
-	my %T_NGRAM = ();
-	for(my $start=0;$start<=$#WORD-($n-1);$start++) {
-	    my $ngram = "$n";
-	    for(my $w=0;$w<$n;$w++) {
-		$ngram .= " ".$WORD[$start+$w];
-	    }
-	    $T_NGRAM{$ngram}++;
-	}
-	foreach my $ngram (keys %T_NGRAM) {
-	    $ngram =~ /^(\d+) /;
-	    my $n = $1;
-            # my $corr = 0;
-#	print "$i e $ngram $T_NGRAM{$ngram}<BR>\n";
-	    $TOTAL[$n] += $T_NGRAM{$ngram};
-	    if (defined($REF_NGRAM{$ngram})) {
-		if ($REF_NGRAM{$ngram} >= $T_NGRAM{$ngram}) {
-		    $CORRECT[$n] += $T_NGRAM{$ngram};
-                    # $corr =  $T_NGRAM{$ngram};
-#	    print "$i e correct1 $T_NGRAM{$ngram}<BR>\n";
-		}
-		else {
-		    $CORRECT[$n] += $REF_NGRAM{$ngram};
-                    # $corr =  $REF_NGRAM{$ngram};
-#	    print "$i e correct2 $REF_NGRAM{$ngram}<BR>\n";
-		}
-	    }
-            # $REF_NGRAM{$ngram} = 0 if !defined $REF_NGRAM{$ngram};
-            # print STDERR "$ngram: {$s, $REF_NGRAM{$ngram}, $T_NGRAM{$ngram}, $corr}\n"
-	}
-    }
-    $s++;
-}
-my $brevity_penalty = 1;
-my $bleu = 0;
-
-my @bleu=();
-
-for(my $n=1;$n<=4;$n++) {
-  if (defined ($TOTAL[$n])){
-    $bleu[$n]=($TOTAL[$n])?$CORRECT[$n]/$TOTAL[$n]:0;
-    # print STDERR "CORRECT[$n]:$CORRECT[$n] TOTAL[$n]:$TOTAL[$n]\n";
-  }else{
-    $bleu[$n]=0;
-  }
-}
-
-if ($length_reference==0){
-  printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
-  exit(1);
-}
-
-if ($length_translation<$length_reference) {
-  $brevity_penalty = exp(1-$length_reference/$length_translation);
-}
-$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
-				my_log( $bleu[2] ) +
-				my_log( $bleu[3] ) +
-				my_log( $bleu[4] ) ) / 4) ;
-printf "BLEU = %.2f, %.1f/%.1f/%.1f/%.1f (BP=%.3f, ratio=%.3f, hyp_len=%d, ref_len=%d)\n",
-    100*$bleu,
-    100*$bleu[1],
-    100*$bleu[2],
-    100*$bleu[3],
-    100*$bleu[4],
-    $brevity_penalty,
-    $length_translation / $length_reference,
-    $length_translation,
-    $length_reference;
-
-
-print STDERR "It is in-advisable to publish scores from multi-bleu.perl.  The scores depend on your tokenizer, which is unlikely to be reproducible from your paper or consistent across research groups.  Instead you should detokenize then use mteval-v14.pl, which has a standard tokenization.  Scores from multi-bleu.perl can still be used for internal purposes when you have a consistent tokenizer.\n";
-
-sub my_log {
-  return -9999999999 unless $_[0];
-  return log($_[0]);
-}
diff --git a/scripts/tests/test_bert_checkpoints.py b/scripts/tests/test_bert_checkpoints.py
deleted file mode 100644
index 16e58216bb..0000000000
--- a/scripts/tests/test_bert_checkpoints.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test inference with BERT checkpoints"""
-import pytest
-import zipfile
-import subprocess
-import sys
-import re
-import mxnet as mx
-
-@pytest.mark.serial
-@pytest.mark.gpu
-@pytest.mark.remote_required
-@pytest.mark.integration
-def test_bert_checkpoints():
-    script = './scripts/bert/finetune_classifier.py'
-    param = 'bert_base_uncased_sst-a628b1d4.params'
-    param_zip = 'bert_base_uncased_sst-a628b1d4.zip'
-    arguments = ['--log_interval', '1000000', '--model_parameters', param,
-                 '--gpu', '0', '--only_inference', '--task_name', 'SST',
-                 '--epochs', '1']
-    url = 'https://apache-mxnet.s3-accelerate.amazonaws.com/gluon/models/' + param_zip
-    mx.gluon.utils.download(url , path='.')
-    with zipfile.ZipFile(param_zip) as zf:
-        zf.extractall('.')
-    p = subprocess.check_call([sys.executable, script] + arguments)
-    with open('log_SST.txt', 'r') as f:
-        x = f.read()
-        find = re.compile('accuracy:0.[0-9]+').search(str(x)).group(0)
-        assert float(find[len('accuracy:'):]) > 0.92
diff --git a/scripts/tests/test_bert_dataset_transform.py b/scripts/tests/test_bert_dataset_transform.py
deleted file mode 100644
index 3a9080154c..0000000000
--- a/scripts/tests/test_bert_dataset_transform.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test BERTDatasetTransform."""
-
-import numpy as np
-from gluonnlp.vocab import BERTVocab
-from gluonnlp.data import count_tokens, BERTTokenizer
-
-from ..bert.data.transform import BERTDatasetTransform
-
-
-def test_bert_dataset_transform():
-    text_a = u'is this jacksonville ?'
-    text_b = u'no it is not'
-    label_cls = 0
-    vocab_tokens = ['is', 'this', 'jack', '##son', '##ville', '?', 'no', 'it', 'is', 'not']
-
-    bert_vocab = BERTVocab(count_tokens(vocab_tokens))
-    tokenizer = BERTTokenizer(vocab=bert_vocab)
-
-    # test BERTDatasetTransform for classification task
-    bert_cls_dataset_t = BERTDatasetTransform(tokenizer, 15,
-                                              class_labels=[label_cls], pad=True,
-                                              pair=True)
-    token_ids, type_ids, length, label_ids = bert_cls_dataset_t((text_a, text_b, label_cls))
-
-    text_a_tokens = ['is', 'this', 'jack', '##son', '##ville', '?']
-    text_b_tokens = ['no', 'it', 'is', 'not']
-    text_a_ids = bert_vocab[text_a_tokens]
-    text_b_ids = bert_vocab[text_b_tokens]
-
-    cls_ids = bert_vocab[[bert_vocab.cls_token]]
-    sep_ids = bert_vocab[[bert_vocab.sep_token]]
-    pad_ids = bert_vocab[[bert_vocab.padding_token]]
-
-    concated_ids = cls_ids + text_a_ids + sep_ids + text_b_ids + sep_ids + pad_ids
-    valid_token_ids = np.array([pad_ids[0]]*15, dtype=np.int32)
-    for i, x in enumerate(concated_ids):
-        valid_token_ids[i] = x
-    valid_type_ids = np.zeros((15,), dtype=np.int32)
-    start = len(text_a_tokens) + 2
-    end = len(text_a_tokens)+2+len(text_b_tokens)+1
-    valid_type_ids[start:end] = 1
-
-    assert all(token_ids == valid_token_ids)
-    assert length == len(vocab_tokens) + 3
-    assert all(type_ids == valid_type_ids)
-    assert all(label_ids == np.array([label_cls], dtype=np.int32))
-
-    # test BERTDatasetTransform for regression task
-    label_reg = 0.2
-    bert_reg_dataset_t = BERTDatasetTransform(tokenizer, 15, pad=True, pair=True)
-    token_ids, type_ids, length, label_reg_val = bert_reg_dataset_t((text_a, text_b, label_reg))
-    assert all(token_ids == valid_token_ids)
-    assert length == len(vocab_tokens) + 3
-    assert all(type_ids == valid_type_ids)
-    assert all(label_reg_val == np.array([label_reg], dtype=np.float32))
diff --git a/scripts/tests/test_bert_embedding.py b/scripts/tests/test_bert_embedding.py
deleted file mode 100644
index fdbcde765c..0000000000
--- a/scripts/tests/test_bert_embedding.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test BERTEmbedding."""
-import time
-
-import pytest
-
-from ..bert.embedding import BertEmbedding
-from ..bert.data.embedding import BertEmbeddingDataset
-
-
-def test_bert_embedding_dataset():
-    sentence = u'is this jacksonville ?'
-    dataset = BertEmbeddingDataset([sentence])
-    assert len(dataset) == 1
-
-
-def test_bert_embedding_data_loader():
-    sentence = u'is this jacksonville ?'
-    bert = BertEmbedding(dataset_name='wiki_multilingual_uncased',
-                         max_seq_length=10)
-    first_sentence = None
-    for i in bert.data_loader([sentence]):
-        first_sentence = i
-        break
-    assert len(first_sentence[0][0]) == 10
-
-
-def test_bert_embedding_data_loader_works_with_cased_data():
-    bert = BertEmbedding(dataset_name="book_corpus_wiki_en_cased")
-    assert bert.tokenizer.basic_tokenizer.lower == False
-
-
-def test_bert_embedding_data_loader_works_with_uncased_data():
-    bert = BertEmbedding(dataset_name="book_corpus_wiki_en_uncased")
-    assert bert.tokenizer.basic_tokenizer.lower == True
diff --git a/scripts/tests/test_bleu.py b/scripts/tests/test_bleu.py
deleted file mode 100644
index 0dc8b5ed5c..0000000000
--- a/scripts/tests/test_bleu.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test BLEU."""
-
-import string
-import os
-import io
-import re
-import subprocess
-import codecs
-import numpy as np
-from numpy.testing import assert_allclose
-from ..machine_translation.bleu import compute_bleu, _bpe_to_words, _split_compound_word
-
-
-actions = ['deletion', 'replacement', 'add']
-
-
-def _sample_translation(reference, max_len):
-    translation = reference[:]
-    while np.random.uniform() < 0.8 and 1 < len(translation) < max_len:
-        trans_len = len(translation)
-        ind = np.random.randint(trans_len)
-        action = np.random.choice(actions)
-        if action == 'deletion':
-            del translation[ind]
-        elif action == 'replacement':
-            ind_rep = np.random.randint(trans_len)
-            translation[ind] = translation[ind_rep]
-        else:
-            ind_insert = np.random.randint(trans_len)
-            translation.insert(ind, translation[ind_insert])
-    return translation
-
-
-def _sample_reference(vocabulary, k):
-    return np.random.choice(vocabulary, size=k).tolist()
-
-
-def _sample_translation_corpus(reference_corpus_list, max_len):
-    translation_corpus = []
-    for references in zip(*reference_corpus_list):
-        n_refs = len(references)
-        ref_ind = np.random.randint(n_refs)
-        translation = _sample_translation(references[ref_ind], max_len)
-        translation_corpus.append(translation)
-    return translation_corpus
-
-
-def _sample_reference_corpus(vocabulary, n, max_len, n_refs=5):
-    reference_corpus_list = [[] for _ in range(n_refs)]
-    for _ in range(n):
-        for i in range(n_refs):
-            ref_len = np.random.randint(1, max_len + 1)
-            reference = _sample_reference(vocabulary, ref_len)
-            reference_corpus_list[i].append(reference)
-    return reference_corpus_list
-
-
-def _write_translaton(translations, path='hypothesis'):
-    out_file = codecs.open(path, 'w', 'utf-8')
-    preds = [' '.join(translation) for translation in translations]
-    out_file.write('\n'.join(preds) + '\n')
-    out_file.flush()
-    out_file.close()
-
-
-def _write_reference(references, path='reference'):
-    for i, reference in enumerate(references):
-        out_file = codecs.open(path + str(i), 'w', 'utf-8')
-        refs = [' '.join(ref) for ref in reference]
-        out_file.write('\n'.join(refs) + '\n')
-        out_file.flush()
-        out_file.close()
-
-
-def test_bleu():
-    n = 100
-    max_len = 50
-    n_refs = 5
-    path = os.path.dirname(os.path.realpath(__file__))
-    ref_path = os.path.join(path, 'reference')
-    trans_path = os.path.join(path, 'hypothesis')
-    vocabulary = list(string.ascii_lowercase)
-    reference_corpus_list = _sample_reference_corpus(vocabulary, n, max_len, n_refs)
-    translation_corpus = _sample_translation_corpus(reference_corpus_list, max_len)
-    _write_reference(reference_corpus_list, path=ref_path)
-    _write_translaton(translation_corpus, path=trans_path)
-    ret_bleu, _, _, _, _ = compute_bleu(reference_corpus_list, translation_corpus)
-    mose_ret = subprocess.check_output('perl %s/multi-bleu.perl %s < %s'
-                                       % (path, ref_path, trans_path),
-                                       shell=True).decode('utf-8')
-    m = re.search('BLEU = (.+?),', mose_ret)
-    gt_bleu = float(m.group(1))
-    assert_allclose(round(ret_bleu * 100, 2), gt_bleu)
-    os.remove(trans_path)
-    for i in range(n_refs):
-        os.remove(ref_path + str(i))
-
-
-def test_detok_bleu():
-    path = os.path.dirname(os.path.realpath(__file__))
-    ref_path = os.path.join(path, 'test_references.txt')
-    trans_path = os.path.join(path, 'test_translations.txt')
-    with io.open(trans_path, 'r', encoding='utf-8') as f:
-        translations = f.readlines()
-
-    with io.open(ref_path, 'r', encoding='utf-8') as f:
-        references = f.readlines()
-    ret_bleu, _, _, _, _ = compute_bleu([references], translations, tokenized=False)
-    mose_ret = subprocess.check_output('perl %s/multi-bleu-detok.perl %s < %s'
-                                       % (path, ref_path, trans_path),
-                                       shell=True).decode('utf-8')
-    m = re.search('BLEU = (.+?),', mose_ret)
-    gt_bleu = float(m.group(1))
-    assert_allclose(round(ret_bleu * 100, 2), gt_bleu)
-
-
-def test_bpe():
-    sequence = ['Th@@', 'is', 'man', 'is', 'ma@@', 'rr@@', 'ied', 'wi@@', 'th', 'her']
-    gt_sequence = ['This', 'man', 'is', 'married', 'with', 'her']
-    merged_sequence = _bpe_to_words(sequence)
-    for gt_word, word in zip(gt_sequence, merged_sequence):
-        assert gt_word == word
-
-
-def test_split_compound_word():
-    sequence = ['rich-text', 'man', 'feed-forward', 'yes', 'true', 'machine-learning', 'language-model']
-    gt_sequence = ['rich', '##AT##-##AT##', 'text', 'man', 'feed', '##AT##-##AT##', 'forward',
-                   'yes', 'true', 'machine', '##AT##-##AT##', 'learning', 'language', '##AT##-##AT##', 'model']
-    split_sequence = _split_compound_word(sequence)
-    for gt_word, word in zip(gt_sequence, split_sequence):
-        assert gt_word == word
-
diff --git a/scripts/tests/test_dataprocessor.py b/scripts/tests/test_dataprocessor.py
deleted file mode 100644
index e0935eedab..0000000000
--- a/scripts/tests/test_dataprocessor.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test DataProcessor."""
-
-import sys
-import os
-import warnings
-import time
-
-import pytest
-
-sys.path.append(os.path.join(os.path.dirname(os.path.dirname(__file__)), 'machine_translation'))
-
-from ..machine_translation.dataprocessor import process_dataset
-from ..machine_translation.dataset import TOY
-
-
-@pytest.mark.remote_required
-def test_toy():
-    # Test toy dataset
-    train_en_de = TOY(segment='train', root='tests/data/translation_test')
-    val_en_de = TOY(segment='val', root='tests/data/translation_test')
-    test_en_de = TOY(segment='test', root='tests/data/translation_test')
-    assert len(train_en_de) == 30
-    assert len(val_en_de) == 30
-    assert len(test_en_de) == 30
-    with warnings.catch_warnings():  # TODO https://github.com/dmlc/gluon-nlp/issues/978
-        warnings.simplefilter("ignore")
-        en_vocab, de_vocab = train_en_de.src_vocab, train_en_de.tgt_vocab
-    assert len(en_vocab) == 358
-    assert len(de_vocab) == 381
-    train_de_en = TOY(segment='train', src_lang='de', tgt_lang='en',
-                      root='tests/data/translation_test')
-    with warnings.catch_warnings():  # TODO https://github.com/dmlc/gluon-nlp/issues/978
-        warnings.simplefilter("ignore")
-        de_vocab, en_vocab = train_de_en.src_vocab, train_de_en.tgt_vocab
-    assert len(en_vocab) == 358
-    assert len(de_vocab) == 381
-    for i in range(10):
-        lhs = train_en_de[i]
-        rhs = train_de_en[i]
-        assert lhs[0] == rhs[1] and rhs[0] == lhs[1]
-    time.sleep(5)
-
-
-
-
-def test_translation_preprocess():
-    src_lang = 'en'
-    tgt_lang = 'de'
-    max_lens = ((10, 10), (0, 0), (-1, -1))
-    for (src_max_len, tgt_max_len) in max_lens:
-        data_train = TOY('train', src_lang=src_lang, tgt_lang=tgt_lang)
-        data_val = TOY('val', src_lang=src_lang, tgt_lang=tgt_lang)
-
-        # TODO https://github.com/dmlc/gluon-nlp/issues/978
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            src_vocab, tgt_vocab = data_train.src_vocab, data_train.tgt_vocab
-        data_val_processed = process_dataset(data_val, src_vocab, tgt_vocab,
-                                             src_max_len, tgt_max_len)
-        for (src, tgt), (preprocessed_src, preprocessed_tgt) in zip(data_val, data_val_processed):
-            if src_max_len >= 0:
-                assert len(preprocessed_src) == min(len(src.split()), src_max_len) + 1
-            else:
-                assert len(preprocessed_src) == len(src.split()) + 1
-            if tgt_max_len >= 0:
-                assert len(preprocessed_tgt) == min(len(tgt.split()), tgt_max_len) + 2
-            else:
-                assert len(preprocessed_tgt) == len(tgt.split()) + 2
-
diff --git a/scripts/tests/test_encoder_decoder.py b/scripts/tests/test_encoder_decoder.py
deleted file mode 100644
index 695785a619..0000000000
--- a/scripts/tests/test_encoder_decoder.py
+++ /dev/null
@@ -1,236 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-
-import numpy as np
-import mxnet as mx
-from mxnet.test_utils import assert_almost_equal
-from ..machine_translation.gnmt import *
-from gluonnlp.model.transformer import *
-from gluonnlp.model.transformer import TransformerDecoder, TransformerOneStepDecoder
-
-
-def test_gnmt_encoder():
-    ctx = mx.current_context()
-    for cell_type in ["lstm", "gru", "relu_rnn", "tanh_rnn"]:
-        for num_layers, num_bi_layers in [(2, 1), (3, 0)]:
-            for use_residual in [False, True]:
-                encoder = GNMTEncoder(cell_type=cell_type, num_layers=num_layers,
-                                      num_bi_layers=num_bi_layers, hidden_size=8,
-                                      dropout=0.0, use_residual=use_residual,
-                                      prefix='gnmt_encoder_')
-                encoder.initialize(ctx=ctx)
-                encoder.hybridize()
-                for batch_size in [4]:
-                    for seq_length in [5, 10]:
-                        inputs_nd = mx.nd.random.normal(0, 1, shape=(batch_size, seq_length, 4), ctx=ctx)
-                        valid_length_nd = mx.nd.array(np.random.randint(1, seq_length,
-                                                                        size=(batch_size,)), ctx=ctx)
-                        encoder_outputs, _ = encoder(inputs_nd, valid_length=valid_length_nd)
-                        valid_length_npy = valid_length_nd.asnumpy()
-                        rnn_output = encoder_outputs[0].asnumpy()
-                        for i in range(batch_size):
-                            if valid_length_npy[i] < seq_length - 1:
-                                padded_out = rnn_output[i, int(valid_length_npy[i]):, :]
-                                assert_almost_equal(padded_out, np.zeros_like(padded_out), 1E-6, 1E-6)
-                        assert(encoder_outputs[0].shape == (batch_size, seq_length, 8))
-                        assert(len(encoder_outputs[1]) == num_layers)
-
-
-def test_gnmt_encoder_decoder():
-    ctx = mx.current_context()
-    num_hidden = 8
-    encoder = GNMTEncoder(cell_type="lstm", num_layers=3, num_bi_layers=1, hidden_size=num_hidden,
-                          dropout=0.0, use_residual=True, prefix='gnmt_encoder_')
-    encoder.initialize(ctx=ctx)
-    encoder.hybridize()
-    for output_attention in [True, False]:
-        for use_residual in [True, False]:
-            decoder = GNMTDecoder(cell_type="lstm", num_layers=3, hidden_size=num_hidden, dropout=0.0,
-                                  output_attention=output_attention, use_residual=use_residual, prefix='gnmt_decoder_')
-            decoder.initialize(ctx=ctx)
-            decoder.hybridize()
-            one_step_decoder = GNMTOneStepDecoder(cell_type="lstm", num_layers=3, hidden_size=num_hidden,
-                                                  dropout=0.0, output_attention=output_attention,
-                                                  use_residual=use_residual, prefix='gnmt_decoder_',
-                                                  params=decoder.collect_params())
-            one_step_decoder.hybridize()
-            for batch_size in [4]:
-                for src_seq_length, tgt_seq_length in [(5, 10), (10, 5)]:
-                    src_seq_nd = mx.nd.random.normal(0, 1, shape=(batch_size, src_seq_length, 4), ctx=ctx)
-                    tgt_seq_nd = mx.nd.random.normal(0, 1, shape=(batch_size, tgt_seq_length, 4), ctx=ctx)
-                    src_valid_length_nd = mx.nd.array(np.random.randint(1, src_seq_length, size=(batch_size,)), ctx=ctx)
-                    tgt_valid_length_nd = mx.nd.array(np.random.randint(1, tgt_seq_length, size=(batch_size,)), ctx=ctx)
-                    src_valid_length_npy = src_valid_length_nd.asnumpy()
-                    tgt_valid_length_npy = tgt_valid_length_nd.asnumpy()
-                    encoder_outputs, _ = encoder(src_seq_nd, valid_length=src_valid_length_nd)
-                    decoder_states = decoder.init_state_from_encoder(encoder_outputs, src_valid_length_nd)
-
-                    # Test multi step forwarding
-                    output, new_states, additional_outputs = decoder(tgt_seq_nd, decoder_states, tgt_valid_length_nd)
-                    assert(output.shape == (batch_size, tgt_seq_length, num_hidden))
-                    output_npy = output.asnumpy()
-                    for i in range(batch_size):
-                        tgt_v_len = int(tgt_valid_length_npy[i])
-                        if tgt_v_len < tgt_seq_length - 1:
-                            assert((output_npy[i, tgt_v_len:, :] == 0).all())
-                    if output_attention:
-                        assert(len(additional_outputs) == 1)
-                        attention_out = additional_outputs[0].asnumpy()
-                        assert(attention_out.shape == (batch_size, tgt_seq_length, src_seq_length))
-                        for i in range(batch_size):
-                            mem_v_len = int(src_valid_length_npy[i])
-                            if mem_v_len < src_seq_length - 1:
-                                assert((attention_out[i, :, mem_v_len:] == 0).all())
-                            if mem_v_len > 0:
-                                assert_almost_equal(attention_out[i, :, :].sum(axis=-1),
-                                                    np.ones(attention_out.shape[1]))
-                    else:
-                        assert(len(additional_outputs) == 0)
-
-                    # Test one-step forwarding
-                    output, new_states, additional_outputs = one_step_decoder(
-                        tgt_seq_nd[:, 0, :], decoder_states)
-                    assert(output.shape == (batch_size, num_hidden))
-                    if output_attention:
-                        assert(len(additional_outputs) == 1)
-                        attention_out = additional_outputs[0].asnumpy()
-                        assert(attention_out.shape == (batch_size, 1, src_seq_length))
-                        for i in range(batch_size):
-                            mem_v_len = int(src_valid_length_npy[i])
-                            if mem_v_len < src_seq_length - 1:
-                                assert((attention_out[i, :, mem_v_len:] == 0).all())
-                            if mem_v_len > 0:
-                                assert_almost_equal(attention_out[i, :, :].sum(axis=-1),
-                                                    np.ones(attention_out.shape[1]))
-                    else:
-                        assert(len(additional_outputs) == 0)
-
-
-def test_transformer_encoder():
-    ctx = mx.current_context()
-    for num_layers in range(1, 3):
-        for output_attention in [True, False]:
-            for use_residual in [False, True]:
-                encoder = TransformerEncoder(num_layers=num_layers, max_length=10,
-                                             units=16, hidden_size=32, num_heads=8,
-                                             dropout=0.0, use_residual=use_residual,
-                                             output_attention=output_attention, prefix='transformer_encoder_')
-                encoder.initialize(ctx=ctx)
-                encoder.hybridize()
-                for batch_size in [4]:
-                    for seq_length in [5, 10]:
-                        inputs_nd = mx.nd.random.normal(0, 1, shape=(batch_size, seq_length, 16), ctx=ctx)
-                        valid_length_nd = mx.nd.array(np.random.randint(1, seq_length,
-                                                                        size=(batch_size,)), ctx=ctx)
-                        encoder_outputs, additional_outputs = encoder(inputs_nd, valid_length=valid_length_nd)
-                        valid_length_npy = valid_length_nd.asnumpy()
-                        encoder_outputs = encoder_outputs.asnumpy()
-                        for i in range(batch_size):
-                            if valid_length_npy[i] < seq_length - 1:
-                                padded_out = encoder_outputs[i, int(valid_length_npy[i]):, :]
-                                assert_almost_equal(padded_out, np.zeros_like(padded_out), 1E-6, 1E-6)
-                        assert(encoder_outputs.shape == (batch_size, seq_length, 16))
-                        if output_attention:
-                            assert(len(additional_outputs) == num_layers)
-                            attention_out = additional_outputs[0][0].asnumpy()
-                            assert(attention_out.shape == (batch_size, 8, seq_length, seq_length))
-                            for i in range(batch_size):
-                                mem_v_len = int(valid_length_npy[i])
-                                if mem_v_len < seq_length - 1:
-                                    assert((attention_out[i, :, :, mem_v_len:] == 0).all())
-                                if mem_v_len > 0:
-                                    assert_almost_equal(attention_out[i, :, :, :].sum(axis=-1),
-                                                      np.ones(attention_out.shape[1:3]))
-                        else:
-                            assert(len(additional_outputs) == 0)
-
-@pytest.mark.parametrize('output_attention', [False, True])
-@pytest.mark.parametrize('use_residual', [False, True])
-@pytest.mark.parametrize('batch_size', [4])
-@pytest.mark.parametrize('src_tgt_seq_len', [(5, 10), (10, 5)])
-def test_transformer_encoder_decoder(output_attention, use_residual, batch_size, src_tgt_seq_len):
-    ctx = mx.current_context()
-    units = 16
-    encoder = TransformerEncoder(num_layers=3, units=units, hidden_size=32, num_heads=8, max_length=10,
-                                 dropout=0.0, use_residual=True, prefix='transformer_encoder_')
-    encoder.initialize(ctx=ctx)
-    encoder.hybridize()
-    decoder = TransformerDecoder(num_layers=3, units=units, hidden_size=32,
-                                 num_heads=8, max_length=10, dropout=0.0,
-                                 output_attention=output_attention,
-                                 use_residual=use_residual,
-                                 prefix='transformer_decoder_')
-    decoder.initialize(ctx=ctx)
-    decoder.hybridize()
-
-    src_seq_length, tgt_seq_length = src_tgt_seq_len
-    src_seq_nd = mx.nd.random.normal(0, 1, shape=(batch_size, src_seq_length, units), ctx=ctx)
-    tgt_seq_nd = mx.nd.random.normal(0, 1, shape=(batch_size, tgt_seq_length, units), ctx=ctx)
-    src_valid_length_nd = mx.nd.array(np.random.randint(1, src_seq_length, size=(batch_size,)), ctx=ctx)
-    tgt_valid_length_nd = mx.nd.array(np.random.randint(1, tgt_seq_length, size=(batch_size,)), ctx=ctx)
-    src_valid_length_npy = src_valid_length_nd.asnumpy()
-    tgt_valid_length_npy = tgt_valid_length_nd.asnumpy()
-    encoder_outputs, _ = encoder(src_seq_nd, valid_length=src_valid_length_nd)
-    decoder_states = decoder.init_state_from_encoder(encoder_outputs, src_valid_length_nd)
-
-    # Test multi step forwarding
-    output, new_states, additional_outputs = decoder(tgt_seq_nd, decoder_states, tgt_valid_length_nd)
-    assert(output.shape == (batch_size, tgt_seq_length, units))
-    output_npy = output.asnumpy()
-    for i in range(batch_size):
-        tgt_v_len = int(tgt_valid_length_npy[i])
-        if tgt_v_len < tgt_seq_length - 1:
-            assert((output_npy[i, tgt_v_len:, :] == 0).all())
-    if output_attention:
-        assert(len(additional_outputs) == 3)
-        attention_out = additional_outputs[0][1].asnumpy()
-        assert(attention_out.shape == (batch_size, 8, tgt_seq_length, src_seq_length))
-        for i in range(batch_size):
-            mem_v_len = int(src_valid_length_npy[i])
-            if mem_v_len < src_seq_length - 1:
-                assert((attention_out[i, :, :, mem_v_len:] == 0).all())
-            if mem_v_len > 0:
-                assert_almost_equal(attention_out[i, :, :, :].sum(axis=-1),
-                                    np.ones(attention_out.shape[1:3]))
-    else:
-        assert(len(additional_outputs) == 0)
-
-    # Test one step forwarding
-    decoder = TransformerOneStepDecoder(num_layers=3, units=units, hidden_size=32,
-                                        num_heads=8, max_length=10, dropout=0.0,
-                                        output_attention=output_attention,
-                                        use_residual=use_residual,
-                                        prefix='transformer_decoder_',
-                                        params=decoder.collect_params())
-    decoder.hybridize()
-    output, new_states, additional_outputs = decoder(tgt_seq_nd[:, 0, :], decoder_states)
-    assert(output.shape == (batch_size, units))
-    if output_attention:
-        assert(len(additional_outputs) == 3)
-        attention_out = additional_outputs[0][1].asnumpy()
-        assert(attention_out.shape == (batch_size, 8, 1, src_seq_length))
-        for i in range(batch_size):
-            mem_v_len = int(src_valid_length_npy[i])
-            if mem_v_len < src_seq_length - 1:
-                assert((attention_out[i, :, :, mem_v_len:] == 0).all())
-            if mem_v_len > 0:
-                assert_almost_equal(attention_out[i, :, :, :].sum(axis=-1),
-                                    np.ones(attention_out.shape[1:3]))
-    else:
-        assert(len(additional_outputs) == 0)
diff --git a/scripts/tests/test_models.py b/scripts/tests/test_models.py
deleted file mode 100644
index 16ef22d612..0000000000
--- a/scripts/tests/test_models.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test models that are not in API yet."""
-
-import os
-
-import mxnet as mx
-import numpy as np
-import numpy.testing as npt
-import pytest
-from mxnet.gluon.utils import _get_repo_url, download
-
-from gluonnlp.data.transforms import GPT2BPEDetokenizer, GPT2BPETokenizer
-
-from ..text_generation.model import get_model
-
-
-@pytest.mark.remote_required
-@pytest.mark.parametrize('model_name', ['gpt2_117m', 'gpt2_345m'])
-def test_pretrained_gpt2(model_name, tmp_path):
-    sentence = ' natural language processing tools such as gluonnlp and torchtext'
-    model, vocab = get_model(model_name, dataset_name='openai_webtext')
-    tokenizer = GPT2BPETokenizer()
-    detokenizer = GPT2BPEDetokenizer()
-    true_data_hash = {'gpt2_117m': '29526682508d03a7c54c598e889f77f7b4608df0',
-                      'gpt2_345m': '6680fd2a3d7b737855536f480bc19d166f15a3ad'}
-    file_name = '{model_name}_gt_logits-{short_hash}.npy'.format(
-            model_name=model_name,
-            short_hash=true_data_hash[model_name][:8])
-    url_format = '{repo_url}gluon/dataset/test/{file_name}'
-    repo_url = _get_repo_url()
-    path = os.path.join(str(tmp_path), file_name)
-    download(url_format.format(repo_url=repo_url, file_name=file_name),
-             path=path,
-             sha1_hash=true_data_hash[model_name])
-    gt_logits = np.load(path)
-    model.hybridize()
-    indices = vocab[tokenizer(sentence)]
-    nd_indices = mx.nd.expand_dims(mx.nd.array(indices), axis=0)
-    logits, new_states = model(nd_indices, None)
-    npt.assert_allclose(logits.asnumpy(), gt_logits, 1E-5, 1E-5)
diff --git a/scripts/tests/test_question_answering.py b/scripts/tests/test_question_answering.py
deleted file mode 100644
index 1a54de3600..0000000000
--- a/scripts/tests/test_question_answering.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import nltk
-import pytest
-from mxnet.gluon.data import DataLoader
-
-from ..question_answering.data_pipeline import SQuADDataLoaderTransformer
-
-
-@pytest.mark.remote_required
-@pytest.mark.serial
-def test_data_loader_able_to_read_spacy(squad_dev_and_vocab_spacy_provider):
-    _, _, train_dataset, dev_dataset, word_vocab, char_vocab = squad_dev_and_vocab_spacy_provider
-    dataloader = DataLoader(train_dataset.transform(SQuADDataLoaderTransformer()), batch_size=1)
-
-    assert word_vocab is not None
-    assert char_vocab is not None
-
-    for record_index, context, query, context_char, query_char, begin, end in dataloader:
-        assert record_index is not None
-        assert context is not None
-        assert query is not None
-        assert context_char is not None
-        assert query_char is not None
-        assert begin is not None
-        assert end is not None
-        break
-
-
-def test_data_loader_able_to_read_nltk(squad_dev_and_vocab_nltk_provider):
-    nltk.download('punkt')
-    _, _, train_dataset, dev_dataset, word_vocab, char_vocab = squad_dev_and_vocab_nltk_provider
-    dataloader = DataLoader(train_dataset.transform(SQuADDataLoaderTransformer()), batch_size=1)
-
-    assert word_vocab is not None
-    assert char_vocab is not None
-
-    for record_index, context, query, context_char, query_char, begin, end in dataloader:
-        assert record_index is not None
-        assert context is not None
-        assert query is not None
-        assert context_char is not None
-        assert query_char is not None
-        assert begin is not None
-        assert end is not None
-        break
diff --git a/scripts/tests/test_references.txt b/scripts/tests/test_references.txt
deleted file mode 100644
index 9f21b0e61d..0000000000
--- a/scripts/tests/test_references.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-Gutach: Noch mehr Sicherheit für Fußgänger
-Sie stehen keine 100 Meter voneinander entfernt: Am Dienstag ist in Gutach die neue B 33-Fußgängerampel am Dorfparkplatz in Betrieb genommen worden - in Sichtweite der älteren Rathausampel.
-Zwei Anlagen so nah beieinander: Absicht oder Schildbürgerstreich?
-Diese Frage hat Gutachs Bürgermeister gestern klar beantwortet.
-"Die Rathausampel ist damals installiert worden, weil diese den Schulweg sichert", erläuterte Eckert gestern.
-Die Kluser-Ampel sichere sowohl Radfahrer als auch Busfahrgäste und die Bergle-Bewohner.
-Die gestern offiziell in Betrieb genommene Anlage sei wichtig für den Kreuzungsbereich Sulzbachweg / Kirchstraße.
-Wir haben das Museum, zwei Kirchen, Kurpark, die Bushaltestelle, einen Arzt und eine Bank sowie den Verkehrsfluss aus dem Wohngebiet › Grub ‹.
-"Bei dem hohen Verkehrs- und Fußgängeraufkommen musste zu deren Sicherheit eine weitere Ampel her", so Eckert.
-Dies bestätigt auch Peter Arnold vom Landratsamt Offenburg.
-"Laut aktuellen Messungen durchfahren auf der B 33 täglich etwa 12 000 Fahrzeuge die Gemeinde Gutach, davon sind etwa zehn Prozent Schwerlastverkehr", betont Arnold.
-Daher sei der Bau einer weiteren Ampel mehr als notwendig: "Sicherheit geht hier einfach vor", so Arnold.
-Insgesamt seien vier Verkehrsschauen durchgeführt worden, auch ein Kreisverkehr wurde angedacht, allerdings wegen der Enge in dem Kreuzungsbereich Sulzbachweg / Kirchstraße wieder verworfen.
-Es wurde laut Arnold bei der Standortwahl der Ampelmasten zuvor alles ausgetestet: "Mittels eines extra für uns mit besonders langen Holzstämmen beladener Transporter haben wir ausgestestet, ob diese Fahrzeuge aus dem Sulzbachweg auf die B 33 ausfahren können, ohne den Ampelmasten umzuknicken".
-Die rund 15 000 Euro teure Ampelanlage selbst ist das "modernste, was es derzeit auf dem Markt gibt", erläuterte Arnold.
-Die Anlage ist mit farbigen LEDs ausgestattet, die so kräftig leuchten, dass die Lichter von den Autofahrern beispielsweise auch bei tiefstehender Sonne gut zu erkennen sind.
-Und sparsam ist sie auch: Die älteren Lichtanlagen verbrauchen etwa 100 Watt, die neuen gerade mal acht Watt.
-Pro Fahrtrichtung gibt es drei Lichtanlagen.
-Arnold erklärte die Technik der neuen Anlage: Diese ist mit zwei Radarsensoren ausgestattet.
-Drückt der Fußgänger den Ampelknopf, testet der obere Radarsensor die Verkehrslage.
-Ist die Straße frei, kommt unmittelbar Grün für den Fußgänger, wenn nicht, dauert es etwa 15 Sekunden.
-Ein weiteres Radarsensor prüft, ob die Grünphase für den Fußgänger beendet werden kann.
-"Sollte eine Gruppe oder gehbehinderte Menschen über die Straße gehen, wird die Grünphase verlängert, es kommt also jeder sicher über die Fahrbahn", erklärte Arnold.
-Natürlich müsse der Autofahrer hier als Partner mitdenken und die Fahrbahn beobachten.
-Dies war gestern nicht der Fall: Kaum zeigte die Ampel für Fußgänger grün, rauschte ein Oberklasse-Fahrzeug durch - bei leuchtendem Rot.
-Josef Winkler schreibt sich seit mehr als 30 Jahren die Nöte seiner Kindheit und Jugend von der Seele.
-Die Katastrophen seiner katholischen Dorfkindheit - die Sprachlosigkeit, der Hang zu roher Gewalt und stumpfer Sexualität, die Enge und Freudlosigkeit - hat der Kärntner Dichter vielfach beschrieben.
-Bekannt ist der Büchner-Preisträger vor allem als Prosaautor, Theatertexte sind in seinem Werk rar.
-Collage aus Prosatexten Gerhard Fresacher stellt für seine Aufführung "Wetterleuchten auf der Zungenspitze", die nun in der Garage X am Petersplatz zu sehen ist, daher eine Collage aus Prosatexten zusammen.
-Der Theatermacher verbindet etwa Elemente aus dem autobiografisch geprägten Roman "Der Leibeigene" (1987) mit Prosaminiaturen aus "Leichnam, seine Familie belauernd" (2003).
-Auf der weitgehend leergeräumten Bühne - wichtiges Requisit: ein zerknautschtes Sofa, auf dem andeutungsweise kopuliert und masturbiert wird - hangelt sich das achtköpfige Ensemble durch das Textmaterial.
-Dabei scheint Regisseur Fresacher dem Text wenig zu vertrauen.
-Die 70-minütige Performance übertüncht die Vorlage mit einer Fülle an Regieeinfällen, bekannt aus dem Repertoire postdramatischer Spielformen.
-Vor allem die Schauspielerinnen kommen bei den mitunter etwas fragwürdigen szenischen Umsetzungen dran.
-Sie werden hart angefasst, mit dem Kopf unter Wasser getaucht, mit ihren Abendroben an die Wand getackert.
-Eingezwängt in Zellophan oder Mieder, staksen sie auf gefährlich hohen Stöckeln durch die Inszenierung, entweder monologisieren sie lautstark oder liegen völlig verstummt auf dem Bühnenboden.
-Der Text vermittelt sich auf diese angestrengte Weise jedoch kaum.
-Die besten Momente hat der Abend, wenn gesungen wird - die Bandbreite reicht von Deep Purple bis zu volkstümlichem Liedgut.
-Erst gegen Ende kommt die überdrehte Aufführung etwas zur Ruhe, und Winklers nachgerade absurder Humor blitzt auf.
-Eine Blackbox im Auto?
-US-amerikanische Straßenplaner sind auf der Suche nach einer Geldquelle, um das verfallende Highway-System zu reparieren, und glauben die Lösung in einem kleinen schwarzen Kasten gefunden zu haben, der im Armaturenbrett jedes Autos Platz findet.
-Die Geräte, die jeden gefahrenen Kilometer aufzeichnen und die Informationen an die Behörden melden, sind Kernpunkt eines kontroversen Versuchs von Washington und den Planungsbüros der Bundesstaaten, das veraltete System zur Finanzierung US-amerikanischer Straßen zu überarbeiten.
-Das normalerweise eher langweilige Gebiet der Straßenplanung hat plötzlich eine intensive Debatte mit bunten Allianzen entfacht.
-Libertäre haben sich mit Umweltgruppen verbündet und sprechen sich dafür aus, dass die Regierung die kleinen Boxen zum Aufzeichnen der gefahrenen Kilometer – und möglicherweise auch, wo sie gefahren wurden – verwenden und die Informationen dann für die Berechnung von Steuerbescheiden einsetzen kann.
-Die Tea Party ist entsetzt.
-Die amerikanische Bürgerrechtsvereinigung (ACLU) ist ebenfalls zutiefst besorgt und äußert eine Reihe von Datenschutzbedenken.
-Doch während man sich im Kongress nicht auf ein Vorgehen einigen kann, warten mehrere Bundesstaaten nicht länger.
-Sie prüfen derzeit, wie sie im Laufe der nächsten zehn Jahre zu einem System wechseln können, bei dem Fahrer pro gefahrener Meile bezahlen.
-Tausende von Autofahrern haben die Fahrtenschreiber, von denen einige mit GPS-Überwachung ausgestattet sind, bereits getestet.
-Das ist wirklich ein Muss für unser Land.
-„Es ist nichts, das wir nur möglicherweise verwenden werden“, sagte Hasan Ikhrata, Geschäftsführer der Southern California Assn. of Governments, die eine Aufzeichnung der gefahrenen Meilen bei allen kalifornischen Autofahrern im Bundesstaat ab 2025 plant.
-Die Art und Weise, wie wir diese Steuern zahlen, wird sich verändern.
-Die Technologie dafür ist da.
-Die Initiative kommt zu einem Zeitpunkt, da der Highway Trust Fund, der aus den Steuern finanziert wird, die US-Amerikaner an der Zapfsäule entrichten, pleite ist.
-Doch in Amerika wird nicht mehr so viel getankt wie früher.
-Autos verbrauchen weniger Benzin.
-Die staatliche Mineralölsteuer von 18,4 Cent pro Gallone (weniger als 4 Eurocent pro Liter) ist seit 20 Jahren nicht gestiegen.
-Politiker wagen bei hohen Spritpreisen nicht, die Steuer auch nur um einen Cent anzuheben.
-„Die Benzinsteuer ist einfach nicht zukunftsfähig“, so Lee Munnich, ein Experte für Verkehrsgesetzgebung an der Universität von Minnesota.
-Sein Bundesstaat hat kürzlich 500 Autos mit Fahrtenschreibern ausgerüstet, mit denen ein meilenbasiertes Bezahlsystem getestet werden soll.
-„Das stellt die langfristig sinnvollste Alternative dar“, sagte er.
-Bürokraten bezeichnen es als meilenbasierte Benutzergebühr.
-Es überrascht nicht, dass die Idee bei städtischen Liberalen Anklang findet, denn die Steuer ließe sich beispielsweise dazu einsetzen, das Fahrverhalten so zu beeinflussen, dass Staus und klimaschädliche Abgase reduziert werden.
-Die kalifornischen Planer setzen auf das System bei der Ausarbeitung von Strategien, mit denen die ambitionierten, gesetzlich verankerten Ziele des Bundesstaats zum Klimawandel erreicht werden sollen.
-Doch der Republikaner Bill Shuster aus Pennsylvania, Vorsitzender des House Transportation Committee, hat ebenfalls erklärt, dass er darin die gangbarste langfristige Alternative sehe.
-Auch die freien Vermarkter der Reason Foundation sind von der Idee angetan, Fahrer nach zurückgelegter Strecke zahlen zu lassen.
-„Das ist keine Steuer, die in einem schwarzen Loch verschwindet“, erklärt Adrian Moore, Vizepräsident für Richtlinien bei Reason.
-Die Leute bezahlen direkt für das, was sie bekommen.
-Die Bewegung wird auch von zwei früheren amerikanischen Verkehrsministern unterstützt, die in einem Bericht im Jahr 2011 den Kongress aufgefordert hatten, sich in Richtung meilenbasierter Abrechnung zu bewegen.
-Der US-Senat genehmigte letztes Jahr ein 90 Millionen Dollar teures Pilotprojekt, das 10.000 Autos umfasst hätte.
-Doch die Mehrheit im Repräsentantenhaus verhinderte den Vorstoß und reagierte damit auf die Bedenken von Abgeordneten aus ländlichen Gebieten, die Menschen vertreten, die im Alltag oft viele Meilen auf dem Weg zur Arbeit oder in die Stadt zurücklegen müssen.
-Mehrere Bundesstaaten und Großstädte bewegen sich nichtsdestotrotz auf eigene Faust in diese Richtung.
-Am engagiertesten ist Oregon, das derzeit 5.000 Fahrer für das größte Experiment des Landes anwirbt.
-Diese Fahrer werden bald die Meilengebühren statt der Mineralölsteuer an den Bundesstaat zahlen.
-Nevada hat bereits ein Pilotprojekt abgeschlossen.
-New York City erwägt ebenfalls ein solches.
-Illinois testet es in eingeschränktem Maße mit Lkws.
-Und die I-95-Koalition, zu der die Verkehrsministerien von 17 Bundesstaaten an der Ostküste gehören (einschließlich Maryland, Pennsylvania, Virginia und Florida), untersucht derzeit, wie man die Änderung einführen könnte.
-Das Konzept ist kein universeller Hit.
-In Nevada, wo vor kurzem 50 Freiwillige mit den Geräten ausgestattet wurden, waren Autofahrer skeptisch beim Gedanken, die Regierung könnte jede ihrer Bewegungen verfolgen.
-„Bedenken gegen Big Brother und derartige Dinge waren ein großes Problem“, erklärt Alauddin Khan, Leiter des Strategie- und Ergebnismanagements im Verkehrsministerium von Nevada.
-Die Leute wollten es nicht.
-Als der Test anlief, warnte die ACLU von Nevada auf ihrer Website: „Es wäre relativ einfach, die Fahrtenschreiber in ausgewachsene Überwachungsgeräte zu verwandeln.“
-Es bestehe keine Notwendigkeit, eine gigantische, sperrige technologische Infrastruktur aufzubauen, die unweigerlich dazu verwendet werden würde, Daten über die täglichen Bewegungen von Einzelpersonen zu erfassen.
-Nevada gehört zu einer Reihe von Bundesstaaten, die nun nach erschwinglicher Technologie Ausschau halten, mit der der Staat die gefahrenen Kilometer erfassen kann, aber nicht genau wann und wo.
-Damit, so Khan, wäre auch die Öffentlichkeit beruhigter.
-Die Jagd nach dieser Technologie hat einige Behörden zu einem kleinen Startup-Unternehmen namens True Mileage in Kalifornien geführt.
-Die Firma ist ursprünglich nicht angetreten, um Bundesstaaten bei der Besteuerung von Autofahrern zu helfen.
-Vielmehr war es ihr Ziel, in einem aufstrebenden Markt für Kfz-Versicherungen Fuß zu fassen, bei denen Fahrer auf Grundlage der gefahrenen Meilen zahlen sollen.
-Doch die von ihr getesteten Geräte sind auch für die Straßenplaner interessant, denn sie arbeiten nicht mit GPS und liefern nur begrenzte Informationen, die regelmäßig per Modem hochgeladen werden.
-„Die Leute sind eher bereit, sich daran zu beteiligen, wenn ihre Geschwindigkeit und Standorte nicht aufgezeichnet werden“, erklärte Ryan Morrison, Geschäftsführer von True Mileage.
-In einigen dieser öffentlichen Pilotprogramme wurden große Fehler gemacht.
-Es gibt wesentlich billigere und weniger intrusive Möglichkeiten, dies umzusetzen.
-In Oregon experimentieren die Planer damit, Autofahrern eine Reihe von Auswahlmöglichkeiten zu geben.
-Sie können sich für ein Gerät mit oder ohne GPS entscheiden.
-Oder sie wählen überhaupt kein Gerät und zahlen stattdessen eine Pauschalgebühr auf Grundlage der durchschnittlich von allen Einwohnern des Bundesstaates gefahrenen Meilen.
-Andere Stellen hoffen, das Konzept einer misstrauischen Öffentlichkeit verkaufen zu können, indem sie die Geräte mit mehr Funktionen ausstatten als mit wenigeren.
-In New York City wollen Verkehrsbeamte ein Gerät zur Besteuerung entwickeln, mit dem sich auch Parkgebühren bezahlen lassen, eine Versicherung nur für gefahrene Kilometer bezahlt werden muss und Geschwindigkeitsdaten von anderen Fahrzeugen in Echtzeit erhoben werden, dank derer Autofahrer Staus ausweichen können.
-„Autofahrer würden durch den Mehrwert der Vorteile, die das System bietet, zur Teilnahme motiviert“, heißt es in einem Planungsdokument der Stadt.
-Einige Verkehrsplaner fragen sich allerdings, ob das ganze Gerede über das Bezahlen pro Meile nicht nur ein riesiges Ablenkungsmanöver sei.
diff --git a/scripts/tests/test_sampler.py b/scripts/tests/test_sampler.py
deleted file mode 100644
index ab398a9057..0000000000
--- a/scripts/tests/test_sampler.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test Sampler."""
-
-import mxnet as mx
-import numpy as np
-from numpy.testing import assert_allclose
-from ..language_model.sampler import LogUniformSampler
-
-
-def test_log_uniform_sampler():
-    ntokens = 793472
-    num_sampled = 8192
-    sampler = LogUniformSampler(ntokens, num_sampled)
-    true_cls = mx.nd.array([5, 10, 20])
-    sample, cnt_sample, cnt_true = sampler(true_cls)
-    assert np.unique(sample.asnumpy()).size == num_sampled
-    assert cnt_true.size == true_cls.size
-    assert cnt_sample.size == cnt_sample.size
diff --git a/scripts/tests/test_sanity.py b/scripts/tests/test_sanity.py
deleted file mode 100644
index c21149d1be..0000000000
--- a/scripts/tests/test_sanity.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-
-@pytest.mark.gpu
-def test_sanity_gpu():
-    # sanity test that makes sure every marker combination has at least 1 test.
-    # due to https://github.com/pytest-dev/pytest/issues/812
-    import gluonnlp as nlp
-
-@pytest.mark.gpu
-@pytest.mark.serial
-def test_sanity_gpu_serial():
-    # sanity test that makes sure every marker combination has at least 1 test.
-    # due to https://github.com/pytest-dev/pytest/issues/812
-    import gluonnlp as nlp
-
-@pytest.mark.gpu
-@pytest.mark.integration
-def test_sanity_gpu_integ():
-    # sanity test that makes sure every marker combination has at least 1 test.
-    # due to https://github.com/pytest-dev/pytest/issues/812
-    import gluonnlp as nlp
diff --git a/scripts/tests/test_scripts.py b/scripts/tests/test_scripts.py
deleted file mode 100644
index 9e848a774f..0000000000
--- a/scripts/tests/test_scripts.py
+++ /dev/null
@@ -1,423 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import subprocess
-import sys
-import time
-import datetime
-
-import pytest
-import mxnet as mx
-import gluonnlp as nlp
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.gpu
-@pytest.mark.integration
-@pytest.mark.parametrize('model', ['skipgram', 'cbow'])
-@pytest.mark.parametrize('fasttext', [True, False])
-def test_skipgram_cbow(model, fasttext):
-    cmd = [
-        sys.executable, './scripts/word_embeddings/train_sg_cbow.py', '--gpu', '0',
-        '--epochs', '2', '--model', model, '--data', 'toy', '--batch-size',
-        '64']
-    cmd += ['--similarity-datasets', 'WordSim353']
-    cmd += ['--analogy-datasets', 'GoogleAnalogyTestSet']
-    if fasttext:
-        cmd += ['--ngram-buckets', '1000']
-    else:
-        cmd += ['--ngram-buckets', '0']
-    subprocess.check_call(cmd)
-    time.sleep(5)
-
-
-@pytest.mark.serial
-@pytest.mark.gpu
-@pytest.mark.integration
-def test_glove():
-    path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    vocab = os.path.join(path, 'word_embeddings/glove/vocab.txt')
-    cooccur = os.path.join(path, 'word_embeddings/glove/cooccurrences.npz')
-    cmd = [
-        sys.executable, './scripts/word_embeddings/train_glove.py', cooccur, vocab,
-        '--batch-size', '2', '--epochs', '2', '--gpu', '0']
-    cmd += ['--similarity-datasets', 'WordSim353']
-    cmd += ['--analogy-datasets', 'GoogleAnalogyTestSet']
-    subprocess.check_call(cmd)
-    time.sleep(5)
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.gpu
-@pytest.mark.integration
-@pytest.mark.parametrize('fasttextloadngrams', [True, False])
-@pytest.mark.parametrize('maxvocabsize', [None, 50000])
-def test_embedding_evaluate_pretrained(fasttextloadngrams, maxvocabsize):
-    cmd = [
-        sys.executable, './scripts/word_embeddings/evaluate_pretrained.py',
-        '--embedding-name', 'fasttext', '--embedding-source', 'wiki.simple',
-        '--gpu', '0'
-    ]
-    cmd += ['--similarity-datasets', 'WordSim353']
-    cmd += ['--analogy-datasets', 'GoogleAnalogyTestSet']
-    if fasttextloadngrams:
-        cmd.append('--fasttext-load-ngrams')
-    if maxvocabsize:
-        cmd += ['--analogy-max-vocab-size', str(maxvocabsize)]
-
-    subprocess.check_call(cmd)
-    time.sleep(5)
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.gpu
-@pytest.mark.integration
-@pytest.mark.parametrize('evaluateanalogies', [True, False])
-@pytest.mark.parametrize('maxvocabsize', [None, 16])
-def test_embedding_evaluate_from_path(evaluateanalogies, maxvocabsize):
-    path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    path = os.path.join(
-        path, '../../tests/unittest/train/test_embedding/lorem_ipsum.bin')
-    cmd = [
-        sys.executable, './scripts/word_embeddings/evaluate_pretrained.py',
-        '--embedding-path', path, '--gpu', '0']
-    if evaluateanalogies:
-        cmd += ['--similarity-datasets']
-        cmd += ['--analogy-datasets', 'GoogleAnalogyTestSet']
-    else:
-        cmd += ['--similarity-datasets', 'WordSim353']
-        cmd += ['--analogy-datasets']
-    if maxvocabsize is not None:
-        cmd += ['--analogy-max-vocab-size', str(maxvocabsize)]
-    subprocess.check_call(cmd)
-    time.sleep(5)
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.gpu
-@pytest.mark.integration
-@pytest.mark.parametrize('use_pretrained', [True, False])
-def test_sentiment_analysis_finetune(use_pretrained):
-    args = ['--gpu', '0', '--batch_size', '32', '--bucket_type', 'fixed',
-            '--epochs', '1', '--dropout', '0',
-            '--lr', '0.005', '--valid_ratio', '0.1',
-            '--save-prefix', 'imdb_lstm_200']
-    if not use_pretrained:
-        args.append('--no_pretrained')
-    process = subprocess.check_call([sys.executable, './scripts/sentiment_analysis/finetune_lm.py']+args)
-    time.sleep(5)
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.gpu
-@pytest.mark.integration
-def test_sentiment_analysis_textcnn():
-    process = subprocess.check_call([sys.executable, './scripts/sentiment_analysis/sentiment_analysis_cnn.py',
-                                     '--gpu', '0', '--batch_size', '50', '--epochs', '1',
-                                     '--dropout', '0.5', '--model_mode', 'rand', '--data_name', 'MR'])
-    time.sleep(5)
-
-@pytest.mark.remote_required
-@pytest.mark.gpu
-@pytest.mark.serial
-@pytest.mark.integration
-@pytest.mark.parametrize('method', ['beam_search', 'sampling'])
-@pytest.mark.parametrize('lmmodel', ['awd_lstm_lm_1150', 'gpt2_117m'])
-def test_sampling(method, lmmodel):
-    if 'gpt2' in lmmodel and method == 'beam_search':
-        return  # unsupported
-    args = [
-        '--bos', 'I love it', '--beam-size', '2', '--print-num', '1', '--gpu', '0', '--lm-model',
-        lmmodel
-    ]
-    if method == 'beam_search':
-        args.insert(0, 'beam-search')
-        args.extend(['--k', '50'])
-    if method == 'sampling':
-        args.insert(0, 'random-sample')
-        args.extend(['--temperature', '1.0'])
-    process = subprocess.check_call([sys.executable, './scripts/text_generation/sequence_sampling.py']
-                                     + args)
-    time.sleep(5)
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.gpu
-@pytest.mark.integration
-def test_gnmt():
-    process = subprocess.check_call([sys.executable, './scripts/machine_translation/train_gnmt.py', '--dataset', 'TOY',
-                                     '--src_lang', 'en', '--tgt_lang', 'de', '--batch_size', '32',
-                                     '--optimizer', 'adam', '--lr', '0.0025', '--save_dir', 'test',
-                                     '--epochs', '1', '--gpu', '0', '--num_buckets', '5',
-                                     '--num_hidden', '64', '--num_layers', '2'])
-    time.sleep(5)
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.gpu
-@pytest.mark.integration
-@pytest.mark.parametrize('bleu', ['tweaked', '13a'])
-def test_transformer(bleu):
-    args = ['--dataset', 'TOY', '--src_lang', 'en', '--tgt_lang', 'de',
-            '--batch_size', '32', '--optimizer', 'adam',
-            '--num_accumulated', '1', '--lr', '1.0',
-            '--warmup_steps', '2000', '--save_dir', 'test',
-            '--epochs', '1', '--gpus', '0', '--scaled', '--average_start',
-            '1', '--num_buckets', '5', '--bleu', bleu, '--num_units',
-            '32', '--hidden_size', '64', '--num_layers', '2',
-            '--num_heads', '4', '--test_batch_size', '32']
-    process = subprocess.check_call([sys.executable, './scripts/machine_translation/train_transformer.py']
-                                    +args)
-
-    process = subprocess.check_call([sys.executable, './scripts/machine_translation/inference_transformer.py',
-                                     '--dataset', 'WMT2014BPE', 
-                                     '--src_lang', 'en',
-                                     '--tgt_lang', 'de', 
-                                     '--scaled', 
-                                     '--num_buckets', '20', 
-                                     '--bucket_scheme', 'exp',
-                                     '--bleu', '13a', 
-                                     '--log_interval', '10', 
-                                     '--model_parameter', './scripts/machine_translation/transformer_en_de_u512/valid_best.params', 
-                                     '--gpu', '0'
-                                     ])
-    time.sleep(5)
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.gpu
-@pytest.mark.integration
-@pytest.mark.parametrize('use_pretrained', [True, False])
-def test_bert_embedding(use_pretrained):
-    args = ['--gpu', '0', '--model', 'bert_12_768_12',
-            '--dataset_name', 'book_corpus_wiki_en_uncased',
-            '--max_seq_length', '25', '--batch_size', '256',
-            '--oov_way', 'avg', '--sentences', '"is this jacksonville ?"',
-            '--verbose']
-    if use_pretrained:
-        args.extend(['--dtype', 'float32'])
-    else:
-        _, _ = nlp.model.get_model('bert_12_768_12', dataset_name='book_corpus_wiki_en_uncased',
-                                   pretrained=True, root='test_bert_embedding')
-        args.extend(['--params_path',
-                     'test_bert_embedding/bert_12_768_12_book_corpus_wiki_en_uncased-75cc780f.params'])
-    process = subprocess.check_call([sys.executable, './scripts/bert/embedding.py'] + args)
-    time.sleep(5)
-
-
-@pytest.mark.serial
-@pytest.mark.gpu
-@pytest.mark.remote_required
-@pytest.mark.integration
-@pytest.mark.skip_master  # TODO remove once https://github.com/apache/incubator-mxnet/issues/17292 is fixed
-@pytest.mark.parametrize('backend', ['horovod', 'device'])
-@pytest.mark.parametrize('optimizer', ['bertadam', 'lamb'])
-def test_bert_pretrain(backend, optimizer):
-    # test data creation
-    process = subprocess.check_call([sys.executable, './scripts/bert/data/create_pretraining_data.py',
-                                     '--input_file', './scripts/bert/sample_text.txt',
-                                     '--output_dir', 'test/bert/data',
-                                     '--dataset_name', 'book_corpus_wiki_en_uncased',
-                                     '--max_seq_length', '128',
-                                     '--max_predictions_per_seq', '20',
-                                     '--dupe_factor', '5',
-                                     '--masked_lm_prob', '0.15',
-                                     '--short_seq_prob', '0',
-                                     '--verbose'])
-    arguments = ['--log_interval', '2',
-                 '--lr', '2e-5', '--warmup_ratio', '0.5',
-                 '--total_batch_size', '32', '--total_batch_size_eval', '8',
-                 '--ckpt_dir', './test/bert/ckpt',
-                 '--num_steps', '20', '--num_buckets', '1',
-                 '--pretrained',
-                 '--comm_backend', backend,
-                 '--optimizer', optimizer]
-
-    if backend == 'device':
-        arguments += ['--gpus', '0']
-
-    # test training with npz data, float32
-    process = subprocess.check_call([sys.executable, './scripts/bert/run_pretraining.py',
-                                     '--dtype', 'float32',
-                                     '--data', './test/bert/data/*.npz',
-                                     '--data_eval', './test/bert/data/*.npz',
-                                     '--eval_use_npz'] + arguments)
-
-    raw_txt_arguments = ['--raw', '--max_seq_length', '128',
-                         '--max_predictions_per_seq', '20', '--masked_lm_prob', '0.15']
-
-    # test training with raw data
-    process = subprocess.check_call([sys.executable, './scripts/bert/run_pretraining.py',
-                                     '--data', './scripts/bert/sample_text.txt',
-                                     '--data_eval', './scripts/bert/sample_text.txt'] +
-                                     arguments + raw_txt_arguments)
-    time.sleep(5)
-
-@pytest.mark.serial
-@pytest.mark.gpu
-@pytest.mark.remote_required
-@pytest.mark.integration
-# MNLI inference (multiple dev sets)
-# STS-B inference (regression task)
-@pytest.mark.parametrize('dataset', ['MNLI', 'STS-B'])
-def test_finetune_inference(dataset):
-    arguments = ['--log_interval', '100', '--epsilon', '1e-8',
-                 '--gpu', '0', '--max_len', '80', '--only_inference']
-    process = subprocess.check_call([sys.executable, './scripts/bert/finetune_classifier.py',
-                                     '--task_name', dataset] + arguments)
-    time.sleep(5)
-
-@pytest.mark.serial
-@pytest.mark.gpu
-@pytest.mark.remote_required
-@pytest.mark.integration
-@pytest.mark.parametrize('dataset', ['XNLI', 'ChnSentiCorp'])
-def test_finetune_chinese_inference(dataset):
-    arguments = ['--log_interval', '100', '--epsilon', '1e-8',
-                 '--gpu', '0', '--max_len', '80', '--only_inference']
-    process = subprocess.check_call([sys.executable, './scripts/bert/finetune_classifier.py',
-                                     '--task_name', dataset] + arguments)
-    time.sleep(5)
-
-@pytest.mark.serial
-@pytest.mark.gpu
-@pytest.mark.remote_required
-@pytest.mark.integration
-@pytest.mark.parametrize('early_stop', [None, 2])
-@pytest.mark.parametrize('bert_model', ['bert_12_768_12', 'roberta_12_768_12'])
-@pytest.mark.parametrize('dataset', ['WNLI'])
-@pytest.mark.parametrize('dtype', ['float32', 'float16'])
-def test_finetune_train(early_stop, bert_model, dataset, dtype):
-    epochs = early_stop + 2 if early_stop else 2
-    arguments = ['--log_interval', '100', '--epsilon', '1e-8',
-                 '--task_name', dataset, '--bert_model', bert_model,
-                 '--gpu', '0', '--epochs', str(epochs), '--dtype', dtype]
-    if early_stop is not None:
-        arguments += ['--early_stop', str(early_stop)]
-    if 'roberta' in bert_model:
-        arguments += ['--bert_dataset', 'openwebtext_ccnews_stories_books_cased']
-    process = subprocess.check_call([sys.executable, './scripts/bert/finetune_classifier.py'] +
-                                    arguments)
-
-@pytest.mark.serial
-@pytest.mark.integration
-@pytest.mark.parametrize('task', ['classification', 'regression', 'question_answering'])
-def test_export(task):
-    process = subprocess.check_call([sys.executable, './scripts/bert/export.py',
-                                     '--task', task])
-
-@pytest.mark.serial
-@pytest.mark.gpu
-@pytest.mark.remote_required
-@pytest.mark.integration
-@pytest.mark.parametrize('sentencepiece', [False, True])
-def test_finetune_squad(sentencepiece):
-    arguments = ['--optimizer', 'adam', '--batch_size', '32',
-                 '--gpu', '--epochs', '1', '--debug', '--max_seq_length', '32',
-                 '--max_query_length', '8', '--doc_stride', '384']
-    if sentencepiece:
-        # the downloaded bpe vocab
-        url = 'http://repo.mxnet.io/gluon/dataset/vocab/test-682b5d15.bpe'
-        f = mx.test_utils.download(url, overwrite=True)
-        arguments += ['--sentencepiece', f]
-
-    process = subprocess.check_call([sys.executable, './scripts/bert/finetune_squad.py']
-                                    + arguments)
-    time.sleep(5)
-
-@pytest.mark.serial
-@pytest.mark.gpu
-@pytest.mark.remote_required
-@pytest.mark.integration
-@pytest.mark.parametrize('dataset', ['MRPC'])
-def test_xlnet_finetune_glue(dataset):
-    arguments = ['--batch_size', '32', '--task_name', dataset,
-                 '--gpu', '1', '--epochs', '1', '--max_len', '32']
-    process = subprocess.check_call([sys.executable, './scripts/language_model/run_glue.py']
-                                    + arguments)
-    time.sleep(5)
-
-@pytest.mark.serial
-@pytest.mark.gpu
-@pytest.mark.remote_required
-@pytest.mark.integration
-def test_bert_ner():
-    folder = './scripts/ner'
-    arguments = ['--train-path', folder + '/dataset_sample/train_sample.txt',
-                 '--dev-path', folder + '/dataset_sample/validation_sample.txt',
-                 '--test-path', folder + '/dataset_sample/test_sample.txt',
-                 '--gpu', '0', '--learning-rate', '1e-5',
-                 '--warmup-ratio', '0', '--batch-size', '1',
-                 '--num-epochs', '1', '--bert-model', 'bert_24_1024_16',
-                 '--save-checkpoint-prefix', './test_bert_ner']
-    script = folder + '/finetune_bert.py'
-    process = subprocess.check_call([sys.executable, script] + arguments)
-    time.sleep(5)
-
-@pytest.mark.serial
-@pytest.mark.gpu
-@pytest.mark.remote_required
-@pytest.mark.integration
-def test_bert_icsl():
-    folder = './scripts/intent_cls_slot_labeling'
-    arguments = ['--gpu', '0', '--dataset', 'atis', '--epochs', '1']
-    script = folder + '/finetune_icsl.py'
-    process = subprocess.check_call([sys.executable, script] + arguments)
-    time.sleep(5)
-
-@pytest.mark.serial
-@pytest.mark.gpu
-@pytest.mark.remote_required
-@pytest.mark.integration
-@pytest.mark.parametrize('dataset', ['MRPC'])
-def test_xlnet_finetune_glue_with_round_to(dataset):
-    arguments = ['--batch_size', '32', '--task_name', dataset,
-                 '--gpu', '1', '--epochs', '1', '--max_len', '32', '--round_to', '8']
-    process = subprocess.check_call([sys.executable, './scripts/language_model/run_glue.py']
-                                    + arguments)
-    time.sleep(5)
-
-@pytest.mark.serial
-@pytest.mark.gpu
-@pytest.mark.remote_required
-@pytest.mark.integration
-def test_finetune_squad_with_round_to():
-    arguments = ['--optimizer', 'adam', '--batch_size', '32',
-                 '--gpu', '--epochs', '1', '--debug', '--max_seq_length', '32',
-                 '--max_query_length', '8', '--doc_stride', '384', '--round_to', '8']
-    process = subprocess.check_call([sys.executable, './scripts/bert/finetune_squad.py']
-                                    + arguments)
-    time.sleep(5)
-
-@pytest.mark.serial
-@pytest.mark.gpu
-@pytest.mark.remote_required
-@pytest.mark.integration
-def test_xlnet_finetune_squad():
-    arguments = ['--optimizer', 'bertadam', '--batch_size', '16',
-                 '--gpu', '1', '--epochs', '1', '--debug', '--max_seq_length', '32',
-                 '--max_query_length', '8', '--doc_stride', '384', '--round_to', '8']
-    process = subprocess.check_call([sys.executable, './scripts/language_model/run_squad.py']
-                                    + arguments)
-    time.sleep(5)
diff --git a/scripts/tests/test_transformer_xl.py b/scripts/tests/test_transformer_xl.py
deleted file mode 100644
index bcfec6d296..0000000000
--- a/scripts/tests/test_transformer_xl.py
+++ /dev/null
@@ -1,141 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test Transformer-XL."""
-import functools
-import re
-
-import mxnet as mx
-import pytest
-
-import gluonnlp as nlp
-from gluonnlp.model.transformer import _position_encoding_init
-
-from ..language_model.transformer import (AdaptiveEmbedding, AdaptiveLogSoftmaxWithLoss,
-                                          PositionalEmbeddingMultiHeadAttentionCell, TransformerXL)
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.parametrize('d_head', [5])
-@pytest.mark.parametrize('num_heads', [1, 3])
-@pytest.mark.parametrize('query_length', [10, 5, 1])
-@pytest.mark.parametrize('memory_length', [5, 1])
-@pytest.mark.parametrize('batch_size', [3, 1])
-@pytest.mark.parametrize('use_mask', [True, False])
-def test_positional_embedding_multihead_attention_cell(d_head, num_heads, query_length,
-                                                       memory_length, batch_size, use_mask):
-    attention_cell = PositionalEmbeddingMultiHeadAttentionCell(d_head=d_head, num_heads=num_heads,
-                                                               scaled=False, dropout=0)
-    attention_cell.initialize()
-
-    if use_mask:
-        mask_nd = mx.random.uniform(0, 1, shape=(batch_size, query_length, memory_length)) > 0.3
-    else:
-        mask_nd = None
-    query_nd = mx.nd.random.normal(0, 1, (batch_size, query_length, d_head))
-    key_nd = mx.nd.random.normal(0, 1, (batch_size, memory_length, d_head))
-    value_nd = mx.nd.random.normal(0, 1, (batch_size, memory_length, d_head))
-    emb_nd = mx.nd.array(_position_encoding_init(memory_length + 1, d_head))
-
-    read_value, att_weights = attention_cell(query_nd, key_nd, value_nd, emb_nd, mask_nd)
-
-
-@pytest.mark.parametrize('embed_size', [64, 32])
-@pytest.mark.parametrize('units', [64, 32])
-@pytest.mark.parametrize('cutoffs', [[10], [10, 30]])
-@pytest.mark.parametrize('div_val', [1, 2, 4])
-@pytest.mark.parametrize('hybridize', [True, False])
-def test_adaptive_embedding(embed_size, units, cutoffs, div_val, hybridize):
-    vocab_size = 100
-    emb = AdaptiveEmbedding(vocab_size=vocab_size, embed_size=embed_size, units=units,
-                            cutoffs=cutoffs, div_val=div_val)
-    emb.initialize()
-    if hybridize:
-        emb.hybridize()
-
-    x = mx.nd.arange(vocab_size)
-    _ = emb(x)
-    mx.nd.waitall()
-
-
-@pytest.mark.parametrize('embed_size', [64, 32])
-@pytest.mark.parametrize('units', [64, 32])
-@pytest.mark.parametrize('cutoffs', [[10], [10, 30]])
-@pytest.mark.parametrize('div_val', [2, 4])
-@pytest.mark.parametrize('tie_with_adaptive_embedding', [False, True])
-@pytest.mark.parametrize('hybridize', [True, False])
-def test_adaptive_softmax(embed_size, units, cutoffs, div_val, tie_with_adaptive_embedding,
-                          hybridize):
-    vocab_size = 100
-
-    Net = functools.partial(AdaptiveLogSoftmaxWithLoss, vocab_size=vocab_size,
-                            embed_size=embed_size, units=units, cutoffs=cutoffs, div_val=div_val)
-
-    if tie_with_adaptive_embedding:
-        # Share all parameters
-        emb = AdaptiveEmbedding(vocab_size=vocab_size, embed_size=embed_size, units=units,
-                                cutoffs=cutoffs, div_val=div_val)
-        emb_params = emb.collect_params()
-        net = Net(tie_embeddings=True, tie_projections=[True] * (len(cutoffs) + 1),
-                  params=emb_params)
-        for param_name, param in net.collect_params().items():
-            if re.search(r'(?:(?:embedding)|(?:projection))\d+_weight', param_name):
-                assert param in emb_params.values()
-            elif re.search(r'(?:(?:embedding)|(?:projection))\d+_bias', param_name):
-                assert param not in emb_params.values()
-
-        # Share only embedding parameters
-        net = Net(tie_embeddings=True, params=emb_params)
-        for param_name, param in net.collect_params().items():
-            if re.search(r'(?:embedding)\d+_weight', param_name):
-                assert param in emb_params.values()
-            elif re.search(r'(?:projection)|(?:bias)', param_name):
-                assert param not in emb_params.values()
-    else:
-        net = Net()
-
-    net.initialize()
-    if hybridize:
-        net.hybridize()
-
-    x = mx.nd.random.normal(shape=(8, 16, units))
-    y = mx.nd.arange(8 * 16).clip(0, vocab_size - 1).reshape((8, 16))
-    _ = net(x, y)
-    mx.nd.waitall()
-
-
-@pytest.mark.parametrize('embed_size', [64, 32])
-@pytest.mark.parametrize('units', [64, 32])
-@pytest.mark.parametrize('cutoffs', [[10], [10, 30]])
-@pytest.mark.parametrize('div_val', [1, 2, 4])
-@pytest.mark.parametrize('mem_len', [8, 16])
-@pytest.mark.parametrize('hybridize', [True, False])
-def test_transformer_xl_model(embed_size, units, cutoffs, div_val, mem_len, hybridize):
-    batch_size = 8
-    vocab_size = 100
-    net = TransformerXL(vocab_size=vocab_size, embed_size=embed_size, units=units,
-                        embed_cutoffs=cutoffs, embed_div_val=div_val)
-    net.initialize()
-    if hybridize:
-        net.hybridize()
-
-    mems = net.begin_mems(batch_size, mem_len, context=mx.cpu())
-    x = mx.nd.arange(batch_size * 16).clip(0, vocab_size - 1).reshape((8, 16))
-    y = x
-    with mx.autograd.record():
-        _ = net(x, y, mems)
-    mx.nd.waitall()
diff --git a/scripts/tests/test_translations.txt b/scripts/tests/test_translations.txt
deleted file mode 100644
index a0b1993bcb..0000000000
--- a/scripts/tests/test_translations.txt
+++ /dev/null
@@ -1,100 +0,0 @@
-Gutach: Mehr Sicherheit für Fußgänger
-Sie sind nicht einmal 100 Meter voneinander entfernt: Am Dienstag wurde die neue Fußgängerampel B 33 am Dorfparkplatz in Gutach in Betrieb genommen - mit Blick auf die bestehende Ampel im Rathaus.
-Zwei Leuchten so nah beieinander: absichtlich oder nur ein dummer Fehler?
-Gestern hat der Bürgermeister von Gutacht eine klare Antwort auf diese Frage gegeben.
-"Damals wurde die Ampel im Rathaus installiert, weil es sich um eine Schulstraße handelte", erklärte Eckert gestern.
-Das Kluser-Licht schützt Radfahrer, Busfahrer und Bewohner von Bergle.
-Das System, das gestern offiziell in Betrieb genommen wurde, ist für den Knotenpunkt Sulzbachweg / Kirchstrasse von Bedeutung.
-Wir haben das Museum, zwei Kirchen, den Kurpark, die Bushaltestelle, eine Arztpraxis und eine Bank, ganz zu schweigen vom Verkehr aus dem Wohngebiet "Grub".
-"In Zeiten des Hochstraßen- und Fußgängerverkehrs waren zusätzliche Leuchten erforderlich, um die Sicherheit zu gewährleisten", sagte Eckert.
-Dies wurde auch von Peter Arnold vom Landkreis Offenburg bestätigt.
-"Täglich fahren nach aktuellen Messungen rund 12.000 Fahrzeuge durch Gutach auf der B33, wovon etwa zehn Prozent im Schwerlastverkehr liegen", betonte Arnold.
-Deshalb war der Bau einer zusätzlichen Leuchte mehr als notwendig: "Hier kommt die Sicherheit an erster Stelle, das ist einfach", sagte Arnold.
-Insgesamt wurden vier Straßenverkehrssicherheitskontrollen durchgeführt und ein Kreisverkehr wurde ebenfalls in Erwägung gezogen, dieser Gedanke wurde jedoch wegen der engen Anbindung der Anschlüsse Sulzbachweg / Kirchstraße abgelehnt.
-Arnold zufolge wurde vor der Standortwahl für die Ampelstellen alle erdenklichen Tests durchgeführt: "Mit einem Lkw, das mit besonders langen Baumstämmen beladen ist, haben wir auch geprüft, ob solche Fahrzeuge vom Sulzbachweg aus ohne Überquerung der Ampelstellen auf die B 33 zugreifen können".
-Das Ampelsystem selbst, das rund 15.000 Euro kostet, ist das "modernste System, das derzeit auf dem Markt erhältlich ist", erklärte Arnold.
-Das System ist mit farbigen LEDs ausgestattet, die hell genug sind, damit der Fahrer die Lichter auch bei unterer Sonne leicht sehen kann.
-Und sie sind auch energieeffizient: Die älteren Lichtsysteme verbrauchen rund 100 Watt, die neuen nur acht.
-Es gibt drei Lichter pro Fahrtrichtung.
-Arnold erklärte die Technologie des neuen Systems: Es ist mit zwei Radarsensoren ausgestattet.
-Wenn der Fußgänger die Taste an der Ampel drückt, überprüft der obere Radarsensor den Verkehrszustand.
-Wenn die Straße klar ist, erhält der Fußgänger sofort grünes Licht, wenn nicht, gibt es eine Verzögerung von etwa 15 Sekunden.
-Ein zusätzlicher Radarsensor prüft, ob die grüne Phase der Fußgängerzone beendet werden kann.
-"Wenn eine Gruppe von Menschen oder Behinderte die Straße überqueren, wird die grüne Phase verlängert, um sicherzustellen, dass jeder sicher über die Straße kommt", erklärte Arnold.
-Natürlich müssen auch die Fahrer ihren Teil dazu beitragen und ihre Augen auf der Straße behalten.
-Gestern war das nicht der Fall: Das Licht war für Fußgänger kaum grün geworden, als ein Luxusfahrzeug mit rotem Licht durchsprang.
-Seit mehr als 30 Jahren schreibt Josef Winkler aus dem Herzen über die Not seiner Kindheit und Jugend.
-Die Katastrophen seiner katholischen Erziehung - die Sprachlosigkeit, seine Neigung zu brutaler Gewalt und zu langweiliger Sexualität, die Inhaftierung und der Mangel an Freude - all das hat der in Kärnten geborene Dichter schon oft beschrieben.
-Der Büchner-Preisträger ist vor allem als Prosaschriftsteller bekannt, mit Theatertexten, die für ihn eine Seltenheit darstellen.
-In einer Collage von Prosatexten schafft "Wetterleuchten auf der Zungenspitze", die nun in der Garage X auf dem Petersplatz zu sehen ist, Gerhard Fresacher eine Collage von Prosatexten.
-So kombinierte der Theaterproduzent Elemente aus dem autobiographisch inspirierten Roman "Der Leibeigene" (1987) [Der Bondsman] mit Prose Miniaturen von "Leichnam, seine Familie belauernd" (2003) [Corpse, Stalking seiner Familie].
-Auf der überwiegend leeren Bühne - mit einer wichtigen Voraussetzung: einem zerbröckelten Sofa, auf dem die Mitglieder auf Kopieren und Masturbieren anspielen - arbeiten die acht Personen durch das Textmaterial.
-Regisseur Fresacher scheint jedoch wenig Vertrauen in den Text zu haben.
-Die 70-minütige Aufführung verschönert das Drehbuch mit zahlreichen Beiträgen des Regisseurs, die aus dem Repertoire postdramatischer Theaterstile bekannt sind.
-Insbesondere die Schauspielerinnen spielen eine wichtige Rolle in der manchmal eher zweifelhaften Inszenierung.
-Sie werden von Hand behandelt, ihre Köpfe unter Wasser gehalten, an die Wand gekettet durch ihre Abendkleider.
-Im Zellophan oder in den Girdles versteckt, stolpern sie auf gefährlich hohen Fersen quer durch das Set und liefern entweder Monologe an der Spitze ihrer Stimmen oder liegen völlig still auf der Bühne.
-Der Ausgangstext enthält jedoch kaum einen Hinweis auf diese intensive Umsetzung.
-Die besten Momente des Abends sind, wenn der Gesang beginnt - die Tracks reichen von Deep Purple bis hin zu traditionellen Volksliedern.
-Erst gegen Ende schwindet die aufgeladene Performance, und wir sehen Blicke von Winklers etwas absurdem Humor.
-Eine schwarze Box in Ihrem Auto?
-Da die amerikanischen Straßenplaner Schwierigkeiten haben, Geld zu finden, um ein bröckelndes Autobahnsystem zu reparieren, beginnen viele, eine Lösung in einer kleinen Blackbox zu sehen, die gut am Armaturenbrett Ihres Autos passt.
-Die Geräte, die jede Sekunde Autofahrer verfolgen und diese Informationen an Bürokraten weiterleiten, stehen im Mittelpunkt eines kontroversen Versuchs in Washington und in den staatlichen Planungsbüros, das veraltete System zur Finanzierung der amerikanischen Hauptstraßen zu überarbeiten.
-Die normalerweise langweilige Arena der Autobahnplanung hat plötzlich heftige Debatten und bunte Bündnisse ausgelöst.
-Libertäre haben sich Umweltgruppen angeschlossen, um der Regierung zu ermöglichen, die kleinen Boxen zu benutzen, um die Meilen, die Sie fahren, zu verfolgen, und möglicherweise, wo Sie sie fahren - dann nutzen Sie die Informationen, um eine Steuerrechnung zu erstellen.
-Die Tee-Party ist schrecklich.
-Die amerikanische Bürgerliche Freiheitsunion ist ebenfalls zutiefst besorgt, wenn es um verschiedene Fragen der Privatsphäre geht.
-Und während sich der Kongress nicht darüber einigen kann, ob er fortfahren soll, warten mehrere Staaten nicht.
-Sie erforschen, wie sie in den nächsten zehn Jahren zu einem System übergehen können, in dem die Fahrer pro Kilometer Straße zahlen, die sie umsteigen.
-Tausende Autofahrer haben bereits die zum Teil mit GPS-Überwachung ausgestatteten Black Boxes zum Testfahren genommen.
-Das ist wirklich ein Muss für unsere Nation.
-"Es geht nicht um etwas, was wir tun könnten", sagte Hasan Ikhrata, Executive Director of the Southern California Assn. of Governments, die plant, dass der Staat bis 2025 Meilen von jedem kalifornischen Autofahrer verfolgt.
-Die Art und Weise, wie wir diese Steuern zahlen, wird sich ändern.
-Die Technologie ist dazu da.
-Der Schub kommt, als der Highway Trust Fund des Landes, der mit Steuern finanziert wird, die die Amerikaner an der Gaspumpe zahlen, kaputt ist.
-Die Amerikaner kaufen nicht so viel Gas wie früher.
-Mit dem Auto kommen Sie in die Gallon.
-Die Bundessteuer selbst, 18,4 Cent pro Gallone, ist in 20 Jahren nicht gestiegen.
-Politiker lehnen es ab, die Steuern auch nur einen Cent zu erhöhen, wenn die Gaspreise hoch sind.
-"Die Gassteuer ist einfach nicht nachhaltig", sagte Lee Munnich, Verkehrspolitikexperte an der Universität Minnesota.
-Sein Staat hat vor kurzem Tracking-Geräte auf 500 Autos, um ein Pay-by-Meilen-System zu testen.
-"Dies funktioniert als die logischste Alternative auf lange Sicht", sagte er.
-Wonks nennen es eine mithelfende Nutzergebühr.
-Es überrascht nicht, dass die Idee an die städtischen Liberalen appelliert, da die Steuern manipuliert werden könnten, um die Lenkungsmuster auf eine Weise zu ändern, die zum Beispiel zur Verringerung von Staus und Treibhausgasen beitragen könnte.
-Die kalifornischen Planer schauen auf das System, wenn sie Strategien entwickeln, um die in den ehrgeizigen Klimaschutzgesetzen des Staates festgelegten Ziele zu erreichen.
-Aber Bill Shuster (R-Pa.), Vorsitzender des House Transportation Committee, hat gesagt, dass auch er dies für die nachhaltigste langfristige Alternative hält.
-Die freien Händler der Reason Foundation lieben es auch, Fahrer pro Meile bezahlen zu lassen.
-"Dies ist nicht nur eine Steuer, die in ein schwarzes Loch geht", sagte Adrian Moore, Vice President of Policy at Reason.
-Die Menschen zahlen direkter in das, was sie bekommen.
-Die Bewegung wird auch durch zwei ehemalige US-Transportation Sekretäre, die in einem 2011 Bericht drängte Kongress in der Richtung bewegen, die Pay-per-mile.
-Der US-Senat genehmigte im vergangenen Jahr ein Pilotprojekt im Wert von $90-Millionen, an dem etwa 10.000 Autos beteiligt gewesen wären.
-Aber die Führung des Hauses hat den Vorschlag umgebracht und auf Bedenken von Abgeordneten aus dem ländlichen Raum reagiert, die Wähler vertreten, deren tägliches Leben oft darin besteht, viele Meilen zu sammeln, um zur Arbeit oder in die Stadt zu kommen.
-Mehrere Staaten und Städte schreiten jedoch allein voran.
-Der begierigste ist Oregon, das 5.000 Fahrer in das größte Experiment des Landes einträgt.
-Diese Fahrer werden bald die Meilengebühren anstelle von Gassteuern an den Staat zahlen.
-Nevada hat bereits einen Piloten fertiggestellt.
-New York City sucht eine.
-Illinois versucht es auf limitierter Basis mit Lastwagen.
-Und die I-95 Coalition, die 17 staatliche Transportabteilungen entlang der Ostküste (einschließlich Maryland, Pennsylvania, Virginia und Florida) umfasst, untersucht, wie sie den Wandel umsetzen könnten.
-Das Konzept ist kein universeller Hit.
-In Nevada, wo vor nicht allzu langer Zeit etwa 50 Autos von Freiwilligen mit diesen Geräten ausgerüstet waren, waren die Fahrer beunruhigt, dass die Regierung in der Lage war, jeden Schritt zu überwachen.
-"Die Bedenken über Big Brother und solche Dinge waren ein großes Problem", sagte Alauddin Khan, der das strategische und leistungsorientierte Management beim Nevada Department of Transportation leitet.
-Das wollte man nicht.
-Als der Prozess begann, warnte die ACLU von Nevada auf ihrer Website: "Es wäre ziemlich einfach, diese Geräte in vollwertige Tracking-Geräte zu verwandeln."
-Es besteht keine Notwendigkeit, eine riesige, schwerfällige technologische Infrastruktur aufzubauen, die unweigerlich erweitert wird, um Aufzeichnungen über das alltägliche Kommen und Gehen des Einzelnen zu erhalten.
-Nevada ist unter mehreren Staaten, die sich jetzt darum bemühen, erschwingliche Technologien zu finden, die es dem Staat ermöglichen würden, zu verfolgen, wie viele Meilen ein Auto gefahren wird, aber nicht genau wo und zu welchem Zeitpunkt.
-Wenn Sie das können, sagte Khan, wird die Öffentlichkeit komfortabler.
-Die Jagd nach dieser Technologie hat einige staatliche Behörden zu einem kleinen kalifornischen Startup namens True Mileage geführt.
-Die Kanzlei war ursprünglich nicht im Bereich der Steuerhilfe für Staaten tätig.
-Sie wollte in einen aufstrebenden Markt in der Kraftfahrzeugversicherung einbrechen, in dem die Fahrer je nach Fahrtrichtung zahlen würden.
-Aber die Geräte es testet Anziehungskraft für Autobahn-Planer, weil sie nicht verwenden GPS und liefern eine begrenzte Menge von Informationen, periodisch per Modem hochgeladen.
-"Die Menschen werden mehr bereit sein, dies zu tun, wenn Sie nicht ihre Geschwindigkeit und Sie nicht verfolgen ihren Standort", sagte Ryan Morrison, Chief Executive of True Mileage.
-In einigen dieser staatlichen Pilotprogramme hat es große Fehler gegeben.
-Dafür gibt es viel weniger kostspielige und aufdringliche Möglichkeiten.
-In Oregon experimentieren Planer damit, den Fahrern eine andere Wahl zu geben.
-Sie können ein Gerät mit oder ohne GPS wählen.
-Oder sie können sich entscheiden, überhaupt kein Gerät zu haben, sondern eine Pauschalgebühr zu zahlen, die auf der durchschnittlichen Anzahl von Meilen basiert, die von allen Staatsbürgern gefahren werden.
-Andere Orte hoffen, das Konzept an ein warnendes Publikum zu verkaufen, indem die Geräte mehr und nicht weniger tun.
-In New York City versuchen die Verkehrsbeamten, ein Steuergerät zu entwickeln, das auch für die Bezahlung von Parkplatzgebühren, die Bereitstellung einer Versicherung zur „Umlageversicherung“ und die Schaffung eines Pools an Echtzeit-Geschwindigkeitsdaten von anderen Fahrern ausgestattet wäre, die die Fahrer nutzen könnten, um den Verkehr zu vermeiden.
-"Die Autofahrer würden wegen des Wertes der Vorteile, die es ihnen bietet, angezogen werden", sagt ein Stadtplanungsdokument.
-Einige Verkehrsplaner fragen sich jedoch, ob das ganze Gerede über die Meilenzahlung nur eine riesige Ablenkung ist.
diff --git a/scripts/tests/test_xlnet.py b/scripts/tests/test_xlnet.py
deleted file mode 100644
index 06daef29ea..0000000000
--- a/scripts/tests/test_xlnet.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Test XLNet."""
-
-import pytest
-
-import mxnet as mx
-
-from ..language_model.transformer import get_model
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.parametrize('use_decoder', [True, False])
-@pytest.mark.parametrize('hybridize', [True, False])
-@pytest.mark.parametrize('modelname', ['xlnet_cased_l12_h768_a12', 'xlnet_cased_l24_h1024_a16'])
-def test_xlnet_pretrained(modelname, hybridize, use_decoder):
-    model, vocab, tokenizer = get_model(modelname, dataset_name='126gb', use_decoder=use_decoder)
-    if hybridize:
-        model.hybridize()
-
-    batch_size, qlen, mlen = 2, 16, 100
-    mems = model.begin_mems(batch_size, mlen, context=mx.cpu())
-    indices = mx.nd.ones(shape=(batch_size, qlen))
-    token_types = mx.nd.ones_like(indices)
-    output, new_mems = model(indices, token_types, mems)
-    mx.nd.waitall()
-
-    assert tokenizer('hello') == ['▁hello']
diff --git a/scripts/tests/word_embeddings/glove/cooccurrences.npz b/scripts/tests/word_embeddings/glove/cooccurrences.npz
deleted file mode 100644
index 38ad69de40..0000000000
Binary files a/scripts/tests/word_embeddings/glove/cooccurrences.npz and /dev/null differ
diff --git a/scripts/tests/word_embeddings/glove/vocab.txt b/scripts/tests/word_embeddings/glove/vocab.txt
deleted file mode 100644
index 304c3eaa9f..0000000000
--- a/scripts/tests/word_embeddings/glove/vocab.txt
+++ /dev/null
@@ -1,27 +0,0 @@
-the	49
-of	29
-and	22
-in	21
-as	14
-to	14
-is	12
-that	12
-a	9
-anarchism	9
-anarchist	9
-an	7
-proudhon	7
-society	7
-this	7
-what	7
-anarchists	6
-are	6
-be	6
-it	6
-property	6
-by	5
-he	5
-one	5
-revolution	5
-term	5
-use	5
diff --git a/scripts/text_classification/fasttext_word_ngram.py b/scripts/text_classification/fasttext_word_ngram.py
deleted file mode 100644
index 49c1e35b26..0000000000
--- a/scripts/text_classification/fasttext_word_ngram.py
+++ /dev/null
@@ -1,422 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=global-variable-undefined,wrong-import-position
-"""Fasttext Classification Training Model model
-===========================
-This example shows how to train a FastText Classification model with the
-Gluon NLP Toolkit.
-The FastText Classification model was introduced by
-- Joulin, Armand, et al. "Bag of tricks for efficient text classification."
-... arXiv preprint arXiv:1607.01759 (2016).
-For larger datasets please refrain from using -ngrams to value > 2
-"""
-import argparse
-import logging
-import math
-import time
-
-from collections import Counter
-import multiprocessing as mp
-import numpy as np
-from mxnet import nd, autograd
-from mxnet.gluon import nn, HybridBlock
-import mxnet as mx
-import mxnet.gluon as gluon
-
-import gluonnlp
-
-gluonnlp.utils.check_version('0.7.1')
-
-class MeanPoolingLayer(gluon.HybridBlock):
-    """A block for mean pooling of encoder features"""
-
-    def __init__(self, prefix=None, params=None):
-        super(MeanPoolingLayer, self).__init__(prefix=prefix, params=params)
-
-    def hybrid_forward(self, F, data, valid_length):  # pylint: disable=arguments-differ
-        """Forward logic"""
-        # Data will have shape (T, N, C)
-        masked_encoded = F.SequenceMask(data,
-                                        sequence_length=valid_length,
-                                        use_sequence_length=True)
-        agg_state = F.broadcast_div(F.sum(masked_encoded, axis=0),
-                                    F.expand_dims(valid_length, axis=1))
-        return agg_state
-
-
-class FastTextClassificationModel(HybridBlock):
-    """
-    The Model Block for FastTextClassification Model.
-    The trained embeddings layer, is averaged and then softmax
-    layer is applied on top of it.
-    """
-
-    def __init__(self, vocab_size, embedding_dim, num_classes, **kwargs):
-        super(FastTextClassificationModel, self).__init__(**kwargs)
-        with self.name_scope():
-            self.vocab_size = vocab_size
-            self.embedding_dim = embedding_dim
-            self.embedding = nn.Embedding(
-                self.vocab_size,
-                self.embedding_dim,
-                weight_initializer=mx.init.Xavier(),
-                dtype='float32')
-            num_output_units = num_classes
-            if num_classes == 2:
-                num_output_units = 1
-            logging.info('Number of output units in the last layer :%s',
-                         num_output_units)
-            self.agg_layer = MeanPoolingLayer()
-            self.dense = nn.Dense(num_output_units)
-
-    def hybrid_forward(self, F, doc, valid_length):  # pylint: disable=arguments-differ
-        doc = doc.swapaxes(dim1=0, dim2=1)
-        embeddings = self.embedding(doc)
-        mean_pooled = self.agg_layer(embeddings, valid_length)
-        dense_output = self.dense(mean_pooled)
-        return F.Dropout(dense_output, 0.1)
-
-
-def create_ngram_set(input_list, ngram_value=2):
-    """
-    Extract a set of n-grams from a list of integers.
-    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
-    {(4, 9), (4, 1), (1, 4), (9, 4)}
-    >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
-    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
-    """
-    return set(zip(*[input_list[i:] for i in range(ngram_value)]))
-
-
-def add_ngram(sequences, token_indice, ngram_range=2):
-    """
-    Augment the input list of list (sequences) by appending n-grams values.
-    Example: adding bi-gram
-    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
-    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
-    >>> add_ngram(sequences, token_indice, ngram_range=2)
-    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
-    Example: adding tri-gram
-    >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
-    >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
-    >>> add_ngram(sequences, token_indice, ngram_range=3)
-    [[1, 3, 4, 5, 1337], [1, 3, 7, 9, 2, 1337, 2018]]
-    """
-    new_sequences = []
-    for input_list in sequences:
-        new_list = input_list[:]
-        for i in range(len(new_list) - ngram_range + 1):
-            for ngram_value in range(2, ngram_range + 1):
-                ngram = tuple(new_list[i:i + ngram_value])
-                if ngram in token_indice:
-                    new_list.append(token_indice[ngram])
-        new_sequences.append(new_list)
-
-    return new_sequences
-
-
-def evaluate_accuracy(data_iterator, net, ctx, loss_fun, num_classes):
-    """
-    This function is used for evaluating accuracy of
-    a given data iterator. (Either Train/Test data)
-    It takes in the loss function used too!
-    """
-    acc = mx.metric.Accuracy()
-    loss_avg = 0.
-    for i, ((data, length), label) in enumerate(data_iterator):
-        data = data.as_in_context(ctx)  # .reshape((-1,784))
-        length = length.astype('float32').as_in_context(ctx)
-        label = label.as_in_context(ctx)
-        output = net(data, length)
-        loss = loss_fun(output, label)
-        preds = []
-        if num_classes == 2:
-            preds = (nd.sign(output) + 1) / 2
-            preds = preds.reshape(-1)
-        else:
-            preds = nd.argmax(output, axis=1)
-        acc.update(preds=preds, labels=label)
-        loss_avg = loss_avg * i / (i + 1) + nd.mean(loss).asscalar() / (i + 1)
-    return acc.get()[1], loss_avg
-
-
-def read_input_data(filename):
-    """Helper function to get training data"""
-    logging.info('Opening file %s for reading input', filename)
-    input_file = open(filename, 'r')
-    data = []
-    labels = []
-    for line in input_file:
-        tokens = line.split(',', 1)
-        labels.append(tokens[0].strip())
-        data.append(tokens[1].strip())
-    return labels, data
-
-
-###############################################################################
-# Utils
-###############################################################################
-def parse_args():
-    """Parse command line arguments."""
-    parser = argparse.ArgumentParser(
-        description='Text Classification with FastText',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-    # Computation options
-    group = parser.add_argument_group('Computation arguments')
-    group.add_argument('--input', type=str, help='Input file location')
-    group.add_argument(
-        '--validation', type=str, help='Validation file Location ')
-    group.add_argument(
-        '--output', type=str, help='Location to save trained model')
-    group.add_argument(
-        '--ngrams', type=int, default=1, help='NGrams used for training')
-    group.add_argument(
-        '--batch_size', type=int, default=16, help='Batch size for training.')
-    group.add_argument('--epochs', type=int, default=10, help='Epoch limit')
-    group.add_argument(
-        '--gpu',
-        type=int,
-        help=('Number (index) of GPU to run on, e.g. 0. '
-              'If not specified, uses CPU.'))
-    group.add_argument(
-        '--no-hybridize',
-        action='store_true',
-        help='Disable hybridization of gluon HybridBlocks.')
-
-    # Model
-    group = parser.add_argument_group('Model arguments')
-    group.add_argument(
-        '--emsize', type=int, default=100, help='Size of embedding vectors.')
-
-    # Optimization options
-    group = parser.add_argument_group('Optimization arguments')
-    group.add_argument('--optimizer', type=str, default='adam')
-    group.add_argument('--lr', type=float, default=0.05)
-
-    args = parser.parse_args()
-    return args
-
-
-def get_label_mapping(train_labels):
-    """
-    Create the mapping from label to numeric label
-    """
-    sorted_labels = np.sort(np.unique(train_labels))
-    label_mapping = {}
-    for i, label in enumerate(sorted_labels):
-        label_mapping[label] = i
-    logging.info('Label mapping:%s', format(label_mapping))
-    return label_mapping
-
-
-def save_model(net, output_file):
-    """This method saves the model to file"""
-    net.save_parameters(output_file)
-
-
-def get_context(args):
-    """ This method gets context of execution"""
-    context = None
-    if args.gpu is None or args.gpu == '':
-        context = mx.cpu()
-    if isinstance(args.gpu, int):
-        context = mx.gpu(args.gpu)
-    return context
-
-
-def get_length(inp):
-    """Returns the length"""
-    return float(len(inp[0]))
-
-
-def get_sequence(inpt):
-    """Transforms input to vocab id's"""
-    document = inpt[0]
-    vocab = inpt[1]
-    return vocab[document.split()]
-
-
-def convert_to_sequences(dataset, vocab):
-    """This function takes a dataset and converts
-    it into sequences via multiprocessing
-    """
-    start = time.time()
-    dataset_vocab = map(lambda x: (x, vocab), dataset)
-    with mp.Pool() as pool:
-        # Each sample is processed in an asynchronous manner.
-        output = pool.map(get_sequence, dataset_vocab)
-    end = time.time()
-    logging.info('Done! Sequence conversion Time={:.2f}s, #Sentences={}'
-                 .format(end - start, len(dataset)))
-    return output
-
-
-def preprocess_dataset(dataset, labels):
-    """ Preprocess and prepare a dataset"""
-    start = time.time()
-    with mp.Pool() as pool:
-        # Each sample is processed in an asynchronous manner.
-        dataset = gluon.data.SimpleDataset(list(zip(dataset, labels)))
-        lengths = gluon.data.SimpleDataset(pool.map(get_length, dataset))
-    end = time.time()
-    logging.info('Done! Preprocessing Time={:.2f}s, #Sentences={}'
-                 .format(end - start, len(dataset)))
-    return dataset, lengths
-
-
-def get_dataloader(train_dataset, train_data_lengths,
-                   test_dataset, batch_size):
-    """ Construct the DataLoader. Pad data, stack label and lengths"""
-    bucket_num, bucket_ratio = 20, 0.2
-    batchify_fn = gluonnlp.data.batchify.Tuple(
-        gluonnlp.data.batchify.Pad(axis=0, pad_val=0, ret_length=True),
-        gluonnlp.data.batchify.Stack(dtype='float32'))
-    batch_sampler = gluonnlp.data.sampler.FixedBucketSampler(
-        train_data_lengths,
-        batch_size=batch_size,
-        num_buckets=bucket_num,
-        ratio=bucket_ratio,
-        shuffle=True)
-    train_dataloader = gluon.data.DataLoader(
-        dataset=train_dataset,
-        batch_sampler=batch_sampler,
-        batchify_fn=batchify_fn)
-    test_dataloader = gluon.data.DataLoader(
-        dataset=test_dataset,
-        batch_size=batch_size,
-        shuffle=False,
-        batchify_fn=batchify_fn)
-    return train_dataloader, test_dataloader
-
-###############################################################################
-# Training code
-###############################################################################
-def train(args):
-    """Training function that orchestrates the Classification! """
-    train_file = args.input
-    test_file = args.validation
-    ngram_range = args.ngrams
-    logging.info('Ngrams range for the training run : %s', ngram_range)
-    logging.info('Loading Training data')
-    train_labels, train_data = read_input_data(train_file)
-    logging.info('Loading Test data')
-    test_labels, test_data = read_input_data(test_file)
-    tokens_list = []
-    for line in train_data:
-        tokens_list.extend(line.split())
-
-    cntr = Counter(tokens_list)
-    train_vocab = gluonnlp.Vocab(cntr)
-    logging.info('Vocabulary size: %s', len(train_vocab))
-    logging.info('Training data converting to sequences...')
-    embedding_matrix_len = len(train_vocab)
-    # Preprocess the dataset
-    train_sequences = convert_to_sequences(train_data, train_vocab)
-    test_sequences = convert_to_sequences(test_data, train_vocab)
-
-    if ngram_range >= 2:
-        logging.info('Adding %s-gram features', ngram_range)
-        # Create set of unique n-gram from the training set.
-        ngram_set = set()
-        for input_list in train_sequences:
-            for i in range(2, ngram_range + 1):
-                set_of_ngram = create_ngram_set(input_list, ngram_value=i)
-                ngram_set.update(set_of_ngram)
-        start_index = len(cntr)
-        token_indices = {v: k + start_index for k, v in enumerate(ngram_set)}
-        embedding_matrix_len = embedding_matrix_len + len(token_indices)
-        train_sequences = add_ngram(train_sequences, token_indices,
-                                    ngram_range)
-        test_sequences = add_ngram(test_sequences, token_indices, ngram_range)
-        logging.info('Added n-gram features to train and test datasets!! ')
-    logging.info('Encoding labels')
-
-    label_mapping = get_label_mapping(train_labels)
-    y_train_final = list(map(lambda x: label_mapping[x], train_labels))
-    y_test_final = list(map(lambda x: label_mapping[x], test_labels))
-
-    train_sequences, train_data_lengths = preprocess_dataset(
-        train_sequences, y_train_final)
-    test_sequences, _ = preprocess_dataset(
-        test_sequences, y_test_final)
-    train_dataloader, test_dataloader = get_dataloader(train_sequences, train_data_lengths,
-                                                       test_sequences, args.batch_size)
-
-    num_classes = len(np.unique(train_labels))
-    logging.info('Number of labels: %s', num_classes)
-    logging.info('Initializing network')
-    ctx = get_context(args)
-    logging.info('Running Training on ctx:%s', ctx)
-    embedding_dim = args.emsize
-    logging.info('Embedding Matrix Length:%s', embedding_matrix_len)
-    net = FastTextClassificationModel(
-        embedding_matrix_len, embedding_dim, num_classes)
-    net.hybridize()
-    net.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
-    logging.info('Network initialized')
-
-    softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
-    sigmoid_loss_fn = gluon.loss.SigmoidBinaryCrossEntropyLoss()
-
-    loss_function = softmax_cross_entropy
-    if num_classes == 2:
-        logging.info(
-            'Changing the loss function to sigmoid since its Binary Classification'
-        )
-        loss_function = sigmoid_loss_fn
-    logging.info('Loss function for training:%s', loss_function)
-    num_epochs = args.epochs
-    batch_size = args.batch_size
-    logging.info('Starting Training!')
-    learning_rate = args.lr
-    trainer = gluon.Trainer(net.collect_params(), 'adam',
-                            {'learning_rate': learning_rate})
-    num_batches = len(train_data) / batch_size
-    display_batch_cadence = int(math.ceil(num_batches / 10))
-    logging.info('Training on %s samples and testing on %s samples',
-                 len(train_data), len(test_data))
-    logging.info('Number of batches for each epoch : %s, Display cadence: %s',
-                 num_batches, display_batch_cadence)
-    for epoch in range(num_epochs):
-        for batch, ((data, length), label) in enumerate(train_dataloader):
-            data = data.as_in_context(ctx)
-            label = label.as_in_context(ctx)
-            length = length.astype('float32').as_in_context(ctx)
-            with autograd.record():
-                output = net(data, length)
-                loss = loss_function(output, label)
-            loss.backward()
-            trainer.step(data.shape[0])
-            if batch % display_batch_cadence == 0:
-                logging.info('Epoch : %s, Batches complete :%s', epoch, batch)
-        logging.info('Epoch complete :%s, Computing Accuracy', epoch)
-        test_accuracy, test_loss = evaluate_accuracy(
-            test_dataloader, net, ctx, loss_function, num_classes)
-        logging.info('Epochs completed : %s Test Accuracy: %s, Test Loss: %s',
-                     epoch, test_accuracy, test_loss)
-        learning_rate = learning_rate * 0.5
-        trainer.set_learning_rate(learning_rate)
-    save_model(net, args.output)
-
-
-if __name__ == '__main__':
-    logging.basicConfig()
-    logging.getLogger().setLevel(logging.INFO)
-    arguments = parse_args()
-    train(arguments)
diff --git a/scripts/text_classification/index.rst b/scripts/text_classification/index.rst
deleted file mode 100644
index 1129220926..0000000000
--- a/scripts/text_classification/index.rst
+++ /dev/null
@@ -1,73 +0,0 @@
-Text Classification
--------------------
-
-:download:`Download scripts </model_zoo/text_classification.zip>`
-
-
-Fast-text Word N-gram
-~~~~~~~~~~~~~~~~~~~~~
-
-Use the following command to train the FastText classification model on the Yelp review dataset.
-The model we have implemented is a slight variant of :
-
-- Joulin, Armand, et al. "`Bag of tricks for efficient text classification <https://arxiv.org/abs/1607.01759>`__"
-
-We have added dropout to the final layer, and the optimizer is changed from 'sgd' to 'adam'
-These are made for demo purposes and we can get very good numbers with original settings too,
-but a complete async sgd with batch size = 1, might be very slow for training using a GPU.
-
-The datasets used in this script can be obtained with
-`this script <https://github.com/facebookresearch/fastText/blob/master/classification-results.sh>`__ from fasttext.
-
-.. code-block:: console
-
-   $ python fasttext_word_ngram.py --input yelp_review_polarity.train \
-                                   --output yelp_review_polarity.gluon \
-                                   --validation yelp_review_polarity.test \
-                                   --ngrams 1 --epochs 10 --lr 0.1 --emsize 100 --gpu 0
-
-
-It gets validation accuracy score of 94%. Yelp review is a binary classification dataset. (It has 2 classes)
-Training logs : `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/classification/fasttext-yelp-review.log>`__
-
-We can call the script for multiclass classification as well without any change, it automatically figures out the number of classes
-and chooses to use sigmoid or softmax loss corresponding to the problem.
-
-Use the following commands to train a Classification model on the dbpedia dataset which has 14 labels
-
-.. code-block:: console
-
-   $ python fasttext_word_ngram.py --input dbpedia.train \
-                                   --output dbpedia.gluon \
-                                   --validation dbpedia.test \
-                                   --ngrams 2 --epochs 25 --lr 0.1 --emsize 100 --gpu 0
-
-It gives validation accuracy of 98.8%. Try tweaking --ngrams to 2 or 3 for improved accuracy numbers.
-Training logs : `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/classification/fasttext-dbpedia.log>`__
-
-
-Use the following command to train a Classification model on the ag_news dataset:
-
-.. code-block:: console
-
-   $ python fasttext_word_ngram.py --input ag_news.train \
-                                   --output ag_news.gluon \
-                                   --validation ag_news.test \
-                                   --ngrams 2 --epochs 25 --lr 0.1 --emsize 100 --gpu 0
-
-It gives a validation accuracy of 92.5%. 
-Training logs : `log <https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/classification/fasttext-ag-news.log>`__
-
-Note: Its not advised to try higher order n-grams with large datasets since it would cause OOM on the GPU's.
-You can try running it when you disable the --gpu option as many AWS EC2 instances support > 64GB RAM.
-In general, larger learning rate and higher order n-grams yield better accuracy. Too high learning rate might
-cause very high oscillations in the accuracy during the training.
-
-Custom Datasets:
-
-The training can benefit from preprocessing the dataset to lower case all the text and remove punctuations.
-Use the following linux utility for achieving the same:
-
-.. code-block:: console
-
-   $ cat <input.txt> | sed -e "s/\([.\!?,'/()]\)/ \1 /g" | tr "[:upper:]" "[:lower:]" > input.preprocessed.txt
diff --git a/scripts/text_generation/__init__.py b/scripts/text_generation/__init__.py
deleted file mode 100644
index 7f603e99eb..0000000000
--- a/scripts/text_generation/__init__.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Text Generation Module."""
-from . import model
diff --git a/scripts/text_generation/index.rst b/scripts/text_generation/index.rst
deleted file mode 100644
index 63873d1fe5..0000000000
--- a/scripts/text_generation/index.rst
+++ /dev/null
@@ -1,131 +0,0 @@
-Text Generation
----------------
-
-:download:`Download scripts </model_zoo/text_generation.zip>`
-
-Sampling a Language Model
-+++++++++++++++++++++++++
-
-This script can be used to generate sentences using beam search or a sequence sampler, to sample from a pre-trained language model such as GPT-2. For example:
-
-.. code-block:: console
-
-   $ python sequence_sampling.py random-sample \
-         --bos 'Deep learning and natural language processing' \
-         --beam-size 1 --print-num 1 \
-         --lm-model gpt2_345m # options are {gpt2_117m, gpt2_345m} \
-         --max-length 1024
-
-Output is
-
-.. code-block:: console
-
-    Sampling Parameters: beam_size=1, temperature=1.0, use_top_k=None
-    Generation Result:
-    ['Deep learning and natural language processing brought application choice in healthcare and perception of sounds and heat to new heights, enriching our physical communities with medical devices and creating vibrant cultures. Anecdote is slowly diminishing but is hardly obsolete nor more appealing than experience.Despite those last words of wisdom, most headset makers even spook us with the complexity and poor code quality. the hard set a mere $150 and beginner creates center for getting started. Temp cheap:\nPosted by Fleegu at 12:02 PM<|endoftext|>', -461.15128]
-
-Sequence Sampler
-~~~~~~~~~~~~~~~~
-
-Use the following command to decode to sample from the multinomial distribution.
-
-.. code-block:: console
-
-   $ python sequence_sampling.py random-sample --bos 'I love it' --beam-size 5 --print-num 5
-
-Output is
-
-.. code-block:: console
-
-    Sampling Parameters: beam_size=5, temperature=1.0, use_top_k=None
-    Generation Result:
-    ['I love it in reference to the northwestern country. replay Liberties were raised from the late 1943 to June <eos>', -89.459656]
-    ['I love it to them. Very account suggests that there is no basis as to whether the constellations are <eos>', -72.687996]
-    ['I love it for quick attempts. It does not have any factors, and [the cause] has <eos>', -64.87619]
-    ['I love it one in the English language, and say it was not for English the same standard than <eos>', -71.51008]
-    ['I love it to take care of her; following many attempts to appease the Canadian military and making some <eos>', -75.5512]
-
-You can also try a lower temperature such as 0.95, which results in sharper distribution.
-
-.. code-block:: console
-
-   $ python sequence_sampling.py random-sample --bos 'I love it' --beam-size 5 --print-num 5 --temperature 0.95
-
-Output is
-
-.. code-block:: console
-
-    Sampling Parameters: beam_size=5, temperature=0.95, use_top_k=None
-    Generation Result:
-    ['I love it and flew by <unk> (a <unk> colleague Due to his delicate and non-serious attacks <eos>', -85.825195]
-    ['I love it in a short anticipated 1927 hiatus. As a result, it was able to withstand changes <eos>', -71.8867]
-    ['I love it for analysis. <eos>', -15.78739]
-    ['I love it his own. The total of one hundred lives of all other documented <unk> in the Congo <eos>', -68.57835]
-    ['I love it in his Why My Woman to Get Out of Graham Your Way. <eos>', -65.74211]
-
-Finally, you can also try to constrain the sampling to sample only from the top-k tokens.
-
-.. code-block:: console
-
-   $ python sequence_sampling.py random-sample --bos 'I love it' --beam-size 5 --print-num 5 --temperature 0.95 --use-top-k 800
-
-Output is
-
-.. code-block:: console
-
-    Sampling Parameters: beam_size=5, temperature=0.95, use_top_k=800
-    Generation Result:
-    ['I love it. It is the same as the Old Age. The best known of this is the <eos>', -30.544556]
-    ['I love it and had a weak start by a group of only three-year-old fans. <eos>', -44.970097]
-    ['I love it ". <eos>', -4.725212]
-    ['I love it with the <unk>. <eos>', -7.236909]
-    ['I love it and its working-based <unk> ". <eos>', -25.340023]
-
-Beam Search Generator
-~~~~~~~~~~~~~~~~~~~~~
-
-Use the following command to decode using beam search.
-
-.. code-block:: console
-
-   $ python sequence_sampling.py beam-search --bos 'I love it' --beam-size 5 --print-num 5
-
-Output is
-
-.. code-block:: console
-
-    Beam Seach Parameters: beam_size=5, alpha=0.0, K=5
-    Generation Result:
-    ['I love it. <eos>', -2.6606221]
-    ['I love it. "<eos>', -4.072001]
-    ['I love it, and the <unk> of the <unk>. <eos>', -14.573]
-    ['I love it, and the <unk> of the <unk>. The <unk> of the <unk>, the <unk>, <eos>', -28.968985]
-    ['I love it, and the <unk> of the <unk>. The <unk> of the <unk>, the <unk> and <eos>', -30.064144]
-
-You can also try a larger beam size, such as 15.
-
-.. code-block:: console
-
-   $ python sequence_sampling.py beam-search --bos 'I love it' --beam-size 15 --print-num 15
-
-Output is
-
-.. code-block:: console
-
-    Beam Seach Parameters: beam_size=15, alpha=0.0, K=5
-    Generation Result:
-    ['I love it. <eos>', -2.6606221]
-    ['I love it. "<eos>', -4.072001]
-    ['I love it ". <eos>', -5.222643]
-    ['I love it, and the <unk> of the <unk>. <eos>', -14.573]
-    ['I love it. It was the first time in the history of the history of the history of the <eos>', -21.041868]
-    ['I love it. It was the first time in the history of the history of the country. <eos>', -21.262276]
-    ['I love it. It was the first time in the history of the history of the United States. <eos>', -21.826159]
-    ['I love it. It was the first time in the history of the history of the world. <eos>', -21.930265]
-    ['I love it. It was the first time in the history of the history of the country. The <eos>', -21.94392]
-    ['I love it. It was the first time in the history of the history of the city. <eos>', -22.00894]
-    ['I love it. It was the first time in the history of the history of the country that the <eos>', -22.152416]
-    ['I love it. It was the first time in the history of the history of the United States, <eos>', -22.170143]
-    ['I love it. It was the first time in the history of the history of the country, and <eos>', -22.188667]
-    ['I love it. It was the first time in the history of the history of the United States that <eos>', -22.254015]
-    ['I love it. It was the first time in the history of the history of the state. <eos>', -22.398975]
diff --git a/scripts/text_generation/model/__init__.py b/scripts/text_generation/model/__init__.py
deleted file mode 100644
index ec32e6cf62..0000000000
--- a/scripts/text_generation/model/__init__.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Text generation models."""
-from gluonnlp.model import get_model as _get_model
-from .gpt import *
-
-def get_model(name, **kwargs):
-    """Returns a pre-defined model by name.
-
-    In addition to the models in GluonNLP model API, this API supports getting GPT-2 models.
-
-    Parameters
-    ----------
-    name : str
-        Name of the model.
-    dataset_name : str or None, default None
-        The dataset name on which the pre-trained model is trained.
-        For language model, options are 'wikitext-2'.
-        For ELMo, Options are 'gbw' and '5bw'.
-        'gbw' represents 1 Billion Word Language Model Benchmark
-        http://www.statmt.org/lm-benchmark/;
-        '5bw' represents a dataset of 5.5B tokens consisting of
-        Wikipedia (1.9B) and all of the monolingual news crawl data from WMT 2008-2012 (3.6B).
-        If specified, then the returned vocabulary is extracted from
-        the training set of the dataset.
-        If None, then vocab is required, for specifying embedding weight size, and is directly
-        returned.
-    vocab : gluonnlp.Vocab or None, default None
-        Vocabulary object to be used with the language model.
-        Required when dataset_name is not specified.
-        None Vocabulary object is required with the ELMo model.
-    pretrained : bool, default False
-        Whether to load the pre-trained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pre-trained weights.
-    root : str, default '$MXNET_HOME/models' with MXNET_HOME defaults to '~/.mxnet'
-        Location for keeping the model parameters.
-
-    Returns
-    -------
-    gluon.Block, gluonnlp.Vocab, (optional) gluonnlp.Vocab
-    """
-    models = {'gpt2_117m' : gpt2_117m,
-              'gpt2_345m' : gpt2_345m}
-    name = name.lower()
-    if name not in models:
-        return _get_model(name, **kwargs)
-    return models[name](**kwargs)
diff --git a/scripts/text_generation/model/gpt.py b/scripts/text_generation/model/gpt.py
deleted file mode 100644
index 37137c4a12..0000000000
--- a/scripts/text_generation/model/gpt.py
+++ /dev/null
@@ -1,441 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""GPT models."""
-
-__all__ = ['GPT2Model', 'GPT2SelfAttentionLayer', 'GPT2FFNLayer',
-           'gpt2_117m', 'gpt2_345m']
-
-import os
-
-import mxnet as mx
-from mxnet.gluon import HybridBlock, nn
-from mxnet.gluon.model_zoo import model_store
-
-from gluonnlp.base import get_home_dir
-from gluonnlp.model.attention_cell import DotProductAttentionCell
-from gluonnlp.model.block import GELU
-from gluonnlp.model.utils import _load_pretrained_params, _load_vocab
-
-
-class GPT2SelfAttentionLayer(HybridBlock):
-    """Self-attention layer used in OpenAI GPT-2.
-
-    Parameters
-    ----------
-    units : int
-        Number of units for the output.
-    num_heads : int
-        Number of heads in multi-head attention
-    dropout : float
-        Dropout probability of the attention probabilities.
-    weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default None.
-        Prefix for name of `Block`s. (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells. Created if `None`.
-
-    Inputs:
-        - **inputs** : input sequence of shape (batch_size, length, in_dim).
-        - **states** : None, or list of tensors
-            The states, for initial states and masks that contains
-            the previous encoded key/values
-            prev_key (batch_size, num_heads, past_length, mem_length),
-            prev_value (batch_size, num_heads, past_length, mem_length)
-            None means no previous states.
-
-    Outputs:
-        - **outputs** : output encoding of shape (batch_size, length, C_out).
-        - **additional_outputs** : list of tensors.
-            Either be an empty list or contains the attention weights in this step.
-            The attention weights will have shape (batch_size, num_heads, length, mem_length)
-    """
-    def __init__(self, units, num_heads, dropout=0.0,
-                 weight_initializer=mx.init.Normal(0.02), bias_initializer='zeros',
-                 prefix=None, params=None):
-        super(GPT2SelfAttentionLayer, self).__init__(prefix=prefix, params=params)
-        self._units = units
-        self._num_heads = num_heads
-        assert units % num_heads == 0
-        with self.name_scope():
-            self._multi_head_qkv_proj = nn.Dense(units=units * 3, flatten=False, use_bias=True,
-                                                 weight_initializer=weight_initializer,
-                                                 bias_initializer=bias_initializer,
-                                                 prefix='qkv_proj_')
-            self._base_attn_cell = DotProductAttentionCell(
-                scaled=True, dropout=dropout, prefix='attn_')
-            self._dropout_layer = nn.Dropout(dropout)
-            self._out_proj = nn.Dense(units=units, flatten=False, use_bias=True,
-                                      weight_initializer=weight_initializer,
-                                      bias_initializer=bias_initializer,
-                                      prefix='out_proj_')
-
-    def hybrid_forward(self, F, data, states=None):  # pylint: disable=arguments-differ
-        # Generate mask
-        if states is not None:
-            prev_key, prev_value = states
-
-            prev_len_range = F.contrib.arange_like(prev_key, axis=2)
-            data_len_range = F.contrib.arange_like(data, axis=1)
-            prev_len = F.broadcast_add(F.slice_axis(prev_len_range, axis=0, begin=-1, end=None),
-                                       F.ones((1, )))
-
-            data_pos = F.broadcast_add(F.contrib.arange_like(data, axis=1), prev_len)
-            all_pos = F.contrib.arange_like(F.concat(prev_len_range, data_len_range, dim=0))
-        else:
-            prev_key, prev_value = None, None
-            data_pos = F.contrib.arange_like(data, axis=1)
-            all_pos = data_pos
-
-        mask = F.broadcast_lesser_equal(all_pos.reshape((1, -1)), data_pos.reshape((-1, 1)))
-        mask = F.broadcast_like(F.expand_dims(mask, axis=0), data, lhs_axes=(0, ), rhs_axes=(0, ))
-        mask = F.concat(*[mask] * self._num_heads, dim=0)
-
-        # Multi-head attention
-        qkv = self._multi_head_qkv_proj(data)  # Shape (batch_size, seq_len, 3 * units)
-        qkv = F.swapaxes(qkv, 1, 2)  # Shape (batch_size, 3 * units, seq_len)
-
-        # Each has shape (batch_size, units, seq_len)
-        query, key, value = F.split(qkv, num_outputs=3, axis=1)
-        # Map each to have shape (batch_size * num_head, ele_units, seq_len)
-        query = query.reshape(shape=(0, -4, self._num_heads, -1, 0)).reshape(
-            shape=(-1, 0, 0), reverse=True)
-        key = key.reshape(shape=(0, -4, self._num_heads, -1, 0)).reshape(
-            shape=(-1, 0, 0), reverse=True)
-        value = value.reshape(shape=(0, -4, self._num_heads, -1, 0)).reshape(
-            shape=(-1, 0, 0), reverse=True)
-        query = F.swapaxes(query, 1, 2)
-        key = F.swapaxes(key, 1, 2)
-        value = F.swapaxes(value, 1, 2)
-        if prev_key is not None:
-            # Shape (batch_size * num_heads, all_len, ele_units)
-            key = F.concat(prev_key.reshape((-1, 0, 0), reverse=True), key, dim=1)
-        if prev_value is not None:
-            value = F.concat(prev_value.reshape((-1, 0, 0), reverse=True),
-                             value, dim=1)
-
-        # Shape (batch_size * num_heads, all_len, ele_units)
-        out, _ = self._base_attn_cell(query, key, value, mask)
-        out = F.transpose(out.reshape((-1, self._num_heads, 0, 0), reverse=True),
-                          axes=(0, 2, 1, 3)).reshape((0, 0, -1))
-        out = self._out_proj(out)
-        return out, [key.reshape((-1, self._num_heads, 0, 0), reverse=True),
-                     value.reshape((-1, self._num_heads, 0, 0), reverse=True)]
-
-
-class GPT2FFNLayer(HybridBlock):
-    """Feed-forward network (FFN) layer used in OpenAI GPT-2.
-
-    Parameters
-    ----------
-    units : int
-        Number of units for the output.
-    hidden_size : int
-        number of units in the hidden layer of position-wise feed-forward networks
-    weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    num_heads : int
-        Number of heads in multi-head attention
-    dropout : float
-        Dropout probability of the attention probabilities.
-    prefix : str, default None.
-        Prefix for name of `Block`s. (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells. Created if `None`.
-
-
-    Inputs:
-        - **inputs** : input sequence of shape (batch_size, length, C_in)
-
-    Outputs:
-        - **outputs** : the output of the encoder. Shape is (batch_size, length, C_out)
-    """
-    def __init__(self, units, hidden_size,
-                 weight_initializer=mx.init.Normal(0.02), bias_initializer='zeros',
-                 prefix=None, params=None):
-        super(GPT2FFNLayer, self).__init__(prefix=prefix, params=params)
-        self._units = units
-        self._hidden_size = hidden_size
-        with self.name_scope():
-            self._hidden_map = nn.Dense(flatten=False, units=hidden_size,
-                                        weight_initializer=weight_initializer,
-                                        bias_initializer=bias_initializer)
-            self._out_map = nn.Dense(flatten=False, units=units,
-                                     weight_initializer=weight_initializer,
-                                     bias_initializer=bias_initializer)
-            self._act = GELU(approximate=True)
-
-    def hybrid_forward(self, F, data): # pylint: disable=arguments-differ
-        out = self._out_map(self._act(self._hidden_map(data)))
-        return out
-
-
-class GPT2Model(HybridBlock):
-    """Generic Model for GPT-2.
-
-    Parameters
-    ----------
-    units : int
-        Number of units for the output.
-    vocab_size : int or None, default None
-        The size of the vocabulary.
-    max_length : int
-        Maximum length of the input sequence
-    num_layers : int
-        Number of attention layers.
-    num_heads : int
-        Number of heads in multi-head attention
-    dropout : float
-        Dropout probability of the attention probabilities.
-    prefix : str, default None.
-        Prefix for name of `Block`s. (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells. Created if `None`.
-    """
-    def __init__(self, units, vocab_size, max_length, num_layers, num_heads, dropout=0.0,
-                 prefix=None, params=None):
-        super(GPT2Model, self).__init__(prefix=prefix, params=params)
-        self._units = units
-        self._max_length = max_length
-        self._num_layers = num_layers
-        self._num_heads = num_heads
-        with self.name_scope():
-            self._pos_embed = nn.Embedding(input_dim=max_length, output_dim=units,
-                                           weight_initializer=mx.init.Normal(0.01),
-                                           prefix='pos_embed_')
-            self._embed = nn.Embedding(input_dim=vocab_size, output_dim=units, prefix='embed_',
-                                       weight_initializer=mx.init.Normal(0.02))
-            self._logits_proj = nn.Dense(units=vocab_size, in_units=units, use_bias=False,
-                                         flatten=False, params=self._embed.params)
-            self._self_attention_layers = nn.HybridSequential()
-            self._ffn_layers = nn.HybridSequential()
-            self._attn_ln = nn.HybridSequential()
-            self._ffn_ln = nn.HybridSequential()
-            for i in range(num_layers):
-                self._self_attention_layers.add(GPT2SelfAttentionLayer(
-                    units=units, num_heads=num_heads, dropout=dropout,
-                    prefix='self_attn{}_'.format(i)))
-                self._ffn_layers.add(GPT2FFNLayer(
-                    units=units, hidden_size=units * 4, prefix='ffn{}_'.format(i)))
-                self._attn_ln.add(nn.LayerNorm(prefix='attn_ln{}_'.format(i)))
-                self._ffn_ln.add(nn.LayerNorm(prefix='ffn_ln{}_'.format(i)))
-                self._final_ln = nn.LayerNorm(prefix='final_ln{}_'.format(i))
-
-    def hybrid_forward(self, F, data, states=None): # pylint: disable=arguments-differ
-        """Compute
-
-        Notes
-        -----
-        If you hybridized the GPT2Model by calling net.hybridize(), you cannot
-        switch between states=None, and states=list_of_NDArray between calls to
-        the net. The hybridized model will only support the type of states used
-        during the first call after hybridization.
-
-        Parameters
-        ----------
-        data : NDArray
-            Shape (batch_size, seq_len)
-        states : list of NDArray or None
-
-        Returns
-        -------
-        out : NDArray
-            Shape (batch_size, seq_len, vocab_size)
-        new_states : list of NDArray
-        """
-        new_states = []
-        if states is not None:
-            prev_len_range = F.contrib.arange_like(states[0], axis=2).astype('int32')
-            prev_len = F.broadcast_add(F.slice_axis(prev_len_range, axis=0, begin=-1, end=None),
-                                       F.ones((1, ), dtype='int32'))
-            data_pos = F.broadcast_add(
-                F.contrib.arange_like(data, axis=1).astype('int32'), prev_len)
-        else:
-            data_pos = F.contrib.arange_like(data, axis=1).astype('int32')
-        if F is mx.nd:
-            length = data.shape[1] + (states[0].shape[2] if states is not None else 0)
-            assert length <= self._max_length
-        # astype cast to workaround https://github.com/apache/incubator-mxnet/issues/16851
-        data_pos = F.broadcast_like(F.expand_dims(data_pos, axis=0), data.astype('int32'),
-                                    lhs_axes=(0, ), rhs_axes=(0, ))
-        out = self._embed(data) + self._pos_embed(data_pos)
-        for i in range(self._num_layers):
-            attn_layer = self._self_attention_layers[i]
-            ffn_layer = self._ffn_layers[i]
-            attn_ln = self._attn_ln[i]
-            ffn_ln = self._ffn_ln[i]
-            layer_states = None if states is None else states[2*i:(2*i + 2)]
-            h, new_layer_states = attn_layer(attn_ln(out), layer_states)
-            out = out + h
-            h = ffn_layer(ffn_ln(out))
-            out = out + h
-            new_states.extend(new_layer_states)
-        out = self._final_ln(out)
-        logits = self._logits_proj(out)
-        return logits, new_states
-
-    def state_info(self, *args, **kwargs): # pylint: disable=unused-argument
-        return None
-
-model_store._model_sha1.update(
-    {name: checksum for checksum, name in [
-        ('26416f2ec2ab0c5f37e74dcec801f3e659546e03', 'gpt2_117m_openai_webtext'),
-        ('29173e25d2f3b187745bea6689693bb771862f81', 'gpt2_345m_openai_webtext'),
-    ]})
-
-gpt2_117m_hparams = {
-    'units': 768,
-    'max_length': 1024,
-    'num_heads': 12,
-    'num_layers': 12,
-    'dropout': 0.0,
-}
-
-gpt2_345m_hparams = {
-    'units': 1024,
-    'max_length': 1024,
-    'num_heads': 16,
-    'num_layers': 24,
-    'dropout': 0.0,
-}
-
-gpt2_hparams = {
-    'gpt2_117m': gpt2_117m_hparams,
-    'gpt2_345m': gpt2_345m_hparams,
-}
-
-def gpt2_117m(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
-              root=os.path.join(get_home_dir(), 'models'), **kwargs):
-    """Generic GPT-2 model.
-
-    The number of layers (L) is 12, number of units (H) is 768, and the
-    number of self-attention heads (A) is 12.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        If not None, the dataset name is used to load a vocabulary for the
-        dataset. If the `pretrained` argument is set to True, the dataset name
-        is further used to select the pretrained parameters to load.
-        Options include 'book_corpus_wiki_en_uncased' and 'book_corpus_wiki_en_cased'.
-    vocab : gluonnlp.vocab.BERTVocab or None, default None
-        Vocabulary for the dataset. Must be provided if dataset_name is not
-        specified. Ignored if dataset_name is specified.
-    pretrained : bool, default True
-        Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Returns
-    -------
-    GPT2Model, gluonnlp.vocab.Vocab
-    """
-    return _get_gpt2_model('gpt2_117m', dataset_name=dataset_name, vocab=vocab,
-                           pretrained=pretrained, ctx=ctx, root=root,
-                           **kwargs)
-
-
-def gpt2_345m(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
-              root=os.path.join(get_home_dir(), 'models'), **kwargs):
-    """Generic GPT-2 model.
-
-    The number of layers (L) is 24, number of units (H) is 1024, and the
-    number of self-attention heads (A) is 24.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        If not None, the dataset name is used to load a vocabulary for the
-        dataset. If the `pretrained` argument is set to True, the dataset name
-        is further used to select the pretrained parameters to load.
-        Options include 'book_corpus_wiki_en_uncased' and 'book_corpus_wiki_en_cased'.
-    vocab : gluonnlp.vocab.BERTVocab or None, default None
-        Vocabulary for the dataset. Must be provided if dataset_name is not
-        specified. Ignored if dataset_name is specified.
-    pretrained : bool, default True
-        Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Returns
-    -------
-    GPT2Model, gluonnlp.vocab.Vocab
-    """
-    return _get_gpt2_model('gpt2_345m', dataset_name=dataset_name, vocab=vocab,
-                           pretrained=pretrained, ctx=ctx, root=root,
-                           **kwargs)
-
-
-def _get_gpt2_model(model_name=None, dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
-                    root=os.path.join(get_home_dir(), 'models'), **kwargs):
-    """Any predefined GPT-2 model.
-
-    Parameters
-    ----------
-    model_name : str or None, default None
-        Options include 'gpt2_117m' and 'gpt2_345m'.
-    dataset_name : str or None, default None
-        If not None, the dataset name is used to load a vocabulary for the
-        dataset. If the `pretrained` argument is set to True, the dataset name
-        is further used to select the pretrained parameters to load.
-        The supported datasets for model_name of either bert_24_1024_16 and
-        bert_12_768_12 are 'openai_webtext'.
-    vocab : gluonnlp.vocab.BERTVocab or None, default None
-        Vocabulary for the dataset. Must be provided if dataset_name is not
-        specified. Ignored if dataset_name is specified.
-    pretrained : bool, default True
-        Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Returns
-    -------
-    GPT2Model, gluonnlp.vocab.Vocab
-    """
-    predefined_args = gpt2_hparams[model_name].copy()
-    mutable_args = ['dropout']
-    mutable_args = frozenset(mutable_args)
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-        'Cannot override predefined model settings.'
-    predefined_args.update(kwargs)
-    vocab = _load_vocab(dataset_name, vocab, root)
-    # GPT2
-    net = GPT2Model(units=predefined_args['units'],
-                    vocab_size=len(vocab),
-                    max_length=predefined_args['max_length'],
-                    num_layers=predefined_args['num_layers'],
-                    num_heads=predefined_args['num_heads'],
-                    dropout=predefined_args['dropout'],
-                    **kwargs)
-    if pretrained:
-        _load_pretrained_params(net, model_name, dataset_name, root, ctx)
-    return net, vocab
diff --git a/scripts/text_generation/sequence_sampling.py b/scripts/text_generation/sequence_sampling.py
deleted file mode 100644
index c73afb3c3a..0000000000
--- a/scripts/text_generation/sequence_sampling.py
+++ /dev/null
@@ -1,190 +0,0 @@
-"""
-Generate Sentences by Sampling and Beam Search
-==============================================
-
-This example shows how to load a pre-trained language model on wikitext-2 in Gluon NLP Toolkit model
-zoo, and use sequence sampler and beam search sampler on the language model to generate sentences.
-"""
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=missing-docstring
-import argparse
-
-import numpy as np
-import mxnet as mx
-import gluonnlp as nlp
-
-import model # local 'model' module with the addition of GPT-2
-
-nlp.utils.check_version('0.7.1')
-
-parser = argparse.ArgumentParser(description='Generate sentences by beam search. '
-                                             'We load a LSTM model that is pre-trained on '
-                                             'WikiText as our encoder.')
-
-# beam search sampler options
-subparsers = parser.add_subparsers(help='Sequence generation methods.',
-                                   dest='command')
-subparsers.required = True
-beam_search_parser = subparsers.add_parser('beam-search', help='Use beam search for decoding.')
-beam_search_parser.add_argument('--alpha', type=float, default=0.0,
-                                help='Alpha in the length penalty term.')
-beam_search_parser.add_argument('--k', type=int, default=5, help='K in the length penalty term.')
-
-# random sampler options
-random_sample_parser = subparsers.add_parser('random-sample',
-                                             help='Use random sampling for decoding.')
-random_sample_parser.add_argument('--temperature', type=float, default=1.0,
-                                  help='Softmax temperature used in sampling.')
-random_sample_parser.add_argument('--use-top-k', type=int, required=False,
-                                  help='Sample only from the top-k candidates.')
-
-# shared options
-for p in [beam_search_parser, random_sample_parser]:
-    p.add_argument('--gpu', type=int, default=0,
-                   help='id of the gpu to use. Set it to empty means to use cpu.')
-    p.add_argument('--lm-model', type=str, default='awd_lstm_lm_1150',
-                   help='type of the pre-trained model to load, can be "standard_lstm_lm_200", '
-                        '"standard_lstm_lm_650", "standard_lstm_lm_1500", '
-                        '"awd_lstm_lm_1150", "gpt2_117m", "gpt2_345m", etc.')
-    p.add_argument('--max-length', type=int, default=20, help='Maximum sentence length.')
-    p.add_argument('--print-num', type=int, default=3, help='Number of sentences to display.')
-    p.add_argument('--bos', type=str, default='I think this works')
-    p.add_argument('--beam-size', type=int, default=5,
-                   help='Beam size in the beam search sampler.')
-
-args = parser.parse_args()
-
-print(args)
-if args.gpu is not None and args.gpu < mx.context.num_gpus():
-    ctx = mx.gpu(args.gpu)
-else:
-    if args.gpu:
-        print('Specified GPU id {} does not exist. Available #GPUs: {}. Using CPU instead.'\
-                .format(args.gpu, mx.context.num_gpus()))
-    ctx = mx.cpu()
-
-assert 0 < args.print_num <= args.beam_size,\
-    'print_num must be between {} and {}, received={}'.format(1, args.beam_size, args.print_num)
-
-
-# Define the decoder function, we use log_softmax to map the output scores to log-likelihoods
-# Also, we transform the layout to NTC
-class LMDecoder:
-    def __init__(self, net):
-        self.net = net
-
-    def __call__(self, inputs, states):
-        outputs, states = self.net(mx.nd.expand_dims(inputs, axis=0), states)
-        return outputs[0], states
-
-    def state_info(self, *arg, **kwargs):
-        return self.net.state_info(*arg, **kwargs)
-
-class GPT2Decoder(LMDecoder):
-    def __call__(self, inputs, states):
-        inputs = mx.nd.expand_dims(inputs, axis=1)
-        out, new_states = self.net(inputs, states)
-        out = mx.nd.slice_axis(out, axis=1, begin=0, end=1).reshape((inputs.shape[0], -1))
-        return out, new_states
-
-def get_decoder_vocab(lm_model):
-    if lm_model.startswith('gpt2'):
-        dataset_name = 'openai_webtext'
-        decoder_cls = GPT2Decoder
-    else:
-        dataset_name = 'wikitext-2'
-        decoder_cls = LMDecoder
-    lm_model, vocab = model.get_model(name=lm_model,
-                                      dataset_name=dataset_name,
-                                      pretrained=True,
-                                      ctx=ctx)
-    decoder = decoder_cls(lm_model)
-    return decoder, vocab
-
-def get_tokenizer(lm_model):
-    if lm_model.startswith('gpt2'):
-        return nlp.data.GPT2BPETokenizer(), nlp.data.GPT2BPEDetokenizer()
-    else:
-        return nlp.data.SacreMosesTokenizer(), nlp.data.SacreMosesDetokenizer(return_str=True)
-
-def get_initial_input_state(decoder, bos_ids):
-    if isinstance(decoder, GPT2Decoder):
-        inputs, begin_states = decoder.net(
-            mx.nd.array([bos_ids], dtype=np.int32, ctx=ctx), None)
-        inputs = inputs[:, -1, :]
-        smoothed_probs = (inputs / args.temperature).softmax(axis=1)
-        inputs = mx.nd.sample_multinomial(smoothed_probs, dtype=np.int32)
-        return inputs, begin_states
-    else:
-        begin_states = decoder.net.begin_state(batch_size=1, ctx=ctx)
-        if len(bos_ids) > 1:
-            _, begin_states = decoder.net(mx.nd.expand_dims(mx.nd.array(bos_ids[:-1], ctx=ctx),
-                                                            axis=1),
-                                          begin_states)
-        inputs = mx.nd.full(shape=(1,), ctx=ctx, val=bos_ids[-1])
-        return inputs, begin_states
-
-
-def generate():
-    assert not args.lm_model.startswith('gpt2') or args.command != 'beam-search'
-    decoder, vocab = get_decoder_vocab(args.lm_model)
-    tokenizer, detokenizer = get_tokenizer(args.lm_model)
-    bos_str = args.bos
-    if not bos_str.startswith(' '):
-        bos_str = ' ' + bos_str
-    bos_tokens = tokenizer(bos_str)
-    bos_ids = vocab[bos_tokens]
-    eos_id = vocab[vocab.eos_token]
-    if args.command == 'random-sample':
-        print('Sampling Parameters: beam_size={}, temperature={}, use_top_k={}'\
-                .format(args.beam_size, args.temperature, args.use_top_k))
-        sampler = nlp.model.SequenceSampler(beam_size=args.beam_size,
-                                            decoder=decoder,
-                                            eos_id=eos_id,
-                                            max_length=args.max_length - len(bos_tokens),
-                                            temperature=args.temperature,
-                                            top_k=args.use_top_k)
-    else:
-        print('Beam Seach Parameters: beam_size={}, alpha={}, K={}'\
-                .format(args.beam_size, args.alpha, args.k))
-        scorer = nlp.model.BeamSearchScorer(alpha=args.alpha, K=args.k, from_logits=False)
-        sampler = nlp.model.BeamSearchSampler(beam_size=args.beam_size,
-                                              decoder=decoder,
-                                              eos_id=eos_id,
-                                              scorer=scorer,
-                                              max_length=args.max_length - len(bos_tokens))
-    inputs, begin_states = get_initial_input_state(decoder, bos_ids)
-
-    sampler._decoder.net.hybridize()  # Hybridize after we obtained the initial states
-
-    # samples have shape (1, beam_size, length), scores have shape (1, beam_size)
-    samples, scores, valid_lengths = sampler(inputs, begin_states)
-    samples = samples[0].asnumpy()
-    scores = scores[0].asnumpy()
-    valid_lengths = valid_lengths[0].asnumpy()
-
-    print('Generation Result:')
-    for i in range(args.print_num):
-        generated_tokens = [vocab.idx_to_token[ele] for ele in samples[i][:valid_lengths[i]]]
-        tokens = bos_tokens + generated_tokens[1:]
-        print([detokenizer(tokens).strip(), scores[i]])
-
-
-if __name__ == '__main__':
-    generate()
diff --git a/scripts/word_embeddings/data.py b/scripts/word_embeddings/data.py
deleted file mode 100644
index 959f0727a8..0000000000
--- a/scripts/word_embeddings/data.py
+++ /dev/null
@@ -1,561 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=
-"""Word embedding training datasets."""
-
-__all__ = [
-    'WikiDumpStream', 'preprocess_dataset', 'wiki', 'transform_data_fasttext',
-    'transform_data_word2vec', 'skipgram_lookup', 'cbow_lookup',
-    'skipgram_fasttext_batch', 'cbow_fasttext_batch', 'skipgram_batch',
-    'cbow_batch']
-
-import functools
-import io
-import itertools
-import json
-import math
-import os
-import warnings
-
-import mxnet as mx
-import numpy as np
-
-import gluonnlp as nlp
-from gluonnlp import Vocab
-from gluonnlp.base import numba_njit
-from gluonnlp.data import CorpusDataset, SimpleDatasetStream
-from utils import print_time
-
-
-def preprocess_dataset(data, min_freq=5, max_vocab_size=None):
-    """Dataset preprocessing helper.
-
-    Parameters
-    ----------
-    data : mx.data.Dataset
-        Input Dataset. For example gluonnlp.data.Text8 or gluonnlp.data.Fil9
-    min_freq : int, default 5
-        Minimum token frequency for a token to be included in the vocabulary
-        and returned DataStream.
-    max_vocab_size : int, optional
-        Specifies a maximum size for the vocabulary.
-
-    Returns
-    -------
-    gluonnlp.data.DataStream
-        Each sample is a valid input to
-        gluonnlp.data.EmbeddingCenterContextBatchify.
-    gluonnlp.Vocab
-        Vocabulary of all tokens in Text8 that occur at least min_freq times of
-        maximum size max_vocab_size.
-    idx_to_counts : list of int
-        Mapping from token indices to their occurrence-counts in the Text8
-        dataset.
-
-    """
-    with print_time('count and construct vocabulary'):
-        counter = nlp.data.count_tokens(itertools.chain.from_iterable(data))
-        vocab = nlp.Vocab(counter, unknown_token=None, padding_token=None,
-                          bos_token=None, eos_token=None, min_freq=min_freq,
-                          max_size=max_vocab_size)
-        idx_to_counts = [counter[w] for w in vocab.idx_to_token]
-
-    def code(sentence):
-        return [vocab[token] for token in sentence if token in vocab]
-
-    with print_time('code data'):
-        data = data.transform(code, lazy=False)
-    data = nlp.data.SimpleDataStream([data])
-    return data, vocab, idx_to_counts
-
-
-def wiki(wiki_root, wiki_date, wiki_language, max_vocab_size=None):
-    """Wikipedia dump helper.
-
-    Parameters
-    ----------
-    wiki_root : str
-        Parameter for WikiDumpStream
-    wiki_date : str
-        Parameter for WikiDumpStream
-    wiki_language : str
-        Parameter for WikiDumpStream
-    max_vocab_size : int, optional
-        Specifies a maximum size for the vocabulary.
-
-    Returns
-    -------
-    gluonnlp.data.DataStream
-        Each sample is a valid input to
-        gluonnlp.data.EmbeddingCenterContextBatchify.
-    gluonnlp.Vocab
-        Vocabulary of all tokens in the Wikipedia corpus as provided by
-        WikiDumpStream but with maximum size max_vocab_size.
-    idx_to_counts : list of int
-        Mapping from token indices to their occurrence-counts in the Wikipedia
-        corpus.
-
-    """
-    data = WikiDumpStream(
-        root=os.path.expanduser(wiki_root), language=wiki_language,
-        date=wiki_date)
-    vocab = data.vocab
-    if max_vocab_size:
-        for token in vocab.idx_to_token[max_vocab_size:]:
-            vocab.token_to_idx.pop(token)
-        vocab.idx_to_token = vocab.idx_to_token[:max_vocab_size]
-    idx_to_counts = data.idx_to_counts
-
-    def code(shard):
-        return [[vocab[token] for token in sentence if token in vocab]
-                for sentence in shard]
-
-    data = data.transform(code)
-    return data, vocab, idx_to_counts
-
-
-def transform_data_fasttext(data, vocab, idx_to_counts, cbow, ngram_buckets,
-                            ngrams, batch_size, window_size,
-                            frequent_token_subsampling=1E-4, dtype='float32',
-                            index_dtype='int64'):
-    """Transform a DataStream of coded DataSets to a DataStream of batches.
-
-    Parameters
-    ----------
-    data : gluonnlp.data.DataStream
-        DataStream where each sample is a valid input to
-        gluonnlp.data.EmbeddingCenterContextBatchify.
-    vocab : gluonnlp.Vocab
-        Vocabulary containing all tokens whose indices occur in data. For each
-        token, it's associated subwords will be computed and used for
-        constructing the batches. No subwords are used if ngram_buckets is 0.
-    idx_to_counts : list of int
-        List of integers such that idx_to_counts[idx] represents the count of
-        vocab.idx_to_token[idx] in the underlying dataset. The count
-        information is used to subsample frequent words in the dataset.
-        Each token is independently dropped with probability 1 - sqrt(t /
-        (count / sum_counts)) where t is the hyperparameter
-        frequent_token_subsampling.
-    cbow : boolean
-        If True, batches for CBOW are returned.
-    ngram_buckets : int
-        Number of hash buckets to consider for the fastText
-        nlp.vocab.NGramHashes subword function.
-    ngrams : list of int
-        For each integer n in the list, all ngrams of length n will be
-        considered by the nlp.vocab.NGramHashes subword function.
-    batch_size : int
-        The returned data stream iterates over batches of batch_size.
-    window_size : int
-        The context window size for
-        gluonnlp.data.EmbeddingCenterContextBatchify.
-    frequent_token_subsampling : float
-        Hyperparameter for subsampling. See idx_to_counts above for more
-        information.
-    dtype : str or np.dtype, default 'float32'
-        Data type of data array.
-    index_dtype : str or np.dtype, default 'int64'
-        Data type of index arrays.
-
-    Returns
-    -------
-    gluonnlp.data.DataStream
-        Stream over batches. Each returned element is a list corresponding to
-        the arguments for the forward pass of model.SG or model.CBOW
-        respectively based on if cbow is False or True. If ngarm_buckets > 0,
-        the returned sample will contain ngrams. Both model.SG or model.CBOW
-        will handle them correctly as long as they are initialized with the
-        subword_function returned as second argument by this function (see
-        below).
-    gluonnlp.vocab.NGramHashes
-        The subword_function used for obtaining the subwords in the returned
-        batches.
-
-    """
-    if ngram_buckets <= 0:
-        raise ValueError('Invalid ngram_buckets. Use Word2Vec training '
-                         'pipeline if not interested in ngrams.')
-
-    sum_counts = float(sum(idx_to_counts))
-    idx_to_pdiscard = [
-        1 - math.sqrt(frequent_token_subsampling / (count / sum_counts))
-        for count in idx_to_counts]
-
-    def subsample(shard):
-        return [[
-            t for t, r in zip(sentence,
-                              np.random.uniform(0, 1, size=len(sentence)))
-            if r > idx_to_pdiscard[t]] for sentence in shard]
-
-    data = data.transform(subsample)
-
-    batchify = nlp.data.batchify.EmbeddingCenterContextBatchify(
-        batch_size=batch_size, window_size=window_size, cbow=cbow,
-        weight_dtype=dtype, index_dtype=index_dtype)
-    data = data.transform(batchify)
-
-    with print_time('prepare subwords'):
-        subword_function = nlp.vocab.create_subword_function(
-            'NGramHashes', ngrams=ngrams, num_subwords=ngram_buckets)
-
-        # Store subword indices for all words in vocabulary
-        idx_to_subwordidxs = list(subword_function(vocab.idx_to_token))
-        subwordidxs = np.concatenate(idx_to_subwordidxs)
-        subwordidxsptr = np.cumsum([
-            len(subwordidxs) for subwordidxs in idx_to_subwordidxs])
-        subwordidxsptr = np.concatenate([
-            np.zeros(1, dtype=np.int64), subwordidxsptr])
-        if cbow:
-            subword_lookup = functools.partial(
-                cbow_lookup, subwordidxs=subwordidxs,
-                subwordidxsptr=subwordidxsptr, offset=len(vocab))
-        else:
-            subword_lookup = functools.partial(
-                skipgram_lookup, subwordidxs=subwordidxs,
-                subwordidxsptr=subwordidxsptr, offset=len(vocab))
-        max_subwordidxs_len = max(len(s) for s in idx_to_subwordidxs)
-        if max_subwordidxs_len > 500:
-            warnings.warn(
-                'The word with largest number of subwords '
-                'has {} subwords, suggesting there are '
-                'some noisy words in your vocabulary. '
-                'You should filter out very long words '
-                'to avoid memory issues.'.format(max_subwordidxs_len))
-
-    data = UnchainStream(data)
-
-    if cbow:
-        batchify_fn = cbow_fasttext_batch
-    else:
-        batchify_fn = skipgram_fasttext_batch
-    batchify_fn = functools.partial(
-        batchify_fn, num_tokens=len(vocab) + len(subword_function),
-        subword_lookup=subword_lookup, dtype=dtype, index_dtype=index_dtype)
-
-    return data, batchify_fn, subword_function
-
-
-def transform_data_word2vec(data, vocab, idx_to_counts, cbow, batch_size,
-                            window_size, frequent_token_subsampling=1E-4,
-                            dtype='float32', index_dtype='int64'):
-    """Transform a DataStream of coded DataSets to a DataStream of batches.
-
-    Parameters
-    ----------
-    data : gluonnlp.data.DataStream
-        DataStream where each sample is a valid input to
-        gluonnlp.data.EmbeddingCenterContextBatchify.
-    vocab : gluonnlp.Vocab
-        Vocabulary containing all tokens whose indices occur in data.
-    idx_to_counts : list of int
-        List of integers such that idx_to_counts[idx] represents the count of
-        vocab.idx_to_token[idx] in the underlying dataset. The count
-        information is used to subsample frequent words in the dataset.
-        Each token is independently dropped with probability 1 - sqrt(t /
-        (count / sum_counts)) where t is the hyperparameter
-        frequent_token_subsampling.
-    batch_size : int
-        The returned data stream iterates over batches of batch_size.
-    window_size : int
-        The context window size for
-        gluonnlp.data.EmbeddingCenterContextBatchify.
-    frequent_token_subsampling : float
-        Hyperparameter for subsampling. See idx_to_counts above for more
-        information.
-    dtype : str or np.dtype, default 'float32'
-        Data type of data array.
-    index_dtype : str or np.dtype, default 'int64'
-        Data type of index arrays.
-
-    Returns
-    -------
-    gluonnlp.data.DataStream
-        Stream over batches.
-    """
-
-    sum_counts = float(sum(idx_to_counts))
-    idx_to_pdiscard = [
-        1 - math.sqrt(frequent_token_subsampling / (count / sum_counts))
-        for count in idx_to_counts]
-
-    def subsample(shard):
-        return [[
-            t for t, r in zip(sentence,
-                              np.random.uniform(0, 1, size=len(sentence)))
-            if r > idx_to_pdiscard[t]] for sentence in shard]
-
-    data = data.transform(subsample)
-
-    batchify = nlp.data.batchify.EmbeddingCenterContextBatchify(
-        batch_size=batch_size, window_size=window_size, cbow=cbow,
-        weight_dtype=dtype, index_dtype=index_dtype)
-    data = data.transform(batchify)
-    data = UnchainStream(data)
-
-    if cbow:
-        batchify_fn = cbow_batch
-    else:
-        batchify_fn = skipgram_batch
-    batchify_fn = functools.partial(batchify_fn, num_tokens=len(vocab),
-                                    dtype=dtype, index_dtype=index_dtype)
-
-    return data, batchify_fn,
-
-
-def cbow_fasttext_batch(centers, contexts, num_tokens, subword_lookup, dtype,
-                        index_dtype):
-    """Create a batch for CBOW training objective with subwords."""
-    _, contexts_row, contexts_col = contexts
-    data, row, col = subword_lookup(contexts_row, contexts_col)
-    centers = mx.nd.array(centers, dtype=index_dtype)
-    contexts = mx.nd.sparse.csr_matrix(
-        (data, (row, col)), dtype=dtype,
-        shape=(len(centers), num_tokens))  # yapf: disable
-    return centers, contexts
-
-
-def skipgram_fasttext_batch(centers, contexts, num_tokens, subword_lookup,
-                            dtype, index_dtype):
-    """Create a batch for SG training objective with subwords."""
-    contexts = mx.nd.array(contexts[2], dtype=index_dtype)
-    data, row, col = subword_lookup(centers)
-    centers = mx.nd.array(centers, dtype=index_dtype)
-    centers_csr = mx.nd.sparse.csr_matrix(
-        (data, (row, col)), dtype=dtype,
-        shape=(len(centers), num_tokens))  # yapf: disable
-    return centers_csr, contexts, centers
-
-
-def cbow_batch(centers, contexts, num_tokens, dtype, index_dtype):
-    """Create a batch for CBOW training objective."""
-    contexts_data, contexts_row, contexts_col = contexts
-    centers = mx.nd.array(centers, dtype=index_dtype)
-    contexts = mx.nd.sparse.csr_matrix(
-        (contexts_data, (contexts_row, contexts_col)),
-        dtype=dtype, shape=(len(centers), num_tokens))  # yapf: disable
-    return centers, contexts
-
-
-def skipgram_batch(centers, contexts, num_tokens, dtype, index_dtype):
-    """Create a batch for SG training objective."""
-    contexts = mx.nd.array(contexts[2], dtype=index_dtype)
-    indptr = mx.nd.arange(len(centers) + 1)
-    centers = mx.nd.array(centers, dtype=index_dtype)
-    centers_csr = mx.nd.sparse.csr_matrix(
-        (mx.nd.ones(centers.shape), centers, indptr), dtype=dtype,
-        shape=(len(centers), num_tokens))
-    return centers_csr, contexts, centers
-
-
-class UnchainStream(nlp.data.DataStream):
-    def __init__(self, iterable):
-        self._stream = iterable
-
-    def __iter__(self):
-        return iter(itertools.chain.from_iterable(self._stream))
-
-
-@numba_njit
-def skipgram_lookup(indices, subwordidxs, subwordidxsptr, offset=0):
-    """Get a sparse COO array of words and subwords for SkipGram.
-
-    Parameters
-    ----------
-    indices : numpy.ndarray
-        Array containing numbers in [0, vocabulary_size). The element at
-        position idx is taken to be the word that occurs at row idx in the
-        SkipGram batch.
-    offset : int
-        Offset to add to each subword index.
-    subwordidxs : numpy.ndarray
-        Array containing concatenation of all subwords of all tokens in the
-        vocabulary, in order of their occurrence in the vocabulary.
-        For example np.concatenate(idx_to_subwordidxs)
-    subwordidxsptr
-        Array containing pointers into subwordidxs array such that
-        subwordidxs[subwordidxsptr[i]:subwordidxsptr[i+1]] returns all subwords
-        of of token i. For example subwordidxsptr = np.cumsum([
-        len(subwordidxs) for subwordidxs in idx_to_subwordidxs])
-    offset : int, default 0
-        Offset to add to each subword index.
-
-    Returns
-    -------
-    numpy.ndarray of dtype float32
-        Array containing weights such that for each row, all weights sum to
-        1. In particular, all elements in a row have weight 1 /
-        num_elements_in_the_row
-    numpy.ndarray of dtype int64
-        This array is the row array of a sparse array of COO format.
-    numpy.ndarray of dtype int64
-        This array is the col array of a sparse array of COO format.
-
-    """
-    row = []
-    col = []
-    data = []
-    for i, idx in enumerate(indices):
-        start = subwordidxsptr[idx]
-        end = subwordidxsptr[idx + 1]
-
-        row.append(i)
-        col.append(idx)
-        data.append(1 / (1 + end - start))
-        for subword in subwordidxs[start:end]:
-            row.append(i)
-            col.append(subword + offset)
-            data.append(1 / (1 + end - start))
-
-    return (np.array(data, dtype=np.float32), np.array(row, dtype=np.int64),
-            np.array(col, dtype=np.int64))
-
-
-@numba_njit
-def cbow_lookup(context_row, context_col, subwordidxs, subwordidxsptr,
-                offset=0):
-    """Get a sparse COO array of words and subwords for CBOW.
-
-    Parameters
-    ----------
-    context_row : numpy.ndarray of dtype int64
-        Array of same length as context_col containing numbers in [0,
-        batch_size). For each idx, context_row[idx] specifies the row that
-        context_col[idx] occurs in a sparse matrix.
-    context_col : numpy.ndarray of dtype int64
-        Array of same length as context_row containing numbers in [0,
-        vocabulary_size). For each idx, context_col[idx] is one of the
-        context words in the context_row[idx] row of the batch.
-    subwordidxs : numpy.ndarray
-        Array containing concatenation of all subwords of all tokens in the
-        vocabulary, in order of their occurrence in the vocabulary.
-        For example np.concatenate(idx_to_subwordidxs)
-    subwordidxsptr
-        Array containing pointers into subwordidxs array such that
-        subwordidxs[subwordidxsptr[i]:subwordidxsptr[i+1]] returns all subwords
-        of of token i. For example subwordidxsptr = np.cumsum([
-        len(subwordidxs) for subwordidxs in idx_to_subwordidxs])
-    offset : int, default 0
-        Offset to add to each subword index.
-
-    Returns
-    -------
-    numpy.ndarray of dtype float32
-        Array containing weights summing to 1. The weights are chosen such
-        that the sum of weights for all subwords and word units of a given
-        context word is equal to 1 / number_of_context_words_in_the_row.
-        This array is the data array of a sparse array of COO format.
-    numpy.ndarray of dtype int64
-        This array is the row array of a sparse array of COO format.
-    numpy.ndarray of dtype int64
-        This array is the col array of a sparse array of COO format.
-        Array containing weights such that for each row, all weights sum to
-        1. In particular, all elements in a row have weight 1 /
-        num_elements_in_the_row
-
-    """
-    row = []
-    col = []
-    data = []
-
-    num_rows = np.max(context_row) + 1
-    row_to_numwords = np.zeros(num_rows)
-
-    for i, idx in enumerate(context_col):
-        start = subwordidxsptr[idx]
-        end = subwordidxsptr[idx + 1]
-
-        row_ = context_row[i]
-        row_to_numwords[row_] += 1
-
-        row.append(row_)
-        col.append(idx)
-        data.append(1 / (1 + end - start))
-        for subword in subwordidxs[start:end]:
-            row.append(row_)
-            col.append(subword + offset)
-            data.append(1 / (1 + end - start))
-
-    # Normalize by number of words
-    for i, row_ in enumerate(row):
-        assert 0 <= row_ <= num_rows
-        data[i] /= row_to_numwords[row_]
-
-    return (np.array(data, dtype=np.float32), np.array(row, dtype=np.int64),
-            np.array(col, dtype=np.int64))
-
-
-class WikiDumpStream(SimpleDatasetStream):
-    """Stream for preprocessed Wikipedia Dumps.
-
-    Expects data in format
-    - root/date/wiki.language/*.txt
-    - root/date/wiki.language/vocab.json
-    - root/date/wiki.language/counts.json
-
-    Parameters
-    ----------
-    path : str
-        Path to a folder storing the dataset and preprocessed vocabulary.
-    skip_empty : bool, default True
-        Whether to skip the empty samples produced from sample_splitters. If
-        False, `bos` and `eos` will be added in empty samples.
-    bos : str or None, default None
-        The token to add at the beginning of each sentence. If None, nothing is
-        added.
-    eos : str or None, default None
-        The token to add at the end of each sentence. If None, nothing is
-        added.
-
-    Attributes
-    ----------
-    vocab : gluonnlp.Vocab
-        Vocabulary object constructed from vocab.json.
-    idx_to_counts : list[int]
-        Mapping from vocabulary word indices to word counts.
-
-    """
-
-    def __init__(self, root, language, date, skip_empty=True, bos=None,
-                 eos=None):
-        self._root = root
-        self._language = language
-        self._date = date
-        self._path = os.path.join(root, date, 'wiki.' + language)
-
-        if not os.path.isdir(self._path):
-            raise ValueError('{} is not valid. '
-                             'Please make sure that the path exists and '
-                             'contains the preprocessed files.'.format(
-                                 self._path))
-
-        self._file_pattern = os.path.join(self._path, '*.txt')
-        super(WikiDumpStream, self).__init__(
-            dataset=CorpusDataset, file_pattern=self._file_pattern,
-            skip_empty=skip_empty, bos=bos, eos=eos)
-
-    @property
-    def vocab(self):
-        path = os.path.join(self._path, 'vocab.json')
-        with io.open(path, 'r', encoding='utf-8') as in_file:
-            return Vocab.from_json(in_file.read())
-
-    @property
-    def idx_to_counts(self):
-        path = os.path.join(self._path, 'counts.json')
-        with io.open(path, 'r', encoding='utf-8') as in_file:
-            return json.load(in_file)
diff --git a/scripts/word_embeddings/evaluate_pretrained.py b/scripts/word_embeddings/evaluate_pretrained.py
deleted file mode 100644
index aa656cd896..0000000000
--- a/scripts/word_embeddings/evaluate_pretrained.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=logging-too-many-args
-"""Evaluation of pre-trained word embeddings
-============================================
-
-This example shows how to load and perform intrinsic evaluation of word
-embeddings using a variety of datasets all part of the Gluon NLP Toolkit.
-
-"""
-
-import argparse
-import logging
-import os
-import sys
-
-import mxnet as mx
-
-import evaluation
-import gluonnlp as nlp
-import utils
-
-nlp.utils.check_version('0.7.0')
-
-def get_args():
-    """Construct the argument parser."""
-    parser = argparse.ArgumentParser(
-        description='Word embedding evaluation with Gluon.',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-    # Embeddings arguments
-    group = parser.add_argument_group('Embedding arguments')
-    group.add_argument('--embedding-path', type=str,
-                       help='Path to a .vec in Word2Vec text foramt or '
-                       '.bin binary fastText model file. ')
-    group.add_argument('--embedding-name', type=str,
-                       help=('Name of embedding type to load. '
-                             'Valid entries: {}'.format(
-                                 ', '.join(
-                                     nlp.embedding.list_sources().keys()))))
-    group.add_argument('--embedding-source', type=str,
-                       help=('Source from which to initialize the embedding.'
-                             'Pass --list-embedding-sources to get a list of '
-                             'valid sources for a given --embedding-name.'))
-    group.add_argument(
-        '--fasttext-load-ngrams',
-        action='store_true',
-        help=('Specify load_ngrams=True '
-              'when loading pretrained fastText embedding.'))
-    group.add_argument(
-        '--analogy-max-vocab-size', type=int, default=None,
-        help=('Only retain the X first tokens from the pre-trained embedding. '
-              'The tokens are ordered by decreasing frequency.'
-              'As the analogy task takes the whole vocabulary into account, '
-              'removing very infrequent words improves performance.'))
-    group.add_argument('--list-embedding-sources', action='store_true')
-
-    # Computation options
-    group = parser.add_argument_group('Computation arguments')
-    group.add_argument('--batch-size', type=int, default=1024,
-                       help='Batch size to use on analogy task. '
-                       'Decrease batch size if evaluation crashes.')
-    group.add_argument('--gpu', type=int,
-                       help=('Number (index) of GPU to run on, e.g. 0. '
-                             'If not specified, uses CPU.'))
-    group.add_argument('--no-hybridize', action='store_true',
-                       help='Disable hybridization of gluon HybridBlocks.')
-
-    # Logging
-    group = parser.add_argument_group('Logging arguments')
-    group.add_argument('--logdir', type=str, default='logs',
-                       help='Directory to store logs.')
-
-    # Evaluation options
-    evaluation.add_parameters(parser)
-
-    args = parser.parse_args()
-
-    validate_args(args)
-    evaluation.validate_args(args)
-
-    return args
-
-
-def validate_args(args):
-    """Validate provided arguments and act on --help."""
-    if args.list_embedding_sources:
-        print('Listing all sources for {} embeddings.'.format(
-            args.embedding_name))
-        print('Specify --embedding-name if you wish to '
-              'list sources of other embeddings')
-        print('')
-        if args.embedding_name not in nlp.embedding.list_sources().keys():
-            print('Invalid embedding name.')
-            print('Only {} are supported.'.format(', '.join(
-                nlp.embedding.list_sources().keys())))
-            sys.exit(1)
-        print(' '.join(nlp.embedding.list_sources()[args.embedding_name]))
-        sys.exit(0)
-
-    if not (args.embedding_path or args.embedding_name):
-        print('You must specify either --embedding-path or --embedding-name ')
-        print('Use --embedding-path to load and evaluate '
-              'word embeddings from a Word2Vec text format '
-              'or fastText binary format file')
-        print('Use --embedding-name or to download one of '
-              'the pre-trained embedding files included in GluonNLP.')
-        sys.exit(1)
-
-    if args.embedding_name and not args.embedding_source:
-        print('Please also specify --embedding-source'
-              ' to select the version of the pre-trained embedding. '
-              'Use --list-embedding-sources to see all available sources')
-        sys.exit(1)
-
-    print(args)
-
-
-def load_embedding_from_path(args):
-    """Load a TokenEmbedding."""
-    if args.embedding_path.endswith('.bin'):
-        with utils.print_time('load fastText model.'):
-            model = \
-                nlp.model.train.FasttextEmbeddingModel.load_fasttext_format(
-                    args.embedding_path)
-        idx_to_token = sorted(model._token_to_idx, key=model._token_to_idx.get)
-
-        # Analogy task is open-vocabulary, so must keep all known words.
-        # But if not evaluating analogy, no need to precompute now as all
-        # words for closed vocabulary task can be obtained via the unknown
-        # lookup
-        if not args.analogy_datasets:
-            # TODO(leezu): use shape (0, model.weight.shape[1]) once np shape
-            # is supported by TokenEmbedding
-            idx_to_token = ['<unk>']
-            idx_to_vec = mx.nd.zeros((1,  model.weight.shape[1]))
-        else:
-            if args.analogy_max_vocab_size:
-                idx_to_token = idx_to_token[:args.analogy_max_vocab_size]
-            with utils.print_time('compute vectors for {} known '
-                                  'words.'.format(len(idx_to_token))):
-                idx_to_vec = model[idx_to_token]
-
-        embedding = nlp.embedding.TokenEmbedding(
-            unknown_token=None, idx_to_token=idx_to_token,
-            idx_to_vec=idx_to_vec, unknown_lookup=model)
-    else:
-        embedding = nlp.embedding.TokenEmbedding.from_file(args.embedding_path)
-
-    return embedding
-
-
-def load_embedding_from_gluonnlp(args):
-    if args.embedding_name.lower() == 'fasttext':
-        token_embedding = nlp.embedding.create(
-            args.embedding_name,
-            source=args.embedding_source,
-            load_ngrams=args.fasttext_load_ngrams)
-    else:
-        token_embedding = nlp.embedding.create(
-            args.embedding_name, source=args.embedding_source)
-    return token_embedding
-
-
-def enforce_max_size(token_embedding, size):
-    assert token_embedding.idx_to_vec is not None
-    if size and len(token_embedding.idx_to_token) > size:
-        assert size > 0
-        size = size + 1 if token_embedding.unknown_token is not None else size
-        token_embedding = nlp.embedding.TokenEmbedding(
-            unknown_token=token_embedding.unknown_token,
-            idx_to_token=token_embedding._idx_to_token[:size],
-            idx_to_vec=token_embedding._idx_to_vec[:size],
-            unknown_lookup=token_embedding.unknown_lookup)
-    return token_embedding
-
-
-if __name__ == '__main__':
-    logging.basicConfig()
-    logging.getLogger().setLevel(logging.INFO)
-
-    args_ = get_args()
-    ctx = utils.get_context(args_)[0]
-    if not os.path.isdir(args_.logdir):
-        os.makedirs(args_.logdir)
-
-    # Load pre-trained embeddings
-    if not args_.embedding_path:
-        token_embedding_ = load_embedding_from_gluonnlp(args_)
-        name = '-' + args_.embedding_name + '-' + args_.embedding_source
-    else:
-        token_embedding_ = load_embedding_from_path(args_)
-        name = ''
-
-    token_embedding_ = enforce_max_size(
-        token_embedding_, args_.analogy_max_vocab_size)
-    if args_.fasttext_load_ngrams:
-        assert token_embedding_.unknown_lookup is not None
-    known_tokens = set(token_embedding_.idx_to_token)
-
-    if args_.similarity_datasets:
-        with utils.print_time('find relevant tokens for similarity'):
-            tokens = evaluation.get_similarity_task_tokens(args_)
-        vocab = nlp.Vocab(nlp.data.count_tokens(tokens),
-                          unknown_token=token_embedding_.unknown_token,
-                          padding_token=None, bos_token=None, eos_token=None)
-        with utils.print_time('set {} embeddings'.format(len(tokens))):
-            vocab.set_embedding(token_embedding_)
-        evaluation.evaluate_similarity(
-            args_, vocab.embedding, ctx, logfile=os.path.join(
-                args_.logdir, 'similarity{}.tsv'.format(name)))
-    if args_.analogy_datasets:
-        with utils.print_time('extend open vocabulary with '
-                              'OOV tokens for analogy'):
-            tokens = evaluation.get_analogy_task_tokens(args_)
-            if token_embedding_.unknown_token is not None:
-                tokens.update(token_embedding_.idx_to_token[1:])
-            else:
-                tokens.update(token_embedding_.idx_to_token)
-        vocab = nlp.Vocab(nlp.data.count_tokens(tokens),
-                          unknown_token=token_embedding_.unknown_token,
-                          padding_token=None, bos_token=None, eos_token=None)
-        with utils.print_time('set {} embeddings'.format(len(tokens))):
-            vocab.set_embedding(token_embedding_)
-        evaluation.evaluate_analogy(
-            args_, vocab.embedding, ctx, logfile=os.path.join(
-                args_.logdir, 'analogy{}.tsv'.format(name)))
diff --git a/scripts/word_embeddings/evaluation.py b/scripts/word_embeddings/evaluation.py
deleted file mode 100644
index 033cb10dc2..0000000000
--- a/scripts/word_embeddings/evaluation.py
+++ /dev/null
@@ -1,302 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Evaluation
-=============
-
-Functions to perform evaluation of TokenEmbeddings on the datasets included in
-the GluonNLP toolkit.
-
-"""
-import itertools
-import sys
-import logging
-import json
-
-import mxnet as mx
-import numpy as np
-from scipy import stats
-
-import gluonnlp as nlp
-
-
-def add_parameters(parser):
-    """Add evaluation specific parameters to parser."""
-    group = parser.add_argument_group('Evaluation arguments')
-
-    group.add_argument('--eval-batch-size', type=int, default=512)
-
-    # Datasets
-    group.add_argument(
-        '--similarity-datasets', type=str,
-        default=nlp.data.word_embedding_evaluation.word_similarity_datasets,
-        nargs='*',
-        help='Word similarity datasets to use for intrinsic evaluation.')
-    group.add_argument(
-        '--similarity-functions', type=str,
-        default=nlp.embedding.evaluation.list_evaluation_functions(
-            'similarity'), nargs='+',
-        help='Word similarity functions to use for intrinsic evaluation.')
-    group.add_argument(
-        '--analogy-datasets', type=str, default=['GoogleAnalogyTestSet'],
-        nargs='*',
-        help='Word similarity datasets to use for intrinsic evaluation.')
-    group.add_argument(
-        '--analogy-functions', type=str,
-        default=nlp.embedding.evaluation.list_evaluation_functions('analogy'),
-        nargs='+',
-        help='Word analogy functions to use for intrinsic evaluation. ')
-
-    ## Analogy evaluation specific arguments
-    group.add_argument(
-        '--analogy-dont-exclude-question-words', action='store_true',
-        help=('Exclude input words from valid output analogies.'
-              'The performance of word embeddings on the analogy task '
-              'is around 0% accuracy if input words are not excluded.'))
-
-
-def validate_args(args):
-    """Validate provided arguments and act on --help."""
-    # Check correctness of similarity dataset names
-    for dataset_name in args.similarity_datasets:
-        if dataset_name and dataset_name.lower() not in map(
-                str.lower,
-                nlp.data.word_embedding_evaluation.word_similarity_datasets):
-            print('{} is not a supported dataset.'.format(dataset_name))
-            sys.exit(1)
-
-    # Check correctness of analogy dataset names
-    for dataset_name in args.analogy_datasets:
-        if dataset_name and dataset_name.lower() not in map(
-                str.lower,
-                nlp.data.word_embedding_evaluation.word_analogy_datasets):
-            print('{} is not a supported dataset.'.format(dataset_name))
-            sys.exit(1)
-
-
-def iterate_similarity_datasets(args):
-    """Generator over all similarity evaluation datasets.
-
-    Iterates over dataset names, keyword arguments for their creation and the
-    created dataset.
-
-    """
-    for dataset_name in args.similarity_datasets:
-        if not dataset_name:
-            continue
-        parameters = nlp.data.list_datasets(dataset_name)
-        for key_values in itertools.product(*parameters.values()):
-            kwargs = dict(zip(parameters.keys(), key_values))
-            yield dataset_name, kwargs, nlp.data.create(dataset_name, **kwargs)
-
-
-def iterate_analogy_datasets(args):
-    """Generator over all analogy evaluation datasets.
-
-    Iterates over dataset names, keyword arguments for their creation and the
-    created dataset.
-
-    """
-    for dataset_name in args.analogy_datasets:
-        if not dataset_name:
-            continue
-        parameters = nlp.data.list_datasets(dataset_name)
-        for key_values in itertools.product(*parameters.values()):
-            kwargs = dict(zip(parameters.keys(), key_values))
-            yield dataset_name, kwargs, nlp.data.create(dataset_name, **kwargs)
-
-
-def get_similarity_task_tokens(args):
-    """Returns a set of all tokens occurring the evaluation datasets."""
-    tokens = set()
-    for _, _, dataset in iterate_similarity_datasets(args):
-        tokens.update(
-            itertools.chain.from_iterable((d[0], d[1]) for d in dataset))
-    return tokens
-
-
-def get_analogy_task_tokens(args):
-    """Returns a set of all tokens occuring the evaluation datasets."""
-    tokens = set()
-    for _, _, dataset in iterate_analogy_datasets(args):
-        tokens.update(
-            itertools.chain.from_iterable(
-                (d[0], d[1], d[2], d[3]) for d in dataset))
-    return tokens
-
-
-def get_tokens_in_evaluation_datasets(args):
-    tokens = get_similarity_task_tokens(args)
-    tokens.update(get_analogy_task_tokens(args))
-    return tokens
-
-
-def evaluate_similarity(args, token_embedding, ctx, logfile=None,
-                        global_step=0):
-    """Evaluate on specified similarity datasets."""
-
-    results = []
-    for similarity_function in args.similarity_functions:
-        evaluator = nlp.embedding.evaluation.WordEmbeddingSimilarity(
-            idx_to_vec=token_embedding.idx_to_vec,
-            similarity_function=similarity_function)
-        evaluator.initialize(ctx=ctx)
-        if not args.no_hybridize:
-            evaluator.hybridize()
-
-        # Evaluate all datasets
-        for (dataset_name, dataset_kwargs,
-             dataset) in iterate_similarity_datasets(args):
-            initial_length = len(dataset)
-            dataset_coded = [[
-                token_embedding.token_to_idx[d[0]],
-                token_embedding.token_to_idx[d[1]], d[2]
-            ] for d in dataset if d[0] in token_embedding.token_to_idx
-                             and d[1] in token_embedding.token_to_idx]
-            num_dropped = initial_length - len(dataset_coded)
-
-            # All words are unknown
-            if not len(dataset_coded):
-                correlation = 0
-            else:
-                words1, words2, scores = zip(*dataset_coded)
-                pred_similarity = evaluator(
-                    mx.nd.array(words1, ctx=ctx), mx.nd.array(words2, ctx=ctx))
-                sr = stats.spearmanr(pred_similarity.asnumpy(),
-                                     np.array(scores))
-                correlation = sr.correlation
-
-            logging.info(
-                'Spearman rank correlation on %s (%s pairs) %s with %s:\t%s',
-                dataset.__class__.__name__, len(dataset_coded),
-                str(dataset_kwargs), similarity_function, correlation)
-
-            result = dict(
-                task='similarity',
-                dataset_name=dataset_name,
-                dataset_kwargs=dataset_kwargs,
-                similarity_function=similarity_function,
-                spearmanr=correlation,
-                num_dropped=num_dropped,
-                global_step=global_step,
-            )
-            log_similarity_result(logfile, result)
-            results.append(result)
-
-    return results
-
-
-def evaluate_analogy(args, token_embedding, ctx, logfile=None, global_step=0):
-    """Evaluate on specified analogy datasets.
-
-    The analogy task is an open vocabulary task, make sure to pass a
-    token_embedding with a sufficiently large number of supported tokens.
-
-    """
-    results = []
-    exclude_question_words = not args.analogy_dont_exclude_question_words
-    for analogy_function in args.analogy_functions:
-        evaluator = nlp.embedding.evaluation.WordEmbeddingAnalogy(
-            idx_to_vec=token_embedding.idx_to_vec,
-            exclude_question_words=exclude_question_words,
-            analogy_function=analogy_function)
-        evaluator.initialize(ctx=ctx)
-        if not args.no_hybridize:
-            evaluator.hybridize()
-
-        for (dataset_name, dataset_kwargs,
-             dataset) in iterate_analogy_datasets(args):
-            initial_length = len(dataset)
-            dataset_coded = [[
-                token_embedding.token_to_idx[d[0]],
-                token_embedding.token_to_idx[d[1]],
-                token_embedding.token_to_idx[d[2]],
-                token_embedding.token_to_idx[d[3]]
-            ] for d in dataset if d[0] in token_embedding.token_to_idx
-                             and d[1] in token_embedding.token_to_idx
-                             and d[2] in token_embedding.token_to_idx
-                             and d[3] in token_embedding.token_to_idx]
-            num_dropped = initial_length - len(dataset_coded)
-
-            dataset_coded_batched = mx.gluon.data.DataLoader(
-                dataset_coded, batch_size=args.eval_batch_size)
-
-            acc = mx.metric.Accuracy()
-            for batch in dataset_coded_batched:
-                batch = batch.as_in_context(ctx)
-                words1, words2, words3, words4 = (batch[:, 0], batch[:, 1],
-                                                  batch[:, 2], batch[:, 3])
-                pred_idxs = evaluator(words1, words2, words3)
-                acc.update(pred_idxs[:, 0], words4.astype(np.float32))
-
-            logging.info('Accuracy on %s (%s quadruples) %s with %s:\t%s',
-                         dataset.__class__.__name__, len(dataset_coded),
-                         str(dataset_kwargs), analogy_function,
-                         acc.get()[1])
-
-            result = dict(
-                task='analogy',
-                dataset_name=dataset_name,
-                dataset_kwargs=dataset_kwargs,
-                analogy_function=analogy_function,
-                accuracy=acc.get()[1],
-                num_dropped=num_dropped,
-                global_step=global_step,
-            )
-            log_analogy_result(logfile, result)
-            results.append(result)
-    return results
-
-
-def log_similarity_result(logfile, result):
-    """Log a similarity evaluation result dictionary as TSV to logfile."""
-    assert result['task'] == 'similarity'
-
-    if not logfile:
-        return
-
-    with open(logfile, 'a') as f:
-        f.write('\t'.join([
-            str(result['global_step']),
-            result['task'],
-            result['dataset_name'],
-            json.dumps(result['dataset_kwargs']),
-            result['similarity_function'],
-            str(result['spearmanr']),
-            str(result['num_dropped']),
-        ]))
-
-        f.write('\n')
-
-
-def log_analogy_result(logfile, result):
-    """Log a analogy evaluation result dictionary as TSV to logfile."""
-    assert result['task'] == 'analogy'
-
-    if not logfile:
-        return
-
-    with open(logfile, 'a') as f:
-        f.write('\t'.join([
-            str(result['global_step']),
-            result['task'],
-            result['dataset_name'],
-            json.dumps(result['dataset_kwargs']),
-            result['analogy_function'],
-            str(result['accuracy']),
-            str(result['num_dropped']),
-        ]))
-        f.write('\n')
diff --git a/scripts/word_embeddings/executors.py b/scripts/word_embeddings/executors.py
deleted file mode 100644
index c73fbef3e3..0000000000
--- a/scripts/word_embeddings/executors.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Python concurrent.futures executors
-======================================
-
-This file contains a lazy ThreadPoolExecutor. The ThreadPoolExecutor in Python
-standard library first fetches the complete iterable, before using a thread
-pool to apply the transformation. This is a major problem for us, as we must
-load all data to memory but need to iterate lazily.
-
-"""
-
-import collections
-import itertools
-import time
-from concurrent.futures import ThreadPoolExecutor
-
-
-class LazyThreadPoolExecutor(ThreadPoolExecutor):
-    """ThreadPoolExecutor with lazy iterable collection in map().
-
-    Implementation taken from https://github.com/python/cpython/pull/707
-
-    """
-
-    def map(self, fn, *iterables, timeout=None, prefetch=None):
-        # pylint: disable=arguments-differ
-        """Lazy apdaption of ThreadPoolExecutor.map.
-
-        Unlike ThreadPoolExecutor.map:
-        - iterables are prefetched lazily
-        - if only a single iterable is specified, iter(iterables[0]) is used
-          instead of zip(*iterables) to obtain a iterator over the arguments
-          that are mapped to fn. This is to match the behavior of
-          mxnet.gluon.Dataset.transform and gluonnlp.data.DataStream.transform
-          which unpack argument tuples.
-
-        """
-        if timeout is not None:
-            end_time = timeout + time.time()
-        if prefetch is None:
-            prefetch = self._max_workers
-        if prefetch < 0:
-            raise ValueError('prefetch count may not be negative')
-
-        if len(iterables) > 1:
-            argsiter = zip(*iterables)
-        else:
-            argsiter = iter(iterables[0])
-        fs = collections.deque(
-            self.submit(fn, *args)
-            for args in itertools.islice(argsiter, self._max_workers +
-                                         prefetch))
-
-        # Yield must be hidden in closure so that the futures are submitted
-        # before the first iterator value is required.
-        def _result_iterator():
-            nonlocal argsiter
-            try:
-                while fs:
-                    res = fs[0].result() if timeout is None else fs[0].result(
-                        end_time - time.time())
-                    # Got a result, future needn't be cancelled
-                    del fs[0]
-                    # Dispatch next task before yielding to keep pipeline full
-                    if argsiter:
-                        try:
-                            args = next(argsiter)
-                        except StopIteration:
-                            argsiter = None
-                        else:
-                            fs.append(self.submit(fn, *args))
-                    yield res
-            finally:
-                for future in fs:
-                    future.cancel()
-
-        return _result_iterator()
diff --git a/scripts/word_embeddings/extract_vocab.py b/scripts/word_embeddings/extract_vocab.py
deleted file mode 100644
index 36e63e6977..0000000000
--- a/scripts/word_embeddings/extract_vocab.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=
-"""Extract the vocabulary from a file and write it to disk."""
-
-import argparse
-import itertools
-import json
-import logging
-import time
-
-import gluonnlp as nlp
-
-
-def parse_args():
-    """Parse command line arguments."""
-    parser = argparse.ArgumentParser(
-        description='Vocabulary extractor.',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-    parser.add_argument('--max-size', type=int, default=None)
-    parser.add_argument('--min-freq', type=int, default=5)
-    parser.add_argument('--max-word-length', type=int, default=50)
-    parser.add_argument('files', type=str, nargs='+')
-    parser.add_argument('--vocab-output', type=str, default='vocab.json')
-    parser.add_argument('--counts-output', type=str, default='counts.json')
-    args = parser.parse_args()
-    return args
-
-
-def get_vocab(args):
-    """Compute the vocabulary."""
-    counter = nlp.data.Counter()
-    start = time.time()
-    for filename in args.files:
-        print('Starting processing of {} after {:.1f} seconds.'.format(
-            filename,
-            time.time() - start))
-        with open(filename, 'r') as f:
-            tokens = itertools.chain.from_iterable((l.split() for l in f))
-            counter.update(tokens)
-
-    if args.max_word_length:
-        counter = {
-            w: c
-            for w, c in counter.items() if len(w) < args.max_word_length
-        }
-
-    total_time = time.time() - start
-    print('Finished after {:.1f} seconds.'.format(total_time))
-    num_words = sum(counter.values())
-    print('Got {} words. Processed {:.1f} per second.'.format(
-        num_words, num_words / total_time))
-
-    start = time.time()
-    print('Starting creation of vocabulary.')
-    vocab = nlp.Vocab(counter, max_size=args.max_size, min_freq=args.min_freq,
-                      unknown_token=None, padding_token=None, bos_token=None,
-                      eos_token=None)
-    with open(args.vocab_output, 'w') as f:
-        f.write(vocab.to_json())
-    print('Finished creation of vocabulary after {:.1f} seconds.'.format(
-        time.time() - start))
-
-    print('Writing word counts.')
-    start = time.time()
-    idx_to_counts = [counter[t] for t in vocab.idx_to_token]
-    with open(args.counts_output, 'w') as f:
-        json.dump(idx_to_counts, f)
-    print('Finished writing word counts after {:.1f} seconds..'.format(
-        time.time() - start))
-
-
-if __name__ == '__main__':
-    logging.basicConfig()
-    logging.getLogger().setLevel(logging.INFO)
-    args_ = parse_args()
-    get_vocab(args_)
diff --git a/scripts/word_embeddings/index.rst b/scripts/word_embeddings/index.rst
deleted file mode 100644
index 77fd719f3b..0000000000
--- a/scripts/word_embeddings/index.rst
+++ /dev/null
@@ -1,139 +0,0 @@
-Word Embedding
---------------
-
-:download:`Download scripts </model_zoo/word_embeddings.zip>`
-
-Gluon NLP makes it easy to evaluate and train word embeddings. Here are
-examples to evaluate the pre-trained embeddings included in the Gluon
-NLP toolkit as well as example scripts for training embeddings on custom
-datasets.
-
-
-Word Embedding Evaluation
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-To evaluate a specific embedding on one or multiple datasets you can use the
-included `evaluate_pretrained.py` as follows.
-
-
-.. code-block:: console
-
-   $ python evaluate_pretrained.py
-
-Call the script with the `--help` option to get an overview of the supported
-options. We include a `run_all.sh` script to run the evaluation for the
-pre-trained English Glove and fastText embeddings included in GluonNLP.
-
-.. code-block:: console
-
-   $ run_all.sh
-
-The resulting logs and a notebook containing a ranking for the different
-evaluation tasks are available `here
-<https://github.com/dmlc/web-data/blob/master/gluonnlp/logs/embedding_results/>`__.
-
-
-Word Embedding Training (Skipgram and CBOW)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Besides loading pre-trained embeddings, the Gluon NLP toolkit also makes it easy
-to train embeddings.
-
-The following code block shows how to use Gluon NLP to train a SkipGram or CBOW
-models. The script and parts of the Gluon NLP library support just-in-time
-compilation with `numba <http://numba.pydata.org/>`_, which is enabled
-automatically when numba is installed on the system. Please `pip
-install --upgrade numba` to make sure training speed is not needlessly throttled
-by Python.
-
-.. code-block:: console
-
-   $ python train_sg_cbow.py --model skipgram --ngram-buckets 0  # Word2Vec Skipgram
-   $ python train_sg_cbow.py --model skipgram --ngram-buckets 2000000  # fastText Skipgram
-   $ python train_sg_cbow.py --model cbow --ngram-buckets 0  # Word2Vec CBOW
-   $ python train_sg_cbow.py --model cbow --ngram-buckets 2000000  # fastText CBOW
-
-Word2Vec models were introduced by Mikolov et al., "Efficient estimation of word
-representations in vector space" ICLR Workshop 2013. FastText models were
-introudced by Bojanowski et al., "Enriching word vectors with subword
-information" TACL 2017.
-
-We report the results obtained by running the :code:`python3
-train_sg_cbow.py --batch-size 4096 --epochs 5 --data fil9 --model skipgram`
-script.For comparison we also report the results obtained by training FastText
-with the `facebookresearch/fastText implementation
-<https://github.com/facebookresearch/fastText>`_. All results are obtained by
-training 5 epochs on the `Fil9 <http://mattmahoney.net/dc/textdata.html>`_
-dataset.
-
-======================================  ===========================  ===================
-Similarity Dataset                        facebookresearch/fastText    train_sg_cbow.py
-======================================  ===========================  ===================
-WordSim353-similarity                                     0.752                0.734
-WordSim353-relatedness                                    0.612                0.608
-MEN (test set)                                            0.736                0.700
-RadinskyMTurk                                             0.687                0.655
-RareWords                                                 0.420                0.457
-SimLex999                                                 0.320                0.346
-SimVerb3500                                               0.190                0.235
-SemEval17Task2 (test set)                                 0.541                0.542
-BakerVerb143                                              0.406                0.383
-YangPowersVerb130                                         0.489                0.466
-======================================  ===========================  ===================
-
-===========================================  ===========================  ===================
-Google Analogy Dataset                        facebookresearch/fastText    train_sg_cbow.py
-===========================================  ===========================  ===================
-capital-common-countries                              0.796                0.581
-capital-world                                         0.442                0.334
-currency                                              0.068                0.074
-city-in-state                                         0.198                0.076
-family                                                0.498                0.593
-gram1-adjective-to-adverb                             0.377                0.688
-gram2-opposite                                        0.343                0.693
-gram3-comparative                                     0.646                0.868
-gram4-superlative                                     0.510                0.757
-gram5-present-participle                              0.445                0.792
-gram6-nationality-adjective                           0.828                0.840
-gram7-past-tense                                      0.385                0.380
-gram8-plural                                          0.706                0.810
-gram9-plural-verbs                                    0.501                0.813
-===========================================  ===========================  ===================
-
-Loading of fastText models with subword information
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Fasttext models trained with the library of facebookresearch are exported both
-in a text and a binary format. Unlike the text format, the binary format
-preserves information about subword units and consequently supports computation
-of word vectors for words unknown during training (and not included in the text
-format). Besides training new fastText embeddings with Gluon NLP it is also
-possible to load the binary format into a Block provided by the Gluon NLP
-toolkit using `FasttextEmbeddingModel.load_fasttext_format`.
-
-
-Word Embedding Training (GloVe)
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Gluon NLP also supports training GloVe models.
-
-.. code-block:: console
-
-   $ python train_glove.py tools/build/cooccurrences.npz tools/build/vocab.txt
-
-Where the `cooccurrences.npz` is a numpy archive containing the sparse word-word
-cooccurrence matrix and vocab.txt a textfile containing the words and their
-counts. They can be constructed from a text corpus using the included
-`vocab_count` and `cooccur` tools. They can be used as follows
-
-.. code-block:: console
-
-   $ mkdir tools/build; cd tools/build; cmake ..; make
-   $ ./vocab_count corpus-part1.txt corpus-part2.txt > vocab.txt
-   $ ./cooccur corpus-part1.txt corpus-part2.txt < vocab.txt
-
-Also see `./vocab_count --help` and `./cooccur --help` for configuration options
-such as min-count or window-size.
-
-GloVe models were introduced by Pennington et al., "Glove: global vectors for
-word representation", ACL 2014.
diff --git a/scripts/word_embeddings/model.py b/scripts/word_embeddings/model.py
deleted file mode 100644
index 3b0074471c..0000000000
--- a/scripts/word_embeddings/model.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=
-"""Word embedding models."""
-
-import mxnet as mx
-import numpy as np
-
-import gluonnlp as nlp
-
-
-class Net(mx.gluon.HybridBlock):
-    """Base class for word2vec and fastText SkipGram and CBOW networks.
-
-    Parameters
-    ----------
-    token_to_idx : dict
-        token_to_idx mapping of the vocabulary that this model is to be trained
-        with. token_to_idx is used for __getitem__ and __contains__. For
-        len(token_to_idx) is used during initialization to obtain the input_dim
-        of the embedding matrix.
-    output_dim : int
-        Dimension of the dense embedding.
-    batch_size : int
-        Batchsize this model will be trained with. TODO temporary until
-        random_like ops are supported
-    negatives_weights : mxnet.nd.NDArray
-        Weights for UnigramCandidateSampler for sampling negatives.
-    smoothing : float, default 0.75
-        Smoothing factor applied to negatives_weights. Final weights are
-        mxnet.nd.power(negative_weights, smoothing).
-    num_negatives : int, default 5
-        Number of negatives to sample for each real sample.
-    sparse_grad : bool, default True
-        Specifies mxnet.gluon.nn.Embedding sparse_grad argument.
-    dtype : str, default 'float32'
-        dtype argument passed to gluon.nn.Embedding
-
-    """
-
-    # pylint: disable=abstract-method
-    def __init__(self, token_to_idx, output_dim, batch_size, negatives_weights,
-                 subword_function=None, num_negatives=5, smoothing=0.75,
-                 sparse_grad=True, dtype='float32', **kwargs):
-        super(Net, self).__init__(**kwargs)
-
-        self._kwargs = dict(
-            input_dim=len(token_to_idx), output_dim=output_dim, dtype=dtype,
-            sparse_grad=sparse_grad, num_negatives=num_negatives)
-
-        with self.name_scope():
-            if subword_function is not None:
-                self.embedding = nlp.model.train.FasttextEmbeddingModel(
-                    token_to_idx=token_to_idx,
-                    subword_function=subword_function,
-                    output_dim=output_dim,
-                    weight_initializer=mx.init.Uniform(scale=1 / output_dim),
-                    sparse_grad=sparse_grad,
-                )
-            else:
-                self.embedding = nlp.model.train.CSREmbeddingModel(
-                    token_to_idx=token_to_idx,
-                    output_dim=output_dim,
-                    weight_initializer=mx.init.Uniform(scale=1 / output_dim),
-                    sparse_grad=sparse_grad,
-                )
-            self.embedding_out = mx.gluon.nn.Embedding(
-                len(token_to_idx), output_dim=output_dim,
-                weight_initializer=mx.init.Zero(), sparse_grad=sparse_grad,
-                dtype=dtype)
-
-            self.negatives_sampler = nlp.data.UnigramCandidateSampler(
-                weights=negatives_weights**smoothing, dtype='int64')
-
-    def __getitem__(self, tokens):
-        return self.embedding[tokens]
-
-
-class SG(Net):
-    """SkipGram network"""
-
-    # pylint: disable=arguments-differ
-    def hybrid_forward(self, F, center, context, center_words):
-        """SkipGram forward pass.
-
-        Parameters
-        ----------
-        center : mxnet.nd.NDArray or mxnet.sym.Symbol
-            Sparse CSR array of word / subword indices of shape (batch_size,
-            len(token_to_idx) + num_subwords). Embedding for center words are
-            computed via F.sparse.dot between the CSR center array and the
-            weight matrix.
-        context : mxnet.nd.NDArray or mxnet.sym.Symbol
-            Dense array of context words of shape (batch_size, ). Also used for
-            row-wise independently masking negatives equal to one of context.
-        center_words : mxnet.nd.NDArray or mxnet.sym.Symbol
-            Dense array of center words of shape (batch_size, ). Only used for
-            row-wise independently masking negatives equal to one of
-            center_words.
-        """
-
-        # negatives sampling
-        negatives = []
-        mask = []
-        for _ in range(self._kwargs['num_negatives']):
-            negatives.append(self.negatives_sampler(center_words))
-            mask_ = negatives[-1] != center_words
-            mask_ = F.stack(mask_, (negatives[-1] != context))
-            mask.append(mask_.min(axis=0))
-
-        negatives = F.stack(*negatives, axis=1)
-        mask = F.stack(*mask, axis=1).astype(np.float32)
-
-        # center - context pairs
-        emb_center = self.embedding(center).expand_dims(1)
-        emb_context = self.embedding_out(context).expand_dims(2)
-        pred_pos = F.batch_dot(emb_center, emb_context).squeeze()
-        loss_pos = (F.relu(pred_pos) - pred_pos + F.Activation(
-            -F.abs(pred_pos), act_type='softrelu')) / (mask.sum(axis=1) + 1)
-
-        # center - negatives pairs
-        emb_negatives = self.embedding_out(negatives).reshape(
-            (-1, self._kwargs['num_negatives'],
-             self._kwargs['output_dim'])).swapaxes(1, 2)
-        pred_neg = F.batch_dot(emb_center, emb_negatives).squeeze()
-        mask = mask.reshape((-1, self._kwargs['num_negatives']))
-        loss_neg = (F.relu(pred_neg) + F.Activation(
-            -F.abs(pred_neg), act_type='softrelu')) * mask
-        loss_neg = loss_neg.sum(axis=1) / (mask.sum(axis=1) + 1)
-
-        return loss_pos + loss_neg
-
-
-class CBOW(Net):
-    """CBOW network"""
-
-    # pylint: disable=arguments-differ
-    def hybrid_forward(self, F, center, context):
-        """CBOW forward pass.
-
-        Parameters
-        ----------
-        center : mxnet.nd.NDArray or mxnet.sym.Symbol
-            Dense array of center words of shape (batch_size, ).
-        context : mxnet.nd.NDArray or mxnet.sym.Symbol
-            Sparse CSR array of word / subword indices of shape (batch_size,
-            len(vocab) + num_subwords). Embedding for context words are
-            computed via F.sparse.dot between the CSR center array and the
-            weight matrix.
-
-        """
-        # negatives sampling
-        negatives = []
-        mask = []
-        for _ in range(self._kwargs['num_negatives']):
-            negatives.append(self.negatives_sampler(center))
-            mask.append(negatives[-1] != center)
-
-        negatives = F.stack(*negatives, axis=1)
-        mask = F.stack(*mask, axis=1).astype(np.float32)
-
-        # context - center samples
-        emb_context = self.embedding(context).expand_dims(1)
-        emb_center = self.embedding_out(center).expand_dims(2)
-        pred_pos = F.batch_dot(emb_context, emb_center).squeeze()
-        loss_pos = (F.relu(pred_pos) - pred_pos + F.Activation(
-            -F.abs(pred_pos), act_type='softrelu')) / (mask.sum(axis=1) + 1)
-
-        # context - negatives samples
-        emb_negatives = self.embedding_out(negatives).reshape(
-            (-1, self._kwargs['num_negatives'],
-             self._kwargs['output_dim'])).swapaxes(1, 2)
-        pred_neg = F.batch_dot(emb_context, emb_negatives).squeeze()
-        mask = mask.reshape((-1, self._kwargs['num_negatives']))
-        loss_neg = (F.relu(pred_neg) + F.Activation(
-            -F.abs(pred_neg), act_type='softrelu')) * mask
-        loss_neg = loss_neg.sum(axis=1) / (mask.sum(axis=1) + 1)
-
-        return loss_pos + loss_neg
diff --git a/scripts/word_embeddings/run_all.sh b/scripts/word_embeddings/run_all.sh
deleted file mode 100755
index 1f4b585b9c..0000000000
--- a/scripts/word_embeddings/run_all.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-set -e
-
-# Glove
-for i in glove.42B.300d glove.6B.100d glove.6B.200d glove.6B.300d glove.6B.50d glove.840B.300d glove.twitter.27B.100d glove.twitter.27B.200d glove.twitter.27B.25d glove.twitter.27B.50d
-do
-    echo "Running $i"
-    python evaluate_pretrained.py --gpu 0  --embedding-name glove --embedding-source $i --logdir results --analogy-max-vocab-size 300000 --analogy-datasets GoogleAnalogyTestSet BiggerAnalogyTestSet
-done
-
-# Fasttext
-for i in crawl-300d-2M wiki-news-300d-1M wiki-news-300d-1M-subword
-do
-    echo "Running $i"
-    python evaluate_pretrained.py --gpu 0  --embedding-name fasttext --embedding-source $i --logdir results --analogy-max-vocab-size 300000 --analogy-datasets GoogleAnalogyTestSet BiggerAnalogyTestSet
-done
-
-# Fasttext with subwords
-for i in wiki.en wiki.simple
-do
-    echo "Running $i"
-    python evaluate_pretrained.py --gpu 0  --embedding-name fasttext --fasttext-load-ngrams --embedding-source $i --logdir results --analogy-max-vocab-size 300000 --analogy-datasets GoogleAnalogyTestSet BiggerAnalogyTestSet
-done
diff --git a/scripts/word_embeddings/tools/CMakeLists.txt b/scripts/word_embeddings/tools/CMakeLists.txt
deleted file mode 100644
index 50a0f5fcb6..0000000000
--- a/scripts/word_embeddings/tools/CMakeLists.txt
+++ /dev/null
@@ -1,61 +0,0 @@
-cmake_minimum_required(VERSION 3.9..3.13)
-
-# * Set and configure build types
-if(NOT CMAKE_BUILD_TYPE)
-  set(CMAKE_BUILD_TYPE Release)
-endif()
-set(CMAKE_CXX_FLAGS "-Wall -Wextra")
-set(CMAKE_CXX_FLAGS_DEBUG "-g")
-set(CMAKE_CXX_FLAGS_RELEASE "-O3")
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_CXX_EXTENSIONS ON)
-
-project(EmbeddingsToolkit VERSION 1.0 LANGUAGES CXX)
-
-# * Update submodules as needed
-find_package(Git QUIET)
-if(GIT_FOUND)
-    option(GIT_SUBMODULE "Check submodules during build" ON)
-    if(GIT_SUBMODULE)
-        message(STATUS "Submodule update")
-        execute_process(COMMAND ${GIT_EXECUTABLE} submodule update --init
-                        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-                        RESULT_VARIABLE GIT_SUBMOD_RESULT)
-        if(NOT GIT_SUBMOD_RESULT EQUAL "0")
-            message(FATAL_ERROR "git submodule update --init failed with ${GIT_SUBMOD_RESULT}, please checkout submodules")
-        endif()
-    endif()
-endif()
-
-# * Compiler flags
-# -march=native or similar is required by __sync_bool_compare_and_swap_16 in growt
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native")
-
-# * Executables
-add_subdirectory(extern/cnpy EXCLUDE_FROM_ALL)
-
-# ** vocab_count
-add_executable(vocab_count vocab_count.cc)
-target_include_directories(vocab_count PUBLIC extern/sparsepp)
-target_include_directories(vocab_count PUBLIC extern/CLI11/include)
-target_link_libraries(vocab_count PUBLIC stdc++fs)
-target_link_libraries(vocab_count PUBLIC pthread)
-
-# ** cooccur
-add_executable(cooccur cooccur.cc)
-target_link_libraries(cooccur PUBLIC pthread)
-target_include_directories(cooccur PUBLIC extern/sparsepp)
-target_include_directories(cooccur PUBLIC extern/xxHash)
-target_include_directories(cooccur PUBLIC extern/growt)
-target_include_directories(cooccur PUBLIC extern/range-v3/include)
-target_include_directories(cooccur PUBLIC extern/cnpy)
-target_include_directories(cooccur PUBLIC extern/CLI11/include)
-target_link_libraries(cooccur PUBLIC stdc++fs)
-target_link_libraries(cooccur PUBLIC cnpy)
-find_package(OpenMP)
-if(OpenMP_CXX_FOUND)
-    target_link_libraries(cooccur PUBLIC OpenMP::OpenMP_CXX)
-endif()
-
diff --git a/scripts/word_embeddings/tools/cooccur.cc b/scripts/word_embeddings/tools/cooccur.cc
deleted file mode 100644
index 81494c30ab..0000000000
--- a/scripts/word_embeddings/tools/cooccur.cc
+++ /dev/null
@@ -1,314 +0,0 @@
-//  Tool to calculate word-word cooccurrence statistics
-//
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-// * Includes and definitions
-#include <cstdlib>
-#include <filesystem>
-#include <fstream>
-#include <functional>
-#include <iostream>
-#include <numeric>
-#include <queue>
-#include <random>
-#include <set>
-#include <sstream>
-#include <stdexcept>
-#include <thread>
-#include <vector>
-
-#include "CLI/CLI.hpp"    // command line parser
-#include "cnpy.h"         // numpy
-#include "sparsepp/spp.h" // fast sparse hash map
-
-#include "utils.h"
-
-namespace fs = std::filesystem;
-using Vocab = spp::sparse_hash_map<std::string, std::pair<uint32_t, uint32_t>>;
-using count_type = float;
-using Matrix = spp::sparse_hash_map<uint64_t, count_type>;
-
-// * Arguments
-enum class ContextWeight { Harmonic, DistanceOverSize, None };
-
-std::istream &operator>>(std::istream &in, ContextWeight &context_weight) {
-  int i;
-  in >> i;
-  context_weight = static_cast<ContextWeight>(i);
-  return in;
-}
-
-std::ostream &operator<<(std::ostream &in,
-                         const ContextWeight &context_weight) {
-  return in << static_cast<int>(context_weight);
-}
-
-// Arguments specified via command line options. See ParseArgs for
-// documentation.
-struct Arguments {
-  unsigned int num_threads = 1;
-  unsigned int window_size = 15;
-  bool no_symmetric = false;
-  bool subsample = false;
-  ContextWeight context_weight;
-};
-
-auto ParseArgs(int argc, char **argv) {
-  // Performance optimizations for writing to stdout
-  std::ios::sync_with_stdio(false);
-
-  Arguments args;
-  CLI::App app("Simple tool to calculate word-word cooccurrence statistics");
-  std::vector<fs::path> files;
-  app.add_option("FILES", files, "File names")->check(CLI::ExistingPath);
-  std::string output = "cooccurrences.npz";
-  app.add_option("-o,--output", output,
-                 "Output file name. Co-occurence matrix is saved as "
-                 "scipy.sparse compatible CSR matrix in a numpy .npz archive");
-  app.add_option("-w,--window-size", args.window_size,
-                 "Window size in which to count co-occurences.");
-  app.add_flag("--no-symmetric", args.no_symmetric,
-               "If not specified, a symmetric context window is used.");
-  app.add_flag("--subsample", args.subsample,
-               "Apply subsampling during co-occurence matrix construction as "
-               "in Word2Vec .");
-  app.add_set("-c,--context-weights", args.context_weight,
-              {ContextWeight::Harmonic, ContextWeight::DistanceOverSize,
-               ContextWeight::None},
-              "Weighting scheme for contexts.")
-      ->type_name("ContextWeight in {Harmonic=0, DistanceOverSize=1, None=2}");
-  app.add_option(
-         "-j,--numThreads", args.num_threads,
-         "Number of threads to use. Each thread constructs an "
-         "independent vocabulary which are finally merged. Only appropriate "
-         "when multiple, sufficiently large input files are specified.")
-      ->check(CLI::Range(1U, std::numeric_limits<unsigned int>::max()));
-
-  try {
-    app.parse(argc, argv);
-  } catch (const CLI::ParseError &e) {
-    std::exit(app.exit(e));
-  }
-
-  std::queue<fs::path> paths;
-  for (auto &file : files) {
-    paths.emplace(file);
-  }
-
-  return std::make_tuple(paths, output, args);
-}
-
-// * Input
-auto ReadVocab() {
-  std::string word;
-  std::string count;
-  int rank{0};
-  Vocab vocab;
-  while (std::cin >> word) {
-    std::cin >> count;
-    vocab[word] = {rank, std::stoi(count)};
-    rank++;
-  }
-  return vocab;
-}
-// * Co-occurence matrix construction
-std::mutex paths_m;
-std::mutex matrices_m;
-
-void ReadMatrix(std::queue<fs::path> &paths, queue<Matrix> &matrices,
-                const Vocab &vocab, const Arguments &args, uint32_t seed) {
-  assert(seed > 0);
-  std::string line;
-  std::deque<uint32_t> history;
-  std::unique_ptr<Matrix> m = std::make_unique<Matrix>();
-
-  // Prepare subsampling
-  std::random_device r;
-  std::default_random_engine random_engine(r());
-  std::uniform_real_distribution<float> uniform_dist(0, 1);
-  std::vector<double> idx_to_pdiscard;
-  if (args.subsample) {
-    double sum_counts = std::accumulate(vocab.begin(), vocab.end(), 0,
-                                        [](const auto &sum, const auto &e) {
-                                          const auto count = e.second.second;
-                                          return sum + count;
-                                        });
-    double t = 1E-4;
-    for (const auto &e : vocab) {
-      const auto count = e.second.second;
-      idx_to_pdiscard.push_back(1 - std::sqrt(t / (count / sum_counts)));
-    }
-  }
-
-  while (true) {
-    fs::path path;
-    {
-      std::scoped_lock lock(paths_m);
-      if (paths.empty()) {
-        break;
-      }
-      path = paths.front();
-      paths.pop();
-    }
-
-    std::ifstream in{path};
-    if (!in.is_open()) {
-      throw std::invalid_argument(path.string() + " cannot be opened!");
-    }
-    while (std::getline(in, line)) {
-      history.clear(); // Discard context from other lines
-      std::stringstream stream(line);
-      std::string word;
-      while (stream >> word) {
-        // TODO We must construct an extra std::string for every word due to
-        // missing support for heterogenous lookup in unordered map. Once
-        // https://wg21.link/P0919 is merged construct a string_view instead.
-        // std::string_view(&*word.begin(), ranges::distance(word))
-        auto word_rank_it = vocab.find(word);
-        // Skip words not contained in the vocabulary
-        if (word_rank_it != vocab.end()) {
-          uint32_t word_rank = word_rank_it->second.first;
-
-          if (args.subsample &&
-              uniform_dist(random_engine) <= idx_to_pdiscard[word_rank]) {
-            continue;
-          }
-
-          for (unsigned int distance = 1; distance <= history.size();
-               distance++) {
-            const auto &context_word_rank = history[distance - 1];
-            uint64_t key; // We merge 32 bit row and col indices to a single 64
-                          // bit key
-            // For symmetric contexts, only store one direction.
-            if (!args.no_symmetric) {
-              if (word_rank <= context_word_rank) {
-                key = (static_cast<uint64_t>(word_rank) << 32) |
-                      context_word_rank;
-              } else {
-                key = word_rank |
-                      (static_cast<uint64_t>(context_word_rank) << 32);
-              }
-            } else {
-              key =
-                  (static_cast<uint64_t>(word_rank) << 32) | context_word_rank;
-            }
-
-            if (args.context_weight == ContextWeight::Harmonic) {
-              (*m)[key] += 1.0f / static_cast<count_type>(distance);
-            } else if (args.context_weight == ContextWeight::DistanceOverSize) {
-              (*m)[key] += (args.window_size - distance - 1) / args.window_size;
-            } else {
-              (*m)[key]++;
-            }
-          }
-
-          // Update circular history buffer
-          if (history.size() == args.window_size) {
-            history.pop_front();
-          }
-          history.push_back(word_rank);
-        }
-      }
-    }
-  }
-  {
-    std::scoped_lock lock(matrices_m);
-    matrices.push(std::move(m));
-  }
-}
-
-std::unique_ptr<Matrix> CombineMatrices(queue<Matrix> &matrices,
-                                        int num_threads) {
-  std::unique_ptr<Matrix> m1 = matrices.pop();
-  for (int i = 1; i < num_threads; i++) {
-    std::unique_ptr<Matrix> m2 = matrices.pop();
-    if (m1->size() < m2->size()) {
-      for (const auto &e : *m1) {
-        (*m2)[e.first] += e.second;
-      }
-      std::swap(m1, m2);
-    } else {
-      for (const auto &e : *m2) {
-        (*m1)[e.first] += e.second;
-      }
-    }
-  }
-  return m1;
-}
-
-auto ComputeCooccurrenceMatrix(Vocab &vocab, std::queue<fs::path> &paths,
-                               const Arguments &args) {
-  std::vector<std::thread> threads;
-  queue<Matrix> matrices;
-  for (unsigned int i = 0; i < args.num_threads; i++) {
-    threads.push_back(std::thread([&paths, &matrices, &vocab, &args, i]() {
-      ReadMatrix(std::ref(paths), std::ref(matrices), std::ref(vocab),
-                 std::ref(args), i + 1);
-    }));
-  }
-  std::unique_ptr<Matrix> m = CombineMatrices(matrices, args.num_threads);
-  for (unsigned int i = 0; i < args.num_threads; i++) {
-    threads[i].join();
-  }
-  return m;
-}
-
-auto ToCOO(const Vocab &vocab, std::unique_ptr<Matrix> m) {
-  size_t num_tokens = vocab.size();
-  size_t nnz = m->size();
-  std::cout << "Got " << nnz
-            << " non-zero entries in cooccurrence matrix of shape ("
-            << num_tokens << ", " << num_tokens << ")" << std::endl;
-  std::vector<uint32_t> row;
-  std::vector<uint32_t> col;
-  std::vector<count_type> data;
-  row.reserve(nnz);
-  col.reserve(nnz);
-  data.reserve(nnz);
-  for (const auto &e : *m) {
-    row.push_back(e.first >> 32);
-    col.push_back(e.first & 0xffffffff);
-    data.push_back(e.second);
-  }
-  return std::make_tuple(row, col, data);
-}
-
-// * Output
-void WriteNumpy(const std::string output, const std::vector<uint32_t> &row,
-                const std::vector<uint32_t> &col,
-                const std::vector<count_type> &data, const bool symmetric,
-                const uint32_t num_tokens) {
-
-  assert(row.size() == data.size());
-  assert(col.size() == data.size());
-  cnpy::npz_save(output, "row", &row[0], {row.size()}, "w");
-  cnpy::npz_save(output, "col", &col[0], {col.size()}, "a");
-  cnpy::npz_save(output, "data", &data[0], {data.size()}, "a");
-  cnpy::npz_save(output, "num_tokens", &num_tokens, {1}, "a");
-  cnpy::npz_save(output, "symmetric", &symmetric, {1}, "a");
-}
-
-// * Main
-int main(int argc, char **argv) {
-  auto [paths, output, args] = ParseArgs(argc, argv);
-  auto vocab = ReadVocab();
-  auto cooccurenceMatrix = ComputeCooccurrenceMatrix(vocab, paths, args);
-  auto [row, col, data] = ToCOO(vocab, std::move(cooccurenceMatrix));
-  WriteNumpy(output, row, col, data, !args.no_symmetric, vocab.size());
-  return 0;
-}
diff --git a/scripts/word_embeddings/tools/extern/CLI11 b/scripts/word_embeddings/tools/extern/CLI11
deleted file mode 160000
index bd4dc91184..0000000000
--- a/scripts/word_embeddings/tools/extern/CLI11
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit bd4dc911847d0cde7a6b41dfa626a85aab213baf
diff --git a/scripts/word_embeddings/tools/extern/cnpy b/scripts/word_embeddings/tools/extern/cnpy
deleted file mode 160000
index 82ff25140e..0000000000
--- a/scripts/word_embeddings/tools/extern/cnpy
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 82ff25140ef6e61bd16ad23b32170b455ee1a52a
diff --git a/scripts/word_embeddings/tools/extern/sparsepp b/scripts/word_embeddings/tools/extern/sparsepp
deleted file mode 160000
index 6061692d22..0000000000
--- a/scripts/word_embeddings/tools/extern/sparsepp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 6061692d226603e2e9ca00939dff078673cadda1
diff --git a/scripts/word_embeddings/tools/utils.h b/scripts/word_embeddings/tools/utils.h
deleted file mode 100644
index 41faf2c807..0000000000
--- a/scripts/word_embeddings/tools/utils.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-#pragma once
-
-#include <condition_variable>
-#include <deque>
-#include <memory>
-#include <mutex>
-#include <utility>
-
-// Based on https://stackoverflow.com/a/12805690
-template <typename T> class queue {
-private:
-  std::mutex d_mutex;
-  std::condition_variable d_condition;
-  std::deque<std::unique_ptr<T>> d_queue;
-
-public:
-  // Add a value to queue in a thread-safe manner.
-  void push(std::unique_ptr<T> value) {
-    {
-      std::unique_lock<std::mutex> lock(this->d_mutex);
-      d_queue.push_front(std::move(value));
-    }
-    this->d_condition.notify_one();
-  }
-
-  // Remove and return a value from the queue in a thread-safe manner (FIFO).
-  // Blocks if there is no value in the Queue.
-  std::unique_ptr<T> pop() {
-    std::unique_lock<std::mutex> lock(this->d_mutex);
-    this->d_condition.wait(lock, [&] { return !this->d_queue.empty(); });
-    std::unique_ptr<T> rc(std::move(this->d_queue.back()));
-    this->d_queue.pop_back();
-    return rc;
-  }
-};
diff --git a/scripts/word_embeddings/tools/vocab_count.cc b/scripts/word_embeddings/tools/vocab_count.cc
deleted file mode 100644
index 9d92e9f157..0000000000
--- a/scripts/word_embeddings/tools/vocab_count.cc
+++ /dev/null
@@ -1,151 +0,0 @@
-//  Tool to extract unigram counts
-//
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include <cstdlib>
-#include <filesystem>
-#include <fstream>
-#include <functional>
-#include <iostream>
-#include <memory>
-#include <mutex>
-#include <numeric>
-#include <queue>
-#include <set>
-#include <sstream>
-#include <stdexcept>
-#include <thread>
-#include <utility>
-#include <vector>
-
-#include "CLI/CLI.hpp"    // command line parser
-#include "sparsepp/spp.h" // fast sparse hash map
-
-#include "./utils.h"
-
-namespace fs = std::filesystem;
-using Vocab = spp::sparse_hash_map<std::string, uint32_t>;
-
-std::mutex paths_m;
-std::mutex vocabs_m;
-
-void ReadVocab(std::queue<fs::path> &paths, queue<Vocab> &vocabs) {
-  std::unique_ptr<Vocab> vocab = std::make_unique<Vocab>();
-  std::string word;
-  fs::path path;
-
-  while (true) {
-    {
-      std::scoped_lock lock(paths_m);
-      if (paths.empty()) {
-        break;
-      }
-      path = paths.front();
-      paths.pop();
-    }
-
-    std::ifstream in{path};
-    if (!in.is_open()) {
-      throw std::invalid_argument(path.string() + " cannot be opened!");
-    }
-    while (in >> word) {
-      (*vocab)[word]++;
-    }
-  }
-  {
-    std::scoped_lock lock(vocabs_m);
-    vocabs.push(std::move(vocab));
-  }
-}
-
-std::unique_ptr<Vocab> CombineVocabs(queue<Vocab> &vocabs, int num_threads) {
-  std::unique_ptr<Vocab> vocab1 = vocabs.pop();
-  for (int i = 1; i < num_threads; i++) {
-    std::unique_ptr<Vocab> vocab2 = vocabs.pop();
-    if (vocab1->size() < vocab2->size()) {
-      for (const auto &e : *vocab1) {
-        (*vocab2)[e.first] += e.second;
-      }
-      std::swap(vocab1, vocab2);
-    } else {
-      for (const auto &e : *vocab2) {
-        (*vocab1)[e.first] += e.second;
-      }
-    }
-  }
-  return vocab1;
-}
-
-int main(int argc, char **argv) {
-  // Performance optimizations for writing to stdout
-  std::ios::sync_with_stdio(false);
-
-  CLI::App app("Simple tool to extract unigram counts");
-  std::vector<std::string> files;
-  app.add_option("FILES", files, "File names")->check(CLI::ExistingPath);
-  unsigned int minCount = 10;
-  app.add_option("-c,--minCount", minCount,
-                 "Minimum number of occurences required for a word to be "
-                 "included in the vocabulary.");
-  unsigned int num_threads = 1;
-  app.add_option(
-         "-j,--num_threads", num_threads,
-         "Number of threads to use. Each thread constructs an "
-         "independent vocabulary which are finally merged. Only appropriate "
-         "when multiple, sufficiently large input files are specified.")
-      ->check(CLI::Range(1U, std::numeric_limits<unsigned int>::max()));
-  CLI11_PARSE(app, argc, argv);
-
-  std::queue<fs::path> paths;
-  for (auto &file : files) {
-    paths.emplace(file);
-  }
-  std::vector<std::thread> threads;
-  queue<Vocab> vocabs;
-  for (unsigned int i = 0; i < num_threads; i++) {
-    threads.push_back(std::thread(
-        [&paths, &vocabs]() { ReadVocab(std::ref(paths), std::ref(vocabs)); }));
-  }
-  std::unique_ptr<Vocab> vocab = CombineVocabs(vocabs, num_threads);
-  for (unsigned int i = 0; i < num_threads; i++) {
-    threads[i].join();
-  }
-
-  // Sort
-  typedef std::function<bool(std::pair<std::string, int>,
-                             std::pair<std::string, int>)>
-      Comparator;
-  Comparator CompFunctor = [](std::pair<std::string, int> elem1,
-                              std::pair<std::string, int> elem2) {
-    return (elem1.second > elem2.second) ||
-           (elem1.second == elem2.second && elem1.first < elem2.first);
-  };
-  std::set<std::pair<std::string, uint32_t>, Comparator> sorted_vocab(
-      vocab->begin(), vocab->end(), CompFunctor);
-  vocab.reset(); // Release ownership
-
-  // Output
-  for (const auto &e : sorted_vocab) {
-    if (e.second < minCount) {
-      break;
-    }
-    std::cout << e.first << "\t" << e.second << "\n";
-  }
-
-  return 0;
-}
diff --git a/scripts/word_embeddings/train_glove.py b/scripts/word_embeddings/train_glove.py
deleted file mode 100644
index 959df6bfea..0000000000
--- a/scripts/word_embeddings/train_glove.py
+++ /dev/null
@@ -1,426 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=global-variable-undefined,wrong-import-position
-"""GloVe embedding model
-===========================
-
-This example shows how to train a GloVe embedding model based on the vocabulary
-and co-occurrence matrix constructed by the vocab_count and cooccur tool. The
-tools are located in the same ./tools folder next to this script.
-
-The GloVe model was introduced by
-
-- Pennington, J., Socher, R., & Manning, C. D. (2014). Glove: global vectors
-  for word representation. In A. Moschitti, B. Pang, & W. Daelemans,
-  Proceedings of the 2014 Conference on Empirical Methods in Natural Language
-  Processing, {EMNLP} 2014, October 25-29, 2014, Doha, Qatar, {A} meeting of
-  SIGDAT, a Special Interest Group of the {ACL (pp. 1532–1543). : ACL.
-
-"""
-# * Imports
-import argparse
-import io
-import logging
-import os
-import random
-import sys
-import tempfile
-import time
-
-import mxnet as mx
-import numpy as np
-
-import evaluation
-import gluonnlp as nlp
-from utils import get_context, print_time
-
-nlp.utils.check_version('0.7.0')
-
-os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
-
-
-# * Utils
-def parse_args():
-    """Parse command line arguments."""
-    parser = argparse.ArgumentParser(
-        description='GloVe with GluonNLP',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-    # Data options
-    group = parser.add_argument_group('Data arguments')
-    group.add_argument(
-        'cooccurrences', type=str,
-        help='Path to cooccurrences.npz containing a sparse (COO) '
-        'representation of the co-occurrence matrix in numpy archive format. '
-        'Output of ./cooccur')
-    group.add_argument('vocab', type=str,
-                       help='Vocabulary indices. Output of vocab_count tool.')
-
-    # Computation options
-    group = parser.add_argument_group('Computation arguments')
-    group.add_argument('--batch-size', type=int, default=512,
-                       help='Batch size for training.')
-    group.add_argument('--epochs', type=int, default=50, help='Epoch limit')
-    group.add_argument(
-        '--gpu', type=int, nargs='+',
-        help='Number (index) of GPU to run on, e.g. 0. '
-        'If not specified, uses CPU.')
-    group.add_argument('--no-hybridize', action='store_true',
-                       help='Disable hybridization of gluon HybridBlocks.')
-    group.add_argument(
-        '--no-static-alloc', action='store_true',
-        help='Disable static memory allocation for HybridBlocks.')
-
-    # Model
-    group = parser.add_argument_group('Model arguments')
-    group.add_argument('--emsize', type=int, default=300,
-                       help='Size of embedding vectors.')
-    group.add_argument('--x-max', type=int, default=100)
-    group.add_argument('--alpha', type=float, default=0.75)
-
-    # Optimization options
-    group = parser.add_argument_group('Optimization arguments')
-    group.add_argument('--adagrad-eps', type=float, default=1,
-                       help='Initial AdaGrad state value.')
-    group.add_argument('--lr', type=float, default=0.1, help='Learning rate')
-    group.add_argument('--seed', type=int, default=1, help='Random seed')
-    group.add_argument('--dropout', type=float, default=0.15)
-
-    # Logging
-    group = parser.add_argument_group('Logging arguments')
-    group.add_argument('--logdir', type=str, default='logs',
-                       help='Directory to store logs.')
-    group.add_argument('--log-interval', type=int, default=100)
-    group.add_argument(
-        '--eval-interval', type=int,
-        help='Evaluate every --eval-interval iterations '
-        'in addition to at the end of every epoch.')
-    group.add_argument('--no-eval-analogy', action='store_true',
-                       help='Don\'t evaluate on the analogy task.')
-
-    # Evaluation options
-    evaluation.add_parameters(parser)
-
-    args = parser.parse_args()
-    evaluation.validate_args(args)
-
-    random.seed(args.seed)
-    mx.random.seed(args.seed)
-    np.random.seed(args.seed)
-
-    return args
-
-
-def get_train_data(args):
-    """Helper function to get training data."""
-    counter = dict()
-    with io.open(args.vocab, 'r', encoding='utf-8') as f:
-        for line in f:
-            token, count = line.split('\t')
-            counter[token] = int(count)
-    vocab = nlp.Vocab(counter, unknown_token=None, padding_token=None,
-                      bos_token=None, eos_token=None, min_freq=1)
-
-    npz = np.load(args.cooccurrences)
-    row, col, counts = npz['row'], npz['col'], npz['data']
-
-    rank_dtype = 'int32'
-    if row.max() >= np.iinfo(np.int32).max:
-        rank_dtype = 'int64'
-        # MXNet has no support for uint32, so we must fall back to int64
-        logging.info('More words than could be counted using int32. '
-                     'Using int64 to represent word indices.')
-    row = mx.nd.array(row, dtype=rank_dtype)
-    col = mx.nd.array(col, dtype=rank_dtype)
-    # row is always used as 'source' and col as 'context' word. Therefore
-    # duplicate the entries.
-
-    assert row.shape == col.shape
-    row = mx.nd.concatenate([row, col])
-    col = mx.nd.concatenate([col, row[:len(row) // 2]])
-
-    counts = mx.nd.array(counts, dtype='float32')
-    counts = mx.nd.concatenate([counts, counts])
-
-    return vocab, row, col, counts
-
-
-# * Gluon Block definition
-class GloVe(nlp.model.train.EmbeddingModel, mx.gluon.HybridBlock):
-    """GloVe EmbeddingModel"""
-
-    def __init__(self, token_to_idx, output_dim, x_max, alpha, dropout=0,
-                 weight_initializer=None,
-                 bias_initializer=mx.initializer.Zero(), sparse_grad=True,
-                 dtype='float32', **kwargs):
-        assert isinstance(token_to_idx, dict)
-
-        super(GloVe, self).__init__(**kwargs)
-        self.token_to_idx = token_to_idx
-        self.weight_initializer = weight_initializer
-        self.bias_initializer = bias_initializer
-        self.sparse_grad = sparse_grad
-        self.dtype = dtype
-
-        self._x_max = x_max
-        self._alpha = alpha
-        self._dropout = dropout
-
-        with self.name_scope():
-            self.source_embedding = mx.gluon.nn.Embedding(
-                len(token_to_idx), output_dim,
-                weight_initializer=weight_initializer, sparse_grad=sparse_grad,
-                dtype=dtype)
-            self.context_embedding = mx.gluon.nn.Embedding(
-                len(token_to_idx), output_dim,
-                weight_initializer=weight_initializer, sparse_grad=sparse_grad,
-                dtype=dtype)
-            self.source_bias = mx.gluon.nn.Embedding(
-                len(token_to_idx), 1, weight_initializer=bias_initializer,
-                sparse_grad=sparse_grad, dtype=dtype)
-            self.context_bias = mx.gluon.nn.Embedding(
-                len(token_to_idx), 1, weight_initializer=bias_initializer,
-                sparse_grad=sparse_grad, dtype=dtype)
-
-    def hybrid_forward(self, F, row, col, counts):
-        """Compute embedding of words in batch.
-
-        Parameters
-        ----------
-        row : mxnet.nd.NDArray or mxnet.sym.Symbol
-            Array of token indices for source words. Shape (batch_size, ).
-        row : mxnet.nd.NDArray or mxnet.sym.Symbol
-            Array of token indices for context words. Shape (batch_size, ).
-        counts : mxnet.nd.NDArray or mxnet.sym.Symbol
-            Their co-occurrence counts. Shape (batch_size, ).
-
-        Returns
-        -------
-        mxnet.nd.NDArray or mxnet.sym.Symbol
-            Loss. Shape (batch_size, ).
-
-        """
-
-        emb_in = self.source_embedding(row)
-        emb_out = self.context_embedding(col)
-
-        if self._dropout:
-            emb_in = F.Dropout(emb_in, p=self._dropout)
-            emb_out = F.Dropout(emb_out, p=self._dropout)
-
-        bias_in = self.source_bias(row).squeeze()
-        bias_out = self.context_bias(col).squeeze()
-        dot = F.batch_dot(emb_in.expand_dims(1),
-                          emb_out.expand_dims(2)).squeeze()
-        tmp = dot + bias_in + bias_out - F.log(counts).squeeze()
-        weight = F.clip(((counts / self._x_max)**self._alpha), a_min=0,
-                        a_max=1).squeeze()
-        loss = weight * F.square(tmp)
-        return loss
-
-    def __contains__(self, token):
-        return token in self.idx_to_token
-
-    def __getitem__(self, tokens):
-        """Looks up embedding vectors of text tokens.
-
-        Parameters
-        ----------
-        tokens : str or list of strs
-            A token or a list of tokens.
-
-        Returns
-        -------
-        mxnet.ndarray.NDArray:
-            The embedding vector(s) of the token(s). According to numpy
-            conventions, if `tokens` is a string, returns a 1-D NDArray
-            (vector); if `tokens` is a list of strings, returns a 2-D NDArray
-            (matrix) of shape=(len(tokens), vec_len).
-        """
-        squeeze = False
-        if isinstance(tokens, str):
-            tokens = [tokens]
-            squeeze = True
-
-        indices = mx.nd.array([self.token_to_idx[t] for t in tokens],
-                              ctx=self.source_embedding.weight.list_ctx()[0])
-        vecs = self.source_embedding(indices) + self.context_embedding(indices)
-
-        if squeeze:
-            assert len(vecs) == 1
-            return vecs[0].squeeze()
-        else:
-            return vecs
-
-
-# * Training code
-def train(args):
-    """Training helper."""
-    vocab, row, col, counts = get_train_data(args)
-    model = GloVe(token_to_idx=vocab.token_to_idx, output_dim=args.emsize,
-                  dropout=args.dropout, x_max=args.x_max, alpha=args.alpha,
-                  weight_initializer=mx.init.Uniform(scale=1 / args.emsize))
-    context = get_context(args)
-    model.initialize(ctx=context)
-    if not args.no_hybridize:
-        model.hybridize(static_alloc=not args.no_static_alloc)
-
-    optimizer_kwargs = dict(learning_rate=args.lr, eps=args.adagrad_eps)
-    params = list(model.collect_params().values())
-    trainer = mx.gluon.Trainer(params, 'groupadagrad', optimizer_kwargs)
-
-    index_dtype = 'int32'
-    if counts.shape[0] >= np.iinfo(np.int32).max:
-        index_dtype = 'int64'
-        logging.info('Co-occurrence matrix is large. '
-                     'Using int64 to represent sample indices.')
-    indices = mx.nd.arange(counts.shape[0], dtype=index_dtype)
-    for epoch in range(args.epochs):
-        # Logging variables
-        log_wc = 0
-        log_start_time = time.time()
-        log_avg_loss = 0
-
-        mx.nd.shuffle(indices, indices)  # inplace shuffle
-        bs = args.batch_size
-        num_batches = indices.shape[0] // bs
-        for i in range(num_batches):
-            batch_indices = indices[bs * i:bs * (i + 1)]
-            ctx = context[i % len(context)]
-            batch_row = row[batch_indices].as_in_context(ctx)
-            batch_col = col[batch_indices].as_in_context(ctx)
-            batch_counts = counts[batch_indices].as_in_context(ctx)
-            with mx.autograd.record():
-                loss = model(batch_row, batch_col, batch_counts)
-                loss.backward()
-
-            if len(context) == 1 or (i + 1) % len(context) == 0:
-                trainer.step(batch_size=1)
-
-            # Logging
-            log_wc += loss.shape[0]
-            log_avg_loss += loss.mean().as_in_context(context[0])
-            if (i + 1) % args.log_interval == 0:
-                # Forces waiting for computation by computing loss value
-                log_avg_loss = log_avg_loss.asscalar() / args.log_interval
-                wps = log_wc / (time.time() - log_start_time)
-                logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, '
-                             'throughput={:.2f}K wps, wc={:.2f}K'.format(
-                                 epoch, i + 1, num_batches, log_avg_loss,
-                                 wps / 1000, log_wc / 1000))
-                log_dict = dict(
-                    global_step=epoch * len(indices) + i * args.batch_size,
-                    epoch=epoch, batch=i + 1, loss=log_avg_loss,
-                    wps=wps / 1000)
-                log(args, log_dict)
-
-                log_start_time = time.time()
-                log_avg_loss = 0
-                log_wc = 0
-
-            if args.eval_interval and (i + 1) % args.eval_interval == 0:
-                with print_time('mx.nd.waitall()'):
-                    mx.nd.waitall()
-                with print_time('evaluate'):
-                    evaluate(args, model, vocab, i + num_batches * epoch)
-
-    # Evaluate
-    with print_time('mx.nd.waitall()'):
-        mx.nd.waitall()
-    with print_time('evaluate'):
-        evaluate(args, model, vocab, num_batches * args.epochs,
-                 eval_analogy=not args.no_eval_analogy)
-
-    # Save params
-    with print_time('save parameters'):
-        model.save_parameters(os.path.join(args.logdir, 'glove.params'))
-
-
-# * Evaluation
-def evaluate(args, model, vocab, global_step, eval_analogy=False):
-    """Evaluation helper"""
-    if 'eval_tokens' not in globals():
-        global eval_tokens
-
-        eval_tokens_set = evaluation.get_tokens_in_evaluation_datasets(args)
-        if not args.no_eval_analogy:
-            eval_tokens_set.update(vocab.idx_to_token)
-
-        # GloVe does not support computing vectors for OOV words
-        eval_tokens_set = filter(lambda t: t in vocab, eval_tokens_set)
-
-        eval_tokens = list(eval_tokens_set)
-
-    # Compute their word vectors
-    context = get_context(args)
-    mx.nd.waitall()
-
-    token_embedding = nlp.embedding.TokenEmbedding(unknown_token=None,
-                                                   allow_extend=True)
-    token_embedding[eval_tokens] = model[eval_tokens]
-
-    results = evaluation.evaluate_similarity(
-        args, token_embedding, context[0], logfile=os.path.join(
-            args.logdir, 'similarity.tsv'), global_step=global_step)
-    if eval_analogy:
-        assert not args.no_eval_analogy
-        results += evaluation.evaluate_analogy(
-            args, token_embedding, context[0], logfile=os.path.join(
-                args.logdir, 'analogy.tsv'))
-
-    return results
-
-
-# * Logging
-def log(args, kwargs):
-    """Log to a file."""
-    logfile = os.path.join(args.logdir, 'log.tsv')
-
-    if 'log_created' not in globals():
-        if os.path.exists(logfile):
-            logging.error('Logfile %s already exists.', logfile)
-            sys.exit(1)
-
-        global log_created
-
-        log_created = sorted(kwargs.keys())
-        header = '\t'.join((str(k) for k in log_created)) + '\n'
-        with open(logfile, 'w') as f:
-            f.write(header)
-
-    # Log variables shouldn't change during training
-    assert log_created == sorted(kwargs.keys())
-
-    with open(logfile, 'a') as f:
-        f.write('\t'.join((str(kwargs[k]) for k in log_created)) + '\n')
-
-
-# * Main
-if __name__ == '__main__':
-    logging.basicConfig()
-    logging.getLogger().setLevel(logging.INFO)
-    args_ = parse_args()
-
-    if os.path.exists(args_.logdir):
-        newlogdir = tempfile.mkdtemp(dir=args_.logdir)
-        logging.warning('%s exists. Using %s', args_.logdir, newlogdir)
-        args_.logdir = newlogdir
-    if not os.path.isdir(args_.logdir):
-        os.makedirs(args_.logdir)
-
-    train(args_)
diff --git a/scripts/word_embeddings/train_sg_cbow.py b/scripts/word_embeddings/train_sg_cbow.py
deleted file mode 100644
index 7be3543201..0000000000
--- a/scripts/word_embeddings/train_sg_cbow.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=global-variable-undefined,wrong-import-position
-"""SkipGram and CBOW embedding models
-=====================================
-
-This example shows how to train SkipGram (SG) and Continuous Bag of Words
-(CBOW) embedding models with the Gluon NLP Toolkit. Including fastText style
-subword information is supported.
-
-The SG and CBOW models were introduced by "Mikolov et al. Efficient estimation
-of word representations in vector space. ICLR Workshop, 2013". The fastText
-model was introduced by "Bojanowski et al. Enriching word vectors with subword
-information. TACL 2017"
-
-"""
-import argparse
-import logging
-import os
-import random
-import sys
-import time
-
-import mxnet as mx
-import numpy as np
-
-import gluonnlp as nlp
-import evaluation
-from utils import get_context, print_time
-from model import SG, CBOW
-from data import transform_data_word2vec, transform_data_fasttext, preprocess_dataset, wiki
-
-nlp.utils.check_version('0.7.0')
-
-def parse_args():
-    """Parse command line arguments."""
-    parser = argparse.ArgumentParser(
-        description='Word embedding training with Gluon.',
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
-
-    # Data options
-    group = parser.add_argument_group('Data arguments')
-    group.add_argument('--data', type=str, default='text8',
-                       help='Training dataset.')
-    group.add_argument('--wiki-root', type=str, default='text8',
-                       help='Root under which preprocessed wiki dump.')
-    group.add_argument('--wiki-language', type=str, default='text8',
-                       help='Language of wiki dump.')
-    group.add_argument('--wiki-date', help='Date of wiki dump.')
-
-    # Computation options
-    group = parser.add_argument_group('Computation arguments')
-    group.add_argument('--batch-size', type=int, default=1024,
-                       help='Batch size for training.')
-    group.add_argument('--epochs', type=int, default=5, help='Epoch limit')
-    group.add_argument(
-        '--gpu', type=int, nargs='+',
-        help=('Number (index) of GPU to run on, e.g. 0. '
-              'If not specified, uses CPU.'))
-    group.add_argument('--no-prefetch-batch', action='store_true',
-                       help='Disable multi-threaded nogil batch prefetching.')
-    group.add_argument('--num-prefetch-epoch', type=int, default=3,
-                       help='Start data pipeline for next N epochs when beginning current epoch.')
-    group.add_argument('--no-hybridize', action='store_true',
-                       help='Disable hybridization of gluon HybridBlocks.')
-
-    # Model
-    group = parser.add_argument_group('Model arguments')
-    group.add_argument('--emsize', type=int, default=300,
-                       help='Size of embedding vectors.')
-    group.add_argument('--ngrams', type=int, nargs='+', default=[3, 4, 5, 6])
-    group.add_argument(
-        '--ngram-buckets', type=int, default=2000000,
-        help='Size of word_context set of the ngram hash function. '
-        'Set this to 0 for Word2Vec style training.')
-    group.add_argument('--model', type=str, default='skipgram',
-                       help='SkipGram or CBOW.')
-    group.add_argument('--window', type=int, default=5,
-                       help='Context window size.')
-    group.add_argument(
-        '--negative', type=int, default=5, help='Number of negative samples '
-        'per source-context word pair.')
-    group.add_argument('--frequent-token-subsampling', type=float,
-                       default=1E-4,
-                       help='Frequent token subsampling constant.')
-    group.add_argument(
-        '--max-vocab-size', type=int,
-        help='Limit the number of words considered. '
-        'OOV words will be ignored.')
-
-    # Optimization options
-    group = parser.add_argument_group('Optimization arguments')
-    group.add_argument('--optimizer', type=str, default='groupadagrad')
-    group.add_argument('--lr', type=float, default=0.1)
-    group.add_argument('--seed', type=int, default=1, help='random seed')
-
-    # Logging
-    group = parser.add_argument_group('Logging arguments')
-    group.add_argument('--logdir', type=str, default='logs',
-                       help='Directory to store logs.')
-    group.add_argument('--log-interval', type=int, default=100)
-    group.add_argument(
-        '--eval-interval', type=int,
-        help='Evaluate every --eval-interval iterations '
-        'in addition to at the end of every epoch.')
-    group.add_argument('--no-eval-analogy', action='store_true',
-                       help='Don\'t evaluate on the analogy task.')
-
-    # Evaluation options
-    evaluation.add_parameters(parser)
-
-    args = parser.parse_args()
-    evaluation.validate_args(args)
-
-    random.seed(args.seed)
-    mx.random.seed(args.seed)
-    np.random.seed(args.seed)
-
-    return args
-
-
-def train(args):
-    """Training helper."""
-    if not args.model.lower() in ['cbow', 'skipgram']:
-        logging.error('Unsupported model %s.', args.model)
-        sys.exit(1)
-
-    if args.data.lower() == 'toy':
-        data = mx.gluon.data.SimpleDataset(nlp.data.Text8(segment='train')[:2])
-        data, vocab, idx_to_counts = preprocess_dataset(
-            data, max_vocab_size=args.max_vocab_size)
-    elif args.data.lower() == 'text8':
-        data = nlp.data.Text8(segment='train')
-        data, vocab, idx_to_counts = preprocess_dataset(
-            data, max_vocab_size=args.max_vocab_size)
-    elif args.data.lower() == 'fil9':
-        data = nlp.data.Fil9(max_sentence_length=10000)
-        data, vocab, idx_to_counts = preprocess_dataset(
-            data, max_vocab_size=args.max_vocab_size)
-    elif args.data.lower() == 'wiki':
-        data, vocab, idx_to_counts = wiki(args.wiki_root, args.wiki_date,
-                                          args.wiki_language,
-                                          args.max_vocab_size)
-
-    if args.ngram_buckets > 0:
-        data, batchify_fn, subword_function = transform_data_fasttext(
-            data, vocab, idx_to_counts, cbow=args.model.lower() == 'cbow',
-            ngram_buckets=args.ngram_buckets, ngrams=args.ngrams,
-            batch_size=args.batch_size, window_size=args.window,
-            frequent_token_subsampling=args.frequent_token_subsampling)
-    else:
-        subword_function = None
-        data, batchify_fn = transform_data_word2vec(
-            data, vocab, idx_to_counts, cbow=args.model.lower() == 'cbow',
-            batch_size=args.batch_size, window_size=args.window,
-            frequent_token_subsampling=args.frequent_token_subsampling)
-
-    num_tokens = float(sum(idx_to_counts))
-
-    model = CBOW if args.model.lower() == 'cbow' else SG
-    embedding = model(token_to_idx=vocab.token_to_idx, output_dim=args.emsize,
-                      batch_size=args.batch_size, num_negatives=args.negative,
-                      negatives_weights=mx.nd.array(idx_to_counts),
-                      subword_function=subword_function)
-    context = get_context(args)
-    embedding.initialize(ctx=context)
-    if not args.no_hybridize:
-        embedding.hybridize(static_alloc=True, static_shape=True)
-
-    optimizer_kwargs = dict(learning_rate=args.lr)
-    trainer = mx.gluon.Trainer(embedding.collect_params(), args.optimizer,
-                               optimizer_kwargs)
-
-    if args.no_prefetch_batch:
-        data = data.transform(batchify_fn)
-    else:
-        from executors import LazyThreadPoolExecutor
-        num_cpu = len(os.sched_getaffinity(0))
-        ex = LazyThreadPoolExecutor(num_cpu)
-
-    num_update = 0
-    prefetched_iters = []
-    for _ in range(min(args.num_prefetch_epoch, args.epochs)):
-        prefetched_iters.append(iter(data))
-    for epoch in range(args.epochs):
-        if epoch + len(prefetched_iters) < args.epochs:
-            prefetched_iters.append(iter(data))
-        data_iter = prefetched_iters.pop(0)
-        try:
-            batches = ex.map(batchify_fn, data_iter)
-        except NameError:  # Py 2 or batch prefetching disabled
-            batches = data_iter
-
-        # Logging variables
-        log_wc = 0
-        log_start_time = time.time()
-        log_avg_loss = 0
-
-        for i, batch in enumerate(batches):
-            ctx = context[i % len(context)]
-            batch = [array.as_in_context(ctx) for array in batch]
-            with mx.autograd.record():
-                loss = embedding(*batch)
-            loss.backward()
-
-            num_update += loss.shape[0]
-            if len(context) == 1 or (i + 1) % len(context) == 0:
-                trainer.step(batch_size=1)
-
-            # Logging
-            log_wc += loss.shape[0]
-            log_avg_loss += loss.mean().as_in_context(context[0])
-            if (i + 1) % args.log_interval == 0:
-                # Forces waiting for computation by computing loss value
-                log_avg_loss = log_avg_loss.asscalar() / args.log_interval
-                wps = log_wc / (time.time() - log_start_time)
-                # Due to subsampling, the overall number of batches is an upper
-                # bound
-                num_batches = num_tokens // args.batch_size
-                if args.model.lower() == 'skipgram':
-                    num_batches = (num_tokens * args.window * 2) // args.batch_size
-                else:
-                    num_batches = num_tokens // args.batch_size
-                logging.info('[Epoch {} Batch {}/{}] loss={:.4f}, '
-                             'throughput={:.2f}K wps, wc={:.2f}K'.format(
-                                 epoch, i + 1, num_batches, log_avg_loss,
-                                 wps / 1000, log_wc / 1000))
-                log_start_time = time.time()
-                log_avg_loss = 0
-                log_wc = 0
-
-            if args.eval_interval and (i + 1) % args.eval_interval == 0:
-                with print_time('mx.nd.waitall()'):
-                    mx.nd.waitall()
-                with print_time('evaluate'):
-                    evaluate(args, embedding, vocab, num_update)
-
-    # Evaluate
-    with print_time('mx.nd.waitall()'):
-        mx.nd.waitall()
-    with print_time('evaluate'):
-        evaluate(args, embedding, vocab, num_update,
-                 eval_analogy=not args.no_eval_analogy)
-
-    # Save params
-    with print_time('save parameters'):
-        embedding.save_parameters(os.path.join(args.logdir, 'embedding.params'))
-
-
-def evaluate(args, embedding, vocab, global_step, eval_analogy=False):
-    """Evaluation helper"""
-    if 'eval_tokens' not in globals():
-        global eval_tokens
-
-        eval_tokens_set = evaluation.get_tokens_in_evaluation_datasets(args)
-        if not args.no_eval_analogy:
-            eval_tokens_set.update(vocab.idx_to_token)
-
-        if not args.ngram_buckets:
-            # Word2Vec does not support computing vectors for OOV words
-            eval_tokens_set = filter(lambda t: t in vocab, eval_tokens_set)
-
-        eval_tokens = list(eval_tokens_set)
-
-    if not os.path.isdir(args.logdir):
-        os.makedirs(args.logdir)
-
-    # Compute their word vectors
-    context = get_context(args)
-    mx.nd.waitall()
-
-    token_embedding = nlp.embedding.TokenEmbedding(unknown_token=None,
-                                                   allow_extend=True)
-    token_embedding[eval_tokens] = embedding[eval_tokens]
-
-    results = evaluation.evaluate_similarity(
-        args, token_embedding, context[0], logfile=os.path.join(
-            args.logdir, 'similarity.tsv'), global_step=global_step)
-    if eval_analogy:
-        assert not args.no_eval_analogy
-        results += evaluation.evaluate_analogy(
-            args, token_embedding, context[0], logfile=os.path.join(
-                args.logdir, 'analogy.tsv'))
-
-    return results
-
-
-if __name__ == '__main__':
-    logging.basicConfig()
-    logging.getLogger().setLevel(logging.INFO)
-    args_ = parse_args()
-    train(args_)
diff --git a/scripts/word_embeddings/utils.py b/scripts/word_embeddings/utils.py
deleted file mode 100644
index 25710b004a..0000000000
--- a/scripts/word_embeddings/utils.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Word Embeddings Training Utilities
-=====================================
-
-"""
-
-import logging
-import time
-from contextlib import contextmanager
-
-import mxnet as mx
-
-
-def get_context(args):
-    if args.gpu is None or args.gpu == '':
-        context = [mx.cpu()]
-    elif isinstance(args.gpu, int):
-        context = [mx.gpu(args.gpu)]
-    else:
-        context = [mx.gpu(int(i)) for i in args.gpu]
-    return context
-
-
-@contextmanager
-def print_time(task):
-    start_time = time.time()
-    logging.info('Starting to %s', task)
-    yield
-    logging.info('Finished to {} in {:.2f} seconds'.format(
-        task,
-        time.time() - start_time))
diff --git a/setup.py b/setup.py
index 91b1b3e893..bebab48309 100644
--- a/setup.py
+++ b/setup.py
@@ -1,10 +1,11 @@
 #!/usr/bin/env python
+from datetime import datetime
 import io
 import os
 import re
 import shutil
 import sys
-from setuptools import setup, find_packages, Extension
+from setuptools import setup, find_packages
 
 
 def read(*names, **kwargs):
@@ -24,27 +25,39 @@ def find_version(*file_paths):
     raise RuntimeError("Unable to find version string.")
 
 
-readme = io.open('README.rst', encoding='utf-8').read()
-
 VERSION = find_version('src', 'gluonnlp', '__init__.py')
 
+if VERSION.endswith('dev'):
+    VERSION = VERSION + datetime.today().strftime('%Y%m%d')
+
+
 requirements = [
-    'numpy>=1.16.0',
-    'cython',
-    'packaging'
+    'numpy',
+    'sacremoses>=0.0.38',
+    'yacs>=0.1.6',
+    'sacrebleu',
+    'flake8',
+    'regex',
+    'contextvars',
+    'pyarrow',
+    'sentencepiece',
+    'protobuf',
+    'pandas',
+    'tokenizers>=0.7.0',
+    'click>=7.0',  # Dependency of youtokentome
+    'youtokentome>=1.0.6',
+    'fasttext>=0.9.2'
 ]
 
 setup(
     # Metadata
     name='gluonnlp',
     version=VERSION,
-    python_requires='>=3.5',
-    author='Gluon NLP Toolkit Contributors',
-    author_email='mxnet-gluon@amazon.com',
-    url='https://github.com/dmlc/gluon-nlp',
-    description='MXNet Gluon NLP Toolkit',
-    long_description=readme,
-    long_description_content_type='text/x-rst',
+    python_requires='>=3.6',
+    author='GluonNLP Toolkit Contributors',
+    author_email='gluonnlp-dev@amazon.com',
+    description='MXNet GluonNLP Toolkit (DeepNumpy Version)',
+    long_description_content_type='text/markdown',
     license='Apache-2.0',
 
     # Package info
@@ -53,28 +66,24 @@ def find_version(*file_paths):
         'scripts',
     )),
     package_dir={"": "src"},
+    package_data={'': [os.path.join('models', 'model_zoo_checksums', '*.txt'),
+                       os.path.join('cli', 'data', 'url_checksums', '*.txt'),
+                       os.path.join('cli', 'data', 'url_checksums', 'mirror', '*.json')]},
     zip_safe=True,
     include_package_data=True,
-    setup_requires=[
-        # Setuptools 18.0 properly handles Cython extensions.
-        'setuptools>=18.0',
-        'cython',
-    ],
     install_requires=requirements,
     extras_require={
         'extras': [
-            'spacy',
-            'nltk',
-            'sacremoses',
-            'scipy',
-            'numba>=0.45',
-            'jieba',
-            'sentencepiece',
             'boto3',
             'tqdm',
-            'sacremoses',
-            'regex',
-            'packaging',
+            'jieba',
+            'subword_nmt',
+            'spacy>=2.0.0',
+            'langid',
+            'nltk',
+            'h5py>=2.10',
+            'scipy',
+            'tqdm'
         ],
         'dev': [
             'pytest',
@@ -91,7 +100,12 @@ def find_version(*file_paths):
             'flaky',
         ],
     },
-    ext_modules=[
-        Extension('gluonnlp.data.fast_bert_tokenizer', sources=['src/gluonnlp/data/fast_bert_tokenizer.pyx']),
-    ],
+
+    entry_points={
+        'console_scripts': [
+            'nlp_data = gluonnlp.cli.data.__main__:cli_main',
+            'nlp_preprocess = gluonnlp.cli.preprocess.__main__:cli_main',
+            'gluon_average_checkpoint = gluonnlp.cli.average_checkpoint:cli_main'
+        ],
+    },
 )
diff --git a/src/gluonnlp/__init__.py b/src/gluonnlp/__init__.py
index 7a588e8233..3a583bd339 100644
--- a/src/gluonnlp/__init__.py
+++ b/src/gluonnlp/__init__.py
@@ -1,49 +1,15 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""NLP toolkit."""
-
-import warnings
-import mxnet
-
-from . import loss
+__version__ = '1.0.0.dev'
+from . import base
 from . import data
-from . import embedding
-from . import model
-from . import metric
+from . import models
 from . import utils
-from . import vocab
+from . import attention_cell
+from . import initializer as init
+from . import layers
+from . import loss
+from . import lr_scheduler
+from . import op
 from . import optimizer
-from . import initializer
-from .vocab import Vocab
-
-__version__ = '0.10.0.dev'
-
-__all__ = ['data',
-           'model',
-           'embedding',
-           'Vocab',
-           'vocab',
-           'loss',
-           'initializer',
-           'optimizer',
-           'utils',
-           'metric']
-
-warnings.filterwarnings(module='gluonnlp', action='default', category=DeprecationWarning)
-utils.version.check_version('1.6.0', warning_only=True, library=mxnet)
+from . import registry
+from . import sequence_sampler
+from . import embedding
diff --git a/src/gluonnlp/attention_cell.py b/src/gluonnlp/attention_cell.py
new file mode 100644
index 0000000000..4773f81d46
--- /dev/null
+++ b/src/gluonnlp/attention_cell.py
@@ -0,0 +1,892 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Attention cells."""
+import math
+import numpy as np
+import mxnet as mx
+from mxnet.gluon.block import HybridBlock
+from mxnet.gluon import nn
+from .op import l2_normalize
+from .layers import SinusoidalPositionalEmbedding,\
+                    BucketPositionalEmbedding,\
+                    LearnedPositionalEmbedding
+from typing import Optional
+
+
+# TODO(sxjscience)
+#  We can optimize the whole function by writing a custom-op,
+#  or automatically fuse these operators.
+def gen_self_attn_mask(F, data,
+                       valid_length=None,
+                       dtype: type = np.float32,
+                       attn_type: str = 'full',
+                       layout: str = 'NT'):
+    """Generate the mask used for the encoder, i.e, self-attention.
+
+    In our implementation, 1 --> not masked, 0 --> masked
+
+    Let's consider the data with two samples:
+
+    data =
+        [['I',   'can', 'now',   'use', 'numpy', 'in',  'Gluon@@', 'NLP'  ],
+         ['May', 'the', 'force', 'be',  'with',  'you', '<PAD>',   '<PAD>']]
+    valid_length =
+        [8, 6]
+
+    - attn_type = 'causal'
+        Each token will attend to itself + the tokens before.
+        It will not attend to tokens in the future.
+
+        For our example, the mask of the first sample is
+                   ['I', 'can', 'now', 'use', 'numpy', 'in', 'Gluon@@', 'NLP']
+        'I':         1,    0,     0,     0,      0,     0,      0,      0
+        'can':       1,    1,     0,     0,      0,     0,      0,      0
+        'now':       1,    1,     1,     0,      0,     0,      0,      0
+        'use':       1,    1,     1,     1,      0,     0,      0,      0
+        'numpy':     1,    1,     1,     1,      1,     0,      0,      0
+        'in':        1,    1,     1,     1,      1,     1,      0,      0
+        'Gluon@@':   1,    1,     1,     1,      1,     1,      1,      0
+        'NLP':       1,    1,     1,     1,      1,     1,      1,      1
+
+        The mask of the second sample is
+                   ['May', 'the', 'force', 'be', 'with', 'you', '<PAD>', '<PAD>']
+        'May':        1,    0,     0,     0,      0,     0,      0,      0
+        'the':        1,    1,     0,     0,      0,     0,      0,      0
+        'force':      1,    1,     1,     0,      0,     0,      0,      0
+        'be':         1,    1,     1,     1,      0,     0,      0,      0
+        'with':       1,    1,     1,     1,      1,     0,      0,      0
+        'you':        1,    1,     1,     1,      1,     1,      0,      0
+        '<PAD>':      0,    0,     0,     0,      0,     0,      0,      0
+        '<PAD>':      0,    0,     0,     0,      0,     0,      0,      0
+
+
+    - attn_type = 'full'
+        Each token will attend to both the tokens before and in the future
+
+        For our example, the mask of the first sample is
+                   ['I', 'can', 'now', 'use', 'numpy', 'in', 'Gluon@@', 'NLP']
+        'I':         1,    1,     1,     1,      1,     1,      1,      1
+        'can':       1,    1,     1,     1,      1,     1,      1,      1
+        'now':       1,    1,     1,     1,      1,     1,      1,      1
+        'use':       1,    1,     1,     1,      1,     1,      1,      1
+        'numpy':     1,    1,     1,     1,      1,     1,      1,      1
+        'in':        1,    1,     1,     1,      1,     1,      1,      1
+        'Gluon@@':   1,    1,     1,     1,      1,     1,      1,      1
+        'NLP':       1,    1,     1,     1,      1,     1,      1,      1
+
+        The mask of the second sample is
+                   ['May', 'the', 'force', 'be', 'with', 'you', '<PAD>', '<PAD>']
+        'May':        1,    1,     1,     1,      1,     1,      0,      0
+        'the':        1,    1,     1,     1,      1,     1,      0,      0
+        'force':      1,    1,     1,     1,      1,     1,      0,      0
+        'be':         1,    1,     1,     1,      1,     1,      0,      0
+        'with':       1,    1,     1,     1,      1,     1,      0,      0
+        'you':        1,    1,     1,     1,      1,     1,      0,      0
+        '<PAD>':      0,    0,     0,     0,      0,     0,      0,      0
+        '<PAD>':      0,    0,     0,     0,      0,     0,      0,      0
+
+    Parameters
+    ----------
+    F
+    data
+        The data.
+        - layout = 'NT'
+            Shape (batch_size, seq_length, C)
+        - layout = 'TN'
+            Shape (seq_length, batch_size, C)
+    valid_length
+        Shape (batch_size,)
+    dtype
+        Data type of the mask
+    attn_type
+        Can be 'full' or 'causal'
+    layout
+        The layout of the data
+
+    Returns
+    -------
+    mask
+        Shape (batch_size, seq_length, seq_length)
+    """
+    if layout == 'NT':
+        batch_axis, time_axis = 0, 1
+    elif layout == 'TN':
+        batch_axis, time_axis = 1, 0
+    else:
+        raise NotImplementedError('Unsupported layout={}'.format(layout))
+    if attn_type == 'full':
+        if valid_length is not None:
+            valid_length = valid_length.astype(dtype)
+            steps = F.npx.arange_like(data, axis=time_axis)  # (seq_length,)
+            mask1 = (F.npx.reshape(steps, (1, 1, -1))
+                     < F.npx.reshape(valid_length, (-2, 1, 1)))
+            mask2 = (F.npx.reshape(steps, (1, -1, 1))
+                     < F.npx.reshape(valid_length, (-2, 1, 1)))
+            mask = mask1 * mask2
+        else:
+            # TODO(sxjscience) optimize
+            seq_len_ones = F.np.ones_like(F.npx.arange_like(data, axis=time_axis))  # (seq_length,)
+            batch_ones = F.np.ones_like(F.npx.arange_like(data, axis=batch_axis))   # (batch_size,)
+            mask = batch_ones.reshape((-1, 1, 1)) * seq_len_ones.reshape((1, -1, 1))\
+                   * seq_len_ones.reshape((1, 1, -1))
+    elif attn_type == 'causal':
+        steps = F.npx.arange_like(data, axis=time_axis)
+        # mask: (seq_length, seq_length)
+        # batch_mask: (batch_size, seq_length)
+        mask = (F.np.expand_dims(steps, axis=0) <= F.np.expand_dims(steps, axis=1)).astype(dtype)
+        if valid_length is not None:
+            valid_length = valid_length.astype(dtype)
+            batch_mask = (F.np.expand_dims(steps, axis=0) < F.np.expand_dims(valid_length, axis=-1)).astype(dtype)
+            mask = mask * F.np.expand_dims(batch_mask, axis=-1)
+        else:
+            batch_ones = F.np.ones_like(F.npx.arange_like(data, axis=batch_axis),
+                                        dtype=dtype)  # (batch_size,)
+            mask = mask * batch_ones.reshape((-1, 1, 1))
+    else:
+        raise NotImplementedError
+    mask = mask.astype(dtype)
+    return mask
+
+
+def gen_mem_attn_mask(F, mem, mem_valid_length, data, data_valid_length=None,
+                      dtype=np.float32, layout: str = 'NT'):
+    """Generate the mask used for the decoder. All query slots are attended to the memory slots.
+
+    In our implementation, 1 --> not masked, 0 --> masked
+
+    Let's consider the data + mem with a batch of two samples:
+
+    mem = [['I',   'can', 'now',   'use'],
+           ['May', 'the', 'force', '<PAD>']]
+    mem_valid_length =
+        [4, 3]
+    data =
+        [['numpy', 'in',    'Gluon@@', 'NLP'  ],
+         ['be',    'with',  'you',     '<PAD>']]
+    data_valid_length =
+        [4, 3]
+
+    For our example, the mask of the first sample is
+                   ['I', 'can', 'now', 'use']
+        'numpy':     1,    1,     1,     1
+        'in':        1,    1,     1,     1
+        'Gluon@@':   1,    1,     1,     1
+        'NLP':       1,    1,     1,     1
+
+    The mask of the second sample is
+                   ['be', 'with', 'you', '<PAD>']
+        'May':        1,    1,     1,     0
+        'the':        1,    1,     1,     0
+        'force':      1,    1,     1,     0
+        '<PAD>':      0,    0,     0,     0
+
+
+    Parameters
+    ----------
+    F :
+    mem
+       - layout = 'NT'
+            Shape (batch_size, mem_length, C_mem)
+       - layout = 'TN'
+            Shape (mem_length, batch_size, C_mem)
+    mem_valid_length :
+        Shape (batch_size,)
+    data
+        - layout = 'NT'
+            Shape (batch_size, query_length, C_data)
+        - layout = 'TN'
+            Shape (query_length, batch_size, C_data)
+    data_valid_length :
+        Shape (batch_size,)
+    dtype
+        Data type of the mask
+    layout
+        Layout of the data + mem tensor
+
+    Returns
+    -------
+    mask :
+        Shape (batch_size, query_length, mem_length)
+    """
+    if layout == 'NT':
+        batch_axis, time_axis = 0, 1
+    elif layout == 'TN':
+        batch_axis, time_axis = 1, 0
+    else:
+        raise NotImplementedError('Unsupported layout={}'.format(layout))
+    mem_valid_length = mem_valid_length.astype(dtype)
+    mem_steps = F.npx.arange_like(mem, axis=time_axis)  # (mem_length,)
+    data_steps = F.npx.arange_like(data, axis=time_axis)  # (query_length,)
+    mem_mask = (F.npx.reshape(mem_steps, (1, 1, -1))
+                < F.npx.reshape(mem_valid_length, (-2, 1, 1))).astype(dtype)  # (B, 1, mem_length)
+    if data_valid_length is not None:
+        data_valid_length = data_valid_length.astype(dtype)
+        data_mask = (F.npx.reshape(data_steps, (1, -1, 1))
+                     < F.npx.reshape(data_valid_length, (-2, 1, 1))).astype(dtype)  # (B, query_length, 1)
+        mask = mem_mask * data_mask
+    else:
+        query_length_ones = F.np.ones_like(data_steps)
+        mask = query_length_ones.reshape((1, -1, 1)) * mem_mask
+    return mask
+
+
+# TODO(sxjscience) Directly implement a kernel for masked softmax
+def masked_softmax(F, att_score, mask, dtype=np.float32, axis: int = -1):
+    """Ignore the masked elements when calculating the softmax. The mask can be broadcastable.
+
+    Parameters
+    ----------
+    F : symbol or ndarray
+    att_score : Symborl or NDArray
+        Shape (..., length, ...)
+    mask : Symbol or NDArray or None
+        Shape (..., length, ...)
+        1 --> The element is not masked
+        0 --> The element is masked
+    dtype
+        data type
+    axis
+        The axis to calculate the softmax. att_score.shape[axis] must be the same as mask.shape[axis]
+    Returns
+    -------
+    att_weights : Symborl or NDArray
+        Shape (..., length, ...)
+    """
+    if mask is not None:
+        # Fill in the masked scores with a very small value
+        neg = -1e18
+        if np.dtype(dtype) == np.float16:
+            neg = -1e4
+        else:
+            try:
+                # if AMP (automatic mixed precision) is enabled, -1e18 will cause NaN.
+                from mxnet.contrib import amp
+                if amp.amp._amp_initialized:
+                    neg = -1e4
+            except ImportError:
+                pass
+
+        att_score = F.np.where(mask, att_score, neg)
+        logits = F.npx.softmax(att_score, axis=axis) * mask
+    else:
+        logits = F.npx.softmax(att_score, axis=axis)
+    return logits
+
+
+# TODO(sxjscience) Directly implement a kernel for masked logsoftmax
+def masked_logsoftmax(F, att_score, mask, dtype=np.float32, axis: int = -1):
+    """Ignore the masked elements when calculating the softmax. The mask can be broadcastable.
+
+    Parameters
+    ----------
+    F : symbol or ndarray
+    att_score : Symborl or NDArray
+        Shape (..., length, ...)
+    mask : Symbol or NDArray or None
+        Shape (..., length, ...)
+        mask = 1 --> not masked
+        mask = 0 --> masked
+    dtype
+        data type
+    axis
+        The axis to calculate the softmax. att_score.shape[axis] must be the same as mask.shape[axis]
+    Returns
+    -------
+    logits : Symborl or NDArray
+        Shape (..., length, ...)
+        The masked values will be all zero
+    """
+    if mask is not None:
+        # Fill in the masked scores with a very small value
+        neg = -1e18
+        if np.dtype(dtype) == np.float16:
+            neg = -1e4
+        else:
+            try:
+                # if AMP (automatic mixed precision) is enabled, -1e18 will cause NaN.
+                from mxnet.contrib import amp
+                if amp.amp._amp_initialized:
+                    neg = -1e4
+            except ImportError:
+                pass
+        att_score = F.np.where(mask, att_score, neg)
+        logits = F.np.where(mask, F.npx.log_softmax(att_score, axis=axis), -np.inf)
+    else:
+        logits = F.npx.log_softmax(att_score, axis=axis)
+    return logits
+
+
+# TODO(sxjscience) Default to einsum. Current it is not the default because
+#   1) einsum is super-slow: https://github.com/apache/incubator-mxnet/issues/18043
+def dot_attn_score(F, query, key, scaled=True, normalized=False, eps=1E-6,
+                   layout='NT'):
+    """The inner function call to calculate the score used in dot-product attention.
+
+    We support multiple leading batch dimensions.
+
+    scaled is True:
+        D(h_q, h_k) = <h_q, h_k> / sqrt(dim_q)
+
+    normalized is True:
+            D(h_q, h_k) = <h_q / ||h_q||, h_k / ||h_k||>
+
+    both scaled and normalized:
+            D(h_q, h_k) = <h_q / ||h_q||, h_k / ||h_k||> / sqrt(dim_q)
+
+    Parameters
+    ----------
+    F : mx.sym or mx.nd
+    query : symbol or ndarray
+        - layout is 'NT'
+            (B0, ..., BN, query_length, query_dim)
+        - layout is 'TN'
+            (query_length, B0, ..., BN, query_dim)
+    key : symbol or ndarray
+        - layout is 'NT'
+            (B0, ..., BN, key_length, key_dim)
+        - layout is 'TN'
+            (key_length, B0, ..., BN, key_dim)
+    scaled : bool
+        Whether to divide the query by the square-root of the query_dim
+        If True: D(h_q, h_k) = <h_q, h_k> / sqrt(dim_q)
+    normalized : bool
+        Whether to normalize the query and the key embeddings
+        If True: D(h_q, h_k) = <h_q / ||h_q||, h_k / ||h_k||>
+    eps : float
+        The epsilon used in the normalization
+    layout
+        The layout of the layer. Can be 'TN' or 'NT'.
+
+    Returns
+    -------
+    scores : symbol or ndarray
+        (B0, ..., BN, query_length, key_length)
+    """
+    if normalized:
+        query = l2_normalize(F, query, -1, eps=eps)
+        key = l2_normalize(F, key, -1, eps=eps)
+    if scaled:
+        query_shape = F.npx.shape_array(query)
+        # TODO(sxjscience) Remove .astype(np.float32).
+        #  Wait for https://github.com/apache/incubator-mxnet/issues/18084
+        query_units = query_shape[-1].astype(np.float32)
+        query = query / F.np.sqrt(query_units)
+    if layout == 'NT':
+        scores = F.npx.batch_dot(query, key, transpose_b=True)
+    else:
+        raise NotImplementedError('layout={} is not supported.'
+                                  ' Currently, only layout = "NT" is implemented!'.format(layout))
+    return scores
+
+
+def multi_head_dot_attn(F, query, key, value,
+                        mask=None,
+                        edge_scores=None,
+                        dropout: float = 0.0,
+                        scaled: bool = True, normalized: bool = False,
+                        eps: float = 1E-6, query_head_units: Optional[int] = None,
+                        layout: str = 'NKT',
+                        use_einsum: bool = False,
+                        dtype=np.float32):
+    """Multihead dot product attention between the query, key, value.
+
+    scaled is False, normalized is False:
+        D(h_q, h_k) = <h_q, h_k>
+    scaled is True, normalized is False:
+        D(h_q, h_k) = <h_q, h_k> / sqrt(dim_q)
+    scaled is False, normalized is True:
+        D(h_q, h_k) = <h_q / ||h_q||, h_k / ||h_k||>
+    scaled is True, normalized is True:
+        D(h_q, h_k) = <h_q / ||h_q||, h_k / ||h_k||> / sqrt(dim_q)
+
+    If edge_scores is provided, we will calcualte the attention as
+        scores = D(h_q, h_k) + EdgeScore_{q, k}
+
+    Parameters
+    ----------
+    F
+    query
+        Query. The shape depends on the layout
+        - layout is 'NKT'
+            Shape (batch_size, num_heads, query_length, key_dim)
+        - layout is 'NTK'
+            Shape (batch_size, query_length, num_heads, key_dim)
+        - layout is 'TNK'
+            Shape (query_length, batch_size, num_heads, key_dim)
+    key
+        Key. The shape depends on the layout
+        - layout is 'NKT'
+            Shape (batch_size, num_heads, mem_length, key_dim)
+        - layout is 'NTK'
+            Shape (batch_size, mem_length, num_heads, key_dim)
+        - layout is 'TNK'
+            Shape (mem_length, batch_size, num_heads, key_dim)
+    value
+        Value. The shape depends on the layout
+        - layout is 'NKT'
+            Shape (batch_size, num_heads, mem_length, value_dim)
+        - layout is 'NTK'
+            Shape (batch_size, mem_length, num_heads, value_dim)
+        - layout is 'TNK'
+            Shape (mem_length, batch_size, num_heads, value_dim)
+    mask
+        Mask between query and memory. Shape (batch_size, query_length, mem_length)
+    edge_scores
+        The edge attention score. Shape can be any shape that is broadcastable to
+        (batch_size, num_heads, query_length, mem_length)
+    dropout
+        Dropout rate
+    scaled
+        Whether to divide the attention weights by the sqrt of the query dimension.
+        This is first proposed in "[NIPS2017] Attention is all you need."::
+
+            score = <h_q, h_k> / sqrt(dim_q)
+
+    normalized
+        If turned on, the cosine distance is used, i.e::
+
+            score = <h_q / ||h_q||, h_k / ||h_k||>
+
+    eps
+        The epsilon value used in L2 normalization
+    query_head_units
+        The units of each query head. If it's empty, we will estimate it via the
+        shape_array of the query.
+    layout
+        This stands for the layout of the attention cell. The shape of the input/output will depend
+        on the layout. Currently, we support 'NKT', 'NTK' and 'TNK' in which
+        'N' means the batch_size, 'K' means the head, and 'T' means the length dimension.
+    use_einsum
+        Whether to use einsum for the computation
+
+    Returns
+    -------
+    context_vec
+        - layout is 'NKT' or 'NTK'
+            Shape (batch_size, query_length, num_heads * value_units)
+        - layout is 'TNK'
+            Shape (query_length, batch_size, num_heads * value_units)
+    additional_info
+        scores:
+            Shape (batch_size, num_head, query_length, mem_length)
+        attn_weight:
+            Shape (batch_size, num_head, query_length, mem_length)
+    """
+    # TODO(sxjscience) Profile layout
+    if normalized:
+        query = l2_normalize(F, query, axis=-1, eps=eps)
+        key = l2_normalize(F, key, axis=-1, eps=eps)
+    if scaled:
+        if query_head_units is None:
+            query_shape = F.npx.shape_array(query)
+            scale = F.np.sqrt(query_shape[-1])
+        else:
+            scale = math.sqrt(query_head_units)
+    else:
+        scale = None
+    if layout == 'NKT':
+        # 1. Expand the dimension of the mask:
+        #   (B, L_query, L_mem) --> (B, 1, L_query, L_mem)
+        if mask is not None:
+            mask = F.np.expand_dims(mask, axis=1)
+        # 2. Calculate the attention weights
+        #   Score: (B, N, L_query, C_Q) X (B, N, L_mem, C_Q) --> (B, N, L_query, L_mem)
+        scores = F.npx.batch_dot(query, key, transpose_b=True)
+        if edge_scores is not None:
+            scores = scores + edge_scores
+        if scaled:
+            scores = scores / scale
+        attn_weights = masked_softmax(F, scores, mask, dtype=dtype, axis=-1)
+        attn_weights = F.npx.dropout(attn_weights, p=dropout)
+        # 3. Calculate the context vector
+        # (B, N, L_query, L_mem) X (B, N, L_mem, C_V) --> (B, L_query, N * C_V)
+        if use_einsum:
+            context_vec = F.np.einsum('bnij,bnjc->binc', attn_weights, value)
+        else:
+            context_vec = F.npx.batch_dot(attn_weights, value).transpose((0, 2, 1, 3))
+        context_vec = F.npx.reshape(context_vec, (-2, -2, -1))
+    elif layout == 'NTK':
+        # 1. Expand the dimension of the mask:
+        #   (B, L_query, L_mem) --> (B, 1, L_query, L_mem)
+        if mask is not None:
+            mask = F.np.expand_dims(mask, axis=1)
+        # 2. Calculate the attention weights
+        #   Score: (B, L_query, N, C_Q) X (B, L_mem, N, C_Q) --> (B, N, L_query, L_mem)
+        if use_einsum:
+            scores = F.np.einsum('binc,bjnc->bnij', query, key)
+        else:
+            scores = F.npx.batch_dot(F.np.swapaxes(query, 1, 2), F.np.swapaxes(key, 1, 2),
+                                     transpose_b=True)
+        if edge_scores is not None:
+            scores = scores + edge_scores
+        if scaled:
+            scores = scores / scale
+        attn_weights = masked_softmax(F, scores, mask, dtype=dtype)
+        attn_weights = F.npx.dropout(attn_weights, p=dropout)
+        # 3. Calculate the context vector
+        # (B, N, L_query, L_mem) X (B, L_mem, N, C_V) --> (B, L_query, N * C_V)
+        if use_einsum:
+            context_vec = F.np.einsum('bnij,bjnc->binc', attn_weights, value)
+        else:
+            context_vec = F.npx.batch_dot(attn_weights,
+                                          F.np.swapaxes(value, 1, 2)).transpose((0, 2, 1, 3))
+        context_vec = F.npx.reshape(context_vec, (-2, -2, -1))
+    elif layout == 'TNK':
+        # 1. Expand the dimension of the mask:
+        #   (B, L_query, L_mem) --> (B, 1, L_query, L_mem)
+        if mask is not None:
+            mask = F.np.expand_dims(mask, axis=1)
+        # 2. Calculate the attention weights
+        #   Score: (L_query, B, N, C_Q) X (L_mem, B, N, C_Q) --> (B, N, L_query, L_mem)
+        #   This layout structure can be implemented very efficiently because B, N are consecutive
+        #   to each other. To have a clear picture of what's happening, we may consider the
+        #   (i, j)th element of the output
+        #       out[i, j, :, :] = query[:, i, j, :] X key[:, i, j, :].T, which is just one GEMM call
+        #   We can thus implement the whole kernel via a single call of batched GEMM with stride.
+        if use_einsum:
+            scores = F.np.einsum('ibnc,jbnc->bnij', query, key)
+        else:
+            scores = F.npx.batch_dot(query.transpose((1, 2, 0, 3)),
+                                     key.transpose((1, 2, 3, 0)))
+        if edge_scores is not None:
+            scores = scores + edge_scores
+        if scaled:
+            scores = scores / scale
+        attn_weights = masked_softmax(F, scores, mask, dtype=dtype)
+        attn_weights = F.npx.dropout(attn_weights, p=dropout)
+        # 3. Calculate the context vector
+        # (B, N, L_query, L_mem) X (L_mem, B, N, C_V) --> (L_query, B, N * C_V)
+        # Again, we can implement it via a single call to batched GEMM with stride.
+
+        # Shape (B, N, L_query, C_V)
+        if use_einsum:
+            context_vec = F.np.einsum('bnij,jbnc->ibnc', attn_weights, value)
+        else:
+            context_vec = F.npx.batch_dot(attn_weights,
+                                          value.transpose((1, 2, 0, 3))).transpose((2, 0, 1, 3))
+        context_vec = F.npx.reshape(context_vec, (-2, -2, -1))
+    else:
+        raise NotImplementedError('layout="{}" is not supported! '
+                                  'We only support layout = "NKT", "NTK", and "TNK".'
+                                  .format(layout))
+    return context_vec, [scores, attn_weights]
+
+
+class MultiHeadAttentionCell(HybridBlock):
+    """The multi-head attention
+
+    out = softmax(<Q_i, K_j> + R_{i, j}) V
+
+    We support multiple layouts
+
+    Let's denote batch_size as B, num_heads as K,
+     query_length as L_q, mem_length as L_m, key_dim as C_k, value_dim as C_v
+
+    - layout="NKT"
+        query: (B, K, L_q, C_k)
+        key: (B, K, L_m, C_k)
+        value: (B, K, L_m, C_v)
+        out: (B, L_q, K * C_v)
+    - layout="NTK"
+        query: (B, L_q, K, C_k)
+        key: (B, L_m, K, C_k)
+        value: (B, L_m, K, C_v)
+        out: (B, L_q, K * C_v)
+    - layout="TNK"
+        query: (L_q, B, K, C_k)
+        key: (L_m, B, K, C_k)
+        value: (L_m, B, K, C_v)
+        out: (L_q, B, K * C_v)
+    """
+    def __init__(self, query_units=None, num_heads=None, attention_dropout=0.0,
+                 scaled: bool = True, normalized: bool = False, eps: float = 1E-6,
+                 dtype='float32', layout='NTK', use_einsum=False):
+        super().__init__()
+        self._query_units = query_units
+        self._num_heads = num_heads
+        self._attention_dropout = attention_dropout
+        self._scaled = scaled
+        self._normalized = normalized
+        self._eps = eps
+        self._dtype = dtype
+        assert layout in ['NTK', 'NKT', 'TNK']
+        self._layout = layout
+        self._use_einsum = use_einsum
+        if self._query_units is not None:
+            assert self._num_heads is not None
+            assert self._query_units % self._num_heads == 0,\
+                'The units must be divisible by the number of heads.'
+            self._query_head_units = self._query_units // self._num_heads
+        else:
+            self._query_head_units = None
+
+    @property
+    def layout(self):
+        return self._layout
+
+    def hybrid_forward(self, F, query, key, value, mask=None, edge_scores=None):
+        return multi_head_dot_attn(F, query=query, key=key, value=value,
+                                   mask=mask, edge_scores=edge_scores,
+                                   dropout=self._attention_dropout,
+                                   scaled=self._scaled, normalized=self._normalized,
+                                   eps=self._eps,
+                                   query_head_units=self._query_head_units,
+                                   layout=self._layout, use_einsum=self._use_einsum,
+                                   dtype=self._dtype)
+
+    def __repr__(self):
+        s = '{name}(\n' \
+            '   query_units={query_units},\n' \
+            '   num_heads={num_heads},\n' \
+            '   attention_dropout={attention_dropout},\n' \
+            '   scaled={scaled},\n' \
+            '   normalized={normalized},\n' \
+            '   layout="{layout}",\n' \
+            '   use_einsum={use_einsum},\n' \
+            '   dtype={dtype}\n' \
+            ')'
+        return s.format(name=self.__class__.__name__,
+                        query_units=self._query_units,
+                        num_heads=self._num_heads,
+                        attention_dropout=self._attention_dropout,
+                        scaled=self._scaled,
+                        normalized=self._normalized,
+                        layout=self._layout,
+                        use_einsum=self._use_einsum,
+                        dtype=self._dtype)
+
+
+class RelAttentionScoreCell(HybridBlock):
+    """Get the score based on the query and relative position index. This is used for implementing
+     relative attention.
+
+    For the multi-head attention with relative positional encoding, we have the formula:
+
+    out = softmax(\frac(Q K^T + R}{\sqrt(d)}) V
+
+    Here, R is the relative positional encoding matrix. Usually, R_{i, j} is calculate based on the
+    relative positional difference $i - j$.
+
+    This function aims at generating the R matrix given the query and the relative positions.
+    We support the following methods:
+
+    - method = 'transformer_xl'
+        R_{i, j} = <Q, W S_{i - j}>, in which S_{i, j} is the sinusoidal embedding and
+        W is a Dense layer that maps S_{i - j} to the same dimension as the query.
+        This is proposed in paper:
+
+        - [ACL2019] Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context
+
+    - method = 'shaw'
+        R_{i, j} = < Q, E_{i - j}>, in which E_{i - j} is the learned positional embedding
+        This is proposed in paper:
+
+        - [NAACL2018] Self-Attention with Relative Position Representations
+    - method = 't5'
+        R_{i, j} = E_{i - j}, in which E_{i - j} is the bucket positional embedding.
+        This is proposed in paper:
+
+        - [Arxiv2019] Exploring the Limits of Transfer Learning with a Unified
+        Text-to-Text Transformer
+
+    Like in MultiHeadAttentionCell, we support different layouts to cope with the query matrix.
+
+    - layout="NKT"
+        query: (B, K, L_q, C_k)
+    - layout="NTK"
+        query: (B, L_q, K, C_k)
+    - layout="TNK"
+        query: (L_q, B, K, C_k)
+
+    """
+    def __init__(self, query_units,
+                 num_heads,
+                 pos_embed_units: Optional[int] = None,
+                 max_distance=None,
+                 bidirectional=False,
+                 num_buckets=None,
+                 method='transformer_xl',
+                 dropout: float = 0.0,
+                 dtype='float32',
+                 layout='NTK',
+                 use_einsum=False):
+        """
+
+        Parameters
+        ----------
+        query_units
+        num_heads
+        pos_embed_units
+        max_distance
+        bidirectional
+        num_buckets
+        method
+        dropout
+        attention_dropout
+        query_add_bias
+            Add additional bias term to the query
+        scaled
+        dtype
+        layout
+        """
+        super().__init__()
+        self._dropout = dropout
+        self._method = method
+        self._query_units = query_units
+        self._num_heads = num_heads
+        self._bidirectional = bidirectional
+        self._num_buckets = num_buckets
+        assert query_units % num_heads == 0, 'The units must be divisible by the number of heads.'
+        self._head_query_units = query_units // num_heads
+        self._max_distance = max_distance
+        self._pos_embed_units = pos_embed_units
+        self._dtype = dtype
+        self._use_einsum = use_einsum
+        self._layout = layout
+        if self._layout not in ['NKT', 'NTK', 'TNK']:
+            raise ValueError('layout="{}" is not supported'.format(self._layout))
+        if method == 'transformer_xl':
+            if pos_embed_units is None:
+                pos_embed_units = self._num_heads * self._head_query_units
+            self._rel_pos_embed = SinusoidalPositionalEmbedding(units=pos_embed_units,
+                                                                dtype=self._dtype)
+            self._rel_proj = nn.Dense(units=query_units,
+                                      in_units=pos_embed_units,
+                                      flatten=False,
+                                      use_bias=False,
+                                      dtype=self._dtype)
+            self._dropout_layer = nn.Dropout(dropout)
+        elif method == 'shaw':
+            assert self._max_distance is not None, 'Must set max_distance when method="shaw".'
+            if self._bidirectional:
+                vocab_size = self._max_distance * 2 + 1
+            else:
+                vocab_size = self._max_distance + 1
+            self._rel_pos_embed = LearnedPositionalEmbedding(
+                units=self._num_heads * self._head_query_units,
+                max_length=vocab_size,
+                weight_initializer=mx.init.Xavier(rnd_type="gaussian",
+                                                  factor_type="in",
+                                                  magnitude=1),
+                mode='wrap' if self._bidirectional else 'raise',
+                dtype=self._dtype)
+        elif method == 't5':
+            if self._num_buckets is None:
+                self._num_buckets = 32
+            if self._max_distance is None:
+                self._max_distance = 128
+            self._rel_pos_embed = BucketPositionalEmbedding(
+                units=num_heads,
+                num_buckets=self._num_buckets,
+                max_distance=self._max_distance,
+                bidirectional=self._bidirectional,
+                dtype=self._dtype)
+        else:
+            raise NotImplementedError('method="{}" is currently not supported!'.format(method))
+
+    @property
+    def layout(self) -> str:
+        """Layout of the cell"""
+        return self._layout
+
+    def hybrid_forward(self, F, rel_positions, query=None):
+        """
+
+        Parameters
+        ----------
+        F
+        rel_positions
+            The relative shifts. Shape (query_length, mem_length)
+            Each element represents the shift between the i-th element of query and the j-th
+            element of memory.
+        query
+            The query for computing the relative scores. The shape depends on the layout.
+            If we use T5 attention, the query won't be used
+
+        Returns
+        -------
+        rel_scores
+            The relative attention scores
+            Can have shape (batch_size, num_heads, query_length, mem_length)
+             or (num_heads, query_length, mem_length)
+        """
+        if self._method == 'transformer_xl' or self._method == 'shaw':
+            assert query is not None, 'Must specify query if method={}'.format(self._method)
+            if self._bidirectional:
+                if self._max_distance is not None:
+                    rel_positions = F.np.clip(rel_positions,
+                                              a_min=-self._max_distance, a_max=self._max_distance)
+            else:
+                if self._max_distance is not None:
+                    rel_positions = F.np.clip(rel_positions,
+                                              a_min=0, a_max=self._max_distance)
+            # uniq_rel.shape = (#uniq,), rev_index.shape = (L_q, L_m)
+            uniq_rel, rev_index = F.np.unique(rel_positions, return_inverse=True)
+
+            uniq_rel_pos_embed = self._rel_pos_embed(uniq_rel)
+            if self._method == 'transformer_xl':
+                uniq_rel_pos_embed = self._rel_proj(self._dropout_layer(uniq_rel_pos_embed))
+            # Shape (#uniq, K, C_q)
+            uniq_rel_pos_embed = F.npx.reshape(uniq_rel_pos_embed,
+                                               (-2, self._num_heads, self._head_query_units))
+            # Calculate the dot-product between query and the relative positional embeddings.
+            # After the calculation, rel_score.shape = (L_q, #uniq, N, K)
+            if self._layout == 'NKT':
+                # query_for_rel: (N, K, L_q, C_q)
+                if self._use_einsum:
+                    rel_score = F.np.einsum('bnid,jnd->ijbn', query, uniq_rel_pos_embed)
+                else:
+                    rel_score = F.np.transpose(
+                        F.np.matmul(query,
+                                    F.np.transpose(uniq_rel_pos_embed, (1, 2, 0))),
+                        (2, 3, 0, 1)
+                    )
+            elif self._layout == 'NTK':
+                # query_for_rel: (N, L_q, K, C_q)
+                if self._use_einsum:
+                    rel_score = F.np.einsum('bind,jnd->ijbn', query, uniq_rel_pos_embed)
+                else:
+                    rel_score = F.np.transpose(
+                        F.np.matmul(F.np.swapaxes(query, 1, 2),
+                                    F.np.transpose(uniq_rel_pos_embed, (1, 2, 0))),
+                        (2, 3, 0, 1)
+                    )
+            elif self._layout == 'TNK':
+                # query_for_rel: (L_q, N, K, C_q)
+                if self._use_einsum:
+                    rel_score = F.np.einsum('ibnd,jnd->ijbn', query, uniq_rel_pos_embed)
+                else:
+                    rel_score = F.np.transpose(
+                        F.np.matmul(F.np.transpose(query, (1, 2, 0, 3)),
+                                    F.np.transpose(uniq_rel_pos_embed, (1, 2, 0))),
+                        (2, 3, 0, 1)
+                    )
+            else:
+                raise NotImplementedError
+            # We use gather_nd to select the elements
+            # TODO(sxjscience) Use advanced indexing once available
+            rev_index = F.npx.reshape_like(rev_index, rel_positions).astype(np.int32)
+            query_idx = F.np.expand_dims(F.npx.arange_like(rel_positions, axis=0).astype(np.int32),
+                                         axis=-1) + F.np.zeros_like(rev_index)
+            rel_score = F.npx.gather_nd(rel_score, F.np.stack([query_idx, rev_index]))
+            rel_score = F.np.transpose(rel_score, (2, 3, 0, 1))
+        elif self._method == 't5':
+            # shape is (K, L_q, L_m)
+            rel_score = self._rel_pos_embed(rel_positions).transpose((2, 0, 1))
+        else:
+            raise NotImplementedError
+        return rel_score
diff --git a/src/gluonnlp/base.py b/src/gluonnlp/base.py
index ea2e9835e4..817456cbd4 100644
--- a/src/gluonnlp/base.py
+++ b/src/gluonnlp/base.py
@@ -19,42 +19,12 @@
 """Helper functions."""
 
 import os
+import numpy as np
 
-__all__ = ['numba_njit', 'numba_prange', 'numba_jitclass', 'numba_types', 'get_home_dir']
+__all__ = ['get_home_dir', 'get_data_home_dir']
 
-try:
-    from numba import njit, prange, jitclass, types  # pytype: disable=import-error
-    numba_njit = njit(nogil=True)
-    numba_prange = prange
-    numba_jitclass = jitclass
-    numba_types = types
-except ImportError:
-    # Define numba shims
-    def identity(f):
-        return f
-
-    def numba_jitclass(spec):
-        # pylint: disable=unused-argument
-        return identity
-
-    class NumbaTypes:
-        """Shim for numba.types"""
-        class NumbaType:
-            """Shim for numba.types.type"""
-            def __getitem__(self, x):
-                # pylint: disable=unused-argument
-                pass
-
-            def __call__(self, *args, **kwargs):
-                # pylint: disable=unused-argument
-                pass
-
-        def __getattr__(self, attr):
-            return self.NumbaType()
-
-    numba_types = NumbaTypes()
-    numba_njit = identity
-    numba_prange = range
+INT_TYPES = (int, np.int32, np.int64)
+FLOAT_TYPES = (float, np.float16, np.float32, np.float64)
 
 
 def get_home_dir():
@@ -63,3 +33,39 @@ def get_home_dir():
     # expand ~ to actual path
     _home_dir = os.path.expanduser(_home_dir)
     return _home_dir
+
+
+def get_data_home_dir():
+    """Get home directory for storing the datasets"""
+    home_dir = get_home_dir()
+    return os.path.join(home_dir, 'datasets', 'nlp')
+
+
+def get_model_zoo_home_dir():
+    """Get the local directory for storing pretrained models"""
+    home_dir = get_home_dir()
+    return os.path.join(home_dir, 'models', 'nlp')
+
+
+def get_model_zoo_checksum_dir():
+    """Get the directory that stores the checksums of the artifacts in the model zoo """
+    curr_dir = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+    check_sum_dir = os.path.join(curr_dir, 'models', 'model_zoo_checksums')
+    return check_sum_dir
+
+
+def get_repo_url():
+    """Return the base URL for Gluon dataset and model repository """
+    # TODO(sxjscience) Revise later by calling gluon.utils._get_repo_url
+    default_repo = 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/'
+    repo_url = os.environ.get('MXNET_GLUON_REPO', default_repo)
+    if repo_url[-1] != '/':
+        repo_url = repo_url + '/'
+    return repo_url
+
+
+def get_repo_model_zoo_url():
+    """Return the base URL for GluonNLP Model Zoo"""
+    repo_url = get_repo_url()
+    model_zoo_url = repo_url + 'models/'
+    return model_zoo_url
diff --git a/src/gluonnlp/calibration/__init__.py b/src/gluonnlp/calibration/__init__.py
deleted file mode 100644
index a4b677b712..0000000000
--- a/src/gluonnlp/calibration/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""This module includes calibration utilities such as layer collector."""
-from . import collector
-from .collector import *
diff --git a/src/gluonnlp/calibration/collector.py b/src/gluonnlp/calibration/collector.py
deleted file mode 100644
index 955031215d..0000000000
--- a/src/gluonnlp/calibration/collector.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-"""Bert layer output collector with threshold clipping for calibration"""
-
-import ctypes
-from mxnet import ndarray
-from mxnet.base import NDArrayHandle, py_str
-from mxnet.ndarray import NDArray
-
-class BertLayerCollector:
-    """Saves layer output min and max values in a dict with layer names as keys.
-    The collected min and max values will be directly used as thresholds for quantization.
-    """
-    def __init__(self, clip_min=None, clip_max=None, logger=None):
-        self.include_layer = lambda name: name.endswith('_output') or \
-                                        name.endswith('reshape10_0') or \
-                                        name.endswith('_mul0_0') or \
-                                        name.endswith('_squeeze0_0')
-        self.min_max_dict = {}
-        self.clip_min = clip_min
-        self.clip_max = clip_max
-        self.logger = logger
-
-    def collect(self, name, arr):
-        """Callback function for collecting min and max values from an NDArray."""
-        name = py_str(name)
-        if self.include_layer is not None and not self.include_layer(name):
-            return
-        handle = ctypes.cast(arr, NDArrayHandle)
-        arr = NDArray(handle, writable=False)
-        min_range = ndarray.min(arr).asscalar()
-        max_range = ndarray.max(arr).asscalar()
-        if name.find('gelu0_leakyrelu0') != -1 and max_range > self.clip_max:
-            max_range = self.clip_max
-        if name.find('layernorm0_layernorm0') != -1 and min_range < self.clip_min:
-            min_range = self.clip_min
-        if name in self.min_max_dict:
-            cur_min_max = self.min_max_dict[name]
-            self.min_max_dict[name] = (min(cur_min_max[0], min_range),
-                                       max(cur_min_max[1], max_range))
-        else:
-            self.min_max_dict[name] = (min_range, max_range)
-        if self.logger is not None:
-            self.logger.info('Collecting layer %s min_range=%f, max_range=%f'
-                             % (name, min_range, max_range))
diff --git a/src/gluonnlp/cli/__init__.py b/src/gluonnlp/cli/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/gluonnlp/cli/average_checkpoint.py b/src/gluonnlp/cli/average_checkpoint.py
new file mode 100644
index 0000000000..a660244bfe
--- /dev/null
+++ b/src/gluonnlp/cli/average_checkpoint.py
@@ -0,0 +1,59 @@
+import argparse
+import mxnet as mx
+import re
+
+mx.npx.set_np()
+
+def get_parser():
+    parser = argparse.ArgumentParser(description='Script to average the checkpoints')
+    parser.add_argument('--checkpoints', type=str, required=True, nargs='+',
+                        help='checkpoint file paths, supports two format, '
+                        '--checkpoints folder/epoch*.params or --checkpoints folder/update*.param')
+    parser.add_argument('--begin', type=int, required=True, help='begin number of checkpoints')
+    parser.add_argument('--end', type=int, required=True, help='end number of checkpoints')
+    parser.add_argument('--save-path', type=str, required=True,
+                        help='Path of the output file')
+    return parser
+
+def main(args):
+    assert args.begin >= 0
+    assert args.end >= args.begin
+    args.range = list(range(args.begin, args.end + 1))
+    
+    ckpt_epochs_regexp = re.compile(r'(.*\/)?epoch(\d+)\.params')
+    ckpt_updates_regexp = re.compile(r'(.*\/)?update(\d+)\.params')
+    ckpt_path = args.checkpoints[0]
+    if ckpt_epochs_regexp.fullmatch(ckpt_path) is not None:
+        ckpt_regexp = ckpt_epochs_regexp
+    elif ckpt_updates_regexp.fullmatch(ckpt_path) is not None:
+        ckpt_regexp = ckpt_updates_regexp
+    else:
+        raise Exception('Wrong checkpoints path format')
+    
+    ckpt_paths = []
+    for path in args.checkpoints:
+        m = ckpt_regexp.fullmatch(path)
+        assert m is not None, 'Wrong checkpoints path format'
+        num = int(m.group(2))
+        if num >= args.begin and num <= args.end:
+            ckpt_paths.append(path)
+    
+    assert len(ckpt_paths) > 0
+    res = mx.npx.load(ckpt_paths[0])
+    keys = res.keys()
+    for ckpt_path in ckpt_paths[1:]:
+        ckpt = mx.npx.load(ckpt_path)
+        for key in keys:
+            res[key] += ckpt[key]
+    for key in keys:
+        res[key] /= len(args.range)
+    mx.npx.save(args.save_path, res)
+
+def cli_main():
+    parser = get_parser()
+    args = parser.parse_args()
+    main(args)
+
+
+if __name__ == '__main__':
+    cli_main()
diff --git a/src/gluonnlp/cli/data b/src/gluonnlp/cli/data
new file mode 120000
index 0000000000..bf363397f3
--- /dev/null
+++ b/src/gluonnlp/cli/data
@@ -0,0 +1 @@
+../../../scripts/datasets
\ No newline at end of file
diff --git a/src/gluonnlp/cli/preprocess b/src/gluonnlp/cli/preprocess
new file mode 120000
index 0000000000..85725c1319
--- /dev/null
+++ b/src/gluonnlp/cli/preprocess
@@ -0,0 +1 @@
+../../../scripts/preprocess
\ No newline at end of file
diff --git a/src/gluonnlp/data/__init__.py b/src/gluonnlp/data/__init__.py
index fa09ed07db..5a7baba5b7 100644
--- a/src/gluonnlp/data/__init__.py
+++ b/src/gluonnlp/data/__init__.py
@@ -1,171 +1,8 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
+from . import vocab
+from . import tokenizers
+from . import batchify
+from .vocab import *
+from .tokenizers import *
 
-# pylint: disable=wildcard-import
-"""This module includes common utilities such as data readers and counter."""
+__all__ = ['batchify'] + vocab.__all__ + tokenizers.__all__
 
-import os
-
-from . import (batchify, candidate_sampler, conll, corpora, dataloader,
-               dataset, question_answering, registry, sampler, sentiment,
-               stream, super_glue, transforms, translation, utils,
-               word_embedding_evaluation, intent_slot, glue, datasetloader,
-               classification, baidu_ernie_data, bert, xlnet)
-from .candidate_sampler import *
-from .conll import *
-from .glue import *
-from .super_glue import *
-from .corpora import *
-from .dataloader import *
-from .dataset import *
-from .question_answering import *
-from .registry import *
-from .sampler import *
-from .sentiment import *
-from .stream import *
-from .transforms import *
-from .translation import *
-from .utils import *
-from .utils import _load_pretrained_sentencepiece_tokenizer
-from .word_embedding_evaluation import *
-from .intent_slot import *
-from .datasetloader import *
-from .classification import *
-from .baidu_ernie_data import *
-from .bert import *
-from .xlnet import *
-from ..base import get_home_dir
-
-__all__ = (['batchify', 'get_tokenizer'] + utils.__all__ + transforms.__all__
-           + sampler.__all__ + dataset.__all__ + corpora.__all__ + sentiment.__all__
-           + word_embedding_evaluation.__all__ + stream.__all__ + conll.__all__
-           + translation.__all__ + registry.__all__ + question_answering.__all__
-           + dataloader.__all__ + candidate_sampler.__all__ + intent_slot.__all__
-           + glue.__all__ + super_glue.__all__ + classification.__all__
-           + baidu_ernie_data.__all__ + datasetloader.__all__
-           + bert.__all__ + xlnet.__all__)  # pytype: disable=attribute-error
-
-
-def get_tokenizer(model_name, dataset_name,
-                  vocab=None, root=os.path.join(get_home_dir(), 'data'),
-                  **kwargs):
-    """Returns a pre-defined tokenizer by name.
-
-    Parameters
-    ----------
-    model_name : str
-        Options include 'bert_24_1024_16', 'bert_12_768_12', 'roberta_12_768_12',
-        'roberta_24_1024_16' and 'ernie_12_768_12'
-    dataset_name : str
-        The supported datasets for model_name of either bert_24_1024_16 and
-        bert_12_768_12 are 'book_corpus_wiki_en_cased',
-        'book_corpus_wiki_en_uncased'.
-        For model_name bert_12_768_12 'wiki_cn_cased',
-        'wiki_multilingual_uncased', 'wiki_multilingual_cased',
-        'scibert_scivocab_uncased', 'scibert_scivocab_cased',
-        'scibert_basevocab_uncased','scibert_basevocab_cased',
-        'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
-        'biobert_v1.1_pubmed',
-        'clinicalbert',
-        'kobert_news_wiki_ko_cased' are supported.
-        For model_name roberta_12_768_12 and roberta_24_1024_16
-        'openwebtext_ccnews_stories_books_cased' is supported.
-        For model_name ernie_12_768_12
-        'baidu_ernie_uncased'.
-        is additionally supported.
-    vocab : gluonnlp.vocab.BERTVocab or None, default None
-        Vocabulary for the dataset. Must be provided if tokenizer is based on
-        vocab.
-    root : str, default '$MXNET_HOME/models' with MXNET_HOME defaults to '~/.mxnet'
-        Location for keeping the model parameters.
-
-    Returns
-    -------
-    gluonnlp.data.BERTTokenizer or gluonnlp.data.GPT2BPETokenizer or
-    gluonnlp.data.SentencepieceTokenizer
-
-    Examples
-    --------
-    >>> model_name = 'bert_12_768_12'
-    >>> dataset_name = 'book_corpus_wiki_en_uncased'
-    >>> _, vocab = gluonnlp.model.get_model(model_name,
-    ...                                     dataset_name=dataset_name,
-    ...                                     pretrained=False, root='./model')
-    -etc-
-    >>> tokenizer = gluonnlp.data.get_tokenizer(model_name, dataset_name, vocab)
-    >>> tokenizer('Habit is second nature.')
-    ['habit', 'is', 'second', 'nature', '.']
-    """
-    model_name, dataset_name = model_name.lower(), dataset_name.lower()
-    model_dataset_name = '_'.join([model_name, dataset_name])
-    model_dataset_names = {'roberta_12_768_12_openwebtext_ccnews_stories_books_cased':
-                           [GPT2BPETokenizer, {'lower': False}],
-                           'roberta_24_1024_16_openwebtext_ccnews_stories_books_cased':
-                           [GPT2BPETokenizer, {'lower': False}],
-                           'bert_12_768_12_book_corpus_wiki_en_cased':
-                           [BERTTokenizer, {'lower': False}],
-                           'bert_12_768_12_book_corpus_wiki_en_uncased':
-                           [BERTTokenizer, {'lower': True}],
-                           'bert_12_768_12_openwebtext_book_corpus_wiki_en_uncased':
-                           [BERTTokenizer, {'lower': True}],
-                           'bert_12_768_12_wiki_multilingual_uncased':
-                           [BERTTokenizer, {'lower': False}],
-                           'bert_12_768_12_wiki_multilingual_cased':
-                           [BERTTokenizer, {'lower': True}],
-                           'bert_12_768_12_wiki_cn_cased':
-                           [BERTTokenizer, {'lower': False}],
-                           'bert_24_1024_16_book_corpus_wiki_en_cased':
-                           [BERTTokenizer, {'lower': False}],
-                           'bert_24_1024_16_book_corpus_wiki_en_uncased':
-                           [BERTTokenizer, {'lower': True}],
-                           'bert_12_768_12_scibert_scivocab_uncased':
-                           [BERTTokenizer, {'lower': True}],
-                           'bert_12_768_12_scibert_scivocab_cased':
-                           [BERTTokenizer, {'lower': False}],
-                           'bert_12_768_12_scibert_basevocab_uncased':
-                           [BERTTokenizer, {'lower': True}],
-                           'bert_12_768_12_scibert_basevocab_cased':
-                           [BERTTokenizer, {'lower': False}],
-                           'bert_12_768_12_biobert_v1.0_pmc_cased':
-                           [BERTTokenizer, {'lower': False}],
-                           'bert_12_768_12_biobert_v1.0_pubmed_cased':
-                           [BERTTokenizer, {'lower': False}],
-                           'bert_12_768_12_biobert_v1.0_pubmed_pmc_cased':
-                           [BERTTokenizer, {'lower': False}],
-                           'bert_12_768_12_biobert_v1.1_pubmed_cased':
-                           [BERTTokenizer, {'lower': False}],
-                           'bert_12_768_12_clinicalbert_uncased':
-                           [BERTTokenizer, {'lower': True}],
-                           'bert_12_768_12_kobert_news_wiki_ko_cased':
-                           [_load_pretrained_sentencepiece_tokenizer, {'num_best': 0, 'alpha':1.0}],
-                           'ernie_12_768_12_baidu_ernie_uncased':
-                           [BERTTokenizer, {'lower': True}]}
-    if model_dataset_name not in model_dataset_names:
-        raise ValueError(
-            'Model name %s is not supported. Available options are\n\t%s'%(
-                model_dataset_name, '\n\t'.join(sorted(model_dataset_names.keys()))))
-    tokenizer_cls, extra_args = model_dataset_names[model_dataset_name]
-    kwargs = {**extra_args, **kwargs}
-    if tokenizer_cls is BERTTokenizer:
-        assert vocab is not None, 'Must specify vocab if loading BERTTokenizer'
-        return tokenizer_cls(vocab, **kwargs)
-    elif tokenizer_cls is GPT2BPETokenizer:
-        return tokenizer_cls(root=root)
-    elif tokenizer_cls is _load_pretrained_sentencepiece_tokenizer:
-        return tokenizer_cls(dataset_name, root, **kwargs)
-    else:
-        raise ValueError('Could not get any matched tokenizer interface.')
diff --git a/src/gluonnlp/data/baidu_ernie_data.py b/src/gluonnlp/data/baidu_ernie_data.py
deleted file mode 100644
index 604dc5413f..0000000000
--- a/src/gluonnlp/data/baidu_ernie_data.py
+++ /dev/null
@@ -1,187 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Baidu ernie data, contains XNLI."""
-
-__all__ = ['BaiduErnieXNLI', 'BaiduErnieLCQMC', 'BaiduErnieChnSentiCorp']
-
-import os
-import tarfile
-from urllib.request import urlretrieve
-
-from ..base import get_home_dir
-from .dataset import TSVDataset
-from .registry import register
-
-_baidu_ernie_data_url = 'https://ernie.bj.bcebos.com/task_data_zh.tgz'
-
-
-class _BaiduErnieDataset(TSVDataset):
-    def __init__(self, root=None, dataset_name=None, segment=None, filename=None, **kwargs):
-        assert (filename or (root and dataset_name and segment))
-        if not filename:
-            root = os.path.expanduser(root)
-            if not os.path.isdir(root):
-                os.makedirs(root)
-            self._root = root
-            download_data_path = os.path.join(self._root, 'task_data.tgz')
-            if not os.path.exists(download_data_path):
-                urlretrieve(_baidu_ernie_data_url, download_data_path)
-                tar_file = tarfile.open(download_data_path, mode='r:gz')
-                tar_file.extractall(self._root)
-            filename = os.path.join(self._root, 'task_data', dataset_name, '%s.tsv' % segment)
-        super(_BaiduErnieDataset, self).__init__(filename, **kwargs)
-
-
-@register(segment=['train', 'dev', 'test'])
-class BaiduErnieXNLI(_BaiduErnieDataset):
-    """ The XNLI dataset redistributed by Baidu
-    <https://github.com/PaddlePaddle/LARK/tree/develop/ERNIE>.
-
-    Original from:
-    Conneau, Alexis, et al. "Xnli: Evaluating cross-lingual sentence representations."
-    arXiv preprint arXiv:1809.05053 (2018).
-    https://github.com/facebookresearch/XNLI
-
-    Licensed under a Creative Commons Attribution-NonCommercial 4.0 International License.
-    License details: https://creativecommons.org/licenses/by-nc/4.0/
-
-    Parameters
-    ----------
-    segment : {'train', 'dev', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/baidu_ernie_task_data'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    return_all_fields : bool, default False
-        Return all fields available in the dataset.
-
-    Examples
-    --------
-    >>> xnli_dev = BaiduErnieXNLI('dev', root='./datasets/baidu_ernie_task_data/')
-    >>> len(xnli_dev)
-    2490
-    >>> len(xnli_dev[0])
-    3
-    >>> xnli_dev[0]
-    ['他说，妈妈，我回来了。', '校车把他放下后，他立即给他妈妈打了电话。', 'neutral']
-    >>> xnli_test = BaiduErnieXNLI('test', root='./datasets/baidu_ernie_task_data/')
-    >>> len(xnli_test)
-    5010
-    >>> len(xnli_test[0])
-    2
-    >>> xnli_test[0]
-    ['嗯，我根本没想过，但是我很沮丧，最后我又和他说话了。', '我还没有和他再次谈论。']
-    """
-    def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets',
-                                                          'baidu_ernie_data'),
-                 return_all_fields=False):
-        A_IDX, B_IDX, LABEL_IDX = 0, 1, 2
-        if segment in ['train', 'dev']:
-            field_indices = [A_IDX, B_IDX, LABEL_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-        elif segment == 'test':
-            field_indices = [A_IDX, B_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-
-        super(BaiduErnieXNLI,
-              self).__init__(root, 'xnli', segment, num_discard_samples=num_discard_samples,
-                             field_indices=field_indices)
-
-
-@register(segment=['train', 'dev', 'test'])
-class BaiduErnieLCQMC(_BaiduErnieDataset):
-    """ The LCQMC dataset original from:
-    Xin Liu, Qingcai Chen, Chong Deng, Huajun Zeng, Jing Chen, Dongfang Li, Buzhou Tang,
-    LCQMC: A Large-scale Chinese Question Matching Corpus,COLING2018.
-
-    No license granted. You can request a private license via
-    http://icrc.hitsz.edu.cn/LCQMC_Application_Form.pdf
-    The code fits the dataset format which was redistributed by Baidu in ERNIE repo.
-    (Baidu does not hold this version any more.)
-
-    Parameters
-    ----------
-    segment : {'train', 'dev', 'test'}, default 'train'
-        Dataset segment.
-    file_path : str
-        Path to the downloaded dataset file.
-    return_all_fields : bool, default False
-        Return all fields available in the dataset.
-    """
-    def __init__(self, file_path, segment='train', return_all_fields=False):
-        A_IDX, B_IDX, LABEL_IDX = 0, 1, 2
-        if segment in ['train', 'dev']:
-            field_indices = [A_IDX, B_IDX, LABEL_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-        elif segment == 'test':
-            field_indices = [A_IDX, B_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-
-        super(BaiduErnieLCQMC,
-              self).__init__(filename=file_path, num_discard_samples=num_discard_samples,
-                             field_indices=field_indices)
-
-
-@register(segment=['train', 'dev', 'test'])
-class BaiduErnieChnSentiCorp(_BaiduErnieDataset):
-    """ The ChnSentiCorp dataset redistributed by Baidu
-    <https://github.com/PaddlePaddle/LARK/tree/develop/ERNIE>.
-
-    Original from Tan Songbo (Chinese Academy of Sciences, tansongbo@software.ict.ac.cn).
-
-    Parameters
-    ----------
-    segment : {'train', 'dev', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/baidu_ernie_task_data'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    return_all_fields : bool, default False
-        Return all fields available in the dataset.
-
-    Examples
-    --------
-    >>> chnsenticorp_dev = BaiduErnieChnSentiCorp('dev', root='./datasets/baidu_ernie_task_data/')
-    >>> len(chnsenticorp_dev)
-    1200
-    >>> len(chnsenticorp_dev[0])
-    2
-    >>> chnsenticorp_dev[2]
-    ['商品的不足暂时还没发现，京东的订单处理速度实在.......周二就打包完成，周五才发货...', '0']
-    >>> chnsenticorp_test = BaiduErnieChnSentiCorp('test', root='./datasets/baidu_ernie_task_data/')
-
-    >>> len(chnsenticorp_test)
-    1200
-    >>> len(chnsenticorp_test[0])
-    1
-    >>> chnsenticorp_test[0]
-    ['这个宾馆比较陈旧了，特价的房间也很一般。总体来说一般']
-    """
-    def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets',
-                                                          'baidu_ernie_data'),
-                 return_all_fields=False):
-        LABEL_IDX, A_IDX = 0, 1
-        if segment in ['train', 'dev']:
-            field_indices = [A_IDX, LABEL_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-        elif segment == 'test':
-            field_indices = [A_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-
-        super(BaiduErnieChnSentiCorp,
-              self).__init__(root, 'chnsenticorp', segment,
-                             num_discard_samples=num_discard_samples, field_indices=field_indices)
diff --git a/src/gluonnlp/data/batchify/batchify.py b/src/gluonnlp/data/batchify.py
similarity index 81%
rename from src/gluonnlp/data/batchify/batchify.py
rename to src/gluonnlp/data/batchify.py
index 2e800b5892..e854fe1670 100644
--- a/src/gluonnlp/data/batchify/batchify.py
+++ b/src/gluonnlp/data/batchify.py
@@ -20,11 +20,13 @@
 
 import warnings
 import math
-from typing import (Dict as t_Dict, Callable as t_Callable, List as t_List, Tuple as t_Tuple,
-                    AnyStr, Union as t_Union)
+from typing import Dict as t_Dict, Callable as t_Callable,\
+    NamedTuple as t_NamedTuple, List as t_List, Tuple as t_Tuple, AnyStr,\
+    Union as t_Union
 
 import numpy as np
 import mxnet as mx
+from mxnet.util import is_np_array
 
 
 def _pad_arrs_to_max_length(arrs, pad_axis, pad_val, use_shared_mem, dtype, round_to=None):
@@ -36,6 +38,8 @@ def _pad_arrs_to_max_length(arrs, pad_axis, pad_val, use_shared_mem, dtype, roun
     pad_axis : int
     pad_val : number
     use_shared_mem : bool, default False
+    dtype :
+    round_to : int
 
     Returns
     -------
@@ -43,12 +47,12 @@ def _pad_arrs_to_max_length(arrs, pad_axis, pad_val, use_shared_mem, dtype, roun
     original_length : NDArray
     """
     if isinstance(arrs[0], mx.nd.NDArray):
-        dtype = arrs[0].dtype if dtype is None else dtype
+        dtype = dtype or arrs[0].dtype
         arrs = [arr.asnumpy() for arr in arrs]
     elif not isinstance(arrs[0], np.ndarray):
         arrs = [np.asarray(ele) for ele in arrs]
     else:
-        dtype = arrs[0].dtype if dtype is None else dtype
+        dtype = dtype or arrs[0].dtype
 
     original_length = [ele.shape[pad_axis] for ele in arrs]
     max_size = max(original_length)
@@ -71,29 +75,44 @@ def _pad_arrs_to_max_length(arrs, pad_axis, pad_val, use_shared_mem, dtype, roun
                 slices = [slice(i, i + 1)] + slices
                 ret[tuple(slices)] = arr
 
-    ctx = mx.Context('cpu_shared', 0) if use_shared_mem else mx.cpu()
-    ret = mx.nd.array(ret, ctx=ctx, dtype=dtype)
-    original_length = mx.nd.array(original_length, ctx=ctx, dtype=np.int32)
-
-    return ret, original_length
+    ctx = mx.Context('cpu', 0) if use_shared_mem else mx.cpu()
+    if is_np_array():
+        ret = mx.np.array(ret, ctx=ctx, dtype=dtype)
+    else:
+        ret = mx.nd.array(ret, ctx=ctx, dtype=dtype)
+    return ret
 
 
 def _stack_arrs(arrs, use_shared_mem, dtype):
     if isinstance(arrs[0], mx.nd.NDArray):
-        dtype = arrs[0].dtype if dtype is None else dtype
+        dtype = dtype or arrs[0].dtype
         if use_shared_mem:
-            out = mx.nd.empty((len(arrs),) + arrs[0].shape, dtype=dtype,
-                              ctx=mx.Context('cpu_shared', 0))
-            return mx.nd.stack(*arrs, out=out)
+            if is_np_array():
+                out = mx.np.empty((len(arrs),) + arrs[0].shape, dtype=dtype,
+                                  ctx=mx.Context('cpu_shared', 0))
+                return mx.np.stack(arrs, out=out)
+            else:
+                out = mx.nd.empty((len(arrs),) + arrs[0].shape, dtype=dtype,
+                                  ctx=mx.Context('cpu_shared', 0))
+                return mx.nd.stack(*arrs, out=out)
         else:
-            return mx.nd.stack(*arrs)
+            if is_np_array():
+                return mx.np.stack(arrs)
+            else:
+                return mx.nd.stack(*arrs)
     else:
         out = np.asarray(arrs)
-        dtype = out.dtype if dtype is None else dtype
+        dtype = dtype or out.dtype
         if use_shared_mem:
-            return mx.nd.array(out, ctx=mx.Context('cpu_shared', 0), dtype=dtype)
+            if is_np_array():
+                return mx.np.array(out, ctx=mx.Context('cpu_shared', 0), dtype=dtype)
+            else:
+                return mx.nd.array(out, ctx=mx.Context('cpu_shared', 0), dtype=dtype)
         else:
-            return mx.nd.array(out, dtype=dtype)
+            if is_np_array():
+                return mx.np.array(out, dtype=dtype)
+            else:
+                return mx.nd.array(out, dtype=dtype)
 
 
 class Stack:
@@ -157,7 +176,7 @@ def __call__(self, data):
         -------
         batch_data : NDArray
         """
-        return _stack_arrs(data, True, self._dtype)
+        return _stack_arrs(data, False, self._dtype)
 
 
 class Pad:
@@ -165,15 +184,13 @@ class Pad:
 
     Parameters
     ----------
+    val : float or int, default 0
+        The padding value.
     axis : int, default 0
         The axis to pad the arrays. The arrays will be padded to the largest dimension at
         `axis`. For example, assume the input arrays have shape
         (10, 8, 5), (6, 8, 5), (3, 8, 5) and the `axis` is 0. Each input will be padded into
         (10, 8, 5) and then stacked to form the final output, which has shape（3, 10, 8, 5).
-    pad_val : float or int, default 0
-        The padding value.
-    ret_length : bool, default False
-        Whether to return the valid length in the output.
     dtype : str or numpy.dtype, default None
         The value type of the output. If it is set to None, the input data type is used.
     round_to : int, default None
@@ -181,36 +198,21 @@ class Pad:
 
     Examples
     --------
-    >>> import gluonnlp.data.batchify as bf
+    >>> import gluonnlp.batchify as bf
     >>> # Inputs are multiple lists
     >>> a = [1, 2, 3, 4]
     >>> b = [4, 5, 6]
     >>> c = [8, 2]
-    >>> bf.Pad(pad_val=0)([a, b, c])
+    >>> bf.Pad()([a, b, c])
     <BLANKLINE>
     [[1. 2. 3. 4.]
      [4. 5. 6. 0.]
      [8. 2. 0. 0.]]
     <NDArray 3x4 @cpu_shared(0)>
-    >>> # Also output the lengths
-    >>> a = [1, 2, 3, 4]
-    >>> b = [4, 5, 6]
-    >>> c = [8, 2]
-    >>> batch, length = bf.Pad(pad_val=0, ret_length=True)([a, b, c])
-    >>> batch
-    <BLANKLINE>
-    [[1. 2. 3. 4.]
-     [4. 5. 6. 0.]
-     [8. 2. 0. 0.]]
-    <NDArray 3x4 @cpu_shared(0)>
-    >>> length
-    <BLANKLINE>
-    [4 3 2]
-    <NDArray 3 @cpu_shared(0)>
-    >>> # Inputs are multiple ndarrays
+    >>> # Inputs are multiple numpy ndarrays
     >>> a = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
     >>> b = np.array([[5, 8], [1, 2]])
-    >>> bf.Pad(axis=1, pad_val=-1)([a, b])
+    >>> bf.Pad(axis=1, val=-1)([a, b])
     <BLANKLINE>
     [[[ 1  2  3  4]
       [ 5  6  7  8]]
@@ -219,24 +221,16 @@ class Pad:
       [ 1  2 -1 -1]]]
     <NDArray 2x2x4 @cpu_shared(0)>
     """
-    def __init__(self, axis=0, pad_val=None, ret_length=False, dtype=None, round_to=None):
+    def __init__(self, val=0, axis=0, dtype=None, round_to=None):
         self._axis = axis
         assert isinstance(axis, int), 'axis must be an integer! ' \
                                       'Received axis=%s, type=%s.' % (str(axis),
                                                                       str(type(axis)))
-        self._pad_val = 0 if pad_val is None else pad_val
-        self._ret_length = ret_length
+        self._val = val
         self._dtype = dtype
         self._warned = False
         self._round_to = round_to
 
-        if pad_val is None:
-            warnings.warn(
-                'Padding value is not given and will be set automatically to 0 '
-                'in data.batchify.Pad(). '
-                'Please check whether this is intended '
-                '(e.g. value of padding index in the vocabulary).')
-
     def __call__(self, data):
         """Batchify the input data.
 
@@ -257,28 +251,21 @@ def __call__(self, data):
         -------
         batch_data: NDArray
             Data in the minibatch. Shape is (N, ...)
-        valid_length: NDArray, optional
-            The sequences' original lengths at the padded axis. Shape is (N,). This will only be
-            returned in `ret_length` is True.
 
         """
 
         if isinstance(data[0], mx.nd.NDArray) and not self._warned:
             self._warned = True
+            #TODO(sxjscience) Investigate the warning
             warnings.warn(
                 'Using Pad with NDArrays is discouraged for speed reasons. '
                 'Instead you should pad your data while it is still a list '
                 'and before converting to an NDArray. '
                 'Alternatively you can consider inputting a numpy.ndarray.')
         if isinstance(data[0], (mx.nd.NDArray, np.ndarray, list)):
-            padded_arr, original_length = _pad_arrs_to_max_length(data, self._axis,
-                                                                  self._pad_val, True,
-                                                                  self._dtype,
-                                                                  round_to=self._round_to)
-            if self._ret_length:
-                return padded_arr, original_length
-            else:
-                return padded_arr
+            padded_arr = _pad_arrs_to_max_length(data, self._axis, self._val, False, self._dtype,
+                                                 round_to=self._round_to)
+            return padded_arr
         else:
             raise NotImplementedError
 
@@ -305,7 +292,7 @@ class Tuple:
     >>> a = ([1, 2, 3, 4], 0)
     >>> b = ([5, 7], 1)
     >>> c = ([1, 2, 3, 4, 5, 6, 7], 0)
-    >>> f1, f2 = bf.Tuple(bf.Pad(pad_val=0), bf.Stack())([a, b])
+    >>> f1, f2 = bf.Tuple(bf.Pad(), bf.Stack())([a, b])
     >>> f1
     <BLANKLINE>
     [[1. 2. 3. 4.]
@@ -403,7 +390,7 @@ class Dict:
     >>> a = {'data': [1, 2, 3, 4], 'label': 0}
     >>> b = {'data': [5, 7], 'label': 1}
     >>> c = {'data': [1, 2, 3, 4, 5, 6, 7], 'label': 0}
-    >>> batchify_fn = Dict({'data': Pad(pad_val=0), 'label': Stack()})
+    >>> batchify_fn = Dict({'data': Pad(), 'label': Stack()})
     >>> sample = batchify_fn([a, b, c])
     >>> sample['data']
     <BLANKLINE>
@@ -460,8 +447,8 @@ class NamedTuple:
 
     Parameters
     ----------
-    container : NamedTuple class
-        The object that constructs the NamedTuple.
+    container
+        The object that constructs the namedtuple.
     fn_info
         The information of the inner batchify functions.
 
@@ -473,7 +460,7 @@ class NamedTuple:
     >>> a = SampleData([1, 2, 3, 4], 0)
     >>> b = SampleData([5, 7], 1)
     >>> c = SampleData([1, 2, 3, 4, 5, 6, 7], 0)
-    >>> batchify_fn = NamedTuple(SampleData, {'data': Pad(pad_val=0), 'label': Stack()})
+    >>> batchify_fn = NamedTuple(SampleData, {'data': Pad(), 'label': Stack()})
     >>> sample = batchify_fn([a, b, c])
     >>> sample
     SampleData(data=
@@ -490,7 +477,7 @@ class NamedTuple:
      [1. 2. 3. 4. 5. 6. 7.]]
     <NDArray 3x7 @cpu_shared(0)>
     >>> # Let's consider to use a list
-    >>> batchify_fn = NamedTuple(SampleData, [Pad(pad_val=0), Stack()])
+    >>> batchify_fn = NamedTuple(SampleData, [Pad(), Stack()])
     >>> batchify_fn([a, b, c])
     SampleData(data=
     [[1. 2. 3. 4. 0. 0. 0.]
@@ -500,8 +487,11 @@ class NamedTuple:
     [0 1 0]
     <NDArray 3 @cpu_shared(0)>)
     """
-    def __init__(self, container, fn_info: t_Union[t_List[t_Callable], t_Tuple[t_Callable],
-                                                   t_Dict[AnyStr, t_Callable]]):
+    def __init__(self,
+                 container: t_NamedTuple,
+                 fn_info: t_Union[t_List[t_Callable],
+                                  t_Tuple[t_Callable],
+                                  t_Dict[AnyStr, t_Callable]]):
         self._container = container
         if isinstance(fn_info, (list, tuple)):
             if len(container._fields) != len(fn_info):
@@ -522,17 +512,17 @@ def __init__(self, container, fn_info: t_Union[t_List[t_Callable], t_Tuple[t_Cal
                 raise ValueError('All batchify functions must be callable.')
         self._fn_l = fn_info
 
-    def __call__(self, data):
+    def __call__(self, data: t_List[t_NamedTuple]) -> t_NamedTuple:
         """Batchify the input data.
 
         Parameters
         ----------
-        data : List of NamedTuple
-            The samples to batchify. Each sample should be a NamedTuple.
+        data
+            The samples to batchfy. Each sample should be a namedtuple.
 
         Returns
         -------
-        ret : List of NamedTuple
+        ret
             A namedtuple of length N. Contains the batchified result of each attribute in the input.
         """
         if not isinstance(data[0], self._container):
diff --git a/src/gluonnlp/data/batchify/__init__.py b/src/gluonnlp/data/batchify/__init__.py
deleted file mode 100644
index 7c7e1b6f71..0000000000
--- a/src/gluonnlp/data/batchify/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Batchify helpers."""
-
-from . import batchify, embedding, language_model
-from .batchify import *
-from .embedding import *
-from .language_model import *
-
-__all__ = batchify.__all__ + language_model.__all__ + embedding.__all__
diff --git a/src/gluonnlp/data/batchify/embedding.py b/src/gluonnlp/data/batchify/embedding.py
deleted file mode 100644
index 7f859b3116..0000000000
--- a/src/gluonnlp/data/batchify/embedding.py
+++ /dev/null
@@ -1,265 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Batchify helpers for embedding training."""
-
-__all__ = ['EmbeddingCenterContextBatchify']
-
-import itertools
-import logging
-import random
-
-import numpy as np
-
-from ...base import numba_njit, numba_prange
-from ..stream import DataStream
-
-
-class EmbeddingCenterContextBatchify:
-    """Helper to create batches of center and contexts words.
-
-    Batches are created lazily on a optionally shuffled version of the Dataset.
-    To create batches from some corpus, first create a
-    EmbeddingCenterContextBatchify object and then call it with the corpus.
-    Please see the documentation of __call__ for more details.
-
-    Parameters
-    ----------
-    batch_size : int
-        Maximum size of batches returned. Actual batch returned can be smaller
-        when running out of samples.
-    window_size : int, default 5
-        The maximum number of context elements to consider left and right of
-        each center element. Less elements may be considered if there are not
-        sufficient elements left / right of the center element or if a reduced
-        window size was drawn.
-    reduce_window_size_randomly : bool, default True
-       If True, randomly draw a reduced window size for every center element
-       uniformly from [1, window].
-    shuffle : bool, default True
-       If True, shuffle the sentences before lazily generating batches.
-    cbow : bool, default False
-       Enable CBOW mode. In CBOW mode the returned context contains multiple
-       entries per row. One for each context. If CBOW is False (default), there
-       is a separate row for each context. The context_data array always
-       contains weights for the context words equal to 1 over the number of
-       context words in the given row of the context array.
-    weight_dtype : numpy.dtype, default numpy.float32
-        Data type for data array of sparse COO context representation.
-    index_dtype : numpy.dtype, default numpy.int64
-
-    """
-
-    def __init__(self, batch_size, window_size=5,
-                 reduce_window_size_randomly=True, shuffle=True, cbow=False,
-                 weight_dtype='float32', index_dtype='int64'):
-        self._batch_size = batch_size
-        self._window_size = window_size
-        self._reduce_window_size_randomly = reduce_window_size_randomly
-        self._shuffle = shuffle
-        self._cbow = cbow
-        self._weight_dtype = weight_dtype
-        self._index_dtype = index_dtype
-
-    def __call__(self, corpus):
-        """Batchify a dataset.
-
-        Parameters
-        ----------
-        corpus : list of sentences
-            List of sentences. Any list containing for example integers or
-            strings can be a sentence. Context samples do not cross sentence
-            boundaries.
-
-         Returns
-         -------
-         DataStream
-             Each element of the DataStream is a tuple of 2 elements (center,
-             context). center is a numpy.ndarray of shape (batch_size, ).
-             context is a tuple of 3 numpy.ndarray, representing a sparse COO
-             array (data, row, col). The center and context arrays contain the
-             center and corresponding context words respectively. A sparse
-             representation is used for context as the number of context words
-             for one center word varies based on the randomly chosen context
-             window size and sentence boundaries. The returned center and col
-             arrays are of the same dtype as the sentence elements.
-
-        """
-        return _EmbeddingCenterContextBatchify(
-            corpus, self._batch_size, self._window_size,
-            self._reduce_window_size_randomly, self._shuffle, cbow=self._cbow,
-            weight_dtype=self._weight_dtype, index_dtype=self._index_dtype)
-
-
-class _EmbeddingCenterContextBatchify(DataStream):
-    def __init__(self, sentences, batch_size, window_size,
-                 reduce_window_size_randomly, shuffle, cbow, weight_dtype,
-                 index_dtype):
-        self._sentences = sentences
-        self._batch_size = batch_size
-        self._window_size = window_size
-        self._reduce_window_size_randomly = reduce_window_size_randomly
-        self._shuffle = shuffle
-        self._cbow = cbow
-        self._weight_dtype = weight_dtype
-        self._index_dtype = index_dtype
-
-    def __iter__(self):
-        if numba_prange is range:
-            logging.warning(
-                'EmbeddingCenterContextBatchify supports just in time compilation '
-                'with numba, but numba is not installed. '
-                'Consider "pip install numba" for significant speed-ups.')
-
-        firstelement = next(itertools.chain.from_iterable(self._sentences))
-        if isinstance(firstelement, str):
-            sentences = [np.asarray(s, dtype='O') for s in self._sentences]
-        else:
-            dtype = type(firstelement)
-            sentences = [np.asarray(s, dtype=dtype) for s in self._sentences]
-
-        if self._shuffle:
-            random.shuffle(sentences)
-
-        sentence_boundaries = np.cumsum([len(c) for c in sentences])
-        sentences = np.concatenate(sentences)
-
-        it = iter(
-            _context_generator(
-                sentence_boundaries, self._window_size, self._batch_size,
-                random_window_size=self._reduce_window_size_randomly,
-                cbow=self._cbow, seed=random.getrandbits(32)))
-
-        def _closure():
-            while True:
-                try:
-                    (center, context_data, context_row, context_col) = next(it)
-                    context_data = np.asarray(context_data, dtype=self._weight_dtype)
-                    context_row = np.asarray(context_row, dtype=self._index_dtype)
-                    context_col = sentences[context_col]
-                    context_coo = (context_data, context_row, context_col)
-                    yield sentences[center], context_coo
-                except StopIteration:
-                    return
-
-        return _closure()
-
-
-@numba_njit
-def _get_sentence_start_end(sentence_boundaries, sentence_pointer):
-    end = sentence_boundaries[sentence_pointer]
-    if sentence_pointer == 0:
-        start = 0
-    else:
-        start = sentence_boundaries[sentence_pointer - 1]
-    return start, end
-
-
-@numba_njit
-def _context_generator(sentence_boundaries, window, batch_size,
-                       random_window_size, cbow, seed):
-    num_rows = batch_size
-    word_pointer = 0
-    num_context_skip = 0
-    while True:
-        center_batch = []
-        # Prepare arrays for COO sparse matrix format
-        context_data = []
-        context_row = []
-        context_col = []
-        i = 0
-        while i < num_rows:
-            if word_pointer >= sentence_boundaries[-1]:
-                # There is no data left
-                break
-
-            contexts = _get_context(word_pointer, sentence_boundaries, window,
-                                    random_window_size, seed)
-            if contexts is None:
-                word_pointer += 1
-                continue
-            center = word_pointer
-            for j, context in enumerate(contexts):
-                if num_context_skip > j:
-                    # In SkipGram mode, there may be some leftover contexts
-                    # form the last batch
-                    continue
-                if i >= num_rows:
-                    num_context_skip = j
-                    assert not cbow
-                    break
-
-                num_context_skip = 0
-                context_row.append(i)
-                context_col.append(context)
-                if cbow:
-                    context_data.append(1.0 / len(contexts))
-                else:
-                    center_batch.append(center)
-                    context_data.append(1)
-                    i += 1
-
-            if cbow:
-                center_batch.append(center)
-                i += 1
-
-            if num_context_skip == 0:
-                word_pointer += 1
-            else:
-                assert i == num_rows
-                break
-
-        if len(center_batch) == num_rows:
-            center_batch_np = np.array(center_batch, dtype=np.int64)
-            context_data_np = np.array(context_data, dtype=np.float32)
-            context_row_np = np.array(context_row, dtype=np.int64)
-            context_col_np = np.array(context_col, dtype=np.int64)
-            yield center_batch_np, context_data_np, context_row_np, context_col_np
-        else:
-            assert word_pointer >= sentence_boundaries[-1]
-            break
-
-
-@numba_njit
-def _get_context(center_idx, sentence_boundaries, window_size,
-                 random_window_size, seed):
-    """Compute the context with respect to a center word in a sentence.
-
-    Takes an numpy array of sentences boundaries.
-
-    """
-    random.seed(seed + center_idx)
-
-    sentence_index = np.searchsorted(sentence_boundaries, center_idx)
-    sentence_start, sentence_end = _get_sentence_start_end(
-        sentence_boundaries, sentence_index)
-
-    if random_window_size:
-        window_size = random.randint(1, window_size)
-    start_idx = max(sentence_start, center_idx - window_size)
-    end_idx = min(sentence_end, center_idx + window_size + 1)
-
-    if start_idx != center_idx and center_idx + 1 != end_idx:
-        context = np.concatenate((np.arange(start_idx, center_idx),
-                                  np.arange(center_idx + 1, end_idx)))
-    elif start_idx != center_idx:
-        context = np.arange(start_idx, center_idx)
-    elif center_idx + 1 != end_idx:
-        context = np.arange(center_idx + 1, end_idx)
-    else:
-        context = None
-
-    return context
diff --git a/src/gluonnlp/data/batchify/language_model.py b/src/gluonnlp/data/batchify/language_model.py
deleted file mode 100644
index 1b574a0306..0000000000
--- a/src/gluonnlp/data/batchify/language_model.py
+++ /dev/null
@@ -1,323 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=undefined-all-variable
-"""Data transforms useful for language models."""
-
-__all__ = ['CorpusBatchify', 'CorpusBPTTBatchify', 'StreamBPTTBatchify']
-
-import itertools
-import math
-
-import numpy as np
-import mxnet as mx
-from mxnet.gluon.data import RandomSampler, SequentialSampler, SimpleDataset
-
-from ..utils import slice_sequence, _slice_pad_length
-from ..stream import DataStream
-
-class CorpusBatchify:
-    """Transform the dataset into N independent sequences, where N is the batch size.
-
-    Parameters
-    ----------
-    vocab : gluonnlp.Vocab
-        The vocabulary to use for numericalizing the dataset. Each token will be mapped to the
-        index according to the vocabulary.
-    batch_size : int
-        The number of samples in each batch.
-    """
-
-    def __init__(self, vocab, batch_size):
-        self._vocab = vocab
-        self._batch_size = batch_size
-
-    def __call__(self, data):
-        """Batchify a dataset.
-
-        Parameters
-        ----------
-        data : mxnet.gluon.data.Dataset
-            A flat dataset to be batchified.
-
-        Returns
-        -------
-        mxnet.gluon.data.Dataset
-            NDArray of shape (len(data) // N, N) where N is the batch_size
-            wrapped by a mxnet.gluon.data.SimpleDataset. Excessive tokens that
-            don't align along the batches are discarded.
-        """
-        sample_len = len(data) // self._batch_size
-        return SimpleDataset(
-            mx.nd.array(
-                self._vocab[data[:sample_len * self._batch_size]]).reshape(
-                    self._batch_size, -1).T)
-
-
-class CorpusBPTTBatchify:
-    """Transform the dataset into batches of numericalized samples, in the way
-    that the recurrent states from last batch connects with the current batch
-    for each sample.
-
-    Each sample is of shape `(seq_len, batch_size)`. When `last_batch='keep'`, the first
-    dimension of last sample may be shorter than `seq_len`.
-
-    Parameters
-    ----------
-    vocab : gluonnlp.Vocab
-        The vocabulary to use for numericalizing the dataset. Each token will be mapped to the
-        index according to the vocabulary.
-    seq_len : int
-        The length of each of the samples for truncated back-propagation-through-time (TBPTT).
-    batch_size : int
-        The number of samples in each batch.
-    last_batch : {'keep', 'discard'}
-        How to handle the last batch if the remaining length is less than `seq_len`.
-
-        - keep: A batch with less samples than previous batches is returned. vocab.padding_token
-          is used to pad the last batch based on batch size.
-
-        - discard: The last batch is discarded if it's smaller than `(seq_len, batch_size)`.
-    """
-
-    def __init__(self,
-                 vocab,
-                 seq_len,
-                 batch_size,
-                 last_batch='keep'):
-        self._vocab = vocab
-        self._seq_len = seq_len
-        self._batch_size = batch_size
-        self._last_batch = last_batch
-
-        if last_batch not in ['keep', 'discard']:
-            raise ValueError(
-                'Got invalid last_batch: "{}". Must be "keep" or "discard".'.
-                format(last_batch))
-
-        if self._last_batch == 'keep':
-            if not self._vocab.padding_token:
-                raise ValueError('vocab.padding_token must be specified '
-                                 'in vocab when last_batch="keep".')
-
-    def __call__(self, corpus):
-        """Batchify a dataset.
-
-        Parameters
-        ----------
-        corpus : mxnet.gluon.data.Dataset
-            A flat dataset to be batchified.
-
-        Returns
-        -------
-        mxnet.gluon.data.Dataset
-            Batches of numericalized samples such that the recurrent states
-            from last batch connects with the current batch for each sample.
-            Each element of the Dataset is a tuple of size 2, specifying the
-            data and label for BPTT respectively. Both items are of the same
-            shape (seq_len, batch_size).
-        """
-        if self._last_batch == 'keep':
-            coded = self._vocab[list(corpus)]
-            sample_len = math.ceil(float(len(coded)) / self._batch_size)
-            padding_size = _slice_pad_length(sample_len, self._seq_len + 1, 1) * \
-                self._batch_size + sample_len * self._batch_size - len(coded)
-            coded.extend([self._vocab[self._vocab.padding_token]] * int(padding_size))
-            assert len(coded) % self._batch_size == 0
-            assert not _slice_pad_length(len(coded) / self._batch_size, self._seq_len + 1, 1)
-        else:
-            sample_len = len(corpus) // self._batch_size
-            coded = self._vocab[corpus[:sample_len * self._batch_size]]
-        data = mx.nd.array(coded).reshape((self._batch_size, -1)).T
-        batches = slice_sequence(data, self._seq_len + 1, overlap=1)
-
-        return SimpleDataset(batches).transform(_split_data_label, lazy=False)
-
-
-def _split_data_label(x):
-    return x[:-1, :], x[1:, :]
-
-
-class StreamBPTTBatchify:
-    """Transform a Stream of CorpusDataset to BPTT batches.
-
-    The corpus is transformed into batches of numericalized samples, in the way that the
-    recurrent states from last batch connects with the current batch for each sample.
-
-    Each sample is of shape `(seq_len, batch_size)`.
-
-    For example, the following 4 sequences::
-
-        a b c d <eos>
-        e f g h i j <eos>
-        k l m n <eos>
-        o <eos>
-
-    will generate 2 batches with seq_len = 5, batch_size = 2 as follow (transposed):
-
-    batch_0.data.T::
-
-        a b c d <eos>
-        e f g h i
-
-    batch_0.target.T::
-
-        b c d <eos> k
-        f g h i j
-
-    batch_1.data.T::
-
-        k l m n <eos>
-        j <eos> o <eos> <padding>
-
-    batch_1.target.T::
-
-        l m n <eos> <padding>
-        <eos> o <eos> <padding> <padding>
-
-    Parameters
-    ----------
-    vocab : gluonnlp.Vocab
-        The vocabulary to use for numericalizing the dataset. Each token will be mapped to the
-        index according to the vocabulary.
-    seq_len : int
-        The length of each of the samples for truncated back-propagation-through-time (TBPTT).
-    batch_size : int
-        The number of samples in each batch.
-    sampler : str, {'sequential', 'random'}, defaults to 'random'
-        The sampler used to sample texts within a file.
-
-        - 'sequential': SequentialSampler
-        - 'random': RandomSampler
-    last_batch : {'keep', 'discard'}
-        How to handle the last batch if the remaining length is less than `seq_len`.
-
-        - keep: A batch with less samples than previous batches is returned.
-        - discard: The last batch is discarded if it's smaller than `(seq_len, batch_size)`.
-    """
-
-    def __init__(self,
-                 vocab,
-                 seq_len,
-                 batch_size,
-                 sampler='random',
-                 last_batch='keep'):
-        self._vocab = vocab
-        self._seq_len = seq_len
-        self._batch_size = batch_size
-        self._sampler = sampler
-        self._last_batch = last_batch
-        if not self._vocab.padding_token:
-            raise ValueError('Padding token must be specified in vocab for StreamBPTTBatchify.')
-
-        if last_batch not in ['keep', 'discard']:
-            raise ValueError(
-                'Got invalid last_batch: "{}". Must be "keep" or "discard".'.
-                format(last_batch))
-
-    def _get_sampler(self, sampler):
-        assert isinstance(
-            sampler,
-            str), 'Expected sampler to be a str, but got %s' % type(sampler)
-        if sampler == 'random':
-            return RandomSampler
-        if sampler == 'sequential':
-            return SequentialSampler
-        raise ValueError(
-            'sampler must be either "random" or "sequential", but got %s' %
-            (sampler))
-
-    def __call__(self, corpus):
-        """Batchify a stream.
-
-        Parameters
-        ----------
-        corpus : nlp.data.DatasetStream
-            A stream of un-flattened CorpusDataset.
-
-        Returns
-        -------
-        nlp.data.DataStream
-            Batches of numericalized samples such that the recurrent states
-            from last batch connects with the current batch for each sample.
-            Each element of the Dataset is a tuple of data and label arrays for
-            BPTT. They are of shape (seq_len, batch_size) respectively.
-        """
-        return _StreamBPTTBatchify(
-            corpus, self._vocab, self._seq_len, self._batch_size,
-            self._get_sampler(self._sampler), self._last_batch)
-
-
-class _StreamBPTTBatchify(DataStream):
-    def __init__(self, corpus, vocab, seq_len, batch_size, sampler,
-                 last_batch):
-        self._corpus = corpus
-        self._vocab = vocab
-        self._seq_len = seq_len
-        self._batch_size = batch_size
-        self._sampler = sampler
-        self._last_batch = last_batch
-        self._padding_idx = vocab[vocab.padding_token]
-
-    def __iter__(self):
-        def _init(data, target, value):
-            """Init the data and target with values."""
-            data[:] = value
-            target[:] = value
-
-        def _read(buffers, i, vocab, corpus):
-            """Read a sentence from the corpus into i-th buffer."""
-            if len(buffers[i]) <= 1:
-                buffers[i].extend(vocab[next(corpus)])
-
-        def _write(data, target, buffers, seq_len, i, length):
-            """Write a sentence from i-th buffer to data and target."""
-            num_tokens = len(buffers[i]) - 1
-            num_tokens = min(num_tokens, seq_len - length)
-            # fill in data and target
-            data[i, length:length+num_tokens] = buffers[i][:num_tokens]
-            target[i, length:length+num_tokens] = buffers[i][1:num_tokens+1]
-            # trim sentence in the buffer if too long. Used for the next batch
-            buffers[i] = buffers[i][num_tokens:]
-            return num_tokens
-
-        # stream states
-        buffers = [[] for _ in range(self._batch_size)]
-        has_next = True
-        has_token_buffered = False
-        data = np.empty([self._batch_size, self._seq_len], dtype=np.float32)
-        target = np.empty([self._batch_size, self._seq_len], dtype=np.float32)
-        corpus = itertools.chain.from_iterable(
-            (corpus_dataset[idx] for idx in self._sampler(len(corpus_dataset)))
-            for corpus_dataset in self._corpus)
-
-        while has_next or has_token_buffered:
-            _init(data, target, self._padding_idx)
-            has_token_buffered = False
-            for i in range(self._batch_size):
-                length = 0
-                try:
-                    while length < self._seq_len:
-                        _read(buffers, i, self._vocab, corpus)
-                        num_tokens = _write(data, target, buffers, self._seq_len, i, length)
-                        if len(buffers[i]) > 0:
-                            has_token_buffered = True
-                        length += num_tokens
-                except StopIteration:
-                    has_next = False
-            if has_token_buffered or self._last_batch == 'keep':
-                yield mx.nd.array(data).T, mx.nd.array(target).T
diff --git a/src/gluonnlp/data/bert/__init__.py b/src/gluonnlp/data/bert/__init__.py
deleted file mode 100644
index caeefac9bb..0000000000
--- a/src/gluonnlp/data/bert/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""BERT fineuning data preprocessing utils"""
-
-from . import glue, squad
-from .glue import *
-from .squad import *
-
-__all__ = glue.__all__ + squad.__all__
diff --git a/src/gluonnlp/data/bert/glue.py b/src/gluonnlp/data/bert/glue.py
deleted file mode 100644
index 3631cb82d2..0000000000
--- a/src/gluonnlp/data/bert/glue.py
+++ /dev/null
@@ -1,129 +0,0 @@
-"""Utility functions for BERT glue data preprocessing"""
-
-__all__ = ['truncate_seqs_equal', 'concat_sequences']
-
-import collections
-import itertools
-import numpy.ma as ma
-
-
-def truncate_seqs_equal(sequences, max_len):
-    """truncate a list of seqs equally so that the total length equals max length.
-
-    Parameters
-    ----------
-    sequences : list of list of object
-        Sequences of tokens, each of which is an iterable of tokens.
-    max_len : int
-        Max length to be truncated to.
-
-    Returns
-    -------
-    list : list of truncated sequence keeping the origin order
-
-    Examples
-    --------
-    >>> seqs = [[1, 2, 3], [4, 5, 6]]
-    >>> truncate_seqs_equal(seqs, 6)
-    [[1, 2, 3], [4, 5, 6]]
-    >>> seqs = [[1, 2, 3], [4, 5, 6]]
-    >>> truncate_seqs_equal(seqs, 4)
-    [[1, 2], [4, 5]]
-    >>> seqs = [[1, 2, 3], [4, 5, 6]]
-    >>> truncate_seqs_equal(seqs, 3)
-    [[1, 2], [4]]
-    """
-    assert isinstance(sequences, list)
-    lens = list(map(len, sequences))
-    if sum(lens) <= max_len:
-        return sequences
-
-    lens = ma.masked_array(lens, mask=[0] * len(lens))
-    while True:
-        argmin = lens.argmin()
-        minval = lens[argmin]
-        quotient, remainder = divmod(max_len, len(lens) - sum(lens.mask))
-        if minval <= quotient:  # Ignore values that don't need truncation
-            lens.mask[argmin] = 1
-            max_len -= minval
-        else:  # Truncate all
-            lens.data[~lens.mask] = [
-                quotient + 1 if i < remainder else quotient for i in range(lens.count())
-            ]
-            break
-    sequences = [seq[:length] for (seq, length) in zip(sequences, lens.data.tolist())]
-    return sequences
-
-
-def concat_sequences(seqs, separators, seq_mask=0, separator_mask=1):
-    """Concatenate sequences in a list into a single sequence, using specified separators.
-
-    Example 1:
-    seqs: [['is', 'this' ,'jacksonville', '?'], ['no' ,'it' ,'is' ,'not', '.']]
-    separator: [[SEP], [SEP], [CLS]]
-    seq_mask: 0
-    separator_mask: 1
-
-    Returns:
-    tokens:      is this jacksonville ? [SEP] no it is not . [SEP] [CLS]
-    segment_ids: 0  0    0            0  0    1  1  1  1   1 1     2
-    p_mask:      0  0    0            0  1    0  0  0  0   0 1     1
-
-    Example 2:
-    separator_mask can also be a list.
-    seqs: [['is', 'this' ,'jacksonville', '?'], ['no' ,'it' ,'is' ,'not', '.']]
-    separator: [[SEP], [SEP], [CLS]]
-    seq_mask: 0
-    separator_mask: [[1], [1], [0]]
-
-    Returns:
-    tokens:     'is this jacksonville ? [SEP] no it is not . [SEP] [CLS]'
-    segment_ids: 0  0    0            0  0    1  1  1  1   1 1     2
-    p_mask:      1  1    1            1  1    0  0  0  0   0 1     0
-
-    Example 3:
-    seq_mask can also be a list.
-    seqs: [['is', 'this' ,'jacksonville', '?'], ['no' ,'it' ,'is' ,'not', '.']]
-    separator: [[SEP], [SEP], [CLS]]
-    seq_mask: [[1, 1, 1, 1], [0, 0, 0, 0, 0]]
-    separator_mask: [[1], [1], [0]]
-
-    Returns:
-    tokens:     'is this jacksonville ? [SEP] no it is not . [SEP] [CLS]'
-    segment_ids: 0  0    0            0  0    1  1  1  1   1 1     2
-    p_mask:      1  1    1            1  1    0  0  0  0   0 1     0
-
-    Parameters
-    ----------
-    seqs : list of list of object
-        sequences to be concatenated
-    separator : list of list of object
-        The special tokens to separate sequences.
-    seq_mask : int or list of list of int
-        A single mask value for all sequence items or a list of values for each item in sequences
-    separator_mask : int or list of list of int
-        A single mask value for all separators or a list of values for each separator
-
-    Returns
-    -------
-    np.array: input token ids in 'int32', shape (batch_size, seq_length)
-    np.array: segment ids in 'int32', shape (batch_size, seq_length)
-    np.array: mask for special tokens
-    """
-    assert isinstance(seqs, collections.abc.Iterable) and len(seqs) > 0
-    assert isinstance(seq_mask, (list, int))
-    assert isinstance(separator_mask, (list, int))
-    concat = sum((seq + sep for sep, seq in itertools.zip_longest(separators, seqs, fillvalue=[])),
-                 [])
-    segment_ids = sum(
-        ([i] * (len(seq) + len(sep))
-         for i, (sep, seq) in enumerate(itertools.zip_longest(separators, seqs, fillvalue=[]))),
-        [])
-    if isinstance(seq_mask, int):
-        seq_mask = [[seq_mask] * len(seq) for seq in seqs]
-    if isinstance(separator_mask, int):
-        separator_mask = [[separator_mask] * len(sep) for sep in separators]
-
-    p_mask = sum((s_mask + mask for sep, seq, s_mask, mask in itertools.zip_longest(
-        separators, seqs, seq_mask, separator_mask, fillvalue=[])), [])
-    return concat, segment_ids, p_mask
diff --git a/src/gluonnlp/data/bert/squad.py b/src/gluonnlp/data/bert/squad.py
deleted file mode 100644
index 97b9025d3c..0000000000
--- a/src/gluonnlp/data/bert/squad.py
+++ /dev/null
@@ -1,308 +0,0 @@
-"""Utility functions for BERT squad data preprocessing"""
-
-__all__ = [
-    'tokenize_and_align_positions', 'get_doc_spans',
-    'align_position2doc_spans', 'improve_answer_span', 'check_is_max_context',
-    'convert_squad_examples'
-]
-
-import collections
-
-
-def tokenize_and_align_positions(origin_text, start_position, end_position, tokenizer):
-    """Tokenize the text and align the origin positions to the corresponding position.
-
-    Parameters
-    ----------
-    origin_text : list
-        list of tokens to be tokenized.
-    start_position : int
-        Start position in the origin_text
-    end_position : int
-        End position in the origin_text
-    tokenizer : callable function, e.g., BERTTokenizer.
-
-    Returns
-    -------
-    int: Aligned start position
-    int: Aligned end position
-    list: tokenized text
-    list: map from the origin index to the tokenized sequence index
-    list: map from tokenized sequence index to the origin index
-
-    Examples
-    --------
-    >>> from gluonnlp.vocab import BERTVocab
-    >>> from gluonnlp.data import count_tokens, BERTTokenizer
-    >>> origin_text = ['is', 'this', 'jacksonville', '?']
-    >>> vocab_tokens = ['is', 'this', 'jack', '##son', '##ville', '?']
-    >>> bert_vocab = BERTVocab(count_tokens(vocab_tokens))
-    >>> tokenizer = BERTTokenizer(vocab=bert_vocab)
-    >>> out = tokenize_and_align_positions(origin_text, 0, 2, tokenizer)
-    >>> out[0] # start_position
-    0
-    >>> out[1] # end_position
-    4
-    >>> out[2] # tokenized_text
-    ['is', 'this', 'jack', '##son', '##ville', '?']
-    >>> out[3] # orig_to_tok_index
-    [0, 1, 2, 5]
-    >>> out[4] # tok_to_orig_index
-    [0, 1, 2, 2, 2, 3]
-    """
-    orig_to_tok_index = []
-    tok_to_orig_index = []
-    tokenized_text = []
-    for (i, token) in enumerate(origin_text):
-        orig_to_tok_index.append(len(tokenized_text))
-        sub_tokens = tokenizer(token)
-        tokenized_text += sub_tokens
-        tok_to_orig_index += [i] * len(sub_tokens)
-
-    start_position = orig_to_tok_index[start_position]
-    end_position = orig_to_tok_index[end_position + 1] - 1 if end_position < len(origin_text) - 1  \
-        else len(tokenized_text) - 1
-    return start_position, end_position, tokenized_text, orig_to_tok_index, tok_to_orig_index
-
-
-def get_doc_spans(full_doc, max_length, doc_stride):
-    """Obtain document spans by sliding a window across the document
-
-    Parameters
-    ----------
-    full_doc: list
-        The origin doc text
-    max_length: max_length
-        Maximum size of a doc span
-    doc_stride: int
-        Step of sliding window
-
-    Returns
-    -------
-    list: a list of processed doc spans
-    list: a list of start/end index of each doc span
-    """
-    doc_spans = []
-    start_offset = 0
-    while start_offset < len(full_doc):
-        length = min(max_length, len(full_doc) - start_offset)
-        end_offset = start_offset + length
-        doc_spans.append((full_doc[start_offset:end_offset], (start_offset, end_offset)))
-        if start_offset + length == len(full_doc):
-            break
-        start_offset += min(length, doc_stride)
-    return list(zip(*doc_spans))
-
-
-def align_position2doc_spans(positions, doc_spans_indices, offset=0, default_value=-1,
-                             all_in_span=True):
-    """Align original positions to the corresponding document span positions
-
-    Parameters
-    ----------
-    positions: list or int
-        A single or a list of positions to be aligned
-    doc_spans_indices: list or tuple
-        Contains the start/end position of the doc_spans. Typically, (start_position, end_position)
-    offset: int
-        Offset of aligned positions. Sometimes the doc spans would be added to the back of
-        a question text, in this case, the new position should add len(question_text).
-    default_value: int
-        The default value to return if the positions are not in the doc span.
-    all_in_span: bool
-        If set to True, then as long as one position is out of span, all positions
-        would be set to default_value.
-
-    Returns
-    -------
-    list: a list of aligned positions
-
-    Examples
-    --------
-    >>> positions = [2, 6]
-    >>> doc_span_indices = [1, 4]
-    >>> align_position2doc_spans(positions, doc_span_indices, default_value=-2)
-    [-2, -2]
-    >>> align_position2doc_spans(positions, doc_span_indices, default_value=-2, all_in_span=False)
-    [1, -2]
-    """
-    if not isinstance(positions, list):
-        positions = [positions]
-    doc_start, doc_end = doc_spans_indices
-    if all_in_span and not all([p in range(doc_start, doc_end) for p in positions]):
-        return [default_value] * len(positions)
-    new_positions = [
-        p - doc_start + offset if p in range(doc_start, doc_end) else default_value
-        for p in positions
-    ]
-    return new_positions
-
-
-def improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
-    """Returns tokenized answer spans that better match the annotated answer.
-
-    The SQuAD annotations are character based. We first project them to
-    whitespace-tokenized words. But then after WordPiece tokenization, we can
-    often find a "better match". For example:
-
-    Question: What year was John Smith born?
-    Context: The leader was John Smith (1895-1943).
-    Answer: 1895
-
-    The original whitespace-tokenized answer will be "(1895-1943).". However
-    after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
-    the exact answer, 1895.
-
-    However, this is not always possible. Consider the following:
-
-    Question: What country is the top exporter of electornics?
-    Context: The Japanese electronics industry is the lagest in the world.
-    Answer: Japan
-
-    In this case, the annotator chose "Japan" as a character sub-span of
-    the word "Japanese". Since our WordPiece tokenizer does not split
-    "Japanese", we just use "Japanese" as the annotation. This is fairly rare
-    in SQuAD, but does happen.
-
-    Parameters
-    ----------
-    doc_tokens: list
-        A list of doc tokens
-    input_start: int
-        start position of the answer
-    input_end: int
-        end position of the answer
-    tokenizer: callable function
-    orig_answer_text: str
-        origin answer text.
-    Returns
-    -------
-    tuple: a tuple of improved start position and end position
-    """
-    tok_answer_text = ' '.join(tokenizer(orig_answer_text))
-
-    for new_start in range(input_start, input_end + 1):
-        for new_end in range(input_end, new_start - 1, -1):
-            text_span = ' '.join(doc_tokens[new_start:(new_end + 1)])
-            if text_span == tok_answer_text:
-                return (new_start, new_end)
-
-    return (input_start, input_end)
-
-
-def check_is_max_context(doc_spans, cur_span_index, position):
-    """Check if this is the 'max context' doc span for the token.
-
-    Because of the sliding window approach taken to scoring documents, a single
-    token can appear in multiple documents. E.g.
-    Doc: the man went to the store and bought a gallon of milk
-    Span A: the man went to the
-    Span B: to the store and bought
-    Span C: and bought a gallon of
-    ...
-
-    Now the word 'bought' will have two scores from spans B and C. We only
-    want to consider the score with "maximum context", which we define as
-    the *minimum* of its left and right context (the *sum* of left and
-    right context will always be the same, of course).
-
-    In the example the maximum context for 'bought' would be span C since
-    it has 1 left context and 3 right context, while span B has 4 left context
-    and 0 right context.
-
-    Note that position is the absolute position in the origin text.
-
-    Parameters
-    ----------
-    doc_spans: list
-        A list of doc spans
-    cur_span_index: int
-        The index of doc span to be checked in doc_spans.
-    position: int
-        Position of the token to be checked.
-
-    Returns
-    -------
-    bool: True if the token has 'max context'.
-    """
-    best_score = None
-    best_span_index = None
-    for (span_index, doc_span) in enumerate(doc_spans):
-        start, end = doc_span
-        end -= 1
-        length = end - start + 1
-        if position < start:
-            continue
-        if position > end:
-            continue
-        num_left_context = position - start
-        num_right_context = end - position
-        score = min(num_left_context, num_right_context) + \
-                0.01 * length
-        if best_score is None or score > best_score:
-            best_score = score
-            best_span_index = span_index
-
-    return cur_span_index == best_span_index
-
-
-SquadExample = collections.namedtuple('SquadExample', [
-    'qas_id', 'question_text', 'paragraph_text', 'doc_tokens', 'example_id', 'orig_answer_text',
-    'start_position', 'end_position', 'start_offset', 'end_offset', 'is_impossible'
-])
-
-
-def convert_squad_examples(record, is_training):
-    """read a single entry of gluonnlp.data.SQuAD and convert it to an example.
-
-    Parameters
-    ----------
-    record: list
-        An entry of gluonnlp.data.SQuAD
-    is_training: bool
-        If the example is used for training,
-        then a rough start/end position will be generated
-
-    Returns
-    -------
-    SquadExample: An instance of SquadExample
-    """
-    example_id = record[0]
-    qas_id = record[1]
-    question_text = record[2]
-    paragraph_text = record[3]
-    orig_answer_text = record[4][0] if record[4] else ''
-    answer_offset = record[5][0] if record[5] else ''
-    is_impossible = record[6] if len(record) == 7 else False
-
-    answer_length = len(orig_answer_text)
-    doc_tokens = []
-
-    char_to_word_offset = []
-    prev_is_whitespace = True
-
-    for c in paragraph_text:
-        if str.isspace(c):
-            prev_is_whitespace = True
-        else:
-            if prev_is_whitespace:
-                doc_tokens.append(c)
-            else:
-                doc_tokens[-1] += c
-            prev_is_whitespace = False
-        char_to_word_offset.append(len(doc_tokens) - 1)
-
-    if not is_training:
-        start_position = -1
-        end_position = -1
-    else:
-        start_position = char_to_word_offset[answer_offset] if not is_impossible else -1
-        end_position = char_to_word_offset[answer_offset + answer_length -
-                                           1] if not is_impossible else -1
-    answer_offset = -1 if is_impossible else answer_offset
-    example = SquadExample(
-        qas_id=qas_id, question_text=question_text, paragraph_text=paragraph_text,
-        doc_tokens=doc_tokens, example_id=example_id, orig_answer_text=orig_answer_text,
-        start_position=start_position, end_position=end_position, start_offset=answer_offset,
-        end_offset=answer_offset + len(orig_answer_text) - 1, is_impossible=is_impossible)
-    return example
diff --git a/src/gluonnlp/data/candidate_sampler.py b/src/gluonnlp/data/candidate_sampler.py
deleted file mode 100644
index e5fa66cfe6..0000000000
--- a/src/gluonnlp/data/candidate_sampler.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Candidate samplers"""
-
-__all__ = ['UnigramCandidateSampler']
-
-import mxnet as mx
-import numpy as np
-
-
-class UnigramCandidateSampler(mx.gluon.HybridBlock):
-    """Unigram Candidate Sampler
-
-    Draw random samples from a unigram distribution with specified weights
-    using the alias method.
-
-    Parameters
-    ----------
-    weights : mx.nd.NDArray
-        Unnormalized class probabilities. Samples are drawn and returned on the
-        same context as weights.context.
-    dtype : str or np.dtype, default 'float32'
-        Data type of the candidates. Make sure that the dtype precision is
-        large enough to represent the size of your weights array precisely. For
-        example, float32 can not distinguish 2**24 from 2**24 + 1.
-
-    """
-
-    def __init__(self, weights, dtype='float32'):
-        super(UnigramCandidateSampler, self).__init__()
-        self._dtype = dtype
-        self.N = weights.size
-
-        if (np.dtype(dtype) == np.float32 and weights.size > 2**24) or \
-           (np.dtype(dtype) == np.float16 and weights.size > 2**11):
-            s = 'dtype={dtype} can not represent all weights'
-            raise ValueError(s.format(dtype=dtype))
-
-        total_weights = weights.sum()
-        prob = (weights * self.N / total_weights).asnumpy().tolist()
-        alias = [0] * self.N
-
-        # sort the data into the outcomes with probabilities
-        # that are high and low than 1/N.
-        low = []
-        high = []
-        for i in range(self.N):
-            if prob[i] < 1.0:
-                low.append(i)
-            else:
-                high.append(i)
-
-        # pair low with high
-        while len(low) > 0 and len(high) > 0:
-            l = low.pop()
-            h = high.pop()
-
-            alias[l] = h
-            prob[h] = prob[h] - (1.0 - prob[l])
-
-            if prob[h] < 1.0:
-                low.append(h)
-            else:
-                high.append(h)
-
-        for i in low + high:
-            prob[i] = 1
-            alias[i] = i
-
-        # store
-        prob = mx.nd.array(prob, dtype='float64')
-        alias = mx.nd.array(alias, dtype='float64')
-        self.prob = self.params.get_constant('prob', prob)
-        self.alias = self.params.get_constant('alias', alias)
-
-    def __repr__(self):
-        s = '{block_name}({len_weights}, {dtype})'
-        return s.format(block_name=self.__class__.__name__, len_weights=self.N,
-                        dtype=self._dtype)
-
-    # pylint: disable=arguments-differ, unused-argument
-    def hybrid_forward(self, F, candidates_like, prob, alias):
-        """Draw samples from uniform distribution and return sampled candidates.
-
-        Parameters
-        ----------
-        candidates_like: mxnet.nd.NDArray or mxnet.sym.Symbol
-            This input specifies the shape of the to be sampled candidates. #
-
-        Returns
-        -------
-        samples: mxnet.nd.NDArray or mxnet.sym.Symbol
-            The sampled candidates of shape candidates_like.shape. Candidates
-            are sampled based on the weights specified on creation of the
-            UnigramCandidateSampler.
-        """
-        candidates_flat = candidates_like.reshape((-1, )).astype('float64')
-        idx = F.random.uniform_like(candidates_flat, low=0, high=self.N).floor()
-        prob = F.gather_nd(prob, idx.reshape((1, -1)))
-        alias = F.gather_nd(alias, idx.reshape((1, -1)))
-        where = F.random.uniform_like(candidates_flat) < prob
-        hit = idx * where
-        alt = alias * (1 - where)
-        candidates = (hit + alt).reshape_like(candidates_like)
-
-        return candidates.astype(self._dtype)
diff --git a/src/gluonnlp/data/classification.py b/src/gluonnlp/data/classification.py
deleted file mode 100644
index 9a259b600f..0000000000
--- a/src/gluonnlp/data/classification.py
+++ /dev/null
@@ -1,645 +0,0 @@
-# Copyright 2018 The Google AI Language Team Authors and DMLC.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""GLUE classification/regression datasets."""
-
-
-__all__ = [
-    'MRPCTask', 'QQPTask', 'QNLITask', 'RTETask', 'STSBTask',
-    'CoLATask', 'MNLITask', 'WNLITask', 'SSTTask', 'XNLITask', 'get_task'
-]
-
-from copy import copy
-from mxnet.metric import Accuracy, F1, MCC, PearsonCorrelation, CompositeEvalMetric
-from .glue import GlueCoLA, GlueSST2, GlueSTSB, GlueMRPC
-from .glue import GlueQQP, GlueRTE, GlueMNLI, GlueQNLI, GlueWNLI
-from .baidu_ernie_data import BaiduErnieXNLI, BaiduErnieChnSentiCorp, BaiduErnieLCQMC
-
-
-class GlueTask:
-    """Abstract GLUE task class.
-
-    Parameters
-    ----------
-    class_labels : list of str, or None
-        Classification labels of the task.
-        Set to None for regression tasks with continuous real values.
-    metrics : list of EValMetric
-        Evaluation metrics of the task.
-    is_pair : bool
-        Whether the task deals with sentence pairs or single sentences.
-    label_alias : dict
-        label alias dict, some different labels in dataset actually means
-        the same. e.g.: {'contradictory':'contradiction'} means contradictory
-        and contradiction label means the same in dataset, they will get
-        the same class id.
-    """
-    def __init__(self, class_labels, metrics, is_pair, label_alias=None):
-        self.class_labels = class_labels
-        self.metrics = metrics
-        self.is_pair = is_pair
-        self.label_alias = label_alias
-
-    def get_dataset(self, segment='train'):
-        """Get the corresponding dataset for the task.
-
-        Parameters
-        ----------
-        segment : str, default 'train'
-            Dataset segments.
-
-        Returns
-        -------
-        TSVDataset : the dataset of target segment.
-        """
-        raise NotImplementedError()
-
-    def dataset_train(self):
-        """Get the training segment of the dataset for the task.
-
-        Returns
-        -------
-        tuple of str, TSVDataset : the segment name, and the dataset.
-        """
-        return 'train', self.get_dataset(segment='train')
-
-    def dataset_dev(self):
-        """Get the dev segment of the dataset for the task.
-
-        Returns
-        -------
-        tuple of (str, TSVDataset), or list of tuple : the segment name, and the dataset.
-        """
-        return 'dev', self.get_dataset(segment='dev')
-
-    def dataset_test(self):
-        """Get the test segment of the dataset for the task.
-
-        Returns
-        -------
-        tuple of (str, TSVDataset), or list of tuple : the segment name, and the dataset.
-        """
-        return 'test', self.get_dataset(segment='test')
-
-class MRPCTask(GlueTask):
-    """The MRPC task on GlueBenchmark.
-
-    Examples
-    --------
-    >>> MRPC = MRPCTask()
-    >>> MRPC.class_labels
-    ['0', '1']
-    >>> type(MRPC.metrics.get_metric(0))
-    <class 'mxnet.metric.Accuracy'>
-    >>> type(MRPC.metrics.get_metric(1))
-    <class 'mxnet.metric.F1'>
-    >>> MRPC.dataset_train()[0]
-    -etc-
-    'train'
-    >>> len(MRPC.dataset_train()[1])
-    3668
-    >>> MRPC.dataset_dev()[0]
-    'dev'
-    >>> len(MRPC.dataset_dev()[1])
-    408
-    >>> MRPC.dataset_test()[0]
-    -etc-
-    'test'
-    >>> len(MRPC.dataset_test()[1])
-    1725
-    """
-    def __init__(self):
-        is_pair = True
-        class_labels = ['0', '1']
-        metric = CompositeEvalMetric()
-        metric.add(Accuracy())
-        metric.add(F1(average='micro'))
-        super(MRPCTask, self).__init__(class_labels, metric, is_pair)
-
-    def get_dataset(self, segment='train'):
-        """Get the corresponding dataset for MRPC.
-
-        Parameters
-        ----------
-        segment : str, default 'train'
-            Dataset segments. Options are 'train', 'dev', 'test'.
-        """
-        return GlueMRPC(segment=segment)
-
-class QQPTask(GlueTask):
-    """The Quora Question Pairs task on GlueBenchmark.
-
-    Examples
-    --------
-    >>> QQP = QQPTask()
-    >>> QQP.class_labels
-    ['0', '1']
-    >>> type(QQP.metrics.get_metric(0))
-    <class 'mxnet.metric.Accuracy'>
-    >>> type(QQP.metrics.get_metric(1))
-    <class 'mxnet.metric.F1'>
-    >>> import warnings
-    >>> with warnings.catch_warnings():
-    ...     # Ignore warnings triggered by invalid entries in GlueQQP set
-    ...     warnings.simplefilter("ignore")
-    ...     QQP.dataset_train()[0]
-    -etc-
-    'train'
-    >>> QQP.dataset_test()[0]
-    -etc-
-    'test'
-    >>> len(QQP.dataset_test()[1])
-    390965
-    """
-    def __init__(self):
-        is_pair = True
-        class_labels = ['0', '1']
-        metric = CompositeEvalMetric()
-        metric.add(Accuracy())
-        metric.add(F1(average='micro'))
-        super(QQPTask, self).__init__(class_labels, metric, is_pair)
-
-    def get_dataset(self, segment='train'):
-        """Get the corresponding dataset for QQP.
-
-        Parameters
-        ----------
-        segment : str, default 'train'
-            Dataset segments. Options are 'train', 'dev', 'test'.
-        """
-        return GlueQQP(segment=segment)
-
-
-class RTETask(GlueTask):
-    """The Recognizing Textual Entailment task on GlueBenchmark.
-
-    Examples
-    --------
-    >>> RTE = RTETask()
-    >>> RTE.class_labels
-    ['not_entailment', 'entailment']
-    >>> type(RTE.metrics)
-    <class 'mxnet.metric.Accuracy'>
-    >>> RTE.dataset_train()[0]
-    -etc-
-    'train'
-    >>> len(RTE.dataset_train()[1])
-    2490
-    >>> RTE.dataset_dev()[0]
-    -etc-
-    'dev'
-    >>> len(RTE.dataset_dev()[1])
-    277
-    >>> RTE.dataset_test()[0]
-    -etc-
-    'test'
-    >>> len(RTE.dataset_test()[1])
-    3000
-    """
-    def __init__(self):
-        is_pair = True
-        class_labels = ['not_entailment', 'entailment']
-        metric = Accuracy()
-        super(RTETask, self).__init__(class_labels, metric, is_pair)
-
-    def get_dataset(self, segment='train'):
-        """Get the corresponding dataset for RTE.
-
-        Parameters
-        ----------
-        segment : str, default 'train'
-            Dataset segments. Options are 'train', 'dev', 'test'.
-        """
-        return GlueRTE(segment=segment)
-
-class QNLITask(GlueTask):
-    """The SQuAD NLI task on GlueBenchmark.
-
-    Examples
-    --------
-    >>> QNLI = QNLITask()
-    >>> QNLI.class_labels
-    ['not_entailment', 'entailment']
-    >>> type(QNLI.metrics)
-    <class 'mxnet.metric.Accuracy'>
-    >>> QNLI.dataset_train()[0]
-    -etc-
-    'train'
-    >>> len(QNLI.dataset_train()[1])
-    108436
-    >>> QNLI.dataset_dev()[0]
-    -etc-
-    'dev'
-    >>> len(QNLI.dataset_dev()[1])
-    5732
-    >>> QNLI.dataset_test()[0]
-    -etc-
-    'test'
-    >>> len(QNLI.dataset_test()[1])
-    5740
-    """
-    def __init__(self):
-        is_pair = True
-        class_labels = ['not_entailment', 'entailment']
-        metric = Accuracy()
-        super(QNLITask, self).__init__(class_labels, metric, is_pair)
-
-    def get_dataset(self, segment='train'):
-        """Get the corresponding dataset for QNLI.
-
-        Parameters
-        ----------
-        segment : str, default 'train'
-            Dataset segments. Options are 'train', 'dev', 'test'.
-        """
-        return GlueQNLI(segment=segment)
-
-class STSBTask(GlueTask):
-    """The Sentence Textual Similarity Benchmark task on GlueBenchmark.
-
-    Examples
-    --------
-    >>> STSB = STSBTask()
-    >>> STSB.class_labels
-    >>> type(STSB.metrics)
-    <class 'mxnet.metric.PearsonCorrelation'>
-    >>> STSB.dataset_train()[0]
-    -etc-
-    'train'
-    >>> len(STSB.dataset_train()[1])
-    5749
-    >>> STSB.dataset_dev()[0]
-    -etc-
-    'dev'
-    >>> len(STSB.dataset_dev()[1])
-    1500
-    >>> STSB.dataset_test()[0]
-    -etc-
-    'test'
-    >>> len(STSB.dataset_test()[1])
-    1379
-    """
-    def __init__(self):
-        is_pair = True
-        class_labels = None
-        metric = PearsonCorrelation(average='micro')
-        super(STSBTask, self).__init__(class_labels, metric, is_pair)
-
-    def get_dataset(self, segment='train'):
-        """Get the corresponding dataset for STSB
-
-        Parameters
-        ----------
-        segment : str, default 'train'
-            Dataset segments. Options are 'train', 'dev', 'test'.
-        """
-        return GlueSTSB(segment=segment)
-
-class CoLATask(GlueTask):
-    """The Warstdadt acceptability task on GlueBenchmark.
-
-    Examples
-    --------
-    >>> CoLA = CoLATask()
-    >>> CoLA.class_labels
-    ['0', '1']
-    >>> type(CoLA.metrics)
-    <class 'mxnet.metric.MCC'>
-    >>> CoLA.dataset_train()[0]
-    -etc-
-    'train'
-    >>> len(CoLA.dataset_train()[1])
-    8551
-    >>> CoLA.dataset_dev()[0]
-    -etc-
-    'dev'
-    >>> len(CoLA.dataset_dev()[1])
-    1043
-    >>> CoLA.dataset_test()[0]
-    -etc-
-    'test'
-    >>> len(CoLA.dataset_test()[1])
-    1063
-    """
-    def __init__(self):
-        is_pair = False
-        class_labels = ['0', '1']
-        metric = MCC(average='micro')
-        super(CoLATask, self).__init__(class_labels, metric, is_pair)
-
-    def get_dataset(self, segment='train'):
-        """Get the corresponding dataset for CoLA
-
-        Parameters
-        ----------
-        segment : str, default 'train'
-            Dataset segments. Options are 'train', 'dev', 'test'.
-        """
-        return GlueCoLA(segment=segment)
-
-class SSTTask(GlueTask):
-    """The Stanford Sentiment Treebank task on GlueBenchmark.
-
-    Examples
-    --------
-    >>> SST = SSTTask()
-    >>> SST.class_labels
-    ['0', '1']
-    >>> type(SST.metrics)
-    <class 'mxnet.metric.Accuracy'>
-    >>> SST.dataset_train()[0]
-    -etc-
-    'train'
-    >>> len(SST.dataset_train()[1])
-    67349
-    >>> SST.dataset_dev()[0]
-    -etc-
-    'dev'
-    >>> len(SST.dataset_dev()[1])
-    872
-    >>> SST.dataset_test()[0]
-    -etc-
-    'test'
-    >>> len(SST.dataset_test()[1])
-    1821
-    """
-    def __init__(self):
-        is_pair = False
-        class_labels = ['0', '1']
-        metric = Accuracy()
-        super(SSTTask, self).__init__(class_labels, metric, is_pair)
-
-    def get_dataset(self, segment='train'):
-        """Get the corresponding dataset for SST
-
-        Parameters
-        ----------
-        segment : str, default 'train'
-            Dataset segments. Options are 'train', 'dev', 'test'.
-        """
-        return GlueSST2(segment=segment)
-
-class WNLITask(GlueTask):
-    """The Winograd NLI task on GlueBenchmark.
-
-    Examples
-    --------
-    >>> WNLI = WNLITask()
-    >>> WNLI.class_labels
-    ['0', '1']
-    >>> type(WNLI.metrics)
-    <class 'mxnet.metric.Accuracy'>
-    >>> WNLI.dataset_train()[0]
-    -etc-
-    'train'
-    >>> len(WNLI.dataset_train()[1])
-    635
-    >>> WNLI.dataset_dev()[0]
-    -etc-
-    'dev'
-    >>> len(WNLI.dataset_dev()[1])
-    71
-    >>> WNLI.dataset_test()[0]
-    -etc-
-    'test'
-    >>> len(WNLI.dataset_test()[1])
-    146
-    """
-    def __init__(self):
-        is_pair = True
-        class_labels = ['0', '1']
-        metric = Accuracy()
-        super(WNLITask, self).__init__(class_labels, metric, is_pair)
-
-    def get_dataset(self, segment='train'):
-        """Get the corresponding dataset for WNLI
-
-        Parameters
-        ----------
-        segment : str, default 'train'
-            Dataset segments. Options are 'dev', 'test', 'train'
-        """
-        return GlueWNLI(segment=segment)
-
-class MNLITask(GlueTask):
-    """The Multi-Genre Natural Language Inference task on GlueBenchmark.
-
-    Examples
-    --------
-    >>> MNLI = MNLITask()
-    >>> MNLI.class_labels
-    ['neutral', 'entailment', 'contradiction']
-    >>> type(MNLI.metrics)
-    <class 'mxnet.metric.Accuracy'>
-    >>> MNLI.dataset_train()[0]
-    -etc-
-    'train'
-    >>> len(MNLI.dataset_train()[1])
-    392702
-    >>> MNLI.dataset_dev()[0][0]
-    -etc-
-    'dev_matched'
-    >>> len(MNLI.dataset_dev()[0][1])
-    9815
-    >>> MNLI.dataset_dev()[1][0]
-    'dev_mismatched'
-    >>> len(MNLI.dataset_dev()[1][1])
-    9832
-    >>> MNLI.dataset_test()[0][0]
-    -etc-
-    'test_matched'
-    >>> len(MNLI.dataset_test()[0][1])
-    9796
-    >>> MNLI.dataset_test()[1][0]
-    'test_mismatched'
-    >>> len(MNLI.dataset_test()[1][1])
-    9847
-    """
-    def __init__(self):
-        is_pair = True
-        class_labels = ['neutral', 'entailment', 'contradiction']
-        metric = Accuracy()
-        super(MNLITask, self).__init__(class_labels, metric, is_pair)
-
-    def get_dataset(self, segment='train'):
-        """Get the corresponding dataset for MNLI
-
-        Parameters
-        ----------
-        segment : str, default 'train'
-            Dataset segments. Options are 'dev_matched', 'dev_mismatched', 'test_matched',
-            'test_mismatched', 'train'
-        """
-        return GlueMNLI(segment=segment)
-
-    def dataset_dev(self):
-        """Get the dev segment of the dataset for the task.
-
-        Returns
-        -------
-        list of TSVDataset : the dataset of the dev segment.
-        """
-        return [('dev_matched', self.get_dataset(segment='dev_matched')),
-                ('dev_mismatched', self.get_dataset(segment='dev_mismatched'))]
-
-    def dataset_test(self):
-        """Get the test segment of the dataset for the task.
-
-        Returns
-        -------
-        list of TSVDataset : the dataset of the test segment.
-        """
-        return [('test_matched', self.get_dataset(segment='test_matched')),
-                ('test_mismatched', self.get_dataset(segment='test_mismatched'))]
-
-class XNLITask(GlueTask):
-    """The XNLI task using the dataset released from Baidu
-
-    <https://github.com/PaddlePaddle/LARK/tree/develop/ERNIE>.
-
-    Examples
-    --------
-    >>> XNLI = XNLITask()
-    >>> XNLI.class_labels
-    ['neutral', 'entailment', 'contradiction']
-    >>> type(XNLI.metrics)
-    <class 'mxnet.metric.Accuracy'>
-    >>> XNLI.dataset_train()[0]
-    'train'
-    >>> len(XNLI.dataset_train()[1])
-    392702
-    >>> XNLI.dataset_dev()[0]
-    'dev'
-    >>> len(XNLI.dataset_dev()[1])
-    2490
-    >>> XNLI.dataset_test()[0]
-    'test'
-    >>> len(XNLI.dataset_test()[1])
-    5010
-    """
-    def __init__(self):
-        is_pair = True
-        class_labels = ['neutral', 'entailment', 'contradiction']
-        metric = Accuracy()
-        super(XNLITask, self).__init__(class_labels, metric, is_pair,
-                                       label_alias={'contradictory':'contradiction'})
-
-    def get_dataset(self, segment='train'):
-        """Get the corresponding dataset for XNLI.
-
-        Parameters
-        ----------
-        segment : str, default 'train'
-            Dataset segments. Options are 'dev', 'test', 'train'
-        """
-        return BaiduErnieXNLI(segment)
-
-class LCQMCTask(GlueTask):
-    """The LCQMC task.
-
-    Note that this dataset is no longer public. You can apply to the dataset owners for LCQMC.
-    http://icrc.hitsz.edu.cn/info/1037/1146.htm
-
-    """
-    def __init__(self):
-        is_pair = True
-        class_labels = ['0', '1']
-        metric = Accuracy()
-        super(LCQMCTask, self).__init__(class_labels, metric, is_pair)
-
-    def get_dataset(self, file_path, segment='train'):
-        # pylint: disable=arguments-differ
-        """Get the corresponding dataset for LCQMC.
-
-        Parameters
-        ----------
-        file_path : str
-            Path to the dataset file
-        segment : str, default 'train'
-            Dataset segments. Options are 'dev', 'test', 'train'
-        """
-        return BaiduErnieLCQMC(file_path, segment)
-
-class ChnSentiCorpTask(GlueTask):
-    """The ChnSentiCorp task using the dataset released from Baidu
-
-    <https://github.com/PaddlePaddle/LARK/tree/develop/ERNIE>.
-
-    Examples
-    --------
-    >>> ChnSentiCorp = ChnSentiCorpTask()
-    >>> ChnSentiCorp.class_labels
-    ['0', '1']
-    >>> type(ChnSentiCorp.metrics)
-    <class 'mxnet.metric.Accuracy'>
-    >>> ChnSentiCorp.dataset_train()[0]
-    'train'
-    >>> len(ChnSentiCorp.dataset_train()[1])
-    9600
-    >>> ChnSentiCorp.dataset_dev()[0]
-    'dev'
-    >>> len(ChnSentiCorp.dataset_dev()[1])
-    1200
-    >>> ChnSentiCorp.dataset_test()[0]
-    'test'
-    >>> len(ChnSentiCorp.dataset_test()[1])
-    1200
-    """
-    def __init__(self):
-        is_pair = False
-        class_labels = ['0', '1']
-        metric = Accuracy()
-        super(ChnSentiCorpTask, self).__init__(class_labels, metric, is_pair)
-
-    def get_dataset(self, segment='train'):
-        """Get the corresponding dataset for ChnSentiCorp.
-
-        Parameters
-        ----------
-        segment : str, default 'train'
-            Dataset segments. Options are 'dev', 'test', 'train'
-        """
-        return BaiduErnieChnSentiCorp(segment)
-
-def get_task(task):
-    """Returns a pre-defined glue task by name.
-
-    Parameters
-    ----------
-    task : str
-        Options include 'MRPC', 'QNLI', 'RTE', 'STS-B', 'CoLA',
-        'MNLI', 'WNLI', 'SST', 'XNLI', 'LCQMC', 'ChnSentiCorp'
-
-    Returns
-    -------
-    GlueTask
-    """
-    tasks = {
-        'mrpc': MRPCTask(),
-        'qqp': QQPTask(),
-        'qnli': QNLITask(),
-        'rte': RTETask(),
-        'sts-b': STSBTask(),
-        'cola': CoLATask(),
-        'mnli': MNLITask(),
-        'wnli': WNLITask(),
-        'sst': SSTTask(),
-        'xnli': XNLITask(),
-        'lcqmc': LCQMCTask(),
-        'chnsenticorp': ChnSentiCorpTask()
-    }
-    if task.lower() not in tasks:
-        raise ValueError(
-            'Task name %s is not supported. Available options are\n\t%s'%(
-                task, '\n\t'.join(sorted(tasks.keys()))))
-    return copy(tasks[task.lower()])
diff --git a/src/gluonnlp/data/conll.py b/src/gluonnlp/data/conll.py
deleted file mode 100644
index b2602e944e..0000000000
--- a/src/gluonnlp/data/conll.py
+++ /dev/null
@@ -1,470 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=
-"""CoNLL format corpora."""
-
-__all__ = ['CoNLL2000', 'CoNLL2001', 'CoNLL2002', 'CoNLL2004', 'UniversalDependencies21']
-
-import codecs
-import glob
-import gzip
-import io
-import os
-import shutil
-import tarfile
-
-from mxnet.gluon.data import SimpleDataset
-from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url
-
-from .. import _constants as C
-from .registry import register
-from ..base import get_home_dir
-
-
-class _CoNLLSequenceTagging(SimpleDataset):
-    def __init__(self, segment, root, has_comment=False):
-        root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
-        self._segment = segment
-        self._root = root
-        self._has_comment = has_comment
-        super(_CoNLLSequenceTagging, self).__init__(self._read_data())
-
-    def _get_data_file_hash(self):
-        assert self._segment in self._data_file, \
-                'Segment "{}" is not available. Options are: {}.'.format(self._segment,
-                                                                         self._data_files.keys())
-        return [self._data_file[self._segment]]
-
-    def _get_data_archive_hash(self):
-        return self._get_data_file_hash()[0]
-
-    def _extract_archive(self):
-        pass
-
-    def _get_data(self):
-        archive_file_name, archive_hash = self._get_data_archive_hash()
-        paths = []
-        for data_file_name, data_hash in self._get_data_file_hash():
-            root = self._root
-            path = os.path.join(root, data_file_name)
-            if hasattr(self, 'namespace'):
-                url = _get_repo_file_url(self.namespace, archive_file_name)
-            else:
-                url = self.base_url + archive_file_name
-            if not os.path.exists(path) or not check_sha1(path, data_hash):
-                download(url, path=root, sha1_hash=archive_hash)
-                self._extract_archive()
-            paths.append(path)
-        return paths
-
-    def _read_data(self):
-        paths = self._get_data()
-        results = []
-        for path in paths:
-            with gzip.open(path, 'r') if path.endswith('gz') else io.open(path, 'rb') as f:
-                line_iter = codecs.getreader(self.codec)\
-                    (io.BufferedReader(f))  # pytype: disable=wrong-arg-types
-                results.append(self._process_iter(line_iter))
-        return list([x for field in item for x in field] for item in zip(*results))
-
-    def _process_iter(self, line_iter):
-        samples = []
-        buf = []
-        for line in line_iter:
-            if not buf and line.startswith('#') and self._has_comment:
-                continue
-            line = line.split()
-            if line:
-                buf.append(line)
-            elif buf:
-                samples.append(tuple(map(list, zip(*buf))))
-                buf = []
-        if buf:
-            samples.append(tuple(map(list, zip(*buf))))
-        return samples
-
-
-@register(segment=['train', 'test'])
-class CoNLL2000(_CoNLLSequenceTagging):
-    """CoNLL2000 Part-of-speech (POS) tagging and chunking joint task dataset.
-
-    Each sample has three fields: word, POS tag, chunk label.
-
-    From
-    https://www.clips.uantwerpen.be/conll2000/chunking/
-
-    Parameters
-    ----------
-    segment : {'train', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/conll2000'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> conll = gluonnlp.data.CoNLL2000('test', root='./datasets/conll2000')
-    -etc-
-    >>> len(conll)
-    2012
-    >>> len(conll[0])
-    3
-    >>> conll[8][0]
-    ['SHEARSON', 'LEHMAN', 'HUTTON', 'Inc', '.']
-    >>> conll[8][1]
-    ['NNP', 'NNP', 'NNP', 'NNP', '.']
-    >>> conll[8][2]
-    ['B-NP', 'I-NP', 'I-NP', 'I-NP', 'O']
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'conll2000')):
-        self._data_file = {'train': ('train.txt.gz',
-                                     '9f31cf936554cebf558d07cce923dca0b7f31864'),
-                           'test': ('test.txt.gz',
-                                    'dc57527f1f60eeafad03da51235185141152f849')}
-        super(CoNLL2000, self).__init__(segment, root)
-
-    base_url = 'http://www.clips.uantwerpen.be/conll2000/chunking/'
-    codec = 'utf-8'
-
-
-@register(segment=['train', 'testa', 'testb'], part=[1, 2, 3])
-class CoNLL2001(_CoNLLSequenceTagging):
-    """CoNLL2001 Clause Identification dataset.
-
-    Each sample has four fields: word, POS tag, chunk label, clause tag.
-
-    From
-    https://www.clips.uantwerpen.be/conll2001/clauses/
-
-    Parameters
-    ----------
-    part : int, {1, 2, 3}
-        Part number of the dataset.
-    segment : {'train', 'testa', 'testb'}, default 'train'
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/conll2001'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> conll = gluonnlp.data.CoNLL2001(1, 'testa', root='./datasets/conll2001')
-    -etc-
-    >>> len(conll)
-    2012
-    >>> len(conll[0])
-    4
-    >>> conll[8][0]
-    ['SHEARSON', 'LEHMAN', 'HUTTON', 'Inc', '.']
-    >>> conll[8][1]
-    ['NNP', 'NNP', 'NNP', 'NNP', '.']
-    >>> conll[8][2]
-    ['B-NP', 'I-NP', 'I-NP', 'I-NP', 'O']
-    >>> conll[8][3]
-    ['X', 'X', 'X', 'X', 'X']
-    """
-    def __init__(self, part, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'conll2001')):
-        self._part = part
-        self._data_file = [
-            {'train': ('train1',
-                       '115400d32437a86af85fbd549c1297775aec5996'),
-             'testa': ('testa1',
-                       '0fad761a9c3e0fece80550add3420554619bce66'),
-             'testb': ('testb1',
-                       'f1075e69b57a9c8e5e5de8496610469dcaaca511')},
-            {'train': ('train2',
-                       'd48cf110875e5999e20e72bc446c9dd19fdde618'),
-             'testa': ('testa2',
-                       '27262d3a45e6b08631d8c2c8d8c33cf7fd63db2c'),
-             'testb': ('testb2',
-                       'd8d0b5819ca5e275c25cec0287ffff8811e65321')},
-            {'train': ('train3',
-                       'c064ba4cb54f81a3d1e15d48cc990dee55a326bc'),
-             'testa': ('testa3',
-                       'c0c11cceb17bba8e0aaad0368d8b0b869c4959f7'),
-             'testb': ('testb3',
-                       'a37f3ca89eb4db08fc576f50161f6c2945302541')}
-            ]
-        super(CoNLL2001, self).__init__(segment, root)
-
-    base_url = 'https://www.clips.uantwerpen.be/conll2001/clauses/data/'
-    codec = 'utf-8'
-
-    def _get_data_file_hash(self):
-        assert self._part in [1, 2, 3], \
-                'Part "{}" is not availble. Options are 1, 2, 3.'.format(self._part)
-        available_segments = self._data_file[self._part-1].keys()
-        assert self._segment in available_segments, \
-                'Segment "{}" is not available. Options are: {}.'.format(self._segment,
-                                                                         available_segments)
-        return [self._data_file[self._part-1][self._segment]]
-
-
-@register(segment=['train', 'testa', 'testb'], lang=['esp', 'ned'])
-class CoNLL2002(_CoNLLSequenceTagging):
-    """CoNLL2002 Named Entity Recognition (NER) task dataset.
-
-    For 'esp', each sample has two fields: word, NER label.
-
-    For 'ned', each sample has three fields: word, POS tag, NER label.
-
-    From
-    https://www.clips.uantwerpen.be/conll2002/ner/
-
-    Parameters
-    ----------
-    lang : str, {'esp', 'ned'}
-        Dataset language.
-    segment : {'train', 'testa', 'testb'}, default 'train'
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/conll2002'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> conll = gluonnlp.data.CoNLL2002('esp', 'testa', root='./datasets/conll2002')
-    -etc-
-    >>> len(conll)
-    1915
-    >>> len(conll[0])
-    2
-    >>> conll[0][0]
-    ['Sao', 'Paulo', '(', 'Brasil', ')', ',', '23', 'may', '(', 'EFECOM', ')', '.']
-    >>> conll[0][1]
-    ['B-LOC', 'I-LOC', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O']
-    """
-    def __init__(self, lang, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'conll2002')):
-        self._lang = lang
-        self._data_file = {
-            'esp': {'train': ('esp.train.gz',
-                              '2f25c8c1a724009f440af8bb3c03710f089dfe11'),
-                    'testa': ('esp.testa.gz',
-                              '1afd035a29419b1a9531308cae6157c624260693'),
-                    'testb': ('esp.testb.gz',
-                              'c6a16bcb0399bf212fec80d6049eaeffcdb58c1d')},
-            'ned': {'train': ('ned.train.gz',
-                              '4282015737b588efa13e6616222d238247a85c67'),
-                    'testa': ('ned.testa.gz',
-                              '7584cbf55692d3b0c133de6d7411ad04ae0e710a'),
-                    'testb': ('ned.testb.gz',
-                              '4d07c576f99aae8a305855a9cbf40163c0b8d84e')}}
-        super(CoNLL2002, self).__init__(segment, root)
-
-    base_url = 'https://www.clips.uantwerpen.be/conll2002/ner/data/'
-    codec = 'latin-1'
-
-    def _get_data_file_hash(self):
-        assert self._lang in self._data_file, \
-                'Language "{}" is not available. Options are "{}".'.format(self._lang,
-                                                                           self._data_file.keys())
-        available_segments = self._data_file[self._lang].keys()
-        assert self._segment in available_segments, \
-                'Segment "{}" is not available. Options are: {}.'.format(self._segment,
-                                                                         available_segments)
-        return [self._data_file[self._lang][self._segment]]
-
-
-@register(segment=['train', 'dev', 'test'])
-class CoNLL2004(_CoNLLSequenceTagging):
-    """CoNLL2004 Semantic Role Labeling (SRL) task dataset.
-
-    Each sample has six or more fields: word, POS tag, chunk label, clause tag, NER label,
-    target verbs, and sense labels (of variable number per sample).
-
-    From
-    http://www.cs.upc.edu/~srlconll/st04/st04.html
-
-    Parameters
-    ----------
-    segment : {'train', 'dev', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/conll2004'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> conll = gluonnlp.data.CoNLL2004('dev', root='./datasets/conll2004')
-    -etc-
-    >>> len(conll)
-    2012
-    >>> len(conll[8])
-    6
-    >>> conll[8][0]
-    ['SHEARSON', 'LEHMAN', 'HUTTON', 'Inc', '.']
-    >>> conll[8][1]
-    ['NNP', 'NNP', 'NNP', 'NNP', '.']
-    >>> conll[8][2]
-    ['B-NP', 'I-NP', 'I-NP', 'I-NP', 'O']
-    >>> conll[8][3]
-    ['*', '*', '*', '*', '*']
-    >>> conll[8][4]
-    ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG', 'O']
-    >>> conll[8][5]
-    ['-', '-', '-', '-', '-']
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'conll2004')):
-        self._archive_file = ('conll04st-release.tar.gz',
-                              '09ef957d908d34fa0abd745cbe43e279414f076c')
-        self._data_file = {
-            'word': {'train': ('words.train.gz',
-                               '89ac63dcdcffc71601a224be6ada7f2e67c8e61f'),
-                     'dev': ('words.dev.gz',
-                             'c3e59d75ae6bbeb76ee78e52a7a7c6b52abc5b6f'),
-                     'test': ('words.test.gz',
-                              '61c7653732d83b51593ed29ae7ff45cd8277c8b5')},
-            'synt': {'train': ('synt.train.pred.gz',
-                               '43ed796f953dcf00db52ec593ed3377aa440d838'),
-                     'dev': ('synt.dev.pred.gz',
-                             'c098ca8a265fb67529c90eee5a93f6781ad87747'),
-                     'test': ('synt.test.pred.gz',
-                              '272c2856171f3e28e3512906ee07019bac90a6b2')},
-            'ne': {'train': ('ne.train.pred.gz',
-                             'd10e8b11b6b856efac978697af75cf582cac6e86'),
-                   'dev': ('ne.dev.pred.gz',
-                           '7883f76f28675d2a7247be527967b846494bbe2c'),
-                   'test': ('ne.test.pred.gz',
-                            'f1a52a58bb96e07e0288479a4a633476d8211963')},
-            'props': {'train': ('props.train.gz',
-                                'c67bb4546e9110ce39ce063624c7a0adf65ea795'),
-                      'dev': ('props.dev.gz',
-                              '7e232a4113d1a7e68b719a2781f09399ebf39956'),
-                      'test': ('props.test.gz',
-                               '639d54e24cebd7476b05c0efc0cbb019ebe52d8e')}}
-
-        super(CoNLL2004, self).__init__(segment, root)
-
-    base_url = 'http://www.cs.upc.edu/~srlconll/st04/'
-    namespace = 'gluon/dataset/conll'
-    codec = 'utf-8'
-
-    def _get_data_file_hash(self):
-        available_segments = self._data_file['ne'].keys()
-        assert self._segment in self._data_file['ne'], \
-                'Segment "{}" is not available. Options are: {}'.format(self._segment,
-                                                                        available_segments)
-        return [self._data_file[part][self._segment] for part in ['word', 'synt', 'ne', 'props']]
-
-    def _get_data_archive_hash(self):
-        return self._archive_file
-
-    def _extract_archive(self):
-        archive_file_name, _ = self._get_data_archive_hash()
-        root = self._root
-        path = os.path.join(root, archive_file_name)
-        with tarfile.open(path, 'r:gz') as tar:
-            tar.extractall(path=root)
-        for fn in glob.glob(os.path.join(root, 'conll04st-release', '*.gz')):
-            shutil.copy(fn, root)
-        shutil.rmtree(os.path.join(root, 'conll04st-release'), ignore_errors=True)
-
-
-@register(segment=['train', 'dev', 'test'],
-          lang=list(C.UD21_DATA_FILE_SHA1.keys()))
-class UniversalDependencies21(_CoNLLSequenceTagging):
-    """Universal dependencies tree banks.
-
-    Each sample has 8 or more fields as described in
-    http://universaldependencies.org/docs/format.html
-
-    From
-    https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-2515
-
-    Parameters
-    ----------
-    lang : str, default 'en'
-        Dataset language.
-    segment : str, default 'train'
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/ud2.1'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> ud = gluonnlp.data.UniversalDependencies21('en', 'dev', root='./datasets/ud21')
-    -etc-
-    >>> len(ud)
-    2002
-    >>> len(ud[0])
-    10
-    >>> ud[0][0]
-    ['1', '2', '3', '4', '5', '6', '7']
-    >>> ud[0][1]
-    ['From', 'the', 'AP', 'comes', 'this', 'story', ':']
-    >>> ud[0][2]
-    ['from', 'the', 'AP', 'come', 'this', 'story', ':']
-    >>> ud[0][3]
-    ['ADP', 'DET', 'PROPN', 'VERB', 'DET', 'NOUN', 'PUNCT']
-    >>> ud[0][4]
-    ['IN', 'DT', 'NNP', 'VBZ', 'DT', 'NN', ':']
-    >>> ud[0][5][:3]
-    ['_', 'Definite=Def|PronType=Art', 'Number=Sing']
-    >>> ud[0][6]
-    ['3', '3', '4', '0', '6', '4', '4']
-    >>> ud[0][7]
-    ['case', 'det', 'obl', 'root', 'det', 'nsubj', 'punct']
-    >>> ud[0][8]
-    ['3:case', '3:det', '4:obl', '0:root', '6:det', '4:nsubj', '4:punct']
-    >>> ud[0][9]
-    ['_', '_', '_', '_', '_', '_', '_']
-    """
-    def __init__(self, lang='en', segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'ud2.1')):
-        self._archive_file = ('ud-treebanks-v2.1.tgz',
-                              '77657b897951e21d2eca6b29958e663964eb57ae')
-        self._lang = lang
-        self._data_file = C.UD21_DATA_FILE_SHA1
-
-        super(UniversalDependencies21, self).__init__(segment, root, True)
-
-    base_url = 'https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-2515/'
-    codec = 'utf-8'
-
-    def _get_data_file_hash(self):
-        assert self._lang in self._data_file, \
-                'Language "{}" is not available. Options are {}.'.format(
-                    self._lang, list(self._data_file.keys()))
-        available_segments = self._data_file[self._lang].keys()
-        assert self._segment in available_segments, \
-                'Segment "{}" is not available for language "{}". ' \
-                'Options are: {}.'.format(self._segment, self._lang, list(available_segments))
-        return [self._data_file[self._lang][self._segment]]
-
-    def _get_data_archive_hash(self):
-        return self._archive_file
-
-    def _extract_archive(self):
-        archive_file_name, _ = self._get_data_archive_hash()
-        root = self._root
-        path = os.path.join(root, archive_file_name)
-        with tarfile.open(path, 'r:gz') as tar:
-            tar.extractall(path=root)
-        for fn in glob.glob(os.path.join(root, 'ud-treebanks-v2.1', '*', '*.conllu')):
-            shutil.copy(fn, root)
-        for data_license in glob.glob(os.path.join(root, 'ud-treebanks-v2.1', '*', 'LICENSE.txt')):
-            lang = os.path.dirname(data_license).split(os.path.sep)[-1]
-            shutil.copy(data_license, os.path.join(root, '{}_LICENSE.txt'.format(lang)))
-        shutil.rmtree(os.path.join(root, 'ud-treebanks-v2.1'), ignore_errors=True)
diff --git a/src/gluonnlp/data/corpora/__init__.py b/src/gluonnlp/data/corpora/__init__.py
deleted file mode 100644
index a209f84ac0..0000000000
--- a/src/gluonnlp/data/corpora/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Corpora."""
-
-from . import (google_billion_word, large_text_compression_benchmark, wikitext)
-
-from .google_billion_word import *
-from .large_text_compression_benchmark import *
-from .wikitext import *
-
-__all__ = (google_billion_word.__all__ +
-           large_text_compression_benchmark.__all__ + wikitext.__all__)
diff --git a/src/gluonnlp/data/corpora/google_billion_word.py b/src/gluonnlp/data/corpora/google_billion_word.py
deleted file mode 100644
index 36128a4dbc..0000000000
--- a/src/gluonnlp/data/corpora/google_billion_word.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=
-"""Google billion words dataset."""
-
-__all__ = ['GBWStream']
-
-import glob
-import hashlib
-import io
-import os
-import tarfile
-import zipfile
-
-from mxnet.gluon.utils import _get_repo_file_url, check_sha1, download
-
-from ..._constants import EOS_TOKEN
-from ...base import get_home_dir
-from ...vocab import Vocab
-from ..dataset import CorpusDataset
-from ..stream import SimpleDatasetStream
-
-
-class GBWStream(SimpleDatasetStream):
-    """1-Billion-Word word-level dataset for language modeling, from Google.
-
-    The GBWSream iterates over CorpusDatasets(flatten=False).
-
-    Source http://www.statmt.org/lm-benchmark
-
-    License: Apache
-
-    Parameters
-    ----------
-    segment : {'train', 'test'}, default 'train'
-        Dataset segment.
-    skip_empty : bool, default True
-        Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
-        will be added in empty samples.
-    bos : str or None, default None
-        The token to add at the begining of each sentence. If None, nothing is added.
-    eos : str or None, default '<eos>'
-        The token to add at the end of each sentence. If None, nothing is added.
-    root : str, default '$MXNET_HOME/datasets/gbw'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    """
-
-    _archive_data = ('1-billion-word-language-modeling-benchmark-r13output.tar.gz',
-                     '4df859766482e12264a5a9d9fb7f0e276020447d')
-    _archive_vocab = ('gbw-ebb1a287.zip',
-                      '63b335dcc27b6804d0a14acb88332d2602fe0f59')
-    _data_file = {'train': ('training-monolingual.tokenized.shuffled',
-                            'news.en-00*-of-00100',
-                            '5e0d7050b37a99fd50ce7e07dc52468b2a9cd9e8'),
-                  'test': ('heldout-monolingual.tokenized.shuffled',
-                           'news.en.heldout-00000-of-00050',
-                           '0a8e2b7496ba0b5c05158f282b9b351356875445')}
-    _vocab_file = ('gbw-ebb1a287.vocab',
-                   'ebb1a287ca14d8fa6f167c3a779e5e7ed63ac69f')
-
-    # Directory layout:
-    # - root ($MXNET_HOME/datasets/gbw)
-    #   - archive_file (1-billion-word-language-modeling-benchmark-r13output.tar.gz)
-    #   - dir (1-billion-word-language-modeling-benchmark-r13output)
-    #     - subdir (training-monolingual.tokenized.shuffled)
-    #     - subdir (heldout-monolingual.tokenized.shuffled)
-
-    def __init__(self, segment='train', skip_empty=True, bos=None, eos=EOS_TOKEN,
-                 root=os.path.join(get_home_dir(), 'datasets', 'gbw')):
-        root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
-        self._root = root
-        self._dir = os.path.join(root, '1-billion-word-language-modeling-benchmark-r13output')
-        self._namespace = 'gluon/dataset/gbw'
-        subdir_name, pattern, data_hash = self._data_file[segment]
-        self._subdir = os.path.join(self._dir, subdir_name)
-        self._file_pattern = os.path.join(self._subdir, pattern)
-        self._data_hash = data_hash
-        self._get_data()
-        sampler = 'sequential' if segment != 'train' else 'random'
-        super().__init__(dataset=CorpusDataset, file_pattern=self._file_pattern,
-                         skip_empty=skip_empty, bos=bos, eos=eos, file_sampler=sampler)
-
-    def _get_data(self):
-        archive_file_name, archive_hash = self._archive_data
-        archive_file_path = os.path.join(self._root, archive_file_name)
-        exists = False
-        if os.path.exists(self._dir) and os.path.exists(self._subdir):
-            # verify sha1 for all files in the subdir
-            sha1 = hashlib.sha1()
-            filenames = sorted(glob.glob(self._file_pattern))
-            for filename in filenames:
-                with open(filename, 'rb') as f:
-                    while True:
-                        data = f.read(1048576)
-                        if not data:
-                            break
-                        sha1.update(data)
-            if sha1.hexdigest() == self._data_hash:
-                exists = True
-        if not exists:
-            # download archive
-            if not os.path.exists(archive_file_path) or \
-               not check_sha1(archive_file_path, archive_hash):
-                download(_get_repo_file_url(self._namespace, archive_file_name),
-                         path=self._root, sha1_hash=archive_hash)
-            # extract archive
-            with tarfile.open(archive_file_path, 'r:gz') as tf:
-                tf.extractall(path=self._root)
-
-    def _get_vocab(self):
-        archive_file_name, archive_hash = self._archive_vocab
-        vocab_file_name, vocab_hash = self._vocab_file
-        namespace = 'gluon/dataset/vocab'
-        root = self._root
-        path = os.path.join(root, vocab_file_name)
-        if not os.path.exists(path) or not check_sha1(path, vocab_hash):
-            downloaded_path = download(_get_repo_file_url(namespace, archive_file_name),
-                                       path=root, sha1_hash=archive_hash)
-
-            with zipfile.ZipFile(downloaded_path, 'r') as zf:
-                zf.extractall(path=root)
-        return path
-
-    @property
-    def vocab(self):
-        path = self._get_vocab()
-        with io.open(path, 'r', encoding='utf-8') as in_file:
-            return Vocab.from_json(in_file.read())
diff --git a/src/gluonnlp/data/corpora/large_text_compression_benchmark.py b/src/gluonnlp/data/corpora/large_text_compression_benchmark.py
deleted file mode 100644
index 3d9a8e20f7..0000000000
--- a/src/gluonnlp/data/corpora/large_text_compression_benchmark.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=
-"""Large Text Compression Benchmark.
-
-The test data for the Large Text Compression Benchmark is the first 109 bytes
-of the English Wikipedia dump on Mar. 3, 2006.
-http://download.wikipedia.org/enwiki/20060303/enwiki-20060303-pages-articles.xml.bz2
-(1.1 GB or 4.8 GB after decompressing with bzip2 - link no longer works).
-Results are also given for the first 108 bytes, which is also used for the
-Hutter Prize. These files have the following sizes and checksums:
-
-File     Size (bytes)   MD5 (GNU md5sum 1.22)             SHA-1 (SlavaSoft fsum 2.51)
-------   -------------  --------------------------------  ----------------------------------------
-enwik8     100,000,000  a1fa5ffddb56f4953e226637dabbb36a  57b8363b814821dc9d47aa4d41f58733519076b2
-enwik9   1,000,000,000  e206c3450ac99950df65bf70ef61a12d  2996e86fb978f93cca8f566cc56998923e7fe581
-
-See http://mattmahoney.net/dc/text.html and
-http://mattmahoney.net/dc/textdata.html for more information.
-
-"""
-
-__all__ = ['Text8', 'Fil9', 'Enwik8']
-
-import os
-import zipfile
-
-from mxnet.gluon.utils import _get_repo_file_url, check_sha1, download
-
-from ...base import get_home_dir
-from ..dataset import CorpusDataset
-
-
-class _LargeTextCompressionBenchmark(CorpusDataset):
-    def __init__(self, root, segment, **kwargs):
-        root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
-        self._root = root
-        self._segment = segment
-        self._namespace = 'gluon/dataset/large_text_compression_benchmark'
-        super().__init__(
-            self._get_data(self.archive_file, self.data_file, segment, root, self._namespace),
-            **kwargs)
-
-    @staticmethod
-    def _get_data(archive_file, data_file, segment, root, namespace):
-        archive_file_name, archive_hash = archive_file
-        data_file_name, data_hash = data_file[segment]
-        path = os.path.join(root, data_file_name)
-        if not os.path.exists(path) or not check_sha1(path, data_hash):
-            downloaded_file_path = download(_get_repo_file_url(namespace, archive_file_name),
-                                            path=root, sha1_hash=archive_hash)
-
-            with zipfile.ZipFile(downloaded_file_path, 'r') as zf:
-                zf.extractall(root)
-        return path
-
-
-class Text8(_LargeTextCompressionBenchmark):
-    """Text8 corpus
-
-    http://mattmahoney.net/dc/textdata.html
-
-    Part of the test data for the Large Text Compression Benchmark
-    http://mattmahoney.net/dc/text.html. The first 10**8 bytes of the cleaned
-    English Wikipedia dump on Mar. 3, 2006.
-
-    License: https://en.wikipedia.org/wiki/Wikipedia:Copyrights
-
-    Parameters
-    ----------
-    root : str, default '$MXNET_HOME/datasets/text8'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    """
-
-    archive_file = ('text8-6c70299b.zip', '6c70299b93b7e1f927b42cd8f6ac1a31547c7a2e')
-    data_file = {
-        'train': ('text8', '0dc3edebc970dcc96137e7deda4d9995af9d93de')
-    }
-
-    def __init__(self,
-                 root=os.path.join(get_home_dir(), 'datasets', 'text8'),
-                 segment='train',
-                 max_sentence_length=10000):
-        self._max_sentence_length = max_sentence_length
-        super().__init__(root=root, segment=segment)
-
-        # pylint: disable=access-member-before-definition
-        if max_sentence_length:
-            data = []
-            for sentence in self._data:
-                for i in range(0, len(sentence), max_sentence_length):
-                    data.append(sentence[i:i + max_sentence_length])
-            self._data = data
-
-
-class Fil9(_LargeTextCompressionBenchmark):
-    """Fil9 corpus
-
-    http://mattmahoney.net/dc/textdata.html
-
-    Part of the test data for the Large Text Compression Benchmark
-    http://mattmahoney.net/dc/text.html. The first 10**9 bytes of the English
-    Wikipedia dump on Mar. 3, 2006.
-
-    License: https://en.wikipedia.org/wiki/Wikipedia:Copyrights
-
-    Parameters
-    ----------
-    root : str, default '$MXNET_HOME/datasets/fil9'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    """
-
-    archive_file = ('fil9-e2a6a602.zip',
-                    'e2a6a602be8d3f9712c92423581aa47e7ffd5906')
-    data_file = {'train': ('fil9', '08caf9b1d5600233aa19cb6b25d7b798558304d3')}
-
-    def __init__(self,
-                 root=os.path.join(get_home_dir(), 'datasets', 'fil9'),
-                 segment='train',
-                 max_sentence_length=None):
-        self._max_sentence_length = max_sentence_length
-        super().__init__(root=root, segment=segment)
-
-        # pylint: disable=access-member-before-definition
-        if max_sentence_length is not None:
-            data = []
-            for sentence in self._data:
-                for i in range(0, len(sentence), max_sentence_length):
-                    data.append(sentence[i:i + max_sentence_length])
-            self._data = data
-
-
-class Enwik8(_LargeTextCompressionBenchmark):
-    """Enwik8 corpus
-
-    http://mattmahoney.net/dc/textdata.html
-
-    Part of the test data for the Large Text Compression Benchmark
-    http://mattmahoney.net/dc/text.html. The first 10**8 bytes of the English
-    Wikipedia dump on Mar. 3, 2006.
-
-    License: https://en.wikipedia.org/wiki/Wikipedia:Copyrights
-
-    Parameters
-    ----------
-    root : str, default '$MXNET_HOME/datasets/text8'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    segment
-        train, test, valid, trainraw, testraw and validraw segments
-        preprocessed with
-        https://github.com/salesforce/awd-lstm-lm/blob/master/data/enwik8/prep_enwik8.py
-        are provided.
-
-    """
-
-    archive_file = ('enwik8-d25f6043.zip', 'd25f60433af3c02ec6d2dec2435e1732f42a1a68')
-    data_file = {
-        'test': ('test.txt', '1389fdf312b253350a959d4fd63e5e9ae7fe74d4'),
-        'train': ('train.txt', 'eff044567358678cd81b9eda516cb146fdba7360'),
-        'val': ('valid.txt', '2076ad59caee0099b6c68e66f92d7ef7d0975113'),
-        'testraw': ('test.txt.raw', 'c30edaac372090c10a562b8777a6703fa3dd9f7e'),
-        'trainraw': ('train.txt.raw', 'd8a8d0ca2a95f20c9d243cb60a579a59d12b0f48'),
-        'valraw': ('valid.txt.raw', '2e6218a15c1d5c3c2d23f8092bf07bc24da0d922')
-    }
-
-    def __init__(self, root=os.path.join(get_home_dir(), 'datasets', 'enwik8'),
-                 segment: str = 'train'):
-        super().__init__(root=root, segment=segment)
diff --git a/src/gluonnlp/data/corpora/wikitext.py b/src/gluonnlp/data/corpora/wikitext.py
deleted file mode 100644
index 13bdd866b3..0000000000
--- a/src/gluonnlp/data/corpora/wikitext.py
+++ /dev/null
@@ -1,416 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=
-"""WikiText corpora."""
-
-__all__ = ['WikiText2', 'WikiText103', 'WikiText2Raw', 'WikiText103Raw']
-
-import os
-import shutil
-import zipfile
-
-from mxnet.gluon.utils import _get_repo_file_url, check_sha1, download
-
-from ... import _constants as C
-from ..dataset import CorpusDataset
-from ..registry import register
-from ...base import get_home_dir
-
-
-class _WikiText(CorpusDataset):
-    def __init__(self, namespace, segment, bos, eos, flatten, skip_empty, root,
-                 **kwargs):
-        root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
-        self._root = root
-        self._namespace = 'gluon/dataset/{}'.format(namespace)
-        self._segment = segment
-        super(_WikiText, self).__init__(
-            self._get_data(),
-            bos=bos,
-            eos=eos,
-            flatten=flatten,
-            skip_empty=skip_empty,
-            **kwargs)
-
-    def _get_data(self):
-        archive_file_name, archive_hash = self._archive_file
-        data_file_name, data_hash = self._data_file[self._segment]
-        root = self._root
-        path = os.path.join(root, data_file_name)
-        if not os.path.exists(path) or not check_sha1(path, data_hash):
-            downloaded_file_path = download(_get_repo_file_url(self._namespace, archive_file_name),
-                                            path=root,
-                                            sha1_hash=archive_hash)
-
-            with zipfile.ZipFile(downloaded_file_path, 'r') as zf:
-                for member in zf.namelist():
-                    filename = os.path.basename(member)
-                    if filename:
-                        dest = os.path.join(root, filename)
-                        with zf.open(member) as source, \
-                                open(dest, 'wb') as target:
-                            shutil.copyfileobj(source, target)
-        return path
-
-
-@register(segment=['train', 'val', 'test'])
-class WikiText2(_WikiText):
-    """WikiText-2 word-level dataset for language modeling, from Salesforce research.
-
-    WikiText2 is implemented as CorpusDataset with the default flatten=True.
-
-    From
-    https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
-
-    License: Creative Commons Attribution-ShareAlike
-
-    Parameters
-    ----------
-    segment : {'train', 'val', 'test'}, default 'train'
-        Dataset segment.
-    flatten : bool, default True
-        Whether to return all samples as flattened tokens. If True, each sample is a token.
-    skip_empty : bool, default True
-        Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
-        will be added in empty samples.
-    tokenizer : function, default str.split
-        A function that splits each sample string into list of tokens.
-    bos : str or None, default None
-        The token to add at the beginning of each sentence. If None, nothing is added.
-    eos : str or None, default '<eos>'
-        The token to add at the end of each sentence. If None, nothing is added.
-    root : str, default '$MXNET_HOME/datasets/wikitext-2'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> wikitext2 = gluonnlp.data.WikiText2('val', root='./datasets/wikitext2')
-    -etc-
-    >>> len(wikitext2)
-    216347
-    >>> wikitext2[0]
-    '='
-    >>> wikitext2 = gluonnlp.data.WikiText2('val', flatten=False,
-    ...                                     root='./datasets/wikitext2')
-    >>> len(wikitext2)
-    2461
-    >>> wikitext2[0]
-    ['=', 'Homarus', 'gammarus', '=', '<eos>']
-    >>> wikitext2 = gluonnlp.data.WikiText2('val', flatten=False, bos='<bos>', eos=None,
-    ...                                     root='./datasets/wikitext2')
-    >>> wikitext2[0]
-    ['<bos>', '=', 'Homarus', 'gammarus', '=']
-    >>> wikitext2 = gluonnlp.data.WikiText2('val', flatten=False, bos='<bos>', eos=None,
-    ...                                     skip_empty=False, root='./datasets/wikitext2')
-    >>> len(wikitext2)
-    3760
-    >>> wikitext2[0]
-    ['<bos>']
-    """
-
-    def __init__(self,
-                 segment='train',
-                 flatten=True,
-                 skip_empty=True,
-                 tokenizer=lambda s: s.split(),
-                 bos=None,
-                 eos=C.EOS_TOKEN,
-                 root=os.path.join(get_home_dir(), 'datasets', 'wikitext-2'),
-                 **kwargs):
-        self._archive_file = ('wikitext-2-v1.zip',
-                              '3c914d17d80b1459be871a5039ac23e752a53cbe')
-        self._data_file = {
-            'train': ('wiki.train.tokens',
-                      '863f29c46ef9d167fff4940ec821195882fe29d1'),
-            'val': ('wiki.valid.tokens',
-                    '0418625c8b4da6e4b5c7a0b9e78d4ae8f7ee5422'),
-            'test': ('wiki.test.tokens',
-                     'c7b8ce0aa086fb34dab808c5c49224211eb2b172')
-        }
-        super(WikiText2, self).__init__(
-            'wikitext-2',
-            segment=segment,
-            bos=bos,
-            eos=eos,
-            flatten=flatten,
-            skip_empty=skip_empty,
-            root=root,
-            tokenizer=tokenizer,
-            **kwargs)
-
-
-@register(segment=['train', 'val', 'test'])
-class WikiText103(_WikiText):
-    """WikiText-103 word-level dataset for language modeling, from Salesforce research.
-
-    WikiText103 is implemented as CorpusDataset with the default flatten=True.
-
-    From
-    https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
-
-    License: Creative Commons Attribution-ShareAlike
-
-    Parameters
-    ----------
-    segment : {'train', 'val', 'test'}, default 'train'
-        Dataset segment.
-    flatten : bool, default True
-        Whether to return all samples as flattened tokens. If True, each sample is a token.
-    skip_empty : bool, default True
-        Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
-        will be added in empty samples.
-    tokenizer : function, default str.split
-        A function that splits each sample string into list of tokens.
-    bos : str or None, default None
-        The token to add at the beginning of each sentence. If None, nothing is added.
-    eos : str or None, default '<eos>'
-        The token to add at the end of each sentence. If None, nothing is added.
-    root : str, default '$MXNET_HOME/datasets/wikitext-103'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> wikitext103 = gluonnlp.data.WikiText103('val', root='./datasets/wikitext103')
-    -etc-
-    >>> len(wikitext103)
-    216347
-    >>> wikitext103[0]
-    '='
-    >>> wikitext103 = gluonnlp.data.WikiText103('val', flatten=False,
-    ...                                         root='./datasets/wikitext103')
-    >>> len(wikitext103)
-    2461
-    >>> wikitext103[0]
-    ['=', 'Homarus', 'gammarus', '=', '<eos>']
-    >>> wikitext103 = gluonnlp.data.WikiText103('val', flatten=False, bos='<bos>', eos=None,
-    ...                                         root='./datasets/wikitext103')
-    >>> wikitext103[0]
-    ['<bos>', '=', 'Homarus', 'gammarus', '=']
-    >>> wikitext103 = gluonnlp.data.WikiText103('val', flatten=False, bos='<bos>', eos=None,
-    ...                                         skip_empty=False, root='./datasets/wikitext103')
-    >>> len(wikitext103)
-    3760
-    >>> wikitext103[0]
-    ['<bos>']
-    """
-
-    def __init__(self,
-                 segment='train',
-                 flatten=True,
-                 skip_empty=True,
-                 tokenizer=lambda s: s.split(),
-                 bos=None,
-                 eos=C.EOS_TOKEN,
-                 root=os.path.join(get_home_dir(), 'datasets',
-                                   'wikitext-103'),
-                 **kwargs):
-        self._archive_file = ('wikitext-103-v1.zip',
-                              '0aec09a7537b58d4bb65362fee27650eeaba625a')
-        self._data_file = {
-            'train': ('wiki.train.tokens',
-                      'b7497e2dfe77e72cfef5e3dbc61b7b53712ac211'),
-            'val': ('wiki.valid.tokens',
-                    'c326ac59dc587676d58c422eb8a03e119582f92b'),
-            'test': ('wiki.test.tokens',
-                     '8a5befc548865cec54ed4273cf87dbbad60d1e47')
-        }
-        super(WikiText103, self).__init__(
-            'wikitext-103',
-            segment=segment,
-            bos=bos,
-            eos=eos,
-            flatten=flatten,
-            skip_empty=skip_empty,
-            root=root,
-            tokenizer=tokenizer,
-            **kwargs)
-
-
-@register(segment=['train', 'val', 'test'])
-class WikiText2Raw(_WikiText):
-    """WikiText-2 character-level dataset for language modeling
-
-    WikiText2Raw is implemented as CorpusDataset with the default flatten=True.
-
-    From Salesforce research:
-    https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
-
-    License: Creative Commons Attribution-ShareAlike
-
-    Parameters
-    ----------
-    segment : {'train', 'val', 'test'}, default 'train'
-        Dataset segment.
-    flatten : bool, default True
-        Whether to return all samples as flattened tokens. If True, each sample is a token.
-    skip_empty : bool, default True
-        Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
-        will be added in empty samples.
-    tokenizer : function, default s.encode('utf-8')
-        A function that splits each sample string into list of tokens.
-        The tokenizer can also be used to convert everything to lowercase.
-        E.g. with tokenizer=lambda s: s.lower().encode('utf-8')
-    bos : str or None, default None
-        The token to add at the beginning of each sentence. If None, nothing is added.
-    eos : str or None, default '<eos>'
-        The token to add at the end of each sentence. If None, nothing is added.
-    root : str, default '$MXNET_HOME/datasets/wikitext-2'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> wikitext2 = gluonnlp.data.WikiText2Raw('val', root='./datasets/wikitext2')
-    -etc-
-    >>> len(wikitext2)
-    1136862
-    >>> wikitext2[0]
-    61
-    >>> type(wikitext2[0])
-    <class 'int'>
-    >>> wikitext2 = gluonnlp.data.WikiText2Raw('val', flatten=False,
-    ...                                        tokenizer=None, root='./datasets/wikitext2')
-    >>> len(wikitext2)
-    2461
-    >>> wikitext2[0]
-    '= Homarus gammarus ='
-    >>> wikitext2 = gluonnlp.data.WikiText2Raw('val', flatten=False, bos='<bos>', eos=None,
-    ...                                        tokenizer=lambda s: s.split(),
-    ...                                        root='./datasets/wikitext2')
-    >>> wikitext2[0]
-    ['<bos>', '=', 'Homarus', 'gammarus', '=']
-    """
-
-    def __init__(self,
-                 segment='train',
-                 flatten=True,
-                 skip_empty=True,
-                 bos=None,
-                 eos=None,
-                 tokenizer=lambda s: s.encode('utf-8'),
-                 root=os.path.join(get_home_dir(), 'datasets', 'wikitext-2'),
-                 **kwargs):
-        self._archive_file = ('wikitext-2-raw-v1.zip',
-                              '3b6993c138fc61c95f7fffd900fef68f8411371d')
-        self._data_file = {
-            'train': ('wiki.train.raw',
-                      'd33faf256327882db0edc7c67cd098d1051a2112'),
-            'val': ('wiki.valid.raw',
-                    'db78d4db83700cba1b1bf4a9381087043db2876d'),
-            'test': ('wiki.test.raw',
-                     '6f1fe2054a940eebfc76b284b09680763b37f5ea')
-        }
-
-        super(WikiText2Raw, self).__init__(
-            'wikitext-2',
-            segment=segment,
-            bos=bos,
-            eos=eos,
-            flatten=flatten,
-            skip_empty=skip_empty,
-            root=root,
-            tokenizer=tokenizer,
-            **kwargs)
-
-
-@register(segment=['train', 'val', 'test'])
-class WikiText103Raw(_WikiText):
-    """WikiText-103 character-level dataset for language modeling
-
-    WikiText103Raw is implemented as CorpusDataset with the default flatten=True.
-
-    From Salesforce research:
-    https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/
-
-    License: Creative Commons Attribution-ShareAlike
-
-    Parameters
-    ----------
-    segment : {'train', 'val', 'test'}, default 'train'
-        Dataset segment.
-    flatten : bool, default True
-        Whether to return all samples as flattened tokens. If True, each sample is a token.
-    skip_empty : bool, default True
-        Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
-        will be added in empty samples.
-    tokenizer : function, default s.encode('utf-8')
-        A function that splits each sample string into list of tokens.
-        The tokenizer can also be used to convert everything to lowercase.
-        E.g. with tokenizer=lambda s: s.lower().encode('utf-8')
-    bos : str or None, default None
-        The token to add at the beginning of each sentence. If None, nothing is added.
-    eos : str or None, default '<eos>'
-        The token to add at the end of each sentence. If None, nothing is added.
-    root : str, default '$MXNET_HOME/datasets/wikitext-103'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> wikitext103 = gluonnlp.data.WikiText103Raw('val', root='./datasets/wikitext103')
-    -etc-
-    >>> len(wikitext103)
-    1136862
-    >>> wikitext103[0]
-    61
-    >>> wikitext103 = gluonnlp.data.WikiText103Raw('val', flatten=False,
-    ...                                            root='./datasets/wikitext103')
-    >>> len(wikitext103)
-    2461
-    >>> wikitext103[0]
-    [61, 32, 72, 111, 109, 97, 114, 117, 115, 32, 103, 97, 109, 109, 97, 114, 117, 115, 32, 61]
-    >>> wikitext103 = gluonnlp.data.WikiText103Raw('val', flatten=False, tokenizer=None,
-    ...                                            root='./datasets/wikitext103')
-    >>> wikitext103[0]
-    '= Homarus gammarus ='
-    """
-
-    def __init__(self,
-                 segment='train',
-                 flatten=True,
-                 skip_empty=True,
-                 tokenizer=lambda s: s.encode('utf-8'),
-                 bos=None,
-                 eos=None,
-                 root=os.path.join(get_home_dir(), 'datasets',
-                                   'wikitext-103'),
-                 **kwargs):
-        self._archive_file = ('wikitext-103-raw-v1.zip',
-                              '86f2375181b9247049d9c9205fad2b71b274b568')
-        self._data_file = {
-            'train': ('wiki.train.raw',
-                      '3d06627c15e834408cfee91293f862c11c1cc9ef'),
-            'val': ('wiki.valid.raw',
-                    'db78d4db83700cba1b1bf4a9381087043db2876d'),
-            'test': ('wiki.test.raw',
-                     '6f1fe2054a940eebfc76b284b09680763b37f5ea')
-        }
-        super(WikiText103Raw, self).__init__(
-            'wikitext-103',
-            segment=segment,
-            bos=bos,
-            eos=eos,
-            flatten=flatten,
-            skip_empty=skip_empty,
-            root=root,
-            tokenizer=tokenizer,
-            **kwargs)
diff --git a/src/gluonnlp/data/dataloader.py b/src/gluonnlp/data/dataloader.py
deleted file mode 100644
index 5e667ec2f9..0000000000
--- a/src/gluonnlp/data/dataloader.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""DataLoader. An extension of Gluon data loader that allows multi-shard sampling."""
-__all__ = ['ShardedDataLoader']
-
-import io
-import pickle
-import multiprocessing
-from multiprocessing.pool import ThreadPool
-from mxnet import context
-from mxnet.gluon.data.dataloader import ForkingPickler, _as_in_context
-from mxnet.gluon.data.dataloader import default_mp_batchify_fn, default_batchify_fn
-from mxnet.gluon.data import sampler as _sampler
-
-_worker_dataset = None
-def _worker_initializer(dataset):
-    """Initializer for processing pool."""
-    # global dataset is per-process based and only available in worker processes
-    # this is only necessary to handle MXIndexedRecordIO because otherwise dataset
-    # can be passed as argument
-    global _worker_dataset
-    _worker_dataset = dataset
-
-def _worker_fn(samples, batchify_fn, dataset=None):
-    """Function for processing data in worker process."""
-    # pylint: disable=unused-argument
-    # it is required that each worker process has to fork a new MXIndexedRecordIO handle
-    # preserving dataset as global variable can save tons of overhead and is safe in new process
-    global _worker_dataset
-    if isinstance(samples[0], (list, tuple)):
-        batch = [batchify_fn([_worker_dataset[i] for i in shard]) for shard in samples]
-    else:
-        batch = batchify_fn([_worker_dataset[i] for i in samples])
-    buf = io.BytesIO()
-    ForkingPickler(buf, pickle.HIGHEST_PROTOCOL).dump(batch)
-    return buf.getvalue()
-
-def _thread_worker_fn(samples, batchify_fn, dataset):
-    """Threadpool worker function for processing data."""
-    if isinstance(samples[0], (list, tuple)):
-        batch = [batchify_fn([dataset[i] for i in shard]) for shard in samples]
-    else:
-        batch = batchify_fn([dataset[i] for i in samples])
-    return batch
-
-class _MultiWorkerIter:
-    """Internal multi-worker iterator for DataLoader."""
-    def __init__(self, worker_pool, batchify_fn, batch_sampler, pin_memory=False,
-                 worker_fn=_worker_fn, prefetch=0, dataset=None):
-        self._worker_pool = worker_pool
-        self._batchify_fn = batchify_fn
-        self._batch_sampler = batch_sampler
-        self._data_buffer = {}
-        self._rcvd_idx = 0
-        self._sent_idx = 0
-        self._iter = iter(self._batch_sampler)
-        self._worker_fn = worker_fn
-        self._pin_memory = pin_memory
-        self._dataset = dataset
-        # pre-fetch
-        for _ in range(prefetch):
-            self._push_next()
-
-    def __len__(self):
-        return len(self._batch_sampler)
-
-    def _push_next(self):
-        """Assign next batch workload to workers."""
-        r = next(self._iter, None)
-        if r is None:
-            return
-        async_ret = self._worker_pool.apply_async(
-            self._worker_fn, (r, self._batchify_fn, self._dataset))
-        self._data_buffer[self._sent_idx] = async_ret
-        self._sent_idx += 1
-
-    def __next__(self):
-        self._push_next()
-        if self._rcvd_idx == self._sent_idx:
-            assert not self._data_buffer, 'Data buffer should be empty at this moment'
-            raise StopIteration
-
-        assert self._rcvd_idx < self._sent_idx, 'rcvd_idx must be smaller than sent_idx'
-        assert self._rcvd_idx in self._data_buffer, 'fatal error with _push_next, rcvd_idx missing'
-        ret = self._data_buffer.pop(self._rcvd_idx)
-        batch = pickle.loads(ret.get()) if self._dataset is None else ret.get()
-        if self._pin_memory:
-            batch = _as_in_context(batch, context.cpu_pinned())
-        self._rcvd_idx += 1
-        return batch
-
-    def next(self):
-        return self.__next__()
-
-    def __iter__(self):
-        return self
-
-
-class ShardedDataLoader:
-    """Loads data from a dataset and returns mini-batches of data.
-
-    Parameters
-    ----------
-    dataset : Dataset
-        Source dataset. Note that numpy and mxnet arrays can be directly used
-        as a Dataset.
-    batch_size : int
-        Size of mini-batch.
-    shuffle : bool
-        Whether to shuffle the samples.
-    sampler : Sampler
-        The sampler to use. Either specify sampler or shuffle, not both.
-    last_batch : {'keep', 'discard', 'rollover'}
-        How to handle the last batch if batch_size does not evenly divide
-        `len(dataset)`.
-
-        keep - A batch with less samples than previous batches is returned.
-        discard - The last batch is discarded if its incomplete.
-        rollover - The remaining samples are rolled over to the next epoch.
-    batch_sampler : Sampler
-        A sampler that returns mini-batches. Do not specify batch_size,
-        shuffle, sampler, and last_batch if batch_sampler is specified.
-    batchify_fn : callable
-        Callback function to allow users to specify how to merge samples
-        into a batch. Defaults to `default_batchify_fn`::
-
-            def default_batchify_fn(data):
-                if isinstance(data[0], nd.NDArray):
-                    return nd.stack(*data)
-                elif isinstance(data[0], tuple):
-                    data = zip(*data)
-                    return [default_batchify_fn(i) for i in data]
-                else:
-                    data = np.asarray(data)
-                    return nd.array(data, dtype=data.dtype)
-
-    num_workers : int, default 0
-        The number of multiprocessing workers to use for data preprocessing.
-        `num_workers > 0` is not supported on Windows yet.
-    pin_memory : boolean, default False
-        If ``True``, the dataloader will copy NDArrays into pinned memory
-        before returning them. Copying from CPU pinned memory to GPU is faster
-        than from normal CPU memory.
-    prefetch : int, default is `num_workers * 2`
-        The number of prefetching batches only works if `num_workers` > 0.
-        If `prefetch` > 0, it allow worker process to prefetch certain batches before
-        acquiring data from iterators.
-        Note that using large prefetching batch will provide smoother bootstrapping performance,
-        but will consume more shared_memory. Using smaller number may forfeit the purpose of using
-        multiple worker processes, try reduce `num_workers` in this case.
-        By default it defaults to `num_workers * 2`.
-    thread_pool : bool, default False
-        If ``True``, use threading pool instead of multiprocessing pool. Using threadpool
-        can avoid shared memory usage. If `DataLoader` is more IO bounded or GIL is not a killing
-        problem, threadpool version may achieve better performance than multiprocessing.
-
-    """
-    def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
-                 last_batch=None, batch_sampler=None, batchify_fn=None,
-                 num_workers=0, pin_memory=False, prefetch=None, thread_pool=False):
-        self._dataset = dataset
-        self._pin_memory = pin_memory
-        self._thread_pool = thread_pool
-
-        if batch_sampler is None:
-            if batch_size is None:
-                raise ValueError('batch_size must be specified unless ' \
-                                 'batch_sampler is specified')
-            if sampler is None:
-                if shuffle:
-                    sampler = _sampler.RandomSampler(len(dataset))
-                else:
-                    sampler = _sampler.SequentialSampler(len(dataset))
-            elif shuffle:
-                raise ValueError('shuffle must not be specified if sampler is specified')
-
-            batch_sampler = _sampler.BatchSampler(
-                sampler, batch_size, last_batch if last_batch else 'keep')
-        elif batch_size is not None or shuffle or sampler is not None or \
-                last_batch is not None:
-            raise ValueError('batch_size, shuffle, sampler and last_batch must ' \
-                             'not be specified if batch_sampler is specified.')
-
-        self._batch_sampler = batch_sampler
-        self._num_workers = num_workers if num_workers >= 0 else 0
-        self._worker_pool = None
-        self._prefetch = max(0, int(prefetch) if prefetch is not None else 2 * self._num_workers)
-        if self._num_workers > 0:
-            if self._thread_pool:
-                self._worker_pool = ThreadPool(self._num_workers)
-            else:
-                self._worker_pool = multiprocessing.Pool(
-                    self._num_workers, initializer=_worker_initializer, initargs=[self._dataset])
-        if batchify_fn is None:
-            if num_workers > 0:
-                self._batchify_fn = default_mp_batchify_fn
-            else:
-                self._batchify_fn = default_batchify_fn
-        else:
-            self._batchify_fn = batchify_fn
-
-    def __iter__(self):
-        if self._num_workers == 0:
-            def _same_process_iter():
-                for batch in self._batch_sampler:
-                    if isinstance(batch[0], (list, tuple)):
-                        rets = [self._batchify_fn([self._dataset[idx] for idx in shard])
-                                for shard in batch]
-                        if self._pin_memory:
-                            rets = [_as_in_context(ret, context.cpu_pinned()) for ret in rets]
-                        yield rets
-                    else:
-                        ret = self._batchify_fn([self._dataset[idx] for idx in batch])
-                        if self._pin_memory:
-                            ret = _as_in_context(ret, context.cpu_pinned())
-                        yield ret
-            return _same_process_iter()
-
-        # multi-worker
-        return _MultiWorkerIter(self._worker_pool, self._batchify_fn, self._batch_sampler,
-                                pin_memory=self._pin_memory,
-                                worker_fn=_thread_worker_fn if self._thread_pool else _worker_fn,
-                                prefetch=self._prefetch,
-                                dataset=self._dataset if self._thread_pool else None)
-
-    def __len__(self):
-        return len(self._batch_sampler)
-
-    def __del__(self):
-        if self._worker_pool:
-            # manually terminate due to a bug that pool is not automatically terminated
-            # https://bugs.python.org/issue34172
-            assert isinstance(self._worker_pool, multiprocessing.pool.Pool)
-            self._worker_pool.terminate()
diff --git a/src/gluonnlp/data/dataset.py b/src/gluonnlp/data/dataset.py
deleted file mode 100644
index 717bfffc5e..0000000000
--- a/src/gluonnlp/data/dataset.py
+++ /dev/null
@@ -1,337 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=undefined-all-variable
-"""NLP Toolkit Dataset API. It allows easy and customizable loading of corpora and dataset files.
-Files can be loaded into formats that are immediately ready for training and evaluation."""
-__all__ = ['TextLineDataset', 'CorpusDataset', 'ConcatDataset', 'TSVDataset', 'NumpyDataset']
-
-import bisect
-import collections
-import io
-import json
-import os
-import warnings
-
-import numpy as np
-
-from mxnet.gluon.data import ArrayDataset, Dataset, SimpleDataset
-
-from .utils import (Splitter, concat_sequence, line_splitter,
-                    whitespace_splitter)
-
-
-class ConcatDataset(Dataset):
-    """Dataset that concatenates a list of datasets.
-
-    Parameters
-    ----------
-    datasets : list
-        List of datasets.
-    """
-    def __init__(self, datasets):
-        self.datasets = datasets
-        self.cum_sizes = np.cumsum([0] + [len(d) for d in datasets])
-
-    def __getitem__(self, i):
-        dataset_id = bisect.bisect_right(self.cum_sizes, i)
-        sample_id = i - self.cum_sizes[dataset_id - 1]
-        return self.datasets[dataset_id - 1][sample_id]
-
-    def __len__(self):
-        return self.cum_sizes[-1]
-
-
-class TextLineDataset(SimpleDataset):
-    """Dataset that comprises lines in a file. Each line will be stripped.
-
-    Parameters
-    ----------
-    filename : str
-        Path to the input text file.
-    encoding : str, default 'utf8'
-        File encoding format.
-    """
-    def __init__(self, filename, encoding='utf8'):
-        lines = []
-        with io.open(filename, 'r', encoding=encoding) as in_file:
-            for line in in_file:
-                lines.append(line.strip())
-        super(TextLineDataset, self).__init__(lines)
-
-
-def _corpus_dataset_process(s, bos, eos):
-    tokens = [bos] if bos else []
-    tokens.extend(s)
-    if eos:
-        tokens.append(eos)
-    return tokens
-
-class TSVDataset(SimpleDataset):
-    """Common tab separated text dataset that reads text fields based on provided sample splitter
-    and field separator.
-
-    The returned dataset includes samples, each of which can either be a list of text fields
-    if field_separator is specified, or otherwise a single string segment produced by the
-    sample_splitter.
-
-    Example::
-
-        # assume `test.tsv` contains the following content:
-        # Id\tFirstName\tLastName
-        # a\tJiheng\tJiang
-        # b\tLaoban\tZha
-        # discard the first line and select the 0th and 2nd fields
-        dataset = data.TSVDataset('test.tsv', num_discard_samples=1, field_indices=[0, 2])
-        assert dataset[0] == ['a', 'Jiang']
-        assert dataset[1] == ['b', 'Zha']
-
-    Parameters
-    ----------
-    filename : str or list of str
-        Path to the input text file or list of paths to the input text files.
-    encoding : str, default 'utf8'
-        File encoding format.
-    sample_splitter : function, default str.splitlines
-        A function that splits the dataset string into samples.
-    field_separator : function or None, default Splitter('\t')
-        A function that splits each sample string into list of text fields.
-        If None, raw samples are returned according to `sample_splitter`.
-    num_discard_samples : int, default 0
-        Number of samples discarded at the head of the first file.
-    field_indices : list of int or None, default None
-        If set, for each sample, only fields with provided indices are selected as the output.
-        Otherwise all fields are returned.
-    allow_missing : bool, default False
-        If set to True, no exception will be thrown if the number of fields is smaller than the
-        maximum field index provided.
-    """
-    def __init__(self, filename, encoding='utf8',
-                 sample_splitter=line_splitter, field_separator=Splitter('\t'),
-                 num_discard_samples=0, field_indices=None, allow_missing=False):
-        assert sample_splitter, 'sample_splitter must be specified.'
-
-        if not isinstance(filename, (tuple, list)):
-            filename = (filename, )
-
-        self._filenames = [os.path.expanduser(f) for f in filename]
-        self._encoding = encoding
-        self._sample_splitter = sample_splitter
-        self._field_separator = field_separator
-        self._num_discard_samples = num_discard_samples
-        self._field_indices = field_indices
-        self._allow_missing = allow_missing
-        super(TSVDataset, self).__init__(self._read())
-
-    def _should_discard(self):
-        discard = self._num_discard_samples > 0
-        self._num_discard_samples -= 1
-        return discard
-
-    def _field_selector(self, fields):
-        if not self._field_indices:
-            return fields
-        try:
-            result = [fields[i] for i in self._field_indices]
-        except IndexError as e:
-            raise(IndexError('%s. Fields = %s'%(str(e), str(fields))))
-        return result
-
-    def _read(self):
-        all_samples = []
-        for filename in self._filenames:
-            with io.open(filename, 'r', encoding=self._encoding) as fin:
-                content = fin.read()
-            samples = (s for s in self._sample_splitter(content) if not self._should_discard())
-            if self._field_separator:
-                if not self._allow_missing:
-                    samples = [self._field_selector(self._field_separator(s)) for s in samples]
-                else:
-                    selected_samples = []
-                    num_missing = 0
-                    for s in samples:
-                        try:
-                            fields = self._field_separator(s)
-                            selected_samples.append(self._field_selector(fields))
-                        except IndexError:
-                            num_missing += 1
-                    if num_missing > 0:
-                        warnings.warn('%d incomplete samples in %s'%(num_missing, filename))
-                    samples = selected_samples
-            all_samples += samples
-        return all_samples
-
-class CorpusDataset(SimpleDataset):
-    """Common text dataset that reads a whole corpus based on provided sample splitter
-    and word tokenizer.
-
-    The returned dataset includes samples, each of which can either be a list of tokens if tokenizer
-    is specified, or otherwise a single string segment produced by the sample_splitter.
-
-    Parameters
-    ----------
-    filename : str or list of str
-        Path to the input text file or list of paths to the input text files.
-    encoding : str, default 'utf8'
-        File encoding format.
-    flatten : bool, default False
-        Whether to return all samples as flattened tokens. If True, each sample is a token.
-    skip_empty : bool, default True
-        Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
-        will be added in empty samples.
-    sample_splitter : function, default str.splitlines
-        A function that splits the dataset string into samples.
-    tokenizer : function or None, default str.split
-        A function that splits each sample string into list of tokens. If None, raw samples are
-        returned according to `sample_splitter`.
-    bos : str or None, default None
-        The token to add at the beginning of each sequence. If None, or if tokenizer is not
-        specified, then nothing is added.
-    eos : str or None, default None
-        The token to add at the end of each sequence. If None, or if tokenizer is not
-        specified, then nothing is added.
-    """
-    def __init__(self, filename, encoding='utf8', flatten=False, skip_empty=True,
-                 sample_splitter=line_splitter, tokenizer=whitespace_splitter,
-                 bos=None, eos=None):
-        assert sample_splitter, 'sample_splitter must be specified.'
-
-        if not isinstance(filename, (tuple, list)):
-            filename = (filename, )
-
-        self._filenames = [os.path.expanduser(f) for f in filename]
-        self._encoding = encoding
-        self._flatten = flatten
-        self._skip_empty = skip_empty
-        self._sample_splitter = sample_splitter
-        self._tokenizer = tokenizer
-        self._bos = bos
-        self._eos = eos
-        super(CorpusDataset, self).__init__(self._read())
-
-    def _read(self):
-        all_samples = []
-        for filename in self._filenames:
-            with io.open(filename, 'r', encoding=self._encoding) as fin:
-                content = fin.read()
-            samples = (s.strip() for s in self._sample_splitter(content))
-            if self._tokenizer:
-                samples = [
-                    _corpus_dataset_process(self._tokenizer(s), self._bos, self._eos)
-                    for s in samples if s or not self._skip_empty
-                ]
-                if self._flatten:
-                    samples = concat_sequence(samples)
-            elif self._skip_empty:
-                samples = [s for s in samples if s]
-
-            all_samples += samples
-        return all_samples
-
-class NumpyDataset(ArrayDataset):
-    """A dataset wrapping over a Numpy binary (.npy, .npz) file.
-
-    If the file is a .npy file, then a single numpy array is loaded.
-    If the file is a .npz file with multiple arrays, then a list of
-    numpy arrays are loaded, ordered by their key in the archive.
-
-    Sparse matrix is not yet supported.
-
-    Parameters
-    ----------
-    filename : str
-        Path to the .npy or .npz file.
-    kwargs
-        Keyword arguments are passed to np.load.
-
-    Properties
-    ----------
-    keys: list of str or None
-        The list of keys loaded from the .npz file.
-    """
-    def __init__(self, filename, **kwargs):
-        arrs = np.load(filename, **kwargs)
-        keys = None
-        data = []
-        if filename.endswith('.npy'):
-            data.append(arrs)
-        elif filename.endswith('.npz'):
-            keys = sorted(arrs.keys())
-            for key in keys:
-                data.append(arrs[key])
-        else:
-            raise ValueError('Unsupported extension: %s'%filename)
-        self._keys = keys
-        super(NumpyDataset, self).__init__(*data)
-
-    @property
-    def keys(self):
-        return self._keys
-
-    def get_field(self, field):
-        """Return the dataset corresponds to the provided key.
-
-        Example::
-            a = np.ones((2,2))
-            b = np.zeros((2,2))
-            np.savez('data.npz', a=a, b=b)
-            dataset = NumpyDataset('data.npz')
-            data_a = dataset.get_field('a')
-            data_b = dataset.get_field('b')
-
-        Parameters
-        ----------
-        field : str
-            The name of the field to retrieve.
-        """
-        idx = self._keys.index(field)
-        return self._data[idx]
-
-
-class _JsonlDataset(SimpleDataset):
-    """A dataset wrapping over a jsonlines (.jsonl) file, each line is a json object.
-
-    Parameters
-    ----------
-    filename : str
-        Path to the .jsonl file.
-    encoding : str, default 'utf8'
-        File encoding format.
-    """
-    def __init__(self, filename, encoding='utf8'):
-
-        if not isinstance(filename, (tuple, list)):
-            filename = (filename, )
-
-        self._filenames = [os.path.expanduser(f) for f in filename]
-        self._encoding = encoding
-
-        super(_JsonlDataset, self).__init__(self._read())
-
-    def _read(self):
-        all_samples = []
-        for filename in self._filenames:
-            samples = []
-            with open(filename, 'r', encoding=self._encoding) as fin:
-                for line in fin.readlines():
-                    samples.append(json.loads(line, object_pairs_hook=collections.OrderedDict))
-            samples = self._read_samples(samples)
-            all_samples += samples
-        return all_samples
-
-    def _read_samples(self, samples):
-        raise NotImplementedError
diff --git a/src/gluonnlp/data/fast_bert_tokenizer.pyx b/src/gluonnlp/data/fast_bert_tokenizer.pyx
deleted file mode 100644
index 3900c34e21..0000000000
--- a/src/gluonnlp/data/fast_bert_tokenizer.pyx
+++ /dev/null
@@ -1,266 +0,0 @@
-"""Used to tokenize text for use with a BERT model."""
-
-from typing import List, Dict, Tuple
-import unicodedata
-
-
-def whitespace_tokenize(text):
-    """Runs basic whitespace cleaning and splitting on a piece of text."""
-    return text.strip().split()
-
-
-cdef class BasicTokenizer:
-    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
-    cdef public bint lower
-    
-    def __init__(self, lower=True):
-        """Constructs a BasicTokenizer.
-    
-        :param do_lower_case: Whether to lower case the input.
-        """
-        self.lower = lower
-    
-    def tokenize(self, text) -> List[str]:
-        """Tokenizes a piece of text."""
-        # Developments notes:
-        #   - The original BERT code loops over every char in pure Python 4 times
-        #     (several more times if you include loops that are happening inside built-ins).
-        #     This optimized version uses generators and only loops over each char explicitly twice.
-        #   - This runs in two separate steps because I thought it would be better to apply
-        #     `lower` and do accent normalization on the whole string at once rather than parts.
-        #     In Python this limits the amount of looping so it provides a speedup.  But in Cython
-        #     that may not actually be true.
-        
-        # Step 1: normalize whitespace, filter control characters, and add spaces around
-        #   Chinese characters.
-        step1_text = "".join(_step1(text)).strip()
-        if self.lower:
-            step1_text = step1_text.lower()
-        
-        # Normalize unicode characters to strip accents
-        # This isn't part of either step1 or step2 because it runs on the entire
-        # string and any looping over chars takes place in a built-in C loop
-        # that is likely more optimized than anything that I can write here.
-        step1_text = unicodedata.normalize("NFD", step1_text)
-        
-        # Step 2: filter non-spacing marks (Mn unicode category) and
-        #   add spaces around any punctuation.
-        # This is pretty simple in comparison to the other step.
-        output_tokens = "".join(_step2(step1_text)).split()
-        return output_tokens
-
-
-cdef class WordpieceTokenizer:
-    """Runs WordPiece tokenziation."""
-    
-    cdef public vocab
-    cdef public str unk_token
-    cdef public long max_input_chars_per_word
-    
-    def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
-        self.vocab = vocab
-        self.unk_token = unk_token
-        self.max_input_chars_per_word = max_input_chars_per_word
-    
-    def tokenize(self, text) -> List[str]:
-        """Tokenizes a piece of text into its word pieces.
-    
-        This uses a greedy longest-match-first algorithm to perform tokenization
-        using the given vocabulary.
-    
-        For example:
-          input = "unaffable"
-          output = ["un", "##aff", "##able"]
-    
-        :param text: A single token or whitespace separated tokens. This should have
-            already been passed through `BasicTokenizer.
-        :returns: A list of wordpiece tokens.
-        """
-        cdef long max_input_chars_per_word = self.max_input_chars_per_word
-        cdef:
-            bint is_bad
-            long start
-            long end
-            Py_ssize_t n_chars
-        
-        output_tokens = []
-        for token in whitespace_tokenize(text):
-            chars = list(token)
-            n_chars = len(chars)
-            if n_chars > max_input_chars_per_word:
-                output_tokens.append(self.unk_token)
-                continue
-            
-            is_bad = False
-            start = 0
-            sub_tokens = []
-            while start < n_chars:
-                end =  n_chars
-                cur_substr = None
-                while start < end:
-                    substr = "".join(chars[start:end])
-                    if start > 0:
-                        # Now it's a subword
-                        substr = "##" + substr
-                    if substr in self.vocab:
-                        cur_substr = substr
-                        break
-                    end -= 1
-                if cur_substr is None:
-                    is_bad = True
-                    break
-                sub_tokens.append(cur_substr)
-                start = end
-            
-            if is_bad:
-                output_tokens.append(self.unk_token)
-            else:
-                output_tokens.extend(sub_tokens)
-        return output_tokens
-
-
-def _step1(str text):
-    """First step in pre-processing test for BERT.
-    
-    This function yields unicode characters while, normalizing all whitespace to spaces,
-    filtering control characters, and adding spaces around chinese characters.
-    """
-    cdef bint prev_ch_whitespace = False
-    cdef str ch
-    cdef str cat
-    cdef Py_UCS4 cp
-    
-    for ch in text:
-        cp = <Py_UCS4>ch  # Casting this here removes the need for some extra error checking in the loop.
-        
-        # `is_control` used unicodedata.category for every character that's not \t, \n, or \r
-        # which is basically everything. So it's better to just call it on everything
-        # to begin with and pass the result around.
-        cat = unicodedata.category(ch)
-        if cp == 0 or cp == 0xfffd or _is_control(cp, cat):
-            continue
-        if _is_whitespace(cp, cat):
-            yield " "
-            prev_ch_whitespace = True
-        else:
-            # From the original BERT code:
-            # ---------------------------
-            # This was added on November 1st, 2018 for the multilingual and Chinese
-            # models. This is also applied to the English models now, but it doesn't
-            # matter since the English models were not trained on any Chinese data
-            # and generally don't have any Chinese data in them (there are Chinese
-            # characters in the vocabulary because Wikipedia does have some Chinese
-            # words in the English Wikipedia.).
-           
-            # NB: Our regression tests will fail if we get rid of this because
-            #   our dev datasets have chinese characters in them.
-            #   I have no idea if this is important for production or not
-            if _is_chinese_char(cp):
-                # Add whitespace around any CJK character.
-                if not prev_ch_whitespace:
-                    yield " "
-                yield ch
-                yield " "
-            else:
-                yield ch
-                prev_ch_whitespace = False
-
-
-def _step2(str text):
-    """After encoding normalization, whitespace normalization, chinese character normalization,
-    and accent stripping, this step runs and filters non-spacing marks (Mn unicode category) and
-    adds spaces around any punctuation.
-    """
-    cdef str ch
-    cdef str cat
-
-    for ch in text:
-        cat = unicodedata.category(ch)
-        # Filter some chars (non-spacing mark)
-        if cat == "Mn":
-            continue
-        # Add whitespace around any punctuation
-        if _is_punctuation(ch, cat):
-            yield " "
-            yield ch
-            yield " "
-        else:
-            yield ch
-
-
-cdef inline bint _is_punctuation(Py_UCS4 cp, str cat):
-    """Checks whether `cp` is a punctuation character.
-    
-    We treat all non-letter/number ASCII as punctuation.
-    Characters such as "^", "$", and "`" are not in the Unicode
-    Punctuation class but we treat them as punctuation anyways, for
-    consistency.
-    """
-    if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
-        (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
-        return True
-    if cat.startswith("P"):
-        return True
-    return False
-
-
-cdef inline bint _is_control(Py_UCS4 ch, str cat):
-    """Checks whether `ch` is a control character."""
-    # Some of these are technically control characters but we count them as whitespace
-    if ch == u"\t" or ch == u"\n" or ch == u"\r":
-        return False
-    if cat in ("Cc", "Cf"):
-        return True
-    return False
-
-
-cdef inline bint _is_whitespace(Py_UCS4 ch, str cat):
-    """Checks whether `chars` is a whitespace character.
-    
-    \t, \n, and \r are technically control characters but we treat them
-    as whitespace since they are generally considered as such.
-    """
-    if ch == u" " or ch == u"\t" or ch == u"\n" or ch == u"\r":
-        return True
-    if cat == "Zs":
-        return True
-    return False
-
-
-cdef inline bint _is_chinese_char(Py_UCS4 cp):
-    """Checks whether CP is the codepoint of a CJK character.
-    
-    This defines a "chinese character" as anything in the CJK Unicode block:
-      https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
-
-    Note that the CJK Unicode block is NOT all Japanese and Korean characters,
-    despite its name. The modern Korean Hangul alphabet is a different block,
-    as is Japanese Hiragana and Katakana. Those alphabets are used to write
-    space-separated words, so they are not treated specially and handled
-    like the all of the other languages.
-    """
-    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
-            (cp >= 0x3400 and cp <= 0x4DBF) or  #
-            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
-            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
-            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
-            (cp >= 0x2B820 and cp <= 0x2CEAF) or
-            (cp >= 0xF900 and cp <= 0xFAFF) or  #
-            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
-        return True
-    return False
-
-
-# Public functions for testing
-def is_punctuation(Py_UCS4 cp, str cat):
-    return _is_punctuation(cp, cat)
-
-def is_control(Py_UCS4 ch, str cat):
-    return _is_control(ch, cat)
-
-def is_whitespace(Py_UCS4 ch, str cat):
-    return _is_whitespace(ch, cat)
-
-def is_chinese_char(Py_UCS4 cp):
-    return _is_chinese_char(cp)
-
diff --git a/src/gluonnlp/data/filtering.py b/src/gluonnlp/data/filtering.py
new file mode 100644
index 0000000000..3bed627de2
--- /dev/null
+++ b/src/gluonnlp/data/filtering.py
@@ -0,0 +1,259 @@
+import re
+import regex
+import requests
+import unicodedata
+import os
+import warnings
+from typing import List, Pattern, Union, Tuple, Optional
+from sacremoses.normalize import MosesPunctNormalizer
+from ..utils.lazy_imports import try_import_fasttext, try_import_langid
+from ..utils.misc import download
+from ..base import get_model_zoo_home_dir, get_repo_url
+
+non_printing_char_regex = regex.compile(r'\p{C}')
+
+
+class MosesNormalizer:
+    """Normalizes the input sentence. Currently, we support the combination of the
+
+    Moses Punctuation Normalizer 'normalize-punctuation.perl' and the
+     'remove-non-printing-char.perl' in [mosesdecoder](https://github.com/moses-smt/mosesdecoder):
+
+    Also, we will normalize the
+
+    Parameters
+    ----------
+    lang
+        The input language
+    remove_non_printable_char
+        Whether to remove the non-printable unicode characters in the input
+    unicode_norm_form
+        The unicode normalization format used. Supported
+
+    """
+    def __init__(self, lang: str, remove_non_printable_char: bool = True,
+                 unicode_norm_form: Optional[str] = None):
+        self._remove_non_printable_char = remove_non_printable_char
+        self._moses_normalizer = MosesPunctNormalizer(lang)
+        self._unicode_norm_form = unicode_norm_form
+        if unicode_norm_form is not None:
+            assert unicode_norm_form in ['NFC', 'NFKC', 'NFD', 'NFKD'],\
+                'Unsupported unicode normalization format, you may refer to ' \
+                'https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize for ' \
+                'more details.'
+        self.__warmup()
+
+    def __warmup(self):
+        self('hello world')
+
+    def __call__(self, sentence: str) -> str:
+        if self._unicode_norm_form:
+            sentence = unicodedata.normalize(self._unicode_norm_form, sentence)
+        sentence = self._moses_normalizer.normalize(sentence)
+        if self._remove_non_printable_char:
+            return non_printing_char_regex.sub(' ', sentence)
+        else:
+            return sentence
+
+
+def _words_match_regex(words: List[str], ignore_case=False, replace_white_space=False) -> Pattern:
+    """Obtain the regex that finds whether a given corpus contains any word in the input words
+
+    Parameters
+    ----------
+    words
+
+    Returns
+    -------
+    regex
+
+    """
+    words = [ele for ele in words if ele]
+    if ignore_case:
+        flags = re.IGNORECASE
+    else:
+        flags = 0
+    if replace_white_space:
+        words = [ele.replace(' ', r'\s+') for ele in words]
+    regex = re.compile('[^a-z]({words})[^a-z]|^({words})$|^({words})[^a-z]|[^a-z]({words})$'
+                       .format(words='|'.join(words)), flags)
+    return regex
+
+
+class ProfanityFilter:
+    """Detect whether the corpus contains possible profanity content.
+
+    We use the word list from https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words
+
+    """
+    def __init__(self, langs: Optional[Union[str, List, Tuple]] = None):
+        def _download(url, retries=5):
+            while retries + 1 > 0:
+                try:
+                    r = requests.get(url, stream=True, verify=True)
+                    if r.status_code != 200:
+                        raise RuntimeError('Failed downloading url {}'.format(url))
+                    return r.text
+                except Exception as e:
+                    retries -= 1
+                    if retries <= 0:
+                        raise e
+                    print('download failed due to {}, retrying, {} attempt{} left'
+                          .format(repr(e), retries, 's' if retries > 1 else ''))
+        url_path =\
+            'https://raw.githubusercontent.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/b36ce5c34c14cb7872dd4c2a4e55fe526138462d/{lang}'
+        available_langs = {'ar', 'cs', 'da', 'de', 'en', 'eo', 'es', 'fa', 'fi', 'fr', 'hi', 'hu',
+                           'it', 'ja', 'ko', 'nl', 'no', 'pl', 'pt', 'ru', 'sv', 'th', 'tlh', 'tr',
+                           'zh'}
+        self._suspicious_words = []
+        if langs is None:
+            filter_langs = available_langs
+        elif isinstance(langs, str):
+            filter_langs = [langs]
+        elif isinstance(langs, (tuple, list)):
+            filter_langs = list(langs)
+        else:
+            raise ValueError('Unsupported input langs={}'.format(langs))
+        for lang in filter_langs:
+            assert lang in available_langs, \
+                'lang={} is not supported. All supported languages={}'.format(lang, available_langs)
+            out = _download(url_path.format(lang=lang))
+            self._suspicious_words += [word.strip() for word in out.split('\n') if word.strip()]
+        self._regex = _words_match_regex(self._suspicious_words)
+
+    def match(self, corpus: str) -> bool:
+        """Search whether the input corpus contains the suspicious bad words.
+
+        Parameters
+        ----------
+        corpus
+            Input string
+
+        Returns
+        -------
+        ret
+            Whether the input corpus contains profanity words.
+        """
+        return self._regex.match(corpus) is not None
+
+
+class LanguageIdentifier:
+    """Detect the language of the input corpus.
+
+    We currently support three pretrained models:
+
+        - model='langid'
+            Use the langid implementation from
+             https://github.com/saffsd/langid.py
+        - model='fasttext'
+            Use the fasttext model "lid.176.bin" from
+             https://fasttext.cc/docs/en/language-identification.html
+        - model='fasttext_compressed'
+            Use the compressed fasttext model "lid.176.ftz"
+            from  https://fasttext.cc/docs/en/language-identification.html
+
+    References:
+
+        @article{joulin2016bag,
+          title={Bag of Tricks for Efficient Text Classification},
+          author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas},
+          journal={arXiv preprint arXiv:1607.01759},
+          year={2016}
+        }
+
+        @article{joulin2016fasttext,
+          title={FastText.zip: Compressing text classification models},
+          author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas},
+          journal={arXiv preprint arXiv:1612.03651},
+          year={2016}
+        }
+
+        @inproceedings{lui2012langid,
+          title={langid. py: An off-the-shelf language identification tool},
+          author={Lui, Marco and Baldwin, Timothy},
+          booktitle={Proceedings of the ACL 2012 system demonstrations},
+          pages={25--30},
+          year={2012},
+          organization={Association for Computational Linguistics}
+        }
+
+    """
+    def __init__(self, algo='fasttext_compressed', model_path=None):
+        assert algo in ['langid', 'fasttext', 'fasttext_compressed']
+        self._algo = algo
+        self._use_fasttext = algo.startswith('fasttext')
+        if algo in ['fasttext', 'fasttext_compressed']:
+            fasttext = try_import_fasttext()
+            if model_path is None:
+                if algo == 'fasttext':
+                    model_path = download(get_repo_url() + 'models/fasttext_langid/lid.176.bin',
+                                          os.path.join(get_model_zoo_home_dir(),
+                                                       'fasttext_langid', 'lid.176.bin'),
+                                          sha1_hash='e613bda316ecb4f5e1924140eedf81b81c087d9a')
+                elif algo == 'fasttext_compressed':
+                    model_path = download(get_repo_url() + 'models/fasttext_langid/lid.176.ftz',
+                                          os.path.join(get_model_zoo_home_dir(),
+                                                       'fasttext_langid', 'lid.176.ftz'),
+                                          sha1_hash='86d1b630ba55a5040231eda9fe24a7befdc411f2')
+                else:
+                    raise NotImplementedError
+            self._model_path = model_path
+            with warnings.catch_warnings():
+                # Ignore the DeprecationWarning. For more details,
+                # See issue: https://github.com/facebookresearch/fastText/issues/1056
+                warnings.filterwarnings("ignore", category=Warning)
+                model = fasttext.load_model(model_path)
+            self._model = model
+        elif algo == 'langid':
+            langid = try_import_langid()
+            self._model_str = langid.langid.model
+            self._model_path = model_path
+            self._model = langid.langid.LanguageIdentifier.from_modelstring(self._model_str)
+        else:
+            raise NotImplementedError
+
+    def __repr__(self):
+        s = '{}(algo={}, model_path={})'.format(self.__class__.__name__,
+                                                self._algo,
+                                                self._model_path)
+        return s
+
+    def __call__(self, corpus: str):
+        """
+
+        Parameters
+        ----------
+        corpus
+            Input corpus
+
+        Returns
+        -------
+        lang_label
+            The ISO-639 1 code of the predicted language
+        score
+            The score of the prediction
+        """
+        if self._use_fasttext:
+            labels, scores = self._model.predict(corpus)
+            label = labels[0].replace("__label__", "")
+            return label, scores[0]
+        else:
+            return self._model.classify(corpus.lower())
+
+    def __getstate__(self):
+        d = {k: v for k, v in self.__dict__.items() if k != '_model'}
+        return d
+
+    def __setstate__(self, state):
+        for k, v in state.items():
+            setattr(self, k, v)
+        if self._use_fasttext:
+            fasttext = try_import_fasttext()
+            with warnings.catch_warnings():
+                # Ignore the DeprecationWarning. For more details,
+                # See issue: https://github.com/facebookresearch/fastText/issues/1056
+                warnings.filterwarnings("ignore", category=Warning)
+                self._model = fasttext.load_model(self._model_path)
+        else:
+            langid = try_import_langid()
+            self._model = langid.langid.LanguageIdentifier.from_modelstring(self._model_str)
diff --git a/src/gluonnlp/data/glue.py b/src/gluonnlp/data/glue.py
deleted file mode 100644
index 853c9ad833..0000000000
--- a/src/gluonnlp/data/glue.py
+++ /dev/null
@@ -1,705 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=line-too-long
-"""GLUEBenchmark corpora."""
-
-__all__ = ['GlueCoLA', 'GlueSST2', 'GlueSTSB', 'GlueQQP', 'GlueRTE', 'GlueMNLI',
-           'GlueQNLI', 'GlueWNLI', 'GlueMRPC']
-
-import zipfile
-import os
-import io
-
-from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url
-
-from .dataset import TSVDataset
-from .registry import register
-from ..base import get_home_dir
-
-
-class _GlueDataset(TSVDataset):
-    def __init__(self, root, data_file, **kwargs):
-        root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
-        segment, zip_hash, data_hash = data_file
-        self._root = root
-        filename = os.path.join(self._root, '%s.tsv' % segment)
-        self._get_data(segment, zip_hash, data_hash, filename)
-        super(_GlueDataset, self).__init__(filename, **kwargs)
-
-    def _get_data(self, segment, zip_hash, data_hash, filename):
-        data_filename = '%s-%s.zip' % (segment, data_hash[:8])
-        if not os.path.exists(filename) or not check_sha1(filename, data_hash):
-            download(_get_repo_file_url(self._repo_dir(), data_filename),
-                     path=self._root, sha1_hash=zip_hash)
-            # unzip
-            downloaded_path = os.path.join(self._root, data_filename)
-            with zipfile.ZipFile(downloaded_path, 'r') as zf:
-                # skip dir structures in the zip
-                for zip_info in zf.infolist():
-                    if zip_info.filename[-1] == '/':
-                        continue
-                    zip_info.filename = os.path.basename(zip_info.filename)
-                    zf.extract(zip_info, self._root)
-
-    def _repo_dir(self):
-        raise NotImplementedError
-
-@register(segment=['train', 'dev', 'test'])
-class GlueCoLA(_GlueDataset):
-    """The Corpus of Linguistic Acceptability (Warstadt et al., 2018) consists of English
-    acceptability judgments drawn from books and journal articles on linguistic theory.
-
-    Each example is a sequence of words annotated with whether it is a grammatical
-    English sentence.
-
-    From
-    https://gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    segment : {'train', 'dev', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/glue_cola'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    return_all_fields : bool, default False
-        Return all fields available in the dataset.
-
-    Examples
-    --------
-    >>> cola_dev = gluonnlp.data.GlueCoLA('dev', root='./datasets/cola')
-    -etc-
-    >>> len(cola_dev)
-    1043
-    >>> len(cola_dev[0])
-    2
-    >>> cola_dev[0]
-    ['The sailors rode the breeze clear of the rocks.', '1']
-    >>> cola_test = gluonnlp.data.GlueCoLA('test', root='./datasets/cola')
-    -etc-
-    >>> len(cola_test)
-    1063
-    >>> len(cola_test[0])
-    1
-    >>> cola_test[0]
-    ['Bill whistled past the house.']
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'glue_cola'),
-                 return_all_fields=False):
-        self._data_file = {'train': ('train', '662227ed4d98bb96b3495234b650e37826a5ef72',
-                                     '7760a9c4b1fb05f6d003475cc7bb0d0118875190'),
-                           'dev': ('dev', '6f3f5252b004eab187bf22ab5b0af31e739d3a3f',
-                                   '30ece4de38e1929545c4154d4c71ad297c7f54b4'),
-                           'test': ('test', 'b88180515ad041935793e74e3a76470b0c1b2c50',
-                                    'f38b43d31bb06accf82a3d5b2fe434a752a74c9f')}
-        data_file = self._data_file[segment]
-        if segment in ['train', 'dev']:
-            A_IDX, LABEL_IDX = 3, 1
-            field_indices = [A_IDX, LABEL_IDX] if not return_all_fields else None
-            num_discard_samples = 0
-        elif segment == 'test':
-            A_IDX = 1
-            field_indices = [A_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-
-        super(GlueCoLA, self).__init__(root, data_file,
-                                       num_discard_samples=num_discard_samples,
-                                       field_indices=field_indices)
-
-    def _repo_dir(self):
-        return 'gluon/dataset/GLUE/CoLA'
-
-@register(segment=['train', 'dev', 'test'])
-class GlueSST2(_GlueDataset):
-    """The Stanford Sentiment Treebank (Socher et al., 2013) consists of sentences from movie
-    reviews and human annotations of their sentiment.
-
-    From
-    https://gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    segment : {'train', 'dev', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/glue_sst'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    return_all_fields : bool, default False
-        Return all fields available in the dataset.
-
-    Examples
-    --------
-    >>> sst_dev = gluonnlp.data.GlueSST2('dev', root='./datasets/sst')
-    -etc-
-    >>> len(sst_dev)
-    872
-    >>> len(sst_dev[0])
-    2
-    >>> sst_dev[0]
-    ["it 's a charming and often affecting journey . ", '1']
-    >>> sst_test = gluonnlp.data.GlueSST2('test', root='./datasets/sst')
-    -etc-
-    >>> len(sst_test)
-    1821
-    >>> len(sst_test[0])
-    1
-    >>> sst_test[0]
-    ['uneasy mishmash of styles and genres .']
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'glue_sst'),
-                 return_all_fields=False):
-        self._data_file = {'train': ('train', 'bcde781bed5caa30d5e9a9d24e5c826965ed02a2',
-                                     'ffbb67a55e27525e925b79fee110ca19585d70ca'),
-                           'dev': ('dev', '85698e465ff6573fb80d0b34229c76df84cd766b',
-                                   'e166f986cec68fd4cca0ae5ce5869b917f88a2fa'),
-                           'test': ('test', 'efac1c275553ed78500e9b8d8629408f5f867b20',
-                                    '3ce8041182bf82dbbbbfe13738b39d3c69722744')}
-        data_file = self._data_file[segment]
-        if segment in ['train', 'dev']:
-            A_IDX, LABEL_IDX = 0, 1
-            field_indices = [A_IDX, LABEL_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-        elif segment == 'test':
-            A_IDX = 1
-            field_indices = [A_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-
-        super(GlueSST2, self).__init__(root, data_file,
-                                       num_discard_samples=num_discard_samples,
-                                       field_indices=field_indices)
-
-    def _repo_dir(self):
-        return 'gluon/dataset/GLUE/SST-2'
-
-@register(segment=['train', 'dev', 'test'])
-class GlueSTSB(_GlueDataset):
-    """The Semantic Textual Similarity Benchmark (Cer et al., 2017) is a collection of
-    sentence pairs drawn from news headlines, video and image captions, and natural
-    language inference data.
-
-    Each pair is human-annotated with a similarity score from 1 to 5.
-
-    From
-    https://gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    segment : {'train', 'dev', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/glue_stsb'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    return_all_fields : bool, default False
-        Return all fields available in the dataset.
-
-    Examples
-    --------
-    >>> stsb_dev = gluonnlp.data.GlueSTSB('dev', root='./datasets/stsb')
-    -etc-
-    >>> len(stsb_dev)
-    1500
-    >>> len(stsb_dev[0])
-    3
-    >>> stsb_dev[0]
-    ['A man with a hard hat is dancing.', 'A man wearing a hard hat is dancing.', '5.000']
-    >>> stsb_test = gluonnlp.data.GlueSTSB('test', root='./datasets/stsb')
-    -etc-
-    >>> len(stsb_test)
-    1379
-    >>> len(stsb_test[0])
-    2
-    >>> stsb_test[0]
-    ['A girl is styling her hair.', 'A girl is brushing her hair.']
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'glue_stsb'),
-                 return_all_fields=False):
-        self._data_file = {'train': ('train', '9378bd341576810730a5c666ed03122e4c5ecc9f',
-                                     '501e55248c6db2a3f416c75932a63693000a82bc'),
-                           'dev': ('dev', '529c3e7c36d0807d88d0b2a5d4b954809ddd4228',
-                                   'f8bcc33b01dfa2e9ba85601d0140020735b8eff3'),
-                           'test': ('test', '6284872d6992d8ec6d96320af89c2f46ac076d18',
-                                    '36553e5e2107b817257232350e95ff0f3271d844')}
-        data_file = self._data_file[segment]
-        if segment in ['train', 'dev']:
-            A_IDX, B_IDX, LABEL_IDX = 7, 8, 9
-            field_indices = [A_IDX, B_IDX, LABEL_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-        elif segment == 'test':
-            A_IDX, B_IDX, = 7, 8
-            field_indices = [A_IDX, B_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-
-        super(GlueSTSB, self).__init__(root, data_file,
-                                       num_discard_samples=num_discard_samples,
-                                       field_indices=field_indices)
-
-    def _repo_dir(self):
-        return 'gluon/dataset/GLUE/STS-B'
-
-@register(segment=['train', 'dev', 'test'])
-class GlueQQP(_GlueDataset):
-    """The Quora Question Pairs dataset is a collection of question pairs from the community
-    question-answering website Quora.
-
-    From
-    https://gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    segment : {'train', 'dev', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/glue_qqp'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    return_all_fields : bool, default False
-        Return all fields available in the dataset.
-
-    Examples
-    --------
-    >>> import warnings
-    >>> with warnings.catch_warnings():
-    ...     # Ignore warnings triggered by invalid entries in GlueQQP dev set
-    ...     warnings.simplefilter("ignore")
-    ...     qqp_dev = gluonnlp.data.GlueQQP('dev', root='./datasets/qqp')
-    -etc-
-    >>> len(qqp_dev)
-    40430
-    >>> len(qqp_dev[0])
-    3
-    >>> qqp_dev[0]
-    ['Why are African-Americans so beautiful?', 'Why are hispanics so beautiful?', '0']
-    >>> qqp_test = gluonnlp.data.GlueQQP('test', root='./datasets/qqp')
-    -etc-
-    >>> len(qqp_test)
-    390965
-    >>> len(qqp_test[3])
-    2
-    >>> qqp_test[3]
-    ['Is it safe to invest in social trade biz?', 'Is social trade geniune?']
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'glue_qqp'),
-                 return_all_fields=False):
-        self._data_file = {'train': ('train', '494f280d651f168ad96d6cd05f8d4ddc6be73ce9',
-                                     '95c01e711ac8dbbda8f67f3a4291e583a72b6988'),
-                           'dev': ('dev', '9957b60c4c62f9b98ec91b26a9d43529d2ee285d',
-                                   '755e0bf2899b8ad315d4bd7d4c85ec51beee5ad0'),
-                           'test': ('test', '1e325cc5dbeeb358f9429c619ebe974fc2d1a8ca',
-                                    '0f50d1a62dd51fe932ba91be08238e47c3e2504a')}
-        data_file = self._data_file[segment]
-        if segment in ['train', 'dev']:
-            A_IDX, B_IDX, LABEL_IDX = 3, 4, 5
-            field_indices = [A_IDX, B_IDX, LABEL_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-        elif segment == 'test':
-            A_IDX, B_IDX, = 1, 2
-            field_indices = [A_IDX, B_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-        # QQP may include broken samples
-        super(GlueQQP, self).__init__(root, data_file,
-                                      num_discard_samples=num_discard_samples,
-                                      field_indices=field_indices, allow_missing=True)
-
-    def _repo_dir(self):
-        return 'gluon/dataset/GLUE/QQP'
-
-@register(segment=['train', 'dev', 'test'])
-class GlueRTE(_GlueDataset):
-    """The Recognizing Textual Entailment (RTE) datasets come from a series of annual textual
-    entailment challenges (RTE1, RTE2, RTE3, and RTE5).
-
-    From
-    https://gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    segment : {'train', 'dev', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/glue_rte'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    return_all_fields : bool, default False
-        Return all fields available in the dataset.
-
-    Examples
-    --------
-    >>> rte_dev = gluonnlp.data.GlueRTE('dev', root='./datasets/rte')
-    -etc-
-    >>> len(rte_dev)
-    277
-    >>> len(rte_dev[0])
-    3
-    >>> rte_dev[0]
-    ['Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation.', 'Christopher Reeve had an accident.', 'not_entailment']
-    >>> rte_test = gluonnlp.data.GlueRTE('test', root='./datasets/rte')
-    -etc-
-    >>> len(rte_test)
-    3000
-    >>> len(rte_test[16])
-    2
-    >>> rte_test[16]
-    ['United failed to progress beyond the group stages of the Champions League and trail in the Premiership title race, sparking rumours over its future.', 'United won the Champions League.']
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'glue_rte'),
-                 return_all_fields=False):
-        self._data_file = {'train': ('train', 'a23b0633f4f4dfa866c672af2e94f7e07344888f',
-                                     'ec2b246745bb5c9d92aee0800684c08902742730'),
-                           'dev': ('dev', 'a6cde090d12a10744716304008cf33dd3f0dbfcb',
-                                   'ade75e0673862dcac9c653efb9f59f51be2749aa'),
-                           'test': ('test', '7e4e58a6fa80b1f05e603b4e220524be7976b488',
-                                    'ddda5c967fb5a4934b429bb52aaa144e70900000')}
-        data_file = self._data_file[segment]
-        if segment in ['train', 'dev']:
-            A_IDX, B_IDX, LABEL_IDX = 1, 2, 3
-            field_indices = [A_IDX, B_IDX, LABEL_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-        elif segment == 'test':
-            A_IDX, B_IDX, = 1, 2
-            field_indices = [A_IDX, B_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-        super(GlueRTE, self).__init__(root, data_file,
-                                      num_discard_samples=num_discard_samples,
-                                      field_indices=field_indices)
-
-    def _repo_dir(self):
-        return 'gluon/dataset/GLUE/RTE'
-
-@register(segment=['train', 'dev_matched', 'dev_mismatched',
-                   'test_matched', 'test_mismatched'])
-class GlueMNLI(_GlueDataset):
-    """The Multi-Genre Natural Language Inference Corpus (Williams et al., 2018)
-    is a crowdsourced collection of sentence pairs with textual entailment annotations.
-
-    From
-    https://gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    segment : {'train', 'dev_matched', 'dev_mismatched', 'test_matched', 'test_mismatched'},
-              default 'train'
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/glue_mnli'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    return_all_fields : bool, default False
-        Return all fields available in the dataset.
-
-    Examples
-    --------
-    >>> mnli_dev = gluonnlp.data.GlueMNLI('dev_matched', root='./datasets/mnli')
-    -etc-
-    >>> len(mnli_dev)
-    9815
-    >>> len(mnli_dev[0])
-    3
-    >>> mnli_dev[0]
-    ['The new rights are nice enough', 'Everyone really likes the newest benefits ', 'neutral']
-    >>> mnli_test = gluonnlp.data.GlueMNLI('test_matched', root='./datasets/mnli')
-    -etc-
-    >>> len(mnli_test)
-    9796
-    >>> len(mnli_test[0])
-    2
-    >>> mnli_test[0]
-    ['Hierbas, ans seco, ans dulce, and frigola are just a few names worth keeping a look-out for.', 'Hierbas is a name worth looking out for.']
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'glue_mnli'),
-                 return_all_fields=False):
-        self._data_file = {'train': ('train', 'aa235064ab3ce47d48caa17c553561d84fdf5bf2',
-                                     '1e74055bc91e260323574bfe63186acb9420fa13'),
-                           'dev_matched': ('dev_matched',
-                                           '328cf527add50ee7bc20a862f97913800ba8a4b1',
-                                           '7a38c5fb5ecc875f259e1d57662d58a984753b70'),
-                           'dev_mismatched': ('dev_mismatched',
-                                              '9c5d6c6d2e3a676bfa19d929b32e2f9f233585c5',
-                                              '47470d91b594e767d80e5de2ef0be6a453c17be5'),
-                           'test_matched': ('test_matched',
-                                            '53877d9d554b6a6d402cc0e5f7e38366cd4f8e60',
-                                            '00106769e11a43eac119975ad25c2de2c8d2dbe7'),
-                           'test_mismatched': ('test_mismatched',
-                                               '82b03d3cc9f4a59c74beab06c141bc0c5bf74a55',
-                                               '5a31abf92f045f127dbb2e3d2e0ef8ddea04c237')}
-        data_file = self._data_file[segment]
-        if segment in ['train']:
-            A_IDX, B_IDX, LABEL_IDX = 8, 9, 11
-            field_indices = [A_IDX, B_IDX, LABEL_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-        elif segment in ['dev_matched', 'dev_mismatched']:
-            A_IDX, B_IDX, LABEL_IDX = 8, 9, 15
-            field_indices = [A_IDX, B_IDX, LABEL_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-        elif segment in ['test_matched', 'test_mismatched']:
-            A_IDX, B_IDX, = 8, 9
-            field_indices = [A_IDX, B_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-        super(GlueMNLI, self).__init__(root, data_file,
-                                       num_discard_samples=num_discard_samples,
-                                       field_indices=field_indices)
-
-    def _repo_dir(self):
-        return 'gluon/dataset/GLUE/MNLI'
-
-@register(segment=['train', 'dev', 'test'])
-class GlueQNLI(_GlueDataset):
-    r"""The Question-answering NLI dataset converted from Stanford Question Answering Dataset
-    (Rajpurkar et al. 2016).
-
-    From
-    https://gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    segment : {'train', 'dev', 'test'}, default 'train'
-        Dataset segment.
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/glue_qnli'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    return_all_fields : bool, default False
-        Return all fields available in the dataset.
-
-    Examples
-    --------
-    >>> qnli_dev = gluonnlp.data.GlueQNLI('dev', root='./datasets/qnli')
-    -etc-
-    >>> len(qnli_dev)
-    5732
-    >>> len(qnli_dev[0])
-    3
-    >>> qnli_dev[0]
-    ['Which NFL team represented the AFC at Super Bowl 50?', 'The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24\u201310 to earn their third Super Bowl title.', 'entailment']
-    >>> qnli_test = gluonnlp.data.GlueQNLI('test', root='./datasets/qnli')
-    -etc-
-    >>> len(qnli_test)
-    5740
-    >>> len(qnli_test[0])
-    2
-    >>> qnli_test[0]
-    ['What seldom used term of a unit of force equal to 1000 pound s of force?', 'Other arcane units of force include the sthène, which is equivalent to 1000 N, and the kip, which is equivalent to 1000 lbf.']
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'glue_qnli'),
-                 return_all_fields=False):
-        self._data_file = {'train': ('train', '95fae96fb1ffa6a2804192c9036d3435e63b48e8',
-                                     'd90a84eb40c6ba32bc2b34284ceaa962c46f8753'),
-                           'dev': ('dev', '5652b9d4d5c8d115c080bcf64101927ea2b3a1e0',
-                                   'd14a61290301c2a9d26459c4cd036742e8591428'),
-                           'test': ('test', '23dfb2f38adb14d3e792dbaecb7f5fd5dfa8db7e',
-                                    'f3da1a2e471ebfee81d91574b42e0f5d39153c59')}
-        data_file = self._data_file[segment]
-        if segment in ['train', 'dev']:
-            A_IDX, B_IDX, LABEL_IDX = 1, 2, 3
-            field_indices = [A_IDX, B_IDX, LABEL_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-        elif segment == 'test':
-            A_IDX, B_IDX, = 1, 2
-            field_indices = [A_IDX, B_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-        super(GlueQNLI, self).__init__(root, data_file,
-                                       num_discard_samples=num_discard_samples,
-                                       field_indices=field_indices)
-
-    def _repo_dir(self):
-        return 'gluon/dataset/GLUE/QNLI'
-
-@register(segment=['train', 'dev', 'test'])
-class GlueWNLI(_GlueDataset):
-    """The Winograd NLI dataset converted from the dataset in
-    Winograd Schema Challenge (Levesque et al., 2011).
-
-    From
-    https://gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    segment : {'train', 'dev', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/glue_wnli'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    return_all_fields : bool, default False
-        Return all fields available in the dataset.
-
-    Examples
-    --------
-    >>> wnli_dev = gluonnlp.data.GlueWNLI('dev', root='./datasets/wnli')
-    -etc-
-    >>> len(wnli_dev)
-    71
-    >>> len(wnli_dev[0])
-    3
-    >>> wnli_dev[0]
-    ['The drain is clogged with hair. It has to be cleaned.', 'The hair has to be cleaned.', '0']
-    >>> wnli_test = gluonnlp.data.GlueWNLI('test', root='./datasets/wnli')
-    -etc-
-    >>> len(wnli_test)
-    146
-    >>> len(wnli_test[0])
-    2
-    >>> wnli_test[0]
-    ['Maude and Dora had seen the trains rushing across the prairie, with long, rolling puffs of black smoke streaming back from the engine. Their roars and their wild, clear whistles could be heard from far away. Horses ran away when they came in sight.', 'Horses ran away when Maude and Dora came in sight.']
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'glue_wnli'),
-                 return_all_fields=False):
-        self._data_file = {'train': ('train', '8db0004d0e58640751a9f2875dd66c8000504ddb',
-                                     'b497281c1d848b619ea8fe427b3a6e4dc8e7fa92'),
-                           'dev': ('dev', 'd54834960555073fb497cf2766edb77fb62c3646',
-                                   '6bbdb866d0cccaac57c3a2505cf53103789b69a9'),
-                           'test': ('test', '431e596a1c6627fb168e7741b3e32ef681da3c7b',
-                                    '6ba8fcf3e5b451c101a3902fb4ba3fc1dea42e50')}
-        data_file = self._data_file[segment]
-        if segment in ['train', 'dev']:
-            A_IDX, B_IDX, LABEL_IDX = 1, 2, 3
-            field_indices = [A_IDX, B_IDX, LABEL_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-        elif segment == 'test':
-            A_IDX, B_IDX, = 1, 2
-            field_indices = [A_IDX, B_IDX] if not return_all_fields else None
-            num_discard_samples = 1
-        super(GlueWNLI, self).__init__(root, data_file,
-                                       num_discard_samples=num_discard_samples,
-                                       field_indices=field_indices)
-
-    def _repo_dir(self):
-        return 'gluon/dataset/GLUE/WNLI'
-
-@register(segment=['train', 'dev', 'test'])
-class GlueMRPC(TSVDataset):
-    """The Microsoft Research Paraphrase Corpus dataset.
-
-    From
-    https://gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    segment : {'train', 'dev', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/glue_mrpc'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> mrpc_dev = gluonnlp.data.GlueMRPC('dev', root='./datasets/mrpc')
-    -etc-
-    >>> len(mrpc_dev)
-    408
-    >>> len(mrpc_dev[0])
-    3
-    >>> mrpc_dev[0]
-    ["He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .", '" The foodservice pie business does not fit our long-term growth strategy .', '1']
-    >>> mrpc_test = gluonnlp.data.GlueMRPC('test', root='./datasets/mrpc')
-    -etc-
-    >>> len(mrpc_test)
-    1725
-    >>> len(mrpc_test[0])
-    2
-    >>> mrpc_test[0]
-    ["PCCW 's chief operating officer , Mike Butcher , and Alex Arena , the chief financial officer , will report directly to Mr So .", 'Current Chief Operating Officer Mike Butcher and Group Chief Financial Officer Alex Arena will report to So .']
-    """
-    def __init__(self,
-                 segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'glue_mrpc')):
-        self._root = root
-        assert segment in ['train', 'dev', 'test'], 'Unsupported segment: %s'%segment
-        self._data_file = {'train': ('msr_paraphrase_train.txt',
-                                     '716e0f67af962f08220b7e97d229b293077ef41f',
-                                     '131675ffd3d2f04f286049d31cca506c8acba69e'),
-                           'dev': ('msr_paraphrase_train.txt',
-                                   '716e0f67af962f08220b7e97d229b293077ef41f',
-                                   'e4486577c4cb2e5c2a3fd961eb24f03c623ea02d'),
-                           'test': ('msr_paraphrase_test.txt',
-                                    '4265196c15cf75620b0b592b8b921f543bda7e6c',
-                                    '3602b2ca26cf574e84183c14d6c0901669ee2d0a')}
-
-        self._generate(segment)
-        path = os.path.join(root, '%s.tsv' % segment)
-        A_IDX, B_IDX, LABEL_IDX = 3, 4, 0
-        if segment == 'test':
-            fields = [A_IDX, B_IDX]
-        else:
-            fields = [A_IDX, B_IDX, LABEL_IDX]
-        super(GlueMRPC, self).__init__(
-            path, num_discard_samples=1, field_indices=fields)
-
-    def _repo_dir(self):
-        return 'https://dl.fbaipublicfiles.com/senteval/senteval_data/'
-
-    def _generate(self, segment):
-        """Partition MRPC dataset into train, dev and test.
-        Adapted from https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e
-        """
-        # download raw data
-        data_name = segment + '.tsv'
-        raw_name, raw_hash, data_hash = self._data_file[segment]
-        raw_path = os.path.join(self._root, raw_name)
-        download(self._repo_dir() + raw_name, path=raw_path, sha1_hash=raw_hash)
-        data_path = os.path.join(self._root, data_name)
-
-        if segment in ('train', 'dev'):
-            if os.path.isfile(data_path) and check_sha1(data_path, data_hash):
-                return
-
-            # retrieve dev ids for train and dev set
-            DEV_ID_URL = 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2Fmrpc_dev_ids.tsv?alt=media&token=ec5c0836-31d5-48f4-b431-7480817f1adc'
-            DEV_ID_HASH = '506c7a1a5e0dd551ceec2f84070fa1a8c2bc4b41'
-            dev_id_name = 'dev_ids.tsv'
-            dev_id_path = os.path.join(self._root, dev_id_name)
-            download(DEV_ID_URL, path=dev_id_path, sha1_hash=DEV_ID_HASH)
-
-            # read dev data ids
-            dev_ids = []
-            with io.open(dev_id_path, encoding='utf8') as ids_fh:
-                for row in ids_fh:
-                    dev_ids.append(row.strip().split('\t'))
-
-            # generate train and dev set
-            train_path = os.path.join(self._root, 'train.tsv')
-            dev_path = os.path.join(self._root, 'dev.tsv')
-            with io.open(raw_path, encoding='utf8') as data_fh:
-                with io.open(train_path, 'w', encoding='utf8') as train_fh:
-                    with io.open(dev_path, 'w', encoding='utf8') as dev_fh:
-                        header = data_fh.readline()
-                        train_fh.write(header)
-                        dev_fh.write(header)
-                        for row in data_fh:
-                            label, id1, id2, s1, s2 = row.strip().split('\t')
-                            example = '%s\t%s\t%s\t%s\t%s\n'%(label, id1, id2, s1, s2)
-                            if [id1, id2] in dev_ids:
-                                dev_fh.write(example)
-                            else:
-                                train_fh.write(example)
-        else:
-            # generate test set
-            if os.path.isfile(data_path) and check_sha1(data_path, data_hash):
-                return
-            with io.open(raw_path, encoding='utf8') as data_fh:
-                with io.open(data_path, 'w', encoding='utf8') as test_fh:
-                    header = data_fh.readline()
-                    test_fh.write('index\t#1 ID\t#2 ID\t#1 String\t#2 String\n')
-                    for idx, row in enumerate(data_fh):
-                        label, id1, id2, s1, s2 = row.strip().split('\t')
-                        test_fh.write('%d\t%s\t%s\t%s\t%s\n'%(idx, id1, id2, s1, s2))
diff --git a/src/gluonnlp/data/intent_slot.py b/src/gluonnlp/data/intent_slot.py
deleted file mode 100644
index 9fc7311d79..0000000000
--- a/src/gluonnlp/data/intent_slot.py
+++ /dev/null
@@ -1,202 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Datasets for intent classification and slot labeling."""
-
-import io
-import os
-import zipfile
-import numpy as np
-from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url
-from mxnet.gluon.data import SimpleDataset
-
-from .dataset import TSVDataset
-from .registry import register
-from .utils import Splitter
-from ..base import get_home_dir
-from ..vocab import Vocab
-
-__all__ = ['ATISDataset', 'SNIPSDataset']
-
-
-class _BaseICSLDataset(SimpleDataset):
-    """Base Class of Datasets for Joint Intent Classification and Slot Labeling.
-
-    Parameters
-    ----------
-    segment : str
-        Dataset segment.
-    root : str
-        Path to temp folder for storing data.
-    """
-    def __init__(self, segment, root):
-        root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
-        self._segment = segment
-        self._root = root
-        self._intent_vocab = None
-        self._slot_vocab = None
-        self._get_data()
-        super(_BaseICSLDataset, self).__init__(self._read_data(segment))
-
-    @property
-    def _download_info(self):
-        """Download file information.
-
-        Returns
-        -------
-        filename_format : str
-            The filename format with slot for short hash.
-        sha1_hash : str
-            Expected sha1 hash of the file content.
-        """
-        raise NotImplementedError
-
-    @property
-    def intent_vocab(self):
-        if self._intent_vocab is None:
-            with open(os.path.join(self._root, 'intent_vocab.json'), 'r') as f:
-                self._intent_vocab = Vocab.from_json(f.read())
-        return self._intent_vocab
-
-    @property
-    def slot_vocab(self):
-        if self._slot_vocab is None:
-            with open(os.path.join(self._root, 'slot_vocab.json'), 'r') as f:
-                self._slot_vocab = Vocab.from_json(f.read())
-        return self._slot_vocab
-
-    def _get_data(self):
-        filename_format, sha1_hash = self._download_info
-        filename = filename_format.format(sha1_hash[:8])
-        data_filename = os.path.join(self._root, filename)
-        url = _get_repo_file_url('gluon/dataset', filename)
-        if not os.path.exists(data_filename) or not check_sha1(data_filename, sha1_hash):
-            download(url, path=data_filename, sha1_hash=sha1_hash)
-            with zipfile.ZipFile(data_filename, 'r') as zf:
-                zf.extractall(self._root)
-
-    def _read_data(self, segment):
-        sentences = TSVDataset(os.path.join(self._root, '{}_sentence.txt'.format(segment)),
-                               field_separator=Splitter(' '))
-        tags = TSVDataset(os.path.join(self._root, '{}_tags.txt'.format(segment)),
-                          field_separator=Splitter(' '))
-        with io.open(os.path.join(self._root, '{}_intent.txt'.format(segment)), 'r',
-                     encoding='utf-8') as f:
-            intents = []
-            for line in f:
-                line = line.strip()
-                intents.append(np.array([self.intent_vocab[ele] for ele in line.split(';')],
-                                        dtype=np.int32))
-        return list(zip(sentences, tags, intents))
-
-
-@register(segment=['train', 'dev', 'test'])
-class ATISDataset(_BaseICSLDataset):
-    """Airline Travel Information System dataset from MS CNTK.
-
-    From
-    https://github.com/Microsoft/CNTK/tree/master/Examples/LanguageUnderstanding/ATIS/Data
-
-    License: Unspecified
-
-    Each sample has three fields: tokens, slot labels, intent label.
-
-    Parameters
-    ----------
-    segment : {'train', 'dev', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/atis'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> atis = gluonnlp.data.ATISDataset(root='./datasets/atis')
-    -etc-
-    >>> len(atis)
-    4478
-    >>> len(atis[0])
-    3
-    >>> len(atis[0][0])
-    10
-    >>> atis[0][0]
-    ['i', 'want', 'to', 'fly', 'from', 'baltimore', 'to', 'dallas', 'round', 'trip']
-    >>> len(atis[0][1])
-    10
-    >>> atis[0][1][:8]
-    ['O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name']
-    >>> atis[0][2]
-    array([10], dtype=int32)
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'atis')):
-        super(ATISDataset, self).__init__(segment, root)
-
-    @property
-    def _download_info(self):
-        return 'atis-{}.zip', 'fb75a1b595566d5c5ec06ee6f2296d6629b8c225'
-
-
-@register(segment=['train', 'dev', 'test'])
-class SNIPSDataset(_BaseICSLDataset):
-    """Snips Natural Language Understanding Benchmark dataset.
-
-    Coucke et al. (2018). Snips Voice Platform: an embedded Spoken Language Understanding system
-    for private-by-design voice interfaces. https://arxiv.org/abs/1805.10190
-
-    From
-    https://github.com/snipsco/nlu-benchmark/tree/master/2017-06-custom-intent-engines
-
-    License: Unspecified
-
-    Each sample has three fields: tokens, slot labels, intent label.
-
-    Parameters
-    ----------
-    segment : {'train', 'dev', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default '$MXNET_HOME/datasets/snips'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> snips = gluonnlp.data.SNIPSDataset(root='./datasets/snips')
-    -etc-
-    >>> len(snips)
-    13084
-    >>> len(snips[0])
-    3
-    >>> len(snips[1][0])
-    8
-    >>> snips[1][0]
-    ['put', 'United', 'Abominations', 'onto', 'my', 'rare', 'groove', 'playlist']
-    >>> len(snips[1][1])
-    8
-    >>> snips[1][1][:5]
-    ['O', 'B-entity_name', 'I-entity_name', 'O', 'B-playlist_owner']
-    >>> snips[1][2]
-    array([0], dtype=int32)
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'snips')):
-        super(SNIPSDataset, self).__init__(segment, root)
-
-    @property
-    def _download_info(self):
-        return 'snips-{}.zip', 'f22420cc0f2a26078337dc375606be46a4cc8c51'
diff --git a/src/gluonnlp/data/datasetloader.py b/src/gluonnlp/data/loading.py
similarity index 85%
rename from src/gluonnlp/data/datasetloader.py
rename to src/gluonnlp/data/loading.py
index c5ff4be344..4027f78866 100644
--- a/src/gluonnlp/data/datasetloader.py
+++ b/src/gluonnlp/data/loading.py
@@ -23,24 +23,111 @@
 __all__ = ['DatasetLoader']
 
 import io
+import os
+import glob
 import pickle
 import warnings
 import multiprocessing
 from functools import partial
-from mxnet import context
-from mxnet.gluon.data.dataloader import ForkingPickler, _as_in_context
-from mxnet.gluon.data.dataloader import default_mp_batchify_fn, default_batchify_fn
-from .stream import _PathDataset
 
+import numpy as np
+from mxnet import context
+from mxnet.gluon.data import ArrayDataset, SimpleDataset
+from mxnet.gluon.data.dataloader import ForkingPickler, _as_in_context, \
+    default_mp_batchify_fn, default_batchify_fn
 
 # manager for creating shared object
 _manager = None
 _dataset = None
+
+
 def _initialize_dataset_worker(manager):
     global _manager
     _manager = manager
 
 
+class NumpyDataset(ArrayDataset):
+    """A dataset wrapping over a Numpy binary (.npy, .npz) file.
+
+    If the file is a .npy file, then a single numpy array is loaded.
+    If the file is a .npz file with multiple arrays, then a list of
+    numpy arrays are loaded, ordered by their key in the archive.
+
+    Sparse matrix is not yet supported.
+
+    Parameters
+    ----------
+    filename : str
+        Path to the .npy or .npz file.
+    kwargs
+        Keyword arguments are passed to np.load.
+
+    Properties
+    ----------
+    keys: list of str or None
+        The list of keys loaded from the .npz file.
+    """
+
+    def __init__(self, filename, **kwargs):
+        arrs = np.load(filename, **kwargs)
+        keys = None
+        data = []
+        if filename.endswith('.npy'):
+            data.append(arrs)
+        elif filename.endswith('.npz'):
+            keys = sorted(arrs.keys())
+            for key in keys:
+                data.append(arrs[key])
+        else:
+            raise ValueError('Unsupported extension: %s' % filename)
+        self._keys = keys
+        super().__init__(*data)
+
+    @property
+    def keys(self):
+        return self._keys
+
+    def get_field(self, field):
+        """Return the dataset corresponds to the provided key.
+
+        Example::
+            a = np.ones((2,2))
+            b = np.zeros((2,2))
+            np.savez('data.npz', a=a, b=b)
+            dataset = NumpyDataset('data.npz')
+            data_a = dataset.get_field('a')
+            data_b = dataset.get_field('b')
+
+        Parameters
+        ----------
+        field : str
+            The name of the field to retrieve.
+        """
+        idx = self._keys.index(field)
+        return self._data[idx]
+
+
+class _PathDataset(SimpleDataset):
+    """A simple Datasets containing a list of paths given the file_pattern.
+
+    Parameters
+    ----------
+    file_pattern: str
+        Path to the input text files.
+    """
+
+    def __init__(self, file_pattern):
+        if not isinstance(file_pattern, str):
+            raise TypeError('file_pattern must be str, but got %s' % type(file_pattern))
+        files = []
+        for pattern in file_pattern.split(','):
+            files.extend(glob.glob(os.path.expanduser(pattern.strip())))
+        files = sorted(files)
+        if len(files) == 0:
+            raise ValueError('Cannot find any file with path "%s"' % file_pattern)
+        super().__init__(files)
+
+
 def _dataset_worker_fn(urls, dataset_fn, batch_sampler_fn):
     """Function to generate datasets and batch sampler for each worker."""
     global _manager, _dataset
@@ -74,6 +161,7 @@ def _batch_worker_fn(samples, batchify_fn, dataset=None, counter=None):
 
 class _MultiBatchWorkerIter:
     """Internal multi-worker iterator for DataLoader."""
+
     def __init__(self, worker_pool, batchify_fn, dataset_iter=None,
                  pin_memory=False, worker_fn=_batch_worker_fn, prefetch=0,
                  manager=None):
@@ -178,13 +266,14 @@ def __iter__(self):
 
 class _MultiDatasetWorkerIter:
     """Internal multi-worker iterator for DataLoader."""
+
     def __init__(self, worker_pool, file_sampler,
                  dataset_fn, batch_sampler_fn,
                  worker_fn=_dataset_worker_fn,
                  prefetch=0, dataset=None, circle_length=1,
                  cached=False, num_max_cached=0):
         if cached:
-            assert num_max_cached > 0,\
+            assert num_max_cached > 0, \
                 'When cached is turned on, num_max_cached must be positive.'
         self._worker_pool = worker_pool
         self._dataset_fn = dataset_fn
@@ -233,9 +322,9 @@ def _next_dataset(self):
             return None
 
         assert self._rcvd_idx < self._sent_idx, \
-               'rcvd_idx must be smaller than sent_idx'
+            'rcvd_idx must be smaller than sent_idx'
         assert self._rcvd_idx in self._data_buffer, \
-               'fatal error with _next_dataset, rcvd_idx missing'
+            'fatal error with _next_dataset, rcvd_idx missing'
 
         if len(self._cached_dataset) == 0 or self._data_buffer[self._rcvd_idx].ready():
             ret = self._data_buffer.pop(self._rcvd_idx)
@@ -336,6 +425,7 @@ def default_batchify_fn(data):
     num_max_dataset_cached : int, default is 0
         Maximum number of cached datasets. It is valid only if `dataset_cached` is True
     """
+
     def __init__(self, file_patterns, file_sampler,
                  dataset_fn=None, batch_sampler_fn=None,
                  dataset_params=None, batch_sampler_params=None, batchify_fn=None,
@@ -344,9 +434,9 @@ def __init__(self, file_patterns, file_sampler,
                  dataset_prefetch=None, batch_prefetch=None,
                  dataset_cached=False, num_max_dataset_cached=0):
         assert num_dataset_workers >= 0, \
-               'num_dataset_workers must be non-negative'
+            'num_dataset_workers must be non-negative'
         assert num_batch_workers >= 0, \
-               'num_batch_workers must be non-negative'
+            'num_batch_workers must be non-negative'
         if num_batch_workers > 0:
             assert num_dataset_workers > 0, \
                 'num_dataset_workers must be positive when num_batch_workers > 0'
@@ -356,7 +446,7 @@ def __init__(self, file_patterns, file_sampler,
                               ' batch sampling are disabled when num_batch_workers=0 though '
                               'num_dataset_workers={} > 0'.format(num_dataset_workers))
         assert circle_length >= 1, \
-               'circle_length must be larger than or equal to 1'
+            'circle_length must be larger than or equal to 1'
         if dataset_cached:
             assert num_max_dataset_cached > 0, \
                 'When dataset_cached is True, num_max_dataset_cached must be positive'
@@ -377,10 +467,10 @@ def __init__(self, file_patterns, file_sampler,
 
         self._num_dataset_workers = num_dataset_workers
         self._num_batch_workers = num_batch_workers
-        self._dataset_prefetch = max(0, int(dataset_prefetch) \
-                if dataset_prefetch is not None else self._num_dataset_workers)
-        self._batch_prefetch = max(0, int(batch_prefetch) \
-                if batch_prefetch is not None else 2 * self._num_batch_workers)
+        self._dataset_prefetch = max(0, int(dataset_prefetch)
+                                     if dataset_prefetch is not None else self._num_dataset_workers)
+        self._batch_prefetch = max(0, int(batch_prefetch)
+                                   if batch_prefetch is not None else 2 * self._num_batch_workers)
 
         self._pin_memory = pin_memory
         self._circle_length = circle_length
@@ -425,6 +515,7 @@ def _same_process_iter():
                             ret = _as_in_context(ret, context.cpu_pinned())
                         yield ret
                     urls = []
+
             return _same_process_iter()
 
         # multi-worker
diff --git a/src/gluonnlp/data/question_answering.py b/src/gluonnlp/data/question_answering.py
deleted file mode 100644
index 4a065a62d5..0000000000
--- a/src/gluonnlp/data/question_answering.py
+++ /dev/null
@@ -1,225 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=
-"""SQuAD dataset."""
-
-__all__ = ['SQuAD']
-
-import json
-import os
-import shutil
-import zipfile
-import tempfile
-import uuid
-
-from mxnet.gluon.data import ArrayDataset
-from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url, replace_file
-from .registry import register
-from ..base import get_home_dir
-
-@register(segment=['train', 'dev'])
-class SQuAD(ArrayDataset):
-    """Stanford Question Answering Dataset (SQuAD) - reading comprehension dataset.
-
-    From
-    https://rajpurkar.github.io/SQuAD-explorer/
-
-    License: CreativeCommons BY-SA 4.0
-
-    The original data format is json, which has multiple contexts (a context is a paragraph of text
-    from which questions are drawn). For each context there are multiple questions, and for each of
-    these questions there are multiple (usually 3) answers.
-
-    This class loads the json and flattens it to a table view. Each record is a single question.
-    Since there are more than one question per context in the original dataset, some records shares
-    the same context. Number of records in the dataset is equal to number of questions in json file.
-
-    The format of each record of the dataset is following:
-
-    - record_index:  An index of the record, generated on the fly (0 ... to # of last question)
-    - question_id:   Question Id. It is a string and taken from the original json file as-is
-    - question:      Question text, taken from the original json file as-is
-    - context:       Context text.  Will be the same for questions from the same context
-    - answer_list:   All answers for this question. Stored as python list
-    - start_indices: All answers' starting indices. Stored as python list.
-      The position in this list is the same as the position of an answer in answer_list
-    - is_impossible: The question is unanswerable, if version is '2.0'.
-      In SQuAd2.0, there are some unanswerable questions.
-
-    Parameters
-    ----------
-    segment : str, default 'train'
-        Dataset segment. Options are 'train' and 'dev'.
-    version : str, default '1.1'
-        Dataset version. Options are '1.1' and '2.0'.
-    root : str, default '~/.mxnet/datasets/squad'
-        Path to temp folder for storing data.
-
-    Examples
-    --------
-    >>> squad = gluonnlp.data.SQuAD('dev', '1.1', root='./datasets/squad')
-    -etc-
-    >>> len(squad)
-    10570
-    >>> len(squad[0])
-    6
-    >>> tuple(type(squad[0][i]) for i in range(6))
-    (<class 'int'>, <class 'str'>, <class 'str'>, <class 'str'>, <class 'list'>, <class 'list'>)
-    >>> squad[0][0]
-    0
-    >>> squad[0][1]
-    '56be4db0acb8001400a502ec'
-    >>> squad[0][2]
-    'Which NFL team represented the AFC at Super Bowl 50?'
-    >>> squad[0][3][:70]
-    'Super Bowl 50 was an American football game to determine the champion '
-    >>> squad[0][4]
-    ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']
-    >>> squad[0][5]
-    [177, 177, 177]
-    >>> squad2 = gluonnlp.data.SQuAD('dev', '2.0', root='./datasets/squad')
-    -etc-
-    >>> len(squad2)
-    11873
-    >>> len(squad2[0])
-    7
-    >>> type(squad2[0][6])
-    <class 'bool'>
-    >>> squad2[0][6]
-    False
-    """
-
-    def __init__(self, segment='train', version='1.1',
-                 root=os.path.join(get_home_dir(), 'datasets', 'squad')):
-        self._data_file = {'1.1': {'train': (('train-v1.1.zip',
-                                              '052a75bf8fdb3e843b8649971658eae8133f9b0e'),
-                                             ('train-v1.1.json',
-                                              '1faea1252438a64f9718412a55036b786cfcc636')),
-                                   'dev': (('dev-v1.1.zip',
-                                            'e31ad736582b72a8eabd5c0b0a38cb779ed913d7'),
-                                           ('dev-v1.1.json',
-                                            'e1621aae0683b346ee9743bd5609266ba0cc34fc'))},
-                           '2.0': {'train': (('train-v2.0.zip',
-                                              'fe497797fc090ee61a046b74eadfee51320b54fb'),
-                                             ('train-v2.0.json',
-                                              'ceb2acdea93b9d82ab1829c7b1e03bee9e302c99')),
-                                   'dev': (('dev-v2.0.zip',
-                                            'de4dad80b3de9194484ca013e95a96a3e2d5603f'),
-                                           ('dev-v2.0.json',
-                                            '846082d15ed71cb5220645b9d473441e00070778'))}}
-
-        root = os.path.expanduser(root)
-
-        if not os.path.isdir(root):
-            os.makedirs(root)
-
-        self._root = root
-        self._segment = segment
-        self._version = version
-        self._get_data()
-
-        super(SQuAD, self).__init__(SQuAD._get_records(self._read_data()))
-
-    def _get_data(self):
-        """Load data from the file. Does nothing if data was loaded before.
-        """
-        (data_archive_name, archive_hash), (data_name, data_hash) \
-            = self._data_file[self._version][self._segment]
-        data_path = os.path.join(self._root, data_name)
-
-        if not os.path.exists(data_path) or not check_sha1(data_path, data_hash):
-            with tempfile.TemporaryDirectory(dir=self._root) as temp_dir:
-                file_path = download(_get_repo_file_url('gluon/dataset/squad', data_archive_name),
-                                     path=temp_dir, sha1_hash=archive_hash)
-                with zipfile.ZipFile(file_path, 'r') as zf:
-                    for member in zf.namelist():
-                        filename = os.path.basename(member)
-                        if filename:
-                            dest = os.path.join(self._root, filename)
-                            temp_dst = dest + str(uuid.uuid4())
-                            with zf.open(member) as source:
-                                with open(temp_dst, 'wb') as target:
-                                    shutil.copyfileobj(source, target)
-                                    replace_file(temp_dst, dest)
-
-    def _read_data(self):
-        """Read data.json from disk and flats it to the following format:
-        Entry = (record_index, question_id, question, context, answer_list, answer_start_indices).
-        Question id and list_of_answers also substituted with indices, so it could be later
-        converted into nd.array
-
-        Returns
-        -------
-        List[Tuple]
-            Flatten list of questions
-        """
-        (_, _), (data_file_name, _) \
-            = self._data_file[self._version][self._segment]
-
-        with open(os.path.join(self._root, data_file_name)) as f:
-            json_data = json.load(f)
-
-        return json_data
-
-    @staticmethod
-    def _get_records(json_dict):
-        """Provides a list of tuples with records where answers are flatten
-
-        :param dict, json_dict: File content loaded into json dictionary
-
-        Returns
-        -------
-        List[Tuple]
-            Flatten list of records in format: record_index, question_id, question, context,
-            answer, answer_start_index, is_impossible(if version is '2.0)
-        """
-        records = []
-
-        record_index = 0
-
-        for title in json_dict['data']:
-            for paragraph in title['paragraphs']:
-                for qas in paragraph['qas']:
-                    answers = SQuAD._get_answers(qas)
-                    is_impossible = qas.get('is_impossible', None)
-                    if is_impossible is not None:
-                        record = (
-                            record_index, qas['id'], qas['question'],
-                            paragraph['context'], answers[0], answers[1], is_impossible
-                        )
-                    else:
-                        record = (
-                            record_index, qas['id'], qas['question'],
-                            paragraph['context'], answers[0], answers[1])
-
-                    record_index += 1
-                    records.append(record)
-
-        return records
-
-    @staticmethod
-    def _get_answers(qas_dict):
-
-        answer_list = []
-        answer_start_list = []
-
-        for answer in qas_dict['answers']:
-            answer_list.append(answer['text'])
-            answer_start_list.append(answer['answer_start'])
-
-        return answer_list, answer_start_list
diff --git a/src/gluonnlp/data/registry.py b/src/gluonnlp/data/registry.py
deleted file mode 100644
index 76b83d8a3f..0000000000
--- a/src/gluonnlp/data/registry.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""A registry for datasets
-
-The registry makes it simple to construct a dataset given its name.
-
-"""
-__all__ = ['register', 'create', 'list_datasets']
-
-import inspect
-
-from mxnet import registry
-from mxnet.gluon.data import Dataset
-
-_REGSITRY_NAME_KWARGS = {}
-
-
-def register(class_=None, **kwargs):
-    """Registers a dataset with segment specific hyperparameters.
-
-    When passing keyword arguments to `register`, they are checked to be valid
-    keyword arguments for the registered Dataset class constructor and are
-    saved in the registry. Registered keyword arguments can be retrieved with
-    the `list_datasets` function.
-
-    All arguments that result in creation of separate datasets should be
-    registered. Examples are datasets divided in different segments or
-    categories, or datasets containing multiple languages.
-
-    Once registered, an instance can be created by calling
-    :func:`~gluonnlp.data.create` with the class name.
-
-    Parameters
-    ----------
-    **kwargs : list or tuple of allowed argument values
-        For each keyword argument, it's value must be a list or tuple of the
-        allowed argument values.
-
-    Examples
-    --------
-    >>> @gluonnlp.data.register(segment=['train', 'test', 'dev'])
-    ... class MyDataset(gluon.data.Dataset):
-    ...     def __init__(self, segment='train'):
-    ...         pass
-    >>> my_dataset = gluonnlp.data.create('MyDataset')
-    >>> print(type(my_dataset))
-    <class 'gluonnlp.data.registry.MyDataset'>
-
-    """
-
-    def _real_register(class_):
-        # Assert that the passed kwargs are meaningful
-        for kwarg_name, values in kwargs.items():
-            try:
-                real_args = inspect.getfullargspec(class_).args
-            except AttributeError:
-                # pylint: disable=deprecated-method
-                real_args = inspect.getargspec(class_.__init__).args
-
-            if not kwarg_name in real_args:
-                raise RuntimeError(
-                    ('{} is not a valid argument for {}. '
-                     'Only valid arguments can be registered.').format(
-                         kwarg_name, class_.__name__))
-
-            if not isinstance(values, (list, tuple)):
-                raise RuntimeError(('{} should be a list of '
-                                    'valid arguments for {}. ').format(
-                                        values, kwarg_name))
-
-        # Save the kwargs associated with this class_
-        _REGSITRY_NAME_KWARGS[class_] = kwargs
-
-        register_ = registry.get_register_func(Dataset, 'dataset')
-        return register_(class_)
-
-    if class_ is not None:
-        # Decorator was called without arguments
-        return _real_register(class_)
-
-    return _real_register
-
-
-def create(name, **kwargs):
-    """Creates an instance of a registered dataset.
-
-    Parameters
-    ----------
-    name : str
-        The dataset name (case-insensitive).
-
-    Returns
-    -------
-    An instance of :class:`mxnet.gluon.data.Dataset` constructed with the
-    keyword arguments passed to the create function.
-
-    """
-    create_ = registry.get_create_func(Dataset, 'dataset')
-    return create_(name, **kwargs)
-
-
-def list_datasets(name=None):
-    """Get valid datasets and registered parameters.
-
-    Parameters
-    ----------
-    name : str or None, default None
-        Return names and registered parameters of registered datasets. If name
-        is specified, only registered parameters of the respective dataset are
-        returned.
-
-    Returns
-    -------
-    dict:
-        A dict of all the valid keyword parameters names for the specified
-        dataset. If name is set to None, returns a dict mapping each valid name
-        to its respective keyword parameter dict. The valid names can be
-        plugged in `gluonnlp.model.word_evaluation_model.create(name)`.
-
-    """
-    reg = registry.get_registry(Dataset)
-
-    if name is not None:
-        class_ = reg[name.lower()]
-        return _REGSITRY_NAME_KWARGS[class_]
-    else:
-        return {
-            dataset_name: _REGSITRY_NAME_KWARGS[class_]
-            for dataset_name, class_ in registry.get_registry(Dataset).items()
-        }
diff --git a/src/gluonnlp/data/sampler.py b/src/gluonnlp/data/sampler.py
index 597b544c8b..aabfe7a688 100644
--- a/src/gluonnlp/data/sampler.py
+++ b/src/gluonnlp/data/sampler.py
@@ -18,18 +18,19 @@
 (e.g. in the order sorted by length). They can also be used to perform bucketing
 for speeding up the processing of variable-length sequences."""
 __all__ = ['ConstWidthBucket', 'LinearWidthBucket', 'ExpWidthBucket',
-           'SortedSampler', 'FixedBucketSampler', 'SortedBucketSampler', 'SplitSampler']
+           'SortedSampler', 'FixedBucketSampler', 'SortedBucketSampler']
 
 import math
-import warnings
 import random
+import warnings
 import numpy as np
-from mxnet.gluon.data import Sampler
-from .._constants import INT_TYPES
+import abc
+from typing import Union, Sequence, Optional, List
+from ..base import INT_TYPES
 
 
 def _match_bucket_keys(bucket_keys, seq_lengths):
-    bucket_key_npy = np.array(bucket_keys, dtype=np.int32)
+    bucket_key_npy = np.array(bucket_keys, dtype=np.int64)
     bucket_sample_ids = [list() for _ in range(len(bucket_keys))]
     batch_size = 10000
     bucket_key_npy = bucket_key_npy.reshape((1,) + bucket_key_npy.shape)
@@ -47,9 +48,9 @@ def _match_bucket_keys(bucket_keys, seq_lengths):
         batch_bucket_id = masked_pad_val.argmin(axis=1).tolist()
         if len(seq_ids_not_found) > 0:
             raise ValueError('Find elements in seq_lengths that cannot fit in the '
-                             'given buckets, seq_length=%s, bucket_keys=%s. ' \
+                             'given buckets, seq_length={}, bucket_keys={}. ' \
                              'You must increase the bucket size.'
-                             % (str(seq_lengths[seq_ids_not_found]), str(bucket_keys)))
+                             % (seq_lengths[seq_ids_not_found], bucket_keys))
         for i, bucket_id in enumerate(batch_bucket_id):
             bucket_sample_ids[bucket_id].append(i + begin)
     return bucket_sample_ids
@@ -66,43 +67,61 @@ def _bucket_stats(bucket_sample_ids, seq_lengths):
         else:
             bucket_average_lengths.append(0)
             bucket_length_stds.append(0)
-    return (bucket_average_lengths, bucket_length_stds)
+    return bucket_average_lengths, bucket_length_stds
 
 
-class BucketScheme:
+class BucketScheme(abc.ABC):
     r"""Base class for generating bucket keys."""
-    def __call__(self, max_lengths, min_lengths, num_buckets):
+    @abc.abstractmethod
+    def __call__(self, max_lengths: Union[int, Sequence[int]],
+                 min_lengths: Union[int, Sequence[int]], num_buckets: int) -> List[int]:
         """Generate bucket keys based on the lengths of sequences and number of buckets.
 
         Parameters
         ----------
-        max_lengths : int or list of int
+        max_lengths
             Maximum of lengths of sequences.
-        min_lengths : int or list of int
+        min_lengths
             Minimum of lengths of sequences.
-        num_buckets : int
+        num_buckets
             Number of buckets
 
         Returns
         -------
-        bucket_keys : list of int
+        bucket_keys
             A list including the keys of the buckets.
         """
         raise NotImplementedError
 
 
+class BaseSampler(abc.ABC):
+    """Base class for samplers.
+
+    All samplers should subclass `BaseSampler` and define `__iter__` and `__len__`
+    methods.
+    """
+    @abc.abstractmethod
+    def __iter__(self):
+        raise NotImplementedError
+
+    @abc.abstractmethod
+    def __len__(self):
+        raise NotImplementedError
+
+
 class ConstWidthBucket(BucketScheme):
     r"""Buckets with constant width."""
-    def __call__(self, max_lengths, min_lengths, num_buckets):
+    def __call__(self, max_lengths: Union[int, Sequence[int]],
+                 min_lengths: Union[int, Sequence[int]], num_buckets: int) -> List[int]:
         r"""This generate bucket keys given that all the buckets have the same width.
 
         Parameters
         ----------
-        max_lengths : int or list of int
+        max_lengths
             Maximum of lengths of sequences.
-        min_lengths : int or list of int
+        min_lengths
             Minimum of lengths of sequences.
-        num_buckets : int
+        num_buckets
             Number of buckets
 
         Returns
@@ -129,21 +148,22 @@ class LinearWidthBucket(BucketScheme):
     r""" Buckets with linearly increasing width:
     :math:`w_i = \alpha * i + 1` for all :math:`i \geq 1`.
     """
-    def __call__(self, max_lengths, min_lengths, num_buckets):
+    def __call__(self, max_lengths: Union[int, Sequence[int]],
+                 min_lengths: Union[int, Sequence[int]], num_buckets: int) -> List[int]:
         r"""This function generates bucket keys with linearly increasing bucket width:
 
         Parameters
         ----------
-        max_lengths : int or list of int
+        max_lengths
             Maximum of lengths of sequences.
-        min_lengths : int or list of int
+        min_lengths
             Minimum of lengths of sequences.
-        num_buckets : int
+        num_buckets
             Number of buckets
 
         Returns
         -------
-        bucket_keys : list of int
+        bucket_keys
             A list including the keys of the buckets.
         """
         if not isinstance(max_lengths, INT_TYPES):
@@ -173,27 +193,28 @@ class ExpWidthBucket(BucketScheme):
 
     Parameters
     ----------
-    bucket_len_step : float, default 1.1
+    bucket_len_step
         This is the increasing factor for the bucket width.
     """
-    def __init__(self, bucket_len_step=1.1):
+    def __init__(self, bucket_len_step: float = 1.1):
         self.bucket_len_step = bucket_len_step
 
-    def __call__(self, max_lengths, min_lengths, num_buckets):
+    def __call__(self, max_lengths: Union[int, Sequence[int]],
+                 min_lengths: Union[int, Sequence[int]], num_buckets: int) -> List[int]:
         r"""This function generates bucket keys exponentially increasing bucket width.
 
         Parameters
         ----------
-        max_lengths : int or list of int
+        max_lengths
             Maximum of lengths of sequences.
-        min_lengths : int or list of int
+        min_lengths
             Minimum of lengths of sequences.
-        num_buckets : int
+        num_buckets
             Number of buckets
 
         Returns
         -------
-        bucket_keys : list of int
+        bucket_keys
             A list including the keys of the buckets.
         """
         if not isinstance(max_lengths, INT_TYPES):
@@ -222,17 +243,17 @@ def __call__(self, max_lengths, min_lengths, num_buckets):
         return bucket_keys
 
 
-class SortedSampler(Sampler):
+class SortedSampler(BaseSampler):
     r"""Sort the samples based on the sort key and then sample sequentially.
 
     Parameters
     ----------
-    sort_keys : list-like object
+    sort_keys
         List of the sort keys.
-    reverse : bool, default True
+    reverse
         Whether to sort by descending order.
     """
-    def __init__(self, sort_keys, reverse=True):
+    def __init__(self, sort_keys: Sequence, reverse: bool = True):
         assert len(sort_keys) > 0
         self._sorted_ids = sorted(range(len(sort_keys)),
                                   key=lambda i: sort_keys[i], reverse=reverse)
@@ -244,23 +265,107 @@ def __len__(self):
         return len(self._sorted_ids)
 
 
-class FixedBucketSampler(Sampler):
+class BoundedBudgetSampler(BaseSampler):
+    r"""Assign each data sample to bounded budget batches. Samples will be sorted by length before batchfy
+    see https://github.com/pytorch/fairseq/blob/master/fairseq/data/data_utils_fast.pyx
+
+    Parameters
+    ----------
+    lengths
+        The length of the sequences in the input data sample.
+    max_num_tokens
+        max tokens num of each batch
+    max_num_sentences
+        max sentences num of each batch
+    required_batch_size_multiple
+        require batch size to be a multiple of N (default: 1).
+        better throughput in GPU.
+    shuffle
+        Whether to shuffle the batches.
+    seed
+        The seed of the sampler
+    """
+    def __init__(self, lengths: Union[Sequence[int], Sequence[Sequence[int]]],
+                 max_num_tokens: int = -1, max_num_sentences: int = -1,
+                 required_batch_size_multiple: int = 1,
+                 shuffle: bool = True, seed: Optional[int] = None):
+        assert len(lengths) > 0, 'BoundedBudgetSampler does not support empty lengths.'
+        assert max_num_tokens > 0 or max_num_sentences > 0, \
+               'One of max_num_tokens and max_num_sentences must be larger than 0'
+        self._lengths = np.array(lengths)
+        if self._lengths.ndim == 2:
+            self._lengths = self._lengths.max(axis=1)
+        self._indices = np.array(range(len(lengths)))
+        self._max_num_tokens = max_num_tokens
+        self._max_num_sentences = max_num_sentences
+        self._batches = []
+        self._shuffle = shuffle
+        self._rng = np.random.RandomState(seed)
+        # sort
+        self._indices = self._indices[np.argsort(self._lengths, kind='mergesort')]
+        batch = []
+        # max len in a batch
+        batch_max_sample_len = 0
+        for index in self._indices:
+            batch_max_sample_len = max(batch_max_sample_len, self._lengths[index])
+            # try to insert new sample to the batch
+            batch_num_sentences = len(batch) + 1
+            batch_num_tokens = batch_num_sentences * batch_max_sample_len
+            if (self._max_num_sentences > 0 and batch_num_sentences > self._max_num_sentences) or \
+               (self._max_num_tokens > 0 and batch_num_tokens > self._max_num_tokens):
+                # moded_bs = len(batch) % required_batch_size_multiple when len(batch) < required_batch_size_multiple
+                moded_bs = max(
+                    required_batch_size_multiple * (len(batch) // required_batch_size_multiple),
+                    len(batch) % required_batch_size_multiple
+                )
+                self._batches.append(np.array(batch[:moded_bs]))
+                batch = batch[moded_bs:]
+                batch_max_sample_len = max(
+                    self._lengths[batch].max() if len(batch) > 0 else 0,
+                    self._lengths[index]
+                )
+            batch.append(index)
+        if len(batch) > 0:
+            self._batches.append(np.array(batch))
+
+    def __iter__(self):
+        if self._shuffle:
+            self._rng.shuffle(self._batches)
+        for batch in self._batches:
+            yield batch
+
+    def __len__(self):
+        return len(self._batches)
+
+    def __repr__(self):
+        ret = '{name}(\n' \
+            '  sample_num={sample_num},\n' \
+            '  batch_num={batch_num},\n' \
+            ')'\
+            .format(name=self.__class__.__name__,
+                    sample_num=len(self._lengths),
+                    batch_num=len(self._batches))
+        return ret
+
+
+# TODO(?) Add rollover flag to BucketSampler, issue: https://github.com/dmlc/gluon-nlp/issues/982
+class FixedBucketSampler(BaseSampler):
     r"""Assign each data sample to a fixed bucket based on its length.
     The bucket keys are either given or generated from the input sequence lengths.
 
     Parameters
     ----------
-    lengths : list of int or list of tuple/list of int
+    lengths
         The length of the sequences in the input data sample.
-    batch_size : int
-        The batch size of the sampler.
+    batch_size
+        The expected batch size of the sampler.
     num_buckets : int or None, default 10
         The number of buckets. This will not be used if bucket_keys is set.
-    bucket_keys : None or list of int or list of tuple, default None
+    bucket_keys
         The keys that will be used to create the buckets. It should usually be the lengths of the
-        sequences. If it is None, the bucket_keys will be generated based on the maximum
-        lengths of the data.
-    ratio : float, default 0
+        sequences. If it is None, the bucket_keys will be generated based on the bucket schemes
+        given input lengths.
+    ratio
         Ratio to scale up the batch size of smaller buckets.
         Assume the :math:`i` th key is :math:`K_i` ,
         the default batch size is :math:`B` , the ratio to scale the batch size is
@@ -273,41 +378,41 @@ class FixedBucketSampler(Sampler):
 
         Thus, setting this to a value larger than 0, like 0.5, will scale up the batch size of the
         smaller buckets.
-    shuffle : bool, default False
+    shuffle
         Whether to shuffle the batches.
-    use_average_length : bool, default False
+    use_average_length
         False: each batch contains batch_size sequences, number of sequence elements varies.
         True: each batch contains batch_size elements, number of sequences varies. In this case,
         ratio option is ignored.
-    num_shards : int, default 0
-        If num_shards > 0, the sampled batch is split into num_shards smaller batches.
-        The output will have structure of list(list(int)).
-        If num_shards = 0, the output will have structure of list(int).
-        This is useful in multi-gpu training and can potentially reduce the number of paddings.
-        In general, it is set to the number of gpus.
-    bucket_scheme : BucketScheme, default ConstWidthBucket
+    bucket_scheme
         It is used to generate bucket keys. It supports:
         ConstWidthBucket: all the buckets have the same width
         LinearWidthBucket: the width of ith  bucket follows :math:`w_i = \alpha * i + 1`
         ExpWidthBucket: the width of ith bucket follows
         :math:`w_i` = bucket_len_step :math:`* w_{i-1}`
+    seed
+        The seed of the bucket sampler
     Examples
     --------
     >>> lengths = [np.random.randint(1, 100) for _ in range(1000)]
     >>> sampler = gluonnlp.data.FixedBucketSampler(lengths, 8, ratio=0.5)
-    >>> print(sampler.stats())
+    >>> print(sampler)
     FixedBucketSampler:
     -etc-
     """
-    def __init__(self, lengths, batch_size, num_buckets=10, bucket_keys=None,
-                 ratio=0, shuffle=False, use_average_length=False, num_shards=0,
-                 bucket_scheme=ConstWidthBucket()):
+    def __init__(self, lengths: Union[Sequence[int], Sequence[Sequence[int]]],
+                 batch_size: int, num_buckets: Optional[int] = 10,
+                 bucket_keys: Optional[Union[Sequence[int], Sequence[Sequence[int]]]] = None,
+                 ratio: float = 0, shuffle: bool = False, use_average_length: bool = False,
+                 bucket_scheme: BucketScheme = ConstWidthBucket(),
+                 seed: Optional[int] = None):
         assert len(lengths) > 0, 'FixedBucketSampler does not support empty lengths.'
         assert batch_size > 0, 'Batch size must be larger than 0.'
         assert ratio >= 0, 'batch size scaling ratio cannot be negative.'
+        self._rng = np.random.RandomState(seed)
         self._batch_size = batch_size
         self._ratio = ratio
-        self._lengths = np.array(lengths, dtype=np.int32)
+        self._lengths = np.array(lengths, dtype=np.int64)
         if self._lengths.ndim == 1:
             self._single_element = True
             attr_num = 1
@@ -318,7 +423,6 @@ def __init__(self, lengths, batch_size, num_buckets=10, bucket_keys=None,
             self._single_element = False
             attr_num = self._lengths.shape[1]
         self._shuffle = shuffle
-        self._num_shards = num_shards
         self._bucket_scheme = bucket_scheme
         max_lengths = self._lengths.max(axis=0)
         min_lengths = self._lengths.min(axis=0)
@@ -330,12 +434,12 @@ def __init__(self, lengths, batch_size, num_buckets=10, bucket_keys=None,
         # Generate the buckets
         if bucket_keys is None:
             assert num_buckets > 0, 'num_buckets must be set when bucket_keys is None. Received ' \
-                                    'num_buckets=%d' % num_buckets
+                                    'num_buckets={}'.format(num_buckets)
             bucket_keys = bucket_scheme(max_lengths, min_lengths, num_buckets)
         else:
             if num_buckets is not None:
                 warnings.warn('num_buckets will not be used if bucket_keys is not None. '
-                              'bucket_keys=%s, num_buckets=%d' % (str(bucket_keys), num_buckets))
+                              'bucket_keys={}, num_buckets={}'.format(str(bucket_keys), num_buckets))
             assert len(bucket_keys) > 0
             if self._single_element:
                 assert isinstance(bucket_keys[0], int)
@@ -348,8 +452,8 @@ def __init__(self, lengths, batch_size, num_buckets=10, bucket_keys=None,
         unused_bucket_keys = [key for key, sample_ids in zip(bucket_keys, bucket_sample_ids)
                               if len(sample_ids) == 0]
         if len(unused_bucket_keys) > 0:
-            warnings.warn('Some buckets are empty and will be removed. Unused bucket keys=%s' %
-                          str(unused_bucket_keys))
+            warnings.warn('Some buckets are empty and will be removed. Unused bucket keys={}'
+                          .format(str(unused_bucket_keys)))
         # Remove empty buckets
         self._bucket_keys = [key for key, sample_ids in zip(bucket_keys, bucket_sample_ids)
                              if len(sample_ids) > 0]
@@ -365,7 +469,8 @@ def __init__(self, lengths, batch_size, num_buckets=10, bucket_keys=None,
                                         for scale_up_key in scale_up_keys]
         else:
             if ratio > 0.:
-                warnings.warn('ratio=%f is ignored when use_average_length is True' % self._ratio)
+                warnings.warn('ratio={} is ignored when use_average_length is True'
+                              .format(self._ratio))
             bucket_average_lengths, bucket_length_stds = _bucket_stats(self._bucket_sample_ids,
                                                                        self._lengths)
             self._bucket_batch_sizes = [max(int(batch_size / (average_length + length_std)), 1)
@@ -374,48 +479,27 @@ def __init__(self, lengths, batch_size, num_buckets=10, bucket_keys=None,
         self._batch_infos = []
         for bucket_id, sample_ids, bucket_batch_size in\
                 zip(range(len(self._bucket_keys) - 1, -1, -1),
-                        self._bucket_sample_ids[::-1],
-                        self._bucket_batch_sizes[::-1]):
+                    self._bucket_sample_ids[::-1],
+                    self._bucket_batch_sizes[::-1]):
             for i in range(0, len(sample_ids), bucket_batch_size):
                 self._batch_infos.append((bucket_id, i))
-
-        if self._num_shards > 0:
-            self._sampler_size = int(math.ceil(len(self._batch_infos) / float(self._num_shards)))
-        else:
-            self._sampler_size = len(self._batch_infos)
+        self._sampler_size = len(self._batch_infos)
 
     def __iter__(self):
         if self._shuffle:
-            np.random.shuffle(self._batch_infos)
+            self._rng.shuffle(self._batch_infos)
             for bucket_id in range(len(self._bucket_keys)):
-                np.random.shuffle(self._bucket_sample_ids[bucket_id])
-
-        if self._num_shards > 0:
-            for batch_idx in range(0, len(self._batch_infos), self._num_shards):
-                if batch_idx + self._num_shards > len(self._batch_infos):
-                    batch_idx = len(self._batch_infos) - self._num_shards
-                batch = self._batch_infos[batch_idx: batch_idx + self._num_shards]
-                bucket_ids, batch_begins = list(zip(*batch))
-                batch_sizes = [self._bucket_batch_sizes[bucket_id] for bucket_id in bucket_ids]
-                batch_ends = [min(batch_begin + batch_size,
-                                  len(self._bucket_sample_ids[bucket_id]))
-                              for bucket_id, batch_begin, batch_size in zip(bucket_ids,
-                                                                            batch_begins,
-                                                                            batch_sizes)]
-                yield [self._bucket_sample_ids[bucket_id][batch_begin:batch_end]
-                       for bucket_id, batch_begin, batch_end in zip(bucket_ids,
-                                                                    batch_begins,
-                                                                    batch_ends)]
-        else:
-            for bucket_id, batch_begin in self._batch_infos:
-                batch_size = self._bucket_batch_sizes[bucket_id]
-                batch_end = min(batch_begin + batch_size, len(self._bucket_sample_ids[bucket_id]))
-                yield self._bucket_sample_ids[bucket_id][batch_begin:batch_end]
+                self._rng.shuffle(self._bucket_sample_ids[bucket_id])
+
+        for bucket_id, batch_begin in self._batch_infos:
+            batch_size = self._bucket_batch_sizes[bucket_id]
+            batch_end = min(batch_begin + batch_size, len(self._bucket_sample_ids[bucket_id]))
+            yield self._bucket_sample_ids[bucket_id][batch_begin:batch_end]
 
     def __len__(self):
         return self._sampler_size
 
-    def stats(self):
+    def __repr__(self):
         """Return a string representing the statistics of the bucketing sampler.
 
         Returns
@@ -423,11 +507,12 @@ def stats(self):
         ret : str
             String representing the statistics of the buckets.
         """
-        ret = '{name}:\n' \
+        ret = '{name}(\n' \
             '  sample_num={sample_num}, batch_num={batch_num}\n' \
             '  key={bucket_keys}\n' \
             '  cnt={bucket_counts}\n' \
-            '  batch_size={bucket_batch_sizes}'\
+            '  batch_size={bucket_batch_sizes}\n'\
+            ')'\
             .format(name=self.__class__.__name__,
                     sample_num=len(self._lengths),
                     batch_num=len(self._batch_infos),
@@ -437,7 +522,7 @@ def stats(self):
         return ret
 
 
-class SortedBucketSampler(Sampler):
+class SortedBucketSampler(BaseSampler):
     r"""Batches are sampled from sorted buckets of data.
 
     First, partition data in buckets of size `batch_size * mult`.
@@ -446,16 +531,18 @@ class SortedBucketSampler(Sampler):
 
     Parameters
     ----------
-    sort_keys : list-like object
+    sort_keys
         The keys to sort the samples.
-    batch_size : int
+    batch_size
         Batch size of the sampler.
-    mult : int or float, default 100
+    mult
         The multiplier to determine the bucket size. Each bucket will have size `mult * batch_size`.
-    reverse : bool, default True
+    reverse
         Whether to sort in descending order.
-    shuffle : bool, default False
+    shuffle
         Whether to shuffle the data.
+    seed
+        The seed of the internal random number generator
 
     Examples
     --------
@@ -467,10 +554,12 @@ class SortedBucketSampler(Sampler):
     ...         print([lengths[ele] for ele in indices])
     [-etc-]
     """
-    def __init__(self, sort_keys, batch_size, mult=100, reverse=True, shuffle=False):
+    def __init__(self, sort_keys: Sequence, batch_size: int, mult: Union[int, float] = 100,
+                 reverse: bool = True, shuffle: bool = False, seed: Optional[int] = None):
         assert len(sort_keys) > 0
         assert batch_size > 0
         assert mult >= 1, 'Bucket size multiplier must be larger than 1'
+        self._rng = np.random.RandomState(seed)
         self._sort_keys = sort_keys
         self._batch_size = batch_size
         self._mult = mult
@@ -480,7 +569,7 @@ def __init__(self, sort_keys, batch_size, mult=100, reverse=True, shuffle=False)
 
     def __iter__(self):
         if self._shuffle:
-            sample_ids = np.random.permutation(self._total_sample_num)
+            sample_ids = self._rng.permutation(self._total_sample_num)
         else:
             sample_ids = list(range(self._total_sample_num))
         bucket_size = int(self._mult * self._batch_size)
@@ -490,7 +579,7 @@ def __iter__(self):
                                        key=lambda i: self._sort_keys[i], reverse=self._reverse)
             batch_begins = list(range(0, len(sorted_sample_ids), self._batch_size))
             if self._shuffle:
-                np.random.shuffle(batch_begins)
+                self._rng.shuffle(batch_begins)
             for batch_begin in batch_begins:
                 batch_end = min(batch_begin + self._batch_size, len(sorted_sample_ids))
                 yield sorted_sample_ids[batch_begin:batch_end]
@@ -499,31 +588,35 @@ def __len__(self):
         return (len(self._sort_keys) + self._batch_size - 1) // self._batch_size
 
 
-class SplitSampler(Sampler):
+class SplitSampler(BaseSampler):
     """Split the dataset into `num_parts` parts and randomly sample from the part
     with index `part_index`.
-
     The data is randomly shuffled at each iteration within each partition.
 
     Parameters
     ----------
-    length: int
-      Number of examples in the dataset
-    num_parts: int, default 1
-      Number of partitions which the data is split into
-    part_index: int, default 0
-      The index of the part to read from
-    even_size: bool, default False
-      If the number of samples is not even across all partitions, sample a few extra samples
-      for the ones with fewer samples.
-    repeat: int, default 1
-      The number of times that items are repeated.
-    shuffle: bool, default True
-      Whether or not to shuffle the items.
+    length
+        Number of examples in the dataset
+    num_parts
+        Number of partitions which the data is split into
+    part_index
+        The index of the part to read from
+    even_size
+        If the number of samples is not even across all partitions, sample a few extra samples
+        for the ones with fewer samples.
+    repeat
+        The number of times that items are repeated.
+    shuffle
+        Whether or not to shuffle the items.
     """
-    def __init__(self, length, num_parts=1, part_index=0, even_size=False, repeat=1, shuffle=True):
+    def __init__(self, length: int,
+                 num_parts: int = 1,
+                 part_index: int = 0,
+                 even_size: bool = False,
+                 repeat: int = 1,
+                 shuffle: bool = True):
         assert length >= num_parts, \
-            'Length (%d) must be greater than or equal to the number of partitions (%d).'%\
+            'Length (%d) must be greater than or equal to the number of partitions (%d).' %\
             (length, num_parts)
         self.even_size = even_size
         self.num_parts = num_parts
@@ -565,3 +658,69 @@ def __iter__(self):
 
     def __len__(self):
         return self._len * self._repeat
+
+
+class ShardedIterator(BaseSampler):
+    r"""A sharded wrapper around an iterable (padded to length).
+    
+    Parameters
+    ----------
+    sampler
+    num_parts
+        Number of partitions which the data is split into (default: 1)
+    part_index
+        The index of the part to read from
+    even_size
+        If the number of batches is not even across all partitions, sample a few extra batches
+        for the ones with fewer batches.
+    """
+    def __init__(self, sampler: BaseSampler,
+                 num_parts: int = 1,
+                 part_index: int = 0,
+                 even_size: bool = False):
+        assert part_index < num_parts, 'part_index should be less than num_parts'
+        self._sampler = sampler
+        self._num_parts = num_parts
+        self._part_index = part_index
+        self._even_size = even_size
+
+        length = len(sampler)
+        if not even_size:
+            part_len = length // num_parts
+            remaining = length % num_parts
+            self._start = part_len * part_index + min(part_index, remaining)
+            self._end = self._start + part_len + (part_index < remaining)
+            self._part_len = self._end - self._start
+        else:
+            part_len = int(length + num_parts - 1) // num_parts
+            self._start = part_len * part_index
+            self._end = self._start + part_len
+            self._start = self._start if self._start < length else length
+            self._end = self._end if self._end < length else length
+            self._part_len = part_len
+
+    def __iter__(self):
+        batches = list(self._sampler)
+        part_batches = batches[self._start:self._end]
+        if self._even_size and len(part_batches) < self._part_len:
+            candidates = random.sample(batches, k=self._part_len-len(part_batches))
+            part_batches.extend(candidates)
+        for batch in part_batches:
+            yield batch
+    
+    def __len__(self):
+        return len(self._sampler)
+    
+    def __repr__(self):
+        ret = '{name}(\n' \
+            '  batch_num={batch_num},\n' \
+            '  part_batch_num={part_batch_num},\n' \
+            '  num_parts={num_parts},\n' \
+            '  part_index={part_index},\n' \
+            ')'\
+            .format(name=self.__class__.__name__,
+                    batch_num=len(self._sampler),
+                    part_batch_num=self._part_len,
+                    num_parts=self._num_parts,
+                    part_index=self._part_index)
+        return ret
diff --git a/src/gluonnlp/data/sentiment.py b/src/gluonnlp/data/sentiment.py
deleted file mode 100644
index 0f4c722b90..0000000000
--- a/src/gluonnlp/data/sentiment.py
+++ /dev/null
@@ -1,472 +0,0 @@
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=
-"""Sentiment analysis datasets."""
-
-__all__ = ['IMDB', 'MR', 'TREC', 'SUBJ', 'SST_1', 'SST_2', 'CR', 'MPQA']
-
-import json
-import os
-import shutil
-import zipfile
-
-from mxnet.gluon.data import SimpleDataset
-from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url
-from .registry import register
-from ..base import get_home_dir
-
-class SentimentDataset(SimpleDataset):
-    """Base class for sentiment analysis data sets.
-
-    Parameters
-    ----------
-    segment : str
-        Dataset segment.
-    root : str
-        Path to temp folder for storing data.
-    """
-    def __init__(self, segment, root):
-        root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
-        self._root = root
-        self._segment = segment
-        self._get_data()
-        super(SentimentDataset, self).__init__(self._read_data())
-
-    def _get_data(self):
-        """Load data from the file. Do nothing if data was loaded before.
-        """
-        (data_archive_name, archive_hash), (data_name, data_hash) \
-            = self._data_file()[self._segment]
-        data_path = os.path.join(self._root, data_name)
-
-        if not os.path.exists(data_path) or not check_sha1(data_path, data_hash):
-            file_path = download(_get_repo_file_url(self._repo_dir(), data_archive_name),
-                                 path=self._root, sha1_hash=archive_hash)
-
-            with zipfile.ZipFile(file_path, 'r') as zf:
-                for member in zf.namelist():
-                    filename = os.path.basename(member)
-                    if filename:
-                        dest = os.path.join(self._root, filename)
-                        with zf.open(member) as source, open(dest, 'wb') as target:
-                            shutil.copyfileobj(source, target)
-
-    def _read_data(self):
-        (_, _), (data_file_name, _) = self._data_file()[self._segment]
-
-        with open(os.path.join(self._root, data_file_name)) as f:
-            samples = json.load(f)
-        return samples
-
-    def _data_file(self):
-        raise NotImplementedError
-
-    def _repo_dir(self):
-        raise NotImplementedError
-
-
-@register(segment=['train', 'test', 'unsup'])
-class IMDB(SimpleDataset):
-    """IMDB reviews for sentiment analysis.
-
-    From
-    http://ai.stanford.edu/~amaas/data/sentiment/
-
-    Positive classes have label values in [7, 10]. Negative classes have label values in [1, 4].
-    All samples in unsupervised set have labels with value 0.
-
-    Parameters
-    ----------
-    segment : str, default 'train'
-        Dataset segment. Options are 'train', 'test', and 'unsup' for unsupervised.
-    root : str, default '$MXNET_HOME/datasets/imdb'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> imdb = gluonnlp.data.IMDB('test', root='./datasets/imdb')
-    -etc-
-    >>> len(imdb)
-    25000
-    >>> len(imdb[0])
-    2
-    >>> type(imdb[0][0]), type(imdb[0][1])
-    (<class 'str'>, <class 'int'>)
-    >>> imdb[0][0][:75]
-    'I went and saw this movie last night after being coaxed to by a few friends'
-    >>> imdb[0][1]
-    10
-    >>> imdb = gluonnlp.data.IMDB('unsup', root='./datasets/imdb')
-    -etc-
-    >>> len(imdb)
-    50000
-    >>> len(imdb[0])
-    2
-    >>> type(imdb[0][0]), type(imdb[0][1])
-    (<class 'str'>, <class 'int'>)
-    >>> imdb[0][0][:70]
-    'I admit, the great majority of films released before say 1933 are just'
-    >>> imdb[0][1]
-    0
-    """
-    def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'imdb')):
-        self._data_file = {'train': ('train.json',
-                                     '516a0ba06bca4e32ee11da2e129f4f871dff85dc'),
-                           'test': ('test.json',
-                                    '7d59bd8899841afdc1c75242815260467495b64a'),
-                           'unsup': ('unsup.json',
-                                     'f908a632b7e7d7ecf113f74c968ef03fadfc3c6c')}
-        root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
-        self._root = root
-        self._segment = segment
-        self._get_data()
-        super(IMDB, self).__init__(self._read_data())
-
-    def _get_data(self):
-        data_file_name, data_hash = self._data_file[self._segment]
-        root = self._root
-        path = os.path.join(root, data_file_name)
-        if not os.path.exists(path) or not check_sha1(path, data_hash):
-            download(_get_repo_file_url('gluon/dataset/imdb', data_file_name),
-                     path=root, sha1_hash=data_hash)
-
-    def _read_data(self):
-        with open(os.path.join(self._root, self._segment+'.json')) as f:
-            samples = json.load(f)
-        return samples
-
-
-@register()
-class MR(SentimentDataset):
-    """Movie reviews for sentiment analysis.
-
-    From
-    https://www.cs.cornell.edu/people/pabo/movie-review-data/
-
-    Positive class has label value 1. Negative class has label value 0.
-
-    Parameters
-    ----------
-    root : str, default '$MXNET_HOME/datasets/mr'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> mr = gluonnlp.data.MR(root='./datasets/mr')
-    -etc-
-    >>> len(mr)
-    10662
-    >>> len(mr[3])
-    2
-    >>> type(mr[3][0]), type(mr[3][1])
-    (<class 'str'>, <class 'int'>)
-    >>> mr[3][0][:55]
-    'if you sometimes like to go to the movies to have fun ,'
-    >>> mr[3][1]
-    1
-    """
-    def __init__(self, root=os.path.join(get_home_dir(), 'datasets', 'mr')):
-        super(MR, self).__init__('all', root)
-
-    def _data_file(self):
-        return {'all': (('all-7606efec.zip', '0fcbaffe0bac94733e6497f700196585f03fa89e'),
-                        ('all-7606efec.json', '7606efec578d9613f5c38bf2cef8d3e4e6575b2c '))}
-
-    def _repo_dir(self):
-        return 'gluon/dataset/mr'
-
-
-@register(segment=['train', 'test'])
-class TREC(SentimentDataset):
-    """Question dataset for question classification.
-
-    From
-    http://cogcomp.cs.illinois.edu/Data/QA/QC/
-
-    Class labels are (http://cogcomp.org/Data/QA/QC/definition.html):
-        - DESCRIPTION: 0
-        - ENTITY: 1
-        - ABBREVIATION: 2
-        - HUMAN: 3
-        - LOCATION: 4
-        - NUMERIC: 5
-
-    The first space-separated token in the text of each sample is the fine-grain label.
-
-    Parameters
-    ----------
-    segment : str, default 'train'
-        Dataset segment. Options are 'train' and 'test'.
-    root : str, default '$MXNET_HOME/datasets/trec'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> trec = gluonnlp.data.TREC('test', root='./datasets/trec')
-    -etc-
-    >>> len(trec)
-    500
-    >>> len(trec[0])
-    2
-    >>> type(trec[0][0]), type(trec[0][1])
-    (<class 'str'>, <class 'int'>)
-    >>> trec[0][0]
-    'How far is it from Denver to Aspen ?'
-    >>> (trec[0][1], trec[0][0].split()[0])
-    (5, 'How')
-    """
-    def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'trec')):
-        super(TREC, self).__init__(segment, root)
-
-    def _data_file(self):
-        return {'train': (('train-1776132f.zip', '337d3f43a56ec26f5773c6fc406ef19fb4cd3c92'),
-                          ('train-1776132f.json', '1776132fb2fc0ed2dc91b62f7817a4e071a3c7de')),
-                'test': (('test-ff9ad0ce.zip', '57f03aaee2651ca05f1f9fc5731ba7e9ad98e38a'),
-                         ('test-ff9ad0ce.json', 'ff9ad0ceb44d8904663fee561804a8dd0edc1b15'))}
-
-    def _repo_dir(self):
-        return 'gluon/dataset/trec'
-
-
-@register()
-class SUBJ(SentimentDataset):
-    """Subjectivity dataset for sentiment analysis.
-
-    Positive class has label value 1. Negative class has label value 0.
-
-    Parameters
-    ----------
-    root : str, default '$MXNET_HOME/datasets/subj'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> subj = gluonnlp.data.SUBJ(root='./datasets/subj')
-    -etc-
-    >>> len(subj)
-    10000
-    >>> len(subj[0])
-    2
-    >>> type(subj[0][0]), type(subj[0][1])
-    (<class 'str'>, <class 'int'>)
-    >>> subj[0][0][:60]
-    'its impressive images of crematorium chimney fires and stack'
-    >>> subj[0][1]
-    1
-    """
-    def __init__(self, root=os.path.join(get_home_dir(), 'datasets', 'subj')):
-        super(SUBJ, self).__init__('all', root)
-
-    def _data_file(self):
-        return {'all': (('all-9e7bd1da.zip', '8b0d95c2fc885cc38e4ad776d7429183f3ef632b'),
-                        ('all-9e7bd1da.json', '9e7bd1daa359c24abe1fac767d0e0af7bc114045'))}
-
-    def _repo_dir(self):
-        return 'gluon/dataset/subj'
-
-
-@register(segment=['train', 'dev', 'test'])
-class SST_1(SentimentDataset):
-    """Stanford Sentiment Treebank: an extension of the MR data set.
-    However, train/dev/test splits are provided and labels are fine-grained
-    (very positive, positive, neutral, negative, very negative).
-
-    From
-    http://nlp.stanford.edu/sentiment/
-
-    Class labels are:
-        - very positive: 4
-        - positive: 3
-        - neutral: 2
-        - negative: 1
-        - very negative: 0
-
-    Parameters
-    ----------
-    segment : str, default 'train'
-        Dataset segment. Options are 'train' and 'test'.
-    root : str, default '$MXNET_HOME/datasets/sst-1'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> sst_1 = gluonnlp.data.SST_1('test', root='./datasets/sst_1')
-    -etc-
-    >>> len(sst_1)
-    2210
-    >>> len(sst_1[0])
-    2
-    >>> type(sst_1[0][0]), type(sst_1[0][1])
-    (<class 'str'>, <class 'int'>)
-    >>> sst_1[0][0][:73]
-    'no movement , no yuks , not much of anything .'
-    >>> sst_1[0][1]
-    1
-    """
-    def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'sst-1')):
-        super(SST_1, self).__init__(segment, root)
-
-    def _data_file(self):
-        return {'train': (('train-638f9352.zip', '0a039010449772700c0e270c7095362403dc486a'),
-                          ('train-638f9352.json', '638f935251c0474e93d4aa50fda0c900faf02bba')),
-                'dev': (('dev-820ac954.zip', 'e4b7899ef5d37a6bf01d8ec1115ba20b8419b96f'),
-                        ('dev-820ac954.json', '820ac954b14b4f7d947e25f7a99249618d7962ee')),
-                'test': (('test-ab593ae9.zip', 'd3736db56cdc7293c38435557697c2407652525d'),
-                         ('test-ab593ae9.json', 'ab593ae9628f94af4f698654158ded1488b1de3b'))}
-
-    def _repo_dir(self):
-        return 'gluon/dataset/sst-1'
-
-
-@register(segment=['train', 'dev', 'test'])
-class SST_2(SentimentDataset):
-    """Stanford Sentiment Treebank: an extension of the MR data set.
-    Same as the SST-1 data set except that neutral reviews are removed
-    and labels are binary (positive, negative).
-
-    From
-    http://nlp.stanford.edu/sentiment/
-
-    Positive class has label value 1. Negative class has label value 0.
-
-    Parameters
-    ----------
-    segment : str, default 'train'
-        Dataset segment. Options are 'train' and 'test'.
-    root : str, default '$MXNET_HOME/datasets/sst-2'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> sst_2 = gluonnlp.data.SST_2('test', root='./datasets/sst_2')
-    -etc-
-    >>> len(sst_2)
-    1821
-    >>> len(sst_2[0])
-    2
-    >>> type(sst_2[0][0]), type(sst_2[0][1])
-    (<class 'str'>, <class 'int'>)
-    >>> sst_2[0][0][:65]
-    'no movement , no yuks , not much of anything .'
-    >>> sst_2[0][1]
-    0
-    """
-    def __init__(self, segment='train', root=os.path.join(get_home_dir(), 'datasets', 'sst-2')):
-        super(SST_2, self).__init__(segment, root)
-
-    def _data_file(self):
-        return {'train': (('train-61f1f238.zip', 'f27a9ac6a7c9208fb7f024b45554da95639786b3'),
-                          ('train-61f1f238.json', '61f1f23888652e11fb683ac548ed0be8a87dddb1')),
-                'dev': (('dev-65511587.zip', '8c74911f0246bd88dc0ced2619f95f10db09dc98'),
-                        ('dev-65511587.json', '655115875d83387b61f9701498143724147a1fc9')),
-                'test': (('test-a39c1db6.zip', '4b7f1648207ec5dffb4e4783cf1f48d6f36ba4db'),
-                         ('test-a39c1db6.json', 'a39c1db6ecc3be20bf2563bf2440c3c06887a2df'))}
-
-    def _repo_dir(self):
-        return 'gluon/dataset/sst-2'
-
-@register()
-class CR(SentimentDataset):
-    """
-    Customer reviews of various products (cameras, MP3s etc.). The task is to
-    predict positive/negative reviews.
-
-    Positive class has label value 1. Negative class has label value 0.
-
-    Parameters
-    ----------
-    root : str, default '$MXNET_HOME/datasets/cr'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> cr = gluonnlp.data.CR(root='./datasets/cr')
-    -etc-
-    >>> len(cr)
-    3775
-    >>> len(cr[3])
-    2
-    >>> type(cr[3][0]), type(cr[3][1])
-    (<class 'str'>, <class 'int'>)
-    >>> cr[3][0][:55]
-    'i know the saying is " you get what you pay for " but a'
-    >>> cr[3][1]
-    0
-    """
-    def __init__(self, root=os.path.join(get_home_dir(), 'datasets', 'cr')):
-        super(CR, self).__init__('all', root)
-
-    def _data_file(self):
-        return {'all': (('all-0c9633c6.zip', 'c662e2f9115d74e1fcc7c896fa3e2dc5ee7688e7'),
-                        ('all-0c9633c6.json', '0c9633c695d29b18730eddff965c850425996edf'))}
-
-    def _repo_dir(self):
-        return 'gluon/dataset/cr'
-
-@register()
-class MPQA(SentimentDataset):
-    """
-    Opinion polarity detection subtask of the MPQA dataset.
-
-    From
-    http://www.cs.pitt.edu/mpqa/
-
-    Positive class has label value 1. Negative class has label value 0.
-
-    Parameters
-    ----------
-    root : str, default '$MXNET_HOME/datasets/mpqa'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> mpqa = gluonnlp.data.MPQA(root='./datasets/mpqa')
-    -etc-
-    >>> len(mpqa)
-    10606
-    >>> len(mpqa[3])
-    2
-    >>> type(mpqa[3][0]), type(mpqa[3][1])
-    (<class 'str'>, <class 'int'>)
-    >>> mpqa[3][0][:25]
-    'many years of decay'
-    >>> mpqa[3][1]
-    0
-    """
-    def __init__(self, root=os.path.join(get_home_dir(), 'datasets', 'mpqa')):
-        super(MPQA, self).__init__('all', root)
-
-    def _data_file(self):
-        return {'all': (('all-bcbfeed8.zip', 'e07ae226cfe4713328eeb9660b261b9852ff5865'),
-                        ('all-bcbfeed8.json', 'bcbfeed8b8767a564bdc428486ef18c1ba4dc536'))}
-
-    def _repo_dir(self):
-        return 'gluon/dataset/mpqa'
diff --git a/src/gluonnlp/data/stream.py b/src/gluonnlp/data/stream.py
deleted file mode 100644
index 6f5d2d4f38..0000000000
--- a/src/gluonnlp/data/stream.py
+++ /dev/null
@@ -1,385 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=undefined-all-variable
-"""NLP Toolkit Data Stream API. It allows easy and customizable streaming of
-corpora and dataset files. Files can be streamed into formats that are
-ready for training and evaluation."""
-
-
-import glob
-import multiprocessing
-import multiprocessing.pool
-import os
-import queue
-import random
-import sys
-import threading
-import traceback
-
-import numpy as np
-
-import mxnet as mx
-from mxnet.gluon.data import RandomSampler, Sampler, SequentialSampler
-
-__all__ = [
-    'DataStream', 'SimpleDataStream', 'DatasetStream', 'SimpleDatasetStream',
-    'PrefetchingStream']
-
-class DataStream:
-    """Abstract Data Stream Interface.
-
-    DataStreams are useful to avoid loading big datasets to memory. A
-    DataStream is a iterable object (it implements the __iter__ function).
-    Whenever an iteration over the DataStream is requested (e.g. in a for loop
-    or by calling iter(datastream)), a new iterator over all samples in the
-    DataStream is returned. DataStreams can be lazily transformed by calling
-    `transform()` which returns a DataStream over the transformed samples.
-
-    """
-
-    def __iter__(self):
-        """Return an iterator over all elements of the DataStream.
-
-        This method returns a new iterator object that can iterate over
-        all the objects in the DataStream.
-
-        Returns
-        -------
-        iterator
-            An object implementing the Python *iterator protocol*.
-
-        """
-        raise NotImplementedError
-
-    def transform(self, fn):
-        """Transform a DataStream lazily.
-
-        Returns
-        -------
-        DataStream
-            The data stream that lazily transforms the data while streaming.
-        """
-
-        return _LazyTransformDataStream(self, fn)
-
-
-class SimpleDataStream(DataStream):
-    """SimpleDataStream wraps iterables to expose the DataStream API.
-
-    Unlike the iterable itself, the SimpleDataStream exposes the DataStream API
-    and allows lazy transformation of the iterable.
-
-    """
-    def __init__(self, iterable):
-        self._stream = iterable
-
-    def __iter__(self):
-        return iter(self._stream)
-
-
-class _LazyTransformDataStream(DataStream):
-    """Data stream that lazily transforms the data."""
-    def __init__(self, stream, fn):
-        self._stream = stream
-        self._fn = fn
-
-    def __iter__(self):
-        stream_iter = iter(self._stream)
-
-        # Yield must be hidden in closure so that __iter__ is called before
-        # __next__ is called. This is important, as calling iter(self._stream)
-        # may trigger multi-threaded or multi-processing prefetching of the
-        # stream.
-        def _closure():
-            try:
-                item = next(stream_iter)
-            except StopIteration:
-                return
-            istuple = isinstance(item, tuple)
-            if istuple:
-                yield self._fn(*item)
-                while True:
-                    try:
-                        yield self._fn(*next(stream_iter))
-                    except StopIteration:
-                        return
-            else:
-                yield self._fn(item)
-                while True:
-                    try:
-                        yield self._fn(next(stream_iter))
-                    except StopIteration:
-                        return
-
-        return _closure()
-
-
-class DatasetStream(DataStream):
-    """Abstract Dataset Stream Interface.
-
-    A DatasetStream is a DataStream where each sample is a
-    `mxnet.gluon.data.Dataset`. An iteration over a DatasetStream iterates over
-    `mxnet.gluon.data.Dataset` objects, representing a chunk or shards of some
-    large datasets.
-
-    Iterating over sizeable chunks of a dataset can be helpful to speed up
-    preprocessing as the overhead of preprocessing each sample individually is
-    reduced (this is similar to the idea of using batches for training a
-    model).
-
-    """
-
-    def __iter__(self):
-        raise NotImplementedError
-
-
-class _PathDataset(mx.gluon.data.SimpleDataset):
-    """A simple Datasets containing a list of paths given the file_pattern.
-
-    Parameters
-    ----------
-    file_pattern: str
-        Path to the input text files.
-    """
-    def __init__(self, file_pattern):
-        if not isinstance(file_pattern, str):
-            raise TypeError('file_pattern must be str, but got %s'%type(file_pattern))
-        files = []
-        for pattern in file_pattern.split(','):
-            files.extend(glob.glob(os.path.expanduser(pattern.strip())))
-        files = sorted(files)
-        if len(files) == 0:
-            raise ValueError('Cannot find any file with path "%s"'%file_pattern)
-        super(_PathDataset, self).__init__(files)
-
-
-class SimpleDatasetStream(DatasetStream):
-    """A simple stream of Datasets.
-
-    The SimpleDatasetStream is created from multiple files based on provided
-    `file_pattern`. One file is read at a time and a corresponding Dataset is
-    returned. The Dataset is created based on the file and the kwargs passed to
-    SimpleDatasetStream.
-
-    Parameters
-    ----------
-    dataset : class
-        The class for which to create an object for every file. kwargs are
-        passed to this class.
-    file_pattern: str
-        Path to the input text files.
-    file_sampler : str or gluon.data.Sampler, defaults to 'random'
-        The sampler used to sample a file. The following string values are supported:
-
-        - 'sequential': SequentialSampler
-        - 'random': RandomSampler
-    kwargs
-        All other keyword arguments are passed to the dataset constructor.
-    """
-    def __init__(self, dataset, file_pattern, file_sampler='random', **kwargs):
-        # TODO(haibin) reuse _SimpleDatasetPathStream here
-        if not isinstance(file_pattern, str):
-            raise TypeError('file_pattern must be str, but got %s'%type(file_pattern))
-        self._dataset = dataset
-        self._files = []
-        for pattern in file_pattern.split(','):
-            self._files.extend(glob.glob(os.path.expanduser(pattern.strip())))
-        self._files = sorted(self._files)
-
-        if len(self._files) == 0:
-            raise ValueError('Cannot find any file with path "%s"'%file_pattern)
-        self._file_sampler = self._get_sampler(file_sampler)
-        self._kwargs = kwargs
-
-    def _get_sampler(self, sampler):
-        if isinstance(sampler, Sampler):
-            return sampler
-        if isinstance(sampler, str):
-            length = len(self._files)
-            if sampler == 'random':
-                return RandomSampler(length)
-            if sampler == 'sequential':
-                return SequentialSampler(length)
-        raise ValueError('file_sampler must be a supported str ("random", "sequential") or'
-                         'a `gluon.data.Sampler`, but got %s'%(sampler))
-
-    def __iter__(self):
-        # generate file samples
-        for file_idx in iter(self._file_sampler):
-            filename = self._files[file_idx]
-            yield self._dataset(filename, **self._kwargs)
-
-
-class _Prefetcher:
-    """Internal shared prefetcher logic."""
-    _dataq = None  # Data queue transmits prefetched elements
-    _controlq = None  # Control queue to instruct thread / process shutdown
-    _errorq = None  # Error queue to transmit exceptions from worker to master
-
-    _checked_start = False  # True once startup has been checkd by _check_start
-
-    def __init__(self, stream, num_prefetch, seed, np_seed, mx_seed):
-        super(_Prefetcher, self).__init__()
-        self.stream = stream
-        assert num_prefetch > 0, 'Unbounded Prefetcher is unsupported.'
-        self.num_prefetch = num_prefetch
-        self.seed = seed
-        self.np_seed = np_seed
-        self.mx_seed = mx_seed
-
-    def run(self):
-        """Method representing the process’s activity."""
-        random.seed(self.seed)
-        np.random.seed(self.np_seed)
-        if not isinstance(self, multiprocessing.Process):
-            # Calling mxnet methods in a subprocess will raise an exception if
-            # mxnet is built with GPU support
-            # https://github.com/apache/incubator-mxnet/issues/4659
-            mx.random.seed(self.mx_seed)
-
-        # Startup - Master waits for this
-        try:
-            stream_iter = iter(self.stream)
-            self._errorq.put(None)
-        except Exception as e:  # pylint: disable=broad-except
-            tb = traceback.format_exc()
-            self._errorq.put((e, tb))
-
-        # Async work
-        while True:
-            try:  # Check control queue
-                c = self._controlq.get(False)
-                if c is None:
-                    break
-                raise RuntimeError('Got unexpected control code {}'.format(repr(c)))
-            except queue.Empty:
-                pass
-            except RuntimeError as e:
-                tb = traceback.format_exc()
-                self._errorq.put((e, tb))
-                self._dataq.put(None)
-
-            try:
-                data = next(stream_iter)
-                error = None
-            except Exception as e:  # pylint: disable=broad-except
-                tb = traceback.format_exc()
-                error = (e, tb)
-                data = None
-            finally:
-                self._errorq.put(error)
-                self._dataq.put(data)
-
-    def __next__(self):
-        next_item = self._dataq.get()
-        next_error = self._errorq.get()
-
-        if next_error is None:
-            return next_item
-        else:
-            self._controlq.put(None)
-            if isinstance(next_error[0], StopIteration):
-                raise StopIteration
-            return self._reraise(*next_error)
-
-    def _reraise(self, e, tb):
-        print('Reraising exception from Prefetcher', file=sys.stderr)
-        print(tb, file=sys.stderr)
-        raise e
-
-    def _check_start(self):
-        assert not self._checked_start
-        self._checked_start = True
-        next_error = self._errorq.get(block=True)
-        if next_error is not None:
-            self._reraise(*next_error)
-
-    def next(self):
-        return self.__next__()
-
-class _ProcessPrefetcher(_Prefetcher, multiprocessing.Process):
-    """Internal multi-processing prefetcher."""
-
-    def __init__(self, *args, **kwargs):
-        super(_ProcessPrefetcher, self).__init__(*args, **kwargs)
-        self._dataq = multiprocessing.Queue(self.num_prefetch)
-        self._controlq = multiprocessing.Queue()
-        self._errorq = multiprocessing.Queue(self.num_prefetch)
-        self.daemon = True
-        self.start()
-        self._check_start()
-
-
-class _ThreadPrefetcher(_Prefetcher, threading.Thread):
-    """Internal threaded prefetcher."""
-
-    def __init__(self, *args, **kwargs):
-        super(_ThreadPrefetcher, self).__init__(*args, **kwargs)
-        self._dataq = queue.Queue(self.num_prefetch)
-        self._controlq = queue.Queue()
-        self._errorq = queue.Queue(self.num_prefetch)
-        self.daemon = True
-        self.start()
-        self._check_start()
-
-
-class PrefetchingStream(DataStream):
-    """Prefetch a DataStream in a separate Thread or Process.
-
-    This iterator will create another thread or process to perform
-    ``iter_next`` and then store the data in memory. It potentially accelerates
-    the data read, at the cost of more memory usage.
-
-    The python, numpy and mxnet random states in the launched Thread or Process
-    will be initialized randomly based on the next 32 bit integer in the
-    python, numpy and mxnet random generator of the caller respectively
-    (random.getrandbits(32), numpy.random.randint(0, 2**32),
-    int(mx.nd.random.uniform(0, 2**32).asscalar())).
-
-    Parameters
-    ----------
-    stream : DataStream
-        Source stream.
-    num_prefetch : int, default 1
-        Number of elements to prefetch from the stream. Must be greater 0.
-    worker_type : 'thread' or 'process', default 'thread'
-        Use a separate Python Thread or Process to prefetch.
-
-    """
-
-    def __init__(self, stream, num_prefetch=1, worker_type='thread'):
-        self._stream = stream
-        self._num_prefetch = num_prefetch
-        if num_prefetch < 1:
-            raise ValueError('num_prefetch must be greater 0.')
-        assert worker_type.lower() in ['thread', 'process']
-        self._multiprocessing = worker_type.lower() == 'process'
-
-    def __iter__(self):
-        seed = random.getrandbits(32)
-        # TODO should be possible to change to 64 bit in MXNet 1.6 (uses int64 by default?)
-        np_seed = np.random.randint(0, np.iinfo(np.int32).max)
-        mx_seed = int(mx.nd.random.uniform(0, np.iinfo(np.int32).max).asscalar())
-        if self._multiprocessing:
-            return _ProcessPrefetcher(self._stream, self._num_prefetch,
-                                      seed=seed, np_seed=np_seed,
-                                      mx_seed=mx_seed)
-        else:
-            return _ThreadPrefetcher(self._stream, self._num_prefetch,
-                                     seed=seed, np_seed=np_seed,
-                                     mx_seed=mx_seed)
diff --git a/src/gluonnlp/data/super_glue.py b/src/gluonnlp/data/super_glue.py
deleted file mode 100644
index 67385f1296..0000000000
--- a/src/gluonnlp/data/super_glue.py
+++ /dev/null
@@ -1,577 +0,0 @@
-# coding: utf-8
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=line-too-long
-"""SuperGLUEBenchmark corpora."""
-
-__all__ = ['SuperGlueRTE', 'SuperGlueCB', 'SuperGlueWSC', 'SuperGlueWiC',
-           'SuperGlueCOPA', 'SuperGlueMultiRC', 'SuperGlueBoolQ',
-           'SuperGlueReCoRD', 'SuperGlueAXb', 'SuperGlueAXg']
-
-import zipfile
-import os
-import re
-
-from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url
-
-from .dataset import _JsonlDataset
-from .registry import register
-from ..base import get_home_dir
-
-
-class _SuperGlueDataset(_JsonlDataset):
-    def __init__(self, root, data_file):
-        root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
-        segment, zip_hash, data_hash = data_file
-        self._root = root
-        filename = os.path.join(self._root, '%s.jsonl' % segment)
-        self._get_data(segment, zip_hash, data_hash, filename)
-        super(_SuperGlueDataset, self).__init__(filename)
-
-    def _get_data(self, segment, zip_hash, data_hash, filename):
-        data_filename = '%s-%s.zip' % (segment, data_hash[:8])
-        if not os.path.exists(filename) or not check_sha1(filename, data_hash):
-            download(_get_repo_file_url(self._repo_dir(), data_filename),
-                     path=self._root, sha1_hash=zip_hash)
-            # unzip
-            downloaded_path = os.path.join(self._root, data_filename)
-            with zipfile.ZipFile(downloaded_path, 'r') as zf:
-                # skip dir structures in the zip
-                for zip_info in zf.infolist():
-                    if zip_info.filename[-1] == '/':
-                        continue
-                    zip_info.filename = os.path.basename(zip_info.filename)
-                    zf.extract(zip_info, self._root)
-
-    def _repo_dir(self):
-        raise NotImplementedError
-
-
-@register(segment=['train', 'val', 'test'])
-class SuperGlueRTE(_SuperGlueDataset):
-    """The Recognizing Textual Entailment (RTE) datasets come from a series of annual textual
-    entailment challenges (RTE1, RTE2, RTE3 and RTE5).
-
-    From
-    https://super.gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    segment : {'train', 'val', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default "$MXNET_HOME/datasets/superglue_rte"
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> rte_val = gluonnlp.data.SuperGlueRTE('val', root='./datasets/rte')
-    -etc-
-    >>> len(rte_val)
-    277
-    >>> sorted(rte_val[0].items())
-    [('hypothesis', 'Christopher Reeve had an accident.'), ('idx', 0), ('label', 'not_entailment'), ('premise', 'Dana Reeve, the widow of the actor Christopher Reeve, has died of lung cancer at age 44, according to the Christopher Reeve Foundation.')]
-    >>> rte_test = gluonnlp.data.SuperGlueRTE('test', root='./datasets/rte')
-    -etc-
-    >>> len(rte_test)
-    3000
-    >>> sorted(rte_test[0].items())
-    [('hypothesis', 'Shukla is related to Mangla.'), ('idx', 0), ('premise', "Mangla was summoned after Madhumita's sister Nidhi Shukla, who was the first witness in the case.")]
-
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'superglue_rte')):
-        self._segment = segment
-        self._data_file = {'train': ('train', 'a4471b47b23f6d8bc2e89b2ccdcf9a3a987c69a1',
-                                     '01ebec38ff3d2fdd849d3b33c2a83154d1476690'),
-                           'val': ('val', '17f23360f77f04d03aee6c42a27a61a6378f1fd9',
-                                   '410f8607d9fc46572c03f5488387327b33589069'),
-                           'test': ('test', 'ef2de5f8351ef80036c4aeff9f3b46106b4f2835',
-                                    '69f9d9b4089d0db5f0605eeaebc1c7abc044336b')}
-        data_file = self._data_file[segment]
-
-        super(SuperGlueRTE, self).__init__(root, data_file)
-
-    def _read_samples(self, samples):
-        return samples
-
-    def _repo_dir(self):
-        return 'gluon/dataset/SUPERGLUE/RTE'
-
-
-@register(segment=['train', 'val', 'test'])
-class SuperGlueCB(_SuperGlueDataset):
-    """The CommitmentBank (CB) is a corpus of short texts in which at least one sentence
-    contains an embedded clause.
-
-    From
-    https://super.gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    segment : {'train', 'val', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default "$MXNET_HOME/datasets/superglue_cb"
-        Path to temp folder from storing data.
-        MXNET_HOME defaults to '~/.mxnet'
-
-    Examples
-    --------
-    >>> cb_val = gluonnlp.data.SuperGlueCB('val', root='./datasets/cb')
-    -etc-
-    >>> len(cb_val)
-    56
-    >>> sorted(cb_val[0].items())
-    [('hypothesis', 'Valence was helping'), ('idx', 0), ('label', 'contradiction'), ('premise', "Valence the void-brain, Valence the virtuous valet. Why couldn't the figger choose his own portion of titanic anatomy to shaft? Did he think he was helping?")]
-    >>> cb_test = gluonnlp.data.SuperGlueCB('test', root='./datasets/cb')
-    -etc-
-    >>> len(cb_test)
-    250
-    >>> sorted(cb_test[0].items())
-    [('hypothesis', 'Polly was not an experienced ocean sailor'), ('idx', 0), ('premise', 'Polly had to think quickly. They were still close enough to shore for him to return her to the police if she admitted she was not an experienced ocean sailor.')]
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'superglue_cb')):
-        self._segment = segment
-        self._data_file = {'train': ('train', '0b27cbdbbcdf2ba82da2f760e3ab40ed694bd2b9',
-                                     '193bdb772d2fe77244e5a56b4d7ac298879ec529'),
-                           'val': ('val', 'e1f9dc77327eba953eb41d5f9b402127d6954ae0',
-                                   'd286ac7c9f722c2b660e764ec3be11bc1e1895f8'),
-                           'test': ('test', '008f9afdc868b38fdd9f989babe034a3ac35dd06',
-                                    'cca70739162d54f3cd671829d009a1ab4fd8ec6a')}
-        data_file = self._data_file[segment]
-
-        super(SuperGlueCB, self).__init__(root, data_file)
-
-    def _read_samples(self, samples):
-        return samples
-
-    def _repo_dir(self):
-        return 'gluon/dataset/SUPERGLUE/CB'
-
-
-@register(segment=['train', 'val', 'test'])
-class SuperGlueWSC(_SuperGlueDataset):
-    """
-    The Winograd Schema Challenge (WSC) is a co-reference resolution dataset.
-    (Levesque et al., 2012)
-
-    From
-    https://super.gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    segment : {'train', 'val', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default "$MXNET_HOME/datasets/superglue_wsc"
-        Path to temp folder from storing data.
-        MXNET_HOME defaults to '~/.mxnet'
-
-    Examples
-    --------
-    >>> wsc_val = gluonnlp.data.SuperGlueWSC('val', root='./datasets/wsc')
-    -etc-
-    >>> len(wsc_val)
-    104
-    >>> sorted(wsc_val[5].items())
-    [('idx', 5), ('label', True), ('target', OrderedDict([('span2_index', 9), ('span1_index', 6), ('span1_text', 'The table'), ('span2_text', 'it')])), ('text', 'The large ball crashed right through the table because it was made of styrofoam.')]
-    >>> wsc_test = gluonnlp.data.SuperGlueWSC('test', root='./datasets/wsc')
-    -etc-
-    >>> len(wsc_test)
-    146
-    >>> sorted(wsc_test[16].items())
-    [('idx', 16), ('target', OrderedDict([('span1_text', 'life'), ('span1_index', 1), ('span2_text', 'it'), ('span2_index', 21)])), ('text', 'Your life is yours and yours alone, and if the pain outweighs the benefit, you should have the option to end it .')]
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'superglue_wsc')):
-        self._segment = segment
-        self._data_file = {'train': ('train', 'ed0fe96914cfe1ae8eb9978877273f6baed621cf',
-                                     'fa978f6ad4b014b5f5282dee4b6fdfdaeeb0d0df'),
-                           'val': ('val', 'cebec2f5f00baa686573ae901bb4d919ca5d3483',
-                                   'ea2413e4e6f628f2bb011c44e1d8bae301375211'),
-                           'test': ('test', '3313896f315e0cb2bb1f24f3baecec7fc93124de',
-                                    'a47024aa81a5e7c9bc6e957b36c97f1d1b5da2fd')}
-        data_file = self._data_file[segment]
-
-        super(SuperGlueWSC, self).__init__(root, data_file)
-
-    def _read_samples(self, samples):
-        return samples
-
-    def _repo_dir(self):
-        return 'gluon/dataset/SUPERGLUE/WSC'
-
-
-@register(segment=['train', 'val', 'test'])
-class SuperGlueWiC(_SuperGlueDataset):
-    """
-    The Word-in-Context (WiC) is a word sense disambiguation dataset cast as binary classification
-    of sentence pairs. (Pilehvar and Camacho-Collados, 2019)
-
-    From
-    https://super.gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    segment : {'train', 'val', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default "$MXNET_HOME/datasets/superglue_wic"
-        Path to temp folder from storing data.
-        MXNET_HOME defaults to '~/.mxnet'
-
-    Examples
-    --------
-    >>> wic_val = gluonnlp.data.SuperGlueWiC('val', root='./datasets/wic')
-    -etc-
-    >>> len(wic_val)
-    638
-    >>> sorted(wic_val[3].items())
-    [('end1', 31), ('end2', 35), ('idx', 3), ('label', True), ('sentence1', 'She gave her hair a quick brush.'), ('sentence2', 'The dentist recommended two brushes a day.'), ('start1', 26), ('start2', 28), ('version', 1.1), ('word', 'brush')]
-    >>> wic_test = gluonnlp.data.SuperGlueWiC('test', root='./datasets/wic')
-    -etc-
-    >>> len(wic_test)
-    1400
-    >>> sorted(wic_test[0].items())
-    [('end1', 46), ('end2', 22), ('idx', 0), ('sentence1', 'The smell of fried onions makes my mouth water.'), ('sentence2', 'His eyes were watering.'), ('start1', 41), ('start2', 14), ('version', 1.1), ('word', 'water')]
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'superglue_wic')):
-        self._segment = segment
-        self._data_file = {'train': ('train', 'ec1e265bbdcde1d8da0b56948ed30d86874b1f12',
-                                     '831a58c553def448e1b1d0a8a36e2b987c81bc9c'),
-                           'val': ('val', '2046c43e614d98d538a03924335daae7881f77cf',
-                                   '73b71136a2dc2eeb3be7ab455a08f20b8dbe7526'),
-                           'test': ('test', '77af78a49aac602b7bbf080a03b644167b781ba9',
-                                    '1be93932d46c8f8dc665eb7af6703c56ca1b1e08')}
-        data_file = self._data_file[segment]
-
-        super(SuperGlueWiC, self).__init__(root, data_file)
-
-    def _read_samples(self, samples):
-        return samples
-
-    def _repo_dir(self):
-        return 'gluon/dataset/SUPERGLUE/WiC'
-
-
-@register(segment=['train', 'val', 'test'])
-class SuperGlueCOPA(_SuperGlueDataset):
-    """
-    The Choice of Plausible Alternatives (COPA) is a causal reasoning dataset.
-    (Roemmele et al., 2011)
-
-    From
-    https://super.gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    segment : {'train', 'val', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default "$MXNET_HOME/datasets/superglue_copa"
-        Path to temp folder from storing data.
-        MXNET_HOME defaults to '~/.mxnet'
-
-    Examples
-    --------
-    >>> copa_val = gluonnlp.data.SuperGlueCOPA('val', root='./datasets/copa')
-    -etc-
-    >>> len(copa_val)
-    100
-    >>> sorted(copa_val[0].items())
-    [('choice1', 'The toilet filled with water.'), ('choice2', 'Water flowed from the spout.'), ('idx', 0), ('label', 1), ('premise', 'The man turned on the faucet.'), ('question', 'effect')]
-    >>> copa_test = gluonnlp.data.SuperGlueCOPA('test', root='./datasets/copa')
-    -etc-
-    >>> len(copa_test)
-    500
-    >>> sorted(copa_test[0].items())
-    [('choice1', 'It was fragile.'), ('choice2', 'It was small.'), ('idx', 0), ('premise', 'The item was packaged in bubble wrap.'), ('question', 'cause')]
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'superglue_copa')):
-        self._segment = segment
-        self._data_file = {'train': ('train', '96d20163fa8371e2676a50469d186643a07c4e7b',
-                                     '5bb9c8df7b165e831613c8606a20cbe5c9622cc3'),
-                           'val': ('val', 'acc13ad855a1d2750a3b746fb0cfe3ca6e8b6615',
-                                   'c8b908d880ffaf69bd897d6f2a1f23b8c3a732d4'),
-                           'test': ('test', '89347d7884e71b49dd73c6bcc317aef64bb1bac8',
-                                    '735f39f3d31409d83b16e56ad8aed7725ef5ddd5')}
-        data_file = self._data_file[segment]
-
-        super(SuperGlueCOPA, self).__init__(root, data_file)
-
-    def _read_samples(self, samples):
-        return samples
-
-    def _repo_dir(self):
-        return 'gluon/dataset/SUPERGLUE/COPA'
-
-
-@register(segment=['train', 'val', 'test'])
-class SuperGlueMultiRC(_SuperGlueDataset):
-    """
-    Multi-Sentence Reading Comprehension (MultiRC) is a QA dataset.
-    (Khashabi et al., 2018)
-
-    From
-    https://super.gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    segment : {'train', 'val', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default "$MXNET_HOME/datasets/superglue_multirc"
-        Path to temp folder from storing data.
-        MXNET_HOME defaults to '~/.mxnet'
-
-    Examples
-    --------
-    >>> multirc_val = gluonnlp.data.SuperGlueMultiRC('val', root='./datasets/multirc')
-    -etc-
-    >>> len(multirc_val)
-    83
-    >>> sorted(multirc_val[0].keys())
-    ['questions', 'text']
-    >>> len(multirc_val[0]['text'])
-    12
-    >>> len(multirc_val[0]['questions'])
-    13
-    >>> sorted(multirc_val[0]['questions'][0].keys())
-    ['answers', 'idx', 'multisent', 'question', 'sentences_used']
-    >>> multirc_test = gluonnlp.data.SuperGlueMultiRC('test', root='./datasets/multirc')
-    -etc-
-    >>> len(multirc_test)
-    166
-    >>> sorted(multirc_test[0].keys())
-    ['questions', 'text']
-    >>> len(multirc_test[0]['text'])
-    14
-    >>> len(multirc_test[0]['questions'])
-    14
-    >>> sorted(multirc_test[0]['questions'][0].keys())
-    ['answers', 'idx', 'multisent', 'question', 'sentences_used']
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'superglue_multirc')):
-        self._segment = segment
-        self._data_file = {'train': ('train', '28d908566004fb84ff81828db8955f86fb771929',
-                                     '2ef471a038f0b8116bf056da6440f290be7ab96e'),
-                           'val': ('val', 'af93161bb987fbafe68111bce87dece4472b4ca0',
-                                   '2364ed153f4f4e8cadde78680229a8544ba427db'),
-                           'test': ('test', 'eabf1e8b426a8370cd3755a99412c7871a47ffa4',
-                                    'd6d1107520d535332969ffe5f5b9bd7af2a33072')}
-        data_file = self._data_file[segment]
-
-        super(SuperGlueMultiRC, self).__init__(root, data_file)
-
-    def _read_samples(self, samples):
-        for i, sample in enumerate(samples):
-            paragraph = dict()
-            text = sample['paragraph']['text']
-            sentences = self._split_text(text)
-            paragraph['text'] = sentences
-            paragraph['questions'] = sample['paragraph']['questions']
-            samples[i] = paragraph
-        return samples
-
-    def _split_text(self, text):
-        text = re.sub(r'<b>Sent .{1,2}: </b>', '', text)
-        text = text.split('<br>')
-        sents = [s for s in text if len(s) > 0]
-        return sents
-
-    def _repo_dir(self):
-        return 'gluon/dataset/SUPERGLUE/MultiRC'
-
-
-@register(segment=['train', 'val', 'test'])
-class SuperGlueBoolQ(_SuperGlueDataset):
-    """
-    Boolean Questions (BoolQ) is a QA dataset where each example consists of a short
-    passage and a yes/no question about it.
-
-    From
-    https://super.gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    segment : {'train', 'val', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default "$MXNET_HOME/datasets/superglue_boolq"
-        Path to temp folder from storing data.
-        MXNET_HOME defaults to '~/.mxnet'
-
-    Examples
-    --------
-    >>> boolq_val = gluonnlp.data.SuperGlueBoolQ('val', root='./datasets/boolq')
-    -etc-
-    >>> len(boolq_val)
-    3270
-    >>> sorted(boolq_val[0].keys())
-    ['idx', 'label', 'passage', 'question']
-    >>> boolq_test = gluonnlp.data.SuperGlueBoolQ('test', root='./datasets/boolq')
-    -etc-
-    >>> len(boolq_test)
-    3245
-    >>> sorted(boolq_test[0].keys())
-    ['idx', 'passage', 'question']
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'superglue_boolq')):
-        self._segment = segment
-        self._data_file = {'train': ('train', '89507ff3015c3212b72318fb932cfb6d4e8417ef',
-                                     'd5be523290f49fc0f21f4375900451fb803817c0'),
-                           'val': ('val', 'fd39562fc2c9d0b2b8289d02a8cf82aa151d0ad4',
-                                   '9b09ece2b1974e4da20f0173454ba82ff8ee1710'),
-                           'test': ('test', 'a805d4bd03112366d548473a6848601c042667d3',
-                                    '98c308620c6d6c0768ba093858c92e5a5550ce9b')}
-        data_file = self._data_file[segment]
-
-        super(SuperGlueBoolQ, self).__init__(root, data_file)
-
-    def _read_samples(self, samples):
-        return samples
-
-    def _repo_dir(self):
-        return 'gluon/dataset/SUPERGLUE/BoolQ'
-
-
-@register(segment=['train', 'val', 'test'])
-class SuperGlueReCoRD(_SuperGlueDataset):
-    """
-    Reading Comprehension with Commonsense Reasoning Dataset (ReCoRD) is a multiple-choice
-    QA dataset.
-
-    From
-    https://super.gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    segment : {'train', 'val', 'test'}, default 'train'
-        Dataset segment.
-    root : str, default "$MXNET_HOME/datasets/superglue_record"
-        Path to temp folder from storing data.
-        MXNET_HOME defaults to '~/.mxnet'
-
-    Examples
-    --------
-    >>> record_val = gluonnlp.data.SuperGlueReCoRD('val', root='./datasets/record')
-    -etc-
-    >>> len(record_val)
-    7481
-    >>> sorted(record_val[0].keys())
-    ['idx', 'passage', 'qas', 'source']
-    >>> record_test = gluonnlp.data.SuperGlueReCoRD('test', root='./datasets/record')
-    -etc-
-    >>> len(record_test)
-    7484
-    >>> sorted(record_test[0].keys())
-    ['idx', 'passage', 'qas', 'source']
-    """
-    def __init__(self, segment='train',
-                 root=os.path.join(get_home_dir(), 'datasets', 'superglue_record')):
-        self._segment = segment
-        self._data_file = {'train': ('train', '047282c912535c9a3bcea519935fde882feb619d',
-                                     '65592074cefde2ecd1b27ce7b35eb8beb86c691a'),
-                           'val': ('val', '442d8470bff2c9295231cd10262a7abf401edc64',
-                                   '9d1850e4dfe2eca3b71bfea191d5f4b412c65309'),
-                           'test': ('test', 'fc639a18fa87befdc52f14c1092fb40475bf50d0',
-                                    'b79b22f54b5a49f98fecd05751b122ccc6947c81')}
-        data_file = self._data_file[segment]
-
-        super(SuperGlueReCoRD, self).__init__(root, data_file)
-
-    def _read_samples(self, samples):
-        return samples
-
-    def _repo_dir(self):
-        return 'gluon/dataset/SUPERGLUE/ReCoRD'
-
-
-class SuperGlueAXb(_SuperGlueDataset):
-    """
-    The Broadcoverage Diagnostics (AX-b) is a diagnostics dataset labeled closely to
-    the schema of MultiNLI.
-
-    From
-    https://super.gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    root : str, default "$MXNET_HOME/datasets/superglue_ax_b"
-        Path to temp folder from storing data.
-        MXNET_HOME defaults to '~/.mxnet'
-
-    Examples
-    --------
-    >>> ax_b = gluonnlp.data.SuperGlueAXb(root='./datasets/ax_b')
-    -etc-
-    >>> len(ax_b)
-    1104
-    >>> sorted(ax_b[0].items())
-    [('idx', '0'), ('label', 'not_entailment'), ('logic', 'Negation'), ('sentence1', 'The cat sat on the mat.'), ('sentence2', 'The cat did not sit on the mat.')]
-    """
-    def __init__(self, root=os.path.join(get_home_dir(), 'datasets', 'superglue_ax_b')):
-        data_file = ('AX-b', '398c5a376eb436f790723cd217ac040334140000',
-                     '50fd8ac409897b652daa4b246917097c3c394bc8')
-
-        super(SuperGlueAXb, self).__init__(root, data_file)
-
-    def _read_samples(self, samples):
-        return samples
-
-    def _repo_dir(self):
-        return 'gluon/dataset/SUPERGLUE/AX-b'
-
-
-class SuperGlueAXg(_SuperGlueDataset):
-    """
-    The Winogender Schema Diagnostics (AX-g) is a diagnostics dataset labeled closely to
-    the schema of MultiNLI.
-
-    From
-    https://super.gluebenchmark.com/tasks
-
-    Parameters
-    ----------
-    root : str, default "$MXNET_HOME/datasets/superglue_ax_g"
-        Path to temp folder from storing data.
-        MXNET_HOME defaults to '~/.mxnet'
-
-    Examples
-    --------
-    >>> ax_g = gluonnlp.data.SuperGlueAXg(root='./datasets/ax_g')
-    -etc-
-    >>> len(ax_g)
-    356
-    >>> sorted(ax_g[0].items())
-    [('hypothesis', 'The accountant sought help filing taxes.'), ('idx', 0), ('label', 'not_entailment'), ('pair_id', 551646), ('premise', 'The taxpayer met with the accountant to get help filing his taxes.')]
-    """
-    def __init__(self, root=os.path.join(get_home_dir(), 'datasets', 'superglue_ax_g')):
-        data_file = ('AX-g', 'd8c92498496854807dfeacd344eddf466d7f468a',
-                     '8a8cbfe00fd88776a2a2f20b477e5b0c6cc8ebae')
-
-        super(SuperGlueAXg, self).__init__(root, data_file)
-
-    def _read_samples(self, samples):
-        return samples
-
-    def _repo_dir(self):
-        return 'gluon/dataset/SUPERGLUE/AX-g'
diff --git a/src/gluonnlp/data/tokenizers.py b/src/gluonnlp/data/tokenizers.py
new file mode 100644
index 0000000000..44449563e2
--- /dev/null
+++ b/src/gluonnlp/data/tokenizers.py
@@ -0,0 +1,1734 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Tokenizers."""
+__all__ = ['WhitespaceTokenizer', 'SpacyTokenizer', 'JiebaTokenizer', 'MosesTokenizer',
+           'SubwordNMTTokenizer', 'YTTMTokenizer', 'SentencepieceTokenizer',
+           'HuggingFaceBPETokenizer', 'HuggingFaceByteBPETokenizer',
+           'HuggingFaceWordPieceTokenizer',
+           'create', 'create_with_json', 'list_all']
+
+import os
+import abc
+import json
+import warnings
+import itertools
+from uuid import uuid4
+from typing import List, Tuple, Union, NewType, Optional
+from collections import OrderedDict
+
+import sacremoses
+
+from .vocab import Vocab
+from ..registry import TOKENIZER_REGISTRY
+from ..utils.lazy_imports import try_import_subword_nmt,\
+                                 try_import_sentencepiece,\
+                                 try_import_huggingface_tokenizers,\
+                                 try_import_yttm,\
+                                 try_import_spacy,\
+                                 try_import_jieba
+
+
+SentencesType = NewType('SentencesType', Union[str, List[str]])
+TokensType = NewType('TokensType', Union[List[str], List[List[str]]])
+TokenIDsType = NewType('TokenIDsType', Union[List[int], List[List[int]]])
+TokenOffsetsType = NewType('TokenOffsetsType', Union[List[Tuple[int, int]],
+                                                     List[List[Tuple[int, int]]]])
+
+
+def _encode_no_vocab_err_msg():
+    return 'There is no vocab bound to the tokenizer. ' \
+           'Must set vocab if the output_type is "int". You may use' \
+           ' `tokenizer.set_vocab(vocab)` to attach the vocabulary.'
+
+
+def _decode_no_vocab_err_msg():
+    return 'Decode has "int" as the input token type. You must specify the ' \
+           'vocabulary in order to decode from integers. ' \
+           'You can use `tokenizer.set_vocab(vocab)`' \
+           ' to attach the vocabulary.'
+
+
+def _token_type_unsupported_err_msg(token_type):
+    return 'The token type is not supported, we only support ' \
+           '"str" and "int" as the inner token types. Received type(token)="{}"'.format(token_type)
+
+
+def _is_tokens_from_multiple_sentences(tokens: Union[TokensType, TokenIDsType]) -> bool:
+    """Return True if the input is List[List[Any]]"""
+    return len(tokens) > 0 and isinstance(tokens[0], list)
+
+
+def _get_token_type(tokens: Union[List[str], List[int], List[List[str]],
+                                  List[List[int]]]) -> type:
+    """
+
+    Parameters
+    ----------
+    tokens
+        The input tokens.
+
+    Returns
+    -------
+    token_type
+        If the tokens is empty, return `str`.
+        Otherwise, return `str` if the input is str and `int` if the input is int.
+    """
+    if len(tokens) == 0:
+        return str
+    if isinstance(tokens[0], int):
+        return int
+    elif isinstance(tokens[0], str):
+        return str
+    elif isinstance(tokens[0], list):
+        flatten_tokens_it = itertools.chain.from_iterable(tokens)
+        try:
+            first_token = next(flatten_tokens_it)
+            return type(first_token)
+        except StopIteration:
+            return str
+    else:
+        raise ValueError(_token_type_unsupported_err_msg(type(tokens[0])))
+
+
+def _get_vocab(vocab: Union[str, Vocab]) -> Vocab:
+    if isinstance(vocab, Vocab):
+        return vocab
+    elif isinstance(vocab, str):
+        return Vocab.load(vocab)
+    else:
+        raise NotImplementedError('Type of the input vocab is not supported. '
+                                  'We only support "str" or "Vocab". type(vocab) = "{}".'
+                                  .format(type(vocab)))
+
+
+def _rebuild_offset_from_tokens(sentence: str, tokens: List[str]) \
+        -> List[Tuple[int, int]]:
+    """Recover the offset of the tokens in the original sentence.
+
+    If you are using a subword tokenizer, make sure to remove the prefix/postfix of the tokens
+    before using this function. Also, this does not work for n-gram-based (n>1) subword
+    tokenization, i.e.
+    it works for "gluonnlp --> gluon + nlp" but not for "gluonnlp --> gl + lu + uo + on + nl + lp"
+
+    Parameters
+    ----------
+    sentence
+        The input sentence
+    tokens
+        A list of strings that represent the tokenization result
+
+    Returns
+    -------
+    offsets
+        A list of pairs: [(start0, end0), (start1, end1), ...].
+        Each pair represents the start and end positions of the token in the original
+        sentence.
+    """
+    running_offset = 0
+    ret = []
+    for token in tokens:
+        token_offset = sentence.index(token, running_offset)
+        token_len = len(token)
+        running_offset = token_offset + token_len
+        ret.append((token_offset, running_offset))
+    return ret
+
+
+def _get_char_offset_from_byte_offset(sentence: str, byte_offsets: List[Tuple[int, int]]):
+    # This is the most naive implementation
+    byte_offset_to_char_offset = {}
+    byte_offset = 0
+    for i, ele in enumerate(sentence):
+        byte_offset_to_char_offset[byte_offset] = i
+        byte_offset += len(ele.encode('utf-8'))
+    byte_offset_to_char_offset[byte_offset] = i + 1  # Handle the last sentence
+    ret = []
+    for ele in byte_offsets:
+        ret.append((byte_offset_to_char_offset[ele[0]],
+                    byte_offset_to_char_offset[ele[1]]))
+    return ret
+
+
+class BaseTokenizer(abc.ABC):
+    @abc.abstractmethod
+    def encode(self, sentences: SentencesType,
+               output_type: Union[type, str] = str) \
+            -> Union[TokensType, TokenIDsType]:
+        """Encode the input sentence(s) into multiple tokens.
+
+        Parameters
+        ----------
+        sentences
+            The sentences to tokenize
+        output_type
+            The type of the output tokens.
+            - str or `token` means each token is represented by its original text.
+            - int or `id` means each token is represented by the index in the vocabulary.
+
+        Returns
+        -------
+        tokens
+            The output tokens.
+        """
+        pass
+
+    @abc.abstractmethod
+    def decode(self, tokens: Union[TokensType, TokenIDsType]) -> SentencesType:
+        """Detokenize a sequence/multiple sequences of tokens to a single sentence/multiple
+         sentences.
+
+        Parameters
+        ----------
+        tokens
+            The input tokens to decode
+
+        Returns
+        -------
+        sentences
+            The detokenized sentence(s)
+        """
+        pass
+
+    def encode_with_offsets(self, sentences: SentencesType,
+                            output_type: type = str) \
+            -> Tuple[Union[TokensType, TokenIDsType], TokenOffsetsType]:
+        """Encode the input sentence(s) into multiple tokens. Different from encode, it
+        will also return the character start and end positions of each token in the original text.
+        The original text is assumed to be
+
+        Here, the default implementation is to use the tokenized result to recover the offsets.
+
+        Parameters
+        ----------
+        sentences
+            The sentence(s) to tokenize
+        output_type
+            The type of the output tokens.
+            - `str` means each token is represented by its original text.
+            - `int` means each token is represented by the index in the vocabulary.
+
+        Returns
+        -------
+        tokens
+            The output tokens.
+        offsets
+            The offsets of these tokens. Each encodes the start and end location in the original
+            unicode string. We return the character-offset instead of the byte-offset.
+        """
+        raise NotImplementedError
+
+
+class BaseTokenizerWithVocab(BaseTokenizer):
+    @property
+    @abc.abstractmethod
+    def vocab(self) -> Optional[Vocab]:
+        """Get the vocab of the tokenizer
+
+        Returns
+        -------
+        vocab
+            The vocab of the tokenizer
+        """
+        pass
+
+    @abc.abstractmethod
+    def set_vocab(self, vocab: Vocab):
+        pass
+
+
+def load_tokenizer(method, **kwargs):
+    if method == 'whitespace':
+        return WhitespaceTokenizer()
+    elif method == 'moses':
+        return MosesTokenizer(**kwargs)
+
+
+@TOKENIZER_REGISTRY.register('whitespace')
+class WhitespaceTokenizer(BaseTokenizerWithVocab):
+    def __init__(self, vocab: Optional[Vocab] = None):
+        self._vocab = vocab
+
+    def encode(self, sentences, output_type=str):
+        is_multiple_sentences = isinstance(sentences, list)
+        if not is_multiple_sentences:
+            sentences = [sentences]
+        if output_type is str:
+            tokens = [sentence.split() for sentence in sentences]
+        elif output_type is int:
+            if self._vocab is None:
+                raise ValueError(_encode_no_vocab_err_msg())
+            tokens = [self._vocab[sentence.split()] for sentence in sentences]
+        else:
+            raise NotImplementedError
+        if is_multiple_sentences:
+            return tokens
+        else:
+            return tokens[0]
+
+    def encode_with_offsets(self, sentences, output_type=str):
+        if output_type is int and (not hasattr(self, 'vocab') or self.vocab is None):
+            raise ValueError(_encode_no_vocab_err_msg())
+        if output_type not in [int, str]:
+            raise ValueError(_token_type_unsupported_err_msg(output_type))
+        is_multiple_sentences = isinstance(sentences, list)
+        if not is_multiple_sentences:
+            sentences = [sentences]
+        all_tokens = self.encode(sentences, output_type=str)
+        offsets = []
+        for ele_tokens, ele_sentence in zip(all_tokens, sentences):
+            ele_offsets = _rebuild_offset_from_tokens(ele_sentence, ele_tokens)
+            offsets.append(ele_offsets)
+        if is_multiple_sentences:
+            return all_tokens, offsets
+        else:
+            return all_tokens[0], offsets[0]
+
+    def decode(self, tokens):
+        is_multiple_sentences = _is_tokens_from_multiple_sentences(tokens)
+        if not is_multiple_sentences:
+            tokens = [tokens]
+        token_type = _get_token_type(tokens)
+        if token_type is str:
+            ret = [' '.join(ele_tokens) for ele_tokens in tokens]
+        elif token_type is int:
+            if self._vocab is None:
+                raise ValueError(_decode_no_vocab_err_msg())
+            ret = [' '.join(self._vocab.to_tokens(ele_tokens)) for ele_tokens in tokens]
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(token_type))
+        if is_multiple_sentences:
+            return ret
+        else:
+            return ret[0]
+
+    @property
+    def vocab(self):
+        return self._vocab
+
+    def set_vocab(self, vocab: Vocab):
+        """Set the vocabulary of the tokenizer
+
+        Parameters
+        ----------
+        vocab
+        """
+        self._vocab = vocab
+
+
+@TOKENIZER_REGISTRY.register('spacy')
+class SpacyTokenizer(BaseTokenizerWithVocab):
+    r"""Apply the Spacy Tokenizer.
+
+    Users of this class are required to install `spaCy <https://spacy.io/usage/>`_
+    and download corresponding NLP models, such as :samp:`python -m spacy download en`.
+
+    Only spacy>=2.0.0 is supported.
+
+    Parameters
+    ----------
+    lang
+        The language of the input. If we just specify the lang and do not specify the model,
+        we will provide the tokenizer with pre-selected models.
+    model
+        The language to tokenize. Default is 'en_core_web_sm', i.e, English.
+        You may refer to https://spacy.io/usage/models for supported languages.
+    vocab
+        The vocabulary of the tokenizer. Can be optional.
+
+    Examples
+    --------
+    >>> import gluonnlp
+    >>> tokenizer = gluonnlp.data.SpacyTokenizer()
+    >>> tokenizer.encode('Gluon NLP toolkit provides a suite of text processing tools.')
+    ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.']
+    >>> tokenizer = gluonnlp.data.SpacyTokenizer('de')
+    >>> tokenizer.encode('Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools'
+    ...                  ' zur Verfügung.')
+    ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \
+'zur', 'Verfügung', '.']
+    >>> tokenizer = gluonnlp.data.SpacyTokenizer(model='de_core_news_sm')
+    >>> tokenizer.encode('Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools'
+    ...                  ' zur Verfügung.')
+    ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \
+'zur', 'Verfügung', '.']
+    """
+
+    def __init__(self, lang: Optional[str] = 'en', model: Optional[str] = None,
+                 vocab: Optional[Vocab] = None):
+        self._vocab = vocab
+        spacy = try_import_spacy()
+        if model is None:
+            assert lang is not None
+            if lang == 'en':
+                model = 'en_core_web_sm'
+            elif lang == 'de':
+                model = 'de_core_news_sm'
+            elif lang == 'fr':
+                model = 'fr_core_news_sm'
+            else:
+                model = 'xx_ent_wiki_sm'
+        retries = 5
+        try:
+            self._nlp = spacy.load(model, disable=['parser', 'tagger', 'ner'])
+        except Exception:
+            from spacy.cli import download
+            while retries >= 0:
+                try:
+                    download(model, False, '--user')
+                    self._nlp = spacy.load(model, disable=['parser', 'tagger', 'ner'])
+                    break
+                except Exception as download_err:
+                    retries -= 1
+                    if retries < 0:
+                        print('SpaCy Model for the specified model="{model}" has not been '
+                              'successfully loaded. You need to check the installation guide in '
+                              'https://spacy.io/usage/models. Usually, the installation command '
+                              'should be `python -m spacy download {model}`.\n'
+                              'Complete Error Message: {err_msg}'.format(model=model,
+                                                                        err_msg=str(download_err)))
+                        raise
+
+    def encode(self, sentences, output_type=str):
+        if output_type is str:
+            if isinstance(sentences, list):
+                return [[tok.text for tok in self._nlp(sentence)] for sentence in sentences]
+            else:
+                return [tok.text for tok in self._nlp(sentences)]
+        elif output_type is int:
+            if self._vocab is None:
+                raise ValueError(_encode_no_vocab_err_msg())
+            tokens = self.encode(sentences, str)
+            if isinstance(sentences, list):
+                return [self._vocab[ele_tokens] for ele_tokens in tokens]
+            else:
+                return [self._vocab[tokens]]
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(output_type))
+
+    def encode_with_offsets(self, sentences: SentencesType, output_type=str):
+        is_multi_sentences = isinstance(sentences, list)
+        if not is_multi_sentences:
+            sentences = [sentences]
+        all_tokens = [self._nlp(sentence) for sentence in sentences]
+        offsets = [[(tok.idx, tok.idx + len(tok.text)) for tok in tokens]
+                   for tokens in all_tokens]
+        if output_type is str:
+            out_tokens = [[tok.text for tok in tokens] for tokens in all_tokens]
+        elif output_type is int:
+            if self._vocab is None:
+                raise ValueError(_encode_no_vocab_err_msg())
+            out_tokens = [self._vocab[[tok.text for tok in tokens]] for tokens in all_tokens]
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(output_type))
+        if is_multi_sentences:
+            return out_tokens, offsets
+        else:
+            return out_tokens[0], offsets[0]
+
+    def decode(self, tokens):
+        raise NotImplementedError(
+            'We decide not to implement the decode feature for SpacyTokenizer'
+            ' because detokenization is not well-supported by'
+            ' spacy. For more details, you may refer to the stack-overflow discussion:'
+            ' https://stackoverflow.com/questions/50330455/how-to-detokenize-spacy-text-without-doc-context. '
+            'Also, we welcome your contribution for an approximate detokenizer for SpaCy.')
+
+    @property
+    def vocab(self):
+        return self._vocab
+
+    def set_vocab(self, vocab: Vocab):
+        """Set the vocabulary of the tokenizer
+
+        Parameters
+        ----------
+        vocab
+            Update the inner vocabulary of the tokenizer to the given vocabulary.
+        """
+        self._vocab = vocab
+
+
+@TOKENIZER_REGISTRY.register('moses')
+class MosesTokenizer(BaseTokenizerWithVocab):
+    r"""Apply the Moses Tokenizer/Detokenizer implemented in
+     [sacremoses](https://github.com/alvations/sacremoses).
+
+    .. note::
+        sacremoses carries an LGPL 2.1+ license.
+
+    Parameters
+    ----------
+    lang
+        The language of the input.
+    """
+
+    def __init__(self, lang: str = 'en', vocab: Optional[Vocab] = None):
+        self._lang = lang
+        self._vocab = vocab
+        if lang == 'zh':
+            warnings.warn('You may not use MosesTokenizer for Chinese sentences because it is '
+                          'not accurate. Try to use JiebaTokenizer. You may also tokenize the '
+                          'chinese sentence to characters and learn a BPE.')
+        self._tokenizer = sacremoses.MosesTokenizer(lang=lang)
+        self._detokenizer = sacremoses.MosesDetokenizer(lang=lang)
+
+        # Here, we need to warm-up the tokenizer to compile the regex
+        # This will boost the performance in MacOS
+        # For benchmarking results, see
+        # https://gist.github.com/sxjscience/f59d2b88262fefd4fb08565c9dec6099
+        self._warmup()
+
+    def _warmup(self):
+        _ = self.encode('hello world')
+        _ = self.decode(['hello', 'world'])
+
+    def encode(self, sentences, output_type=str):
+        if output_type is str:
+            if isinstance(sentences, list):
+                return [self._tokenizer.tokenize(sentence, return_str=False)
+                        for sentence in sentences]
+            else:
+                return self._tokenizer.tokenize(sentences, return_str=False)
+        elif output_type is int:
+            if self._vocab is None:
+                raise ValueError(_encode_no_vocab_err_msg())
+            tokens = self.encode(sentences, str)
+            if isinstance(sentences, list):
+                return [self._vocab[ele_tokens] for ele_tokens in tokens]
+            else:
+                return self._vocab[tokens]
+        else:
+            raise NotImplementedError
+
+    def encode_with_offsets(self, sentences, output_type=str):
+        raise NotImplementedError('We cannot obtain the original offsets for MosesTokenizer.')
+
+    def decode(self, tokens):
+        is_multiple_sentences = _is_tokens_from_multiple_sentences(tokens)
+        if not is_multiple_sentences:
+            tokens = [tokens]
+        token_type = _get_token_type(tokens)
+        if token_type is str:
+            ret = [self._detokenizer.detokenize(ele_tokens, return_str=True)
+                   for ele_tokens in tokens]
+        elif token_type is int:
+            if self._vocab is None:
+                raise ValueError(_decode_no_vocab_err_msg())
+            ret = [self._detokenizer.detokenize(self._vocab.to_tokens(ele_tokens), return_str=True)
+                   for ele_tokens in tokens]
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(token_type))
+        if is_multiple_sentences:
+            return ret
+        else:
+            return ret[0]
+
+    @property
+    def vocab(self):
+        return self._vocab
+
+    def set_vocab(self, vocab: Vocab):
+        self._vocab = vocab
+
+
+@TOKENIZER_REGISTRY.register('jieba')
+class JiebaTokenizer(BaseTokenizerWithVocab):
+    r"""Apply the jieba tokenizer to tokenize Chinese sentences.
+
+    For more details, you may refer to [jieba](https://github.com/fxsjy/jieba)
+
+    """
+
+    def __init__(self, dictionary=None, vocab: Optional[Vocab] = None):
+        self._vocab = vocab
+        jieba = try_import_jieba()
+        self._tokenizer = jieba.Tokenizer(dictionary)
+        self._tokenizer.initialize(self._tokenizer.dictionary)
+
+    def encode(self, sentences, output_type=str):
+        if output_type is str:
+            if isinstance(sentences, list):
+                return [list(self._tokenizer.cut(sentence)) for sentence in sentences]
+            else:
+                return list(self._tokenizer.cut(sentences))
+        elif output_type is int or output_type == 'id':
+            if self._vocab is None:
+                raise ValueError(_encode_no_vocab_err_msg())
+            if isinstance(sentences, list):
+                return [[self._vocab[ele] for ele in self._tokenizer.cut(sentence)]
+                        for sentence in sentences]
+            else:
+                return [self._vocab[ele] for ele in self._tokenizer.cut(sentences)]
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(output_type))
+
+    def encode_with_offsets(self, sentences, output_type=str):
+        is_multiple_sentences = isinstance(sentences, list)
+        if not is_multiple_sentences:
+            sentences = [sentences]
+        all_tokens = [list(self._tokenizer.tokenize(sentence)) for sentence in sentences]
+        offsests = [[(ele[1], ele[2]) for ele in tokens] for tokens in all_tokens]
+        if output_type is str:
+            ret_tokens = [[ele[0] for ele in tokens] for tokens in all_tokens]
+        elif output_type is int:
+            if self._vocab is None:
+                raise ValueError(_encode_no_vocab_err_msg())
+            ret_tokens = [self._vocab[[ele[0] for ele in tokens]] for tokens in all_tokens]
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(output_type))
+        if is_multiple_sentences:
+            return ret_tokens, offsests
+        else:
+            return ret_tokens[0], offsests[0]
+
+    def decode(self, tokens):
+        is_multiple_sentences = _is_tokens_from_multiple_sentences(tokens)
+        if not is_multiple_sentences:
+            tokens = [tokens]
+        token_type = _get_token_type(tokens)
+        if token_type is str:
+            ret = [''.join(ele_tokens) for ele_tokens in tokens]
+        elif token_type is int:
+            if self._vocab is None:
+                raise ValueError(_decode_no_vocab_err_msg())
+            ret = [''.join(self._vocab.to_tokens(ele_tokens)) for ele_tokens in tokens]
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(token_type))
+        if is_multiple_sentences:
+            return ret
+        else:
+            return ret[0]
+
+    @property
+    def vocab(self):
+        return self._vocab
+
+    def set_vocab(self, vocab: Vocab):
+        self._vocab = vocab
+
+    def __getstate__(self):
+        """Make the JiebaTokenizer pickleble. It is safe to remove the lock."""
+        d = {k: v for k, v in self._tokenizer.__dict__.items() if k != 'lock'}
+        return d
+
+    def __setstate__(self, state):
+        jieba = try_import_jieba()
+        self._tokenizer = jieba.Tokenizer()
+        for k, v in state.items():
+            setattr(self._tokenizer, k, v)
+
+
+@TOKENIZER_REGISTRY.register('subword_nmt')
+class SubwordNMTTokenizer(BaseTokenizerWithVocab):
+    def __init__(self, codec_path, vocab_path: Optional[Union[str, Vocab]] = None,
+                 separator: str = '@@', bpe_dropout: float = 0.0,
+                 suffix: str = '</w>'):
+        """
+
+        Parameters
+        ----------
+        codec_path
+        vocab_path
+        separator
+        bpe_dropout
+        """
+        try_import_subword_nmt()
+        from subword_nmt.apply_bpe import BPE
+        self._codec_path = codec_path
+        self._vocab = _get_vocab(vocab_path)
+        self._separator = separator
+        self._bpe_dropout = bpe_dropout
+        self._suffix = suffix
+        with open(self._codec_path, 'r', encoding='utf-8') as merge_codes:
+            self._bpe = BPE(codes=merge_codes, separator=self._separator)
+        self._last_subword_id_set = frozenset([self._vocab[ele]
+                                               for ele in self._vocab.all_tokens
+                                               if not ele.endswith(self._separator)])
+
+    def transform_sentence(self, sentence):
+        # replace the separator in encoded result with suffix
+        # a@@, b@@, c ->  a, b, c</w>
+        return [word[:-2] if len(word) > 2 and word[-2:] == self._separator else word + self._suffix
+                for word in sentence]
+
+    def encode(self, sentences, output_type=str):
+        is_multi_sentences = isinstance(sentences, list)
+        if not is_multi_sentences:
+            sentences = [sentences]
+        if output_type is str:
+            ret = [self.transform_sentence(
+                self._bpe.segment(sentence, dropout=self._bpe_dropout).split(' '))
+                   for sentence in sentences]
+        elif output_type is int:
+            if self._vocab is None:
+                raise ValueError(_encode_no_vocab_err_msg())
+            ret = [self._vocab[self.transform_sentence(
+                self._bpe.segment(sentence, dropout=self._bpe_dropout).split(' '))]
+                   for sentence in sentences]
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(output_type))
+        if is_multi_sentences:
+            return ret
+        else:
+            return ret[0]
+
+    def encode_with_offsets(self, sentences, output_type=str):
+        is_multi_sentences = isinstance(sentences, list)
+        if not is_multi_sentences:
+            sentences = [sentences]
+        tokens = []
+        token_ids = []
+        offsets = []
+        for sentence in sentences:
+            encode_token = self.transform_sentence(
+                self._bpe.segment(sentence, dropout=self._bpe_dropout).split(' '))
+            encode_id = self._vocab[encode_token]
+            encode_token_without_suffix = [x.replace(self._suffix, '') for x in encode_token]
+            encode_offset = _rebuild_offset_from_tokens(sentence, encode_token_without_suffix)
+            tokens.append(encode_token)
+            token_ids.append(encode_id)
+            offsets.append(encode_offset)
+        if not is_multi_sentences:
+            tokens = tokens[0]
+            token_ids = token_ids[0]
+            offsets = offsets[0]
+        if output_type is str:
+            return tokens, offsets
+        elif output_type is int:
+            return token_ids, offsets
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(output_type))
+
+    def decode(self, tokens: Union[TokensType, TokenIDsType]) -> SentencesType:
+        is_multiple_sentences = _is_tokens_from_multiple_sentences(tokens)
+        if not is_multiple_sentences:
+            tokens = [tokens]
+        token_type = _get_token_type(tokens)
+        if token_type is str:
+            ret = [''.join(ele_tokens).replace(self._suffix, ' ').strip()
+                   for ele_tokens in tokens]
+        elif token_type is int:
+            if self._vocab is None:
+                raise ValueError(_decode_no_vocab_err_msg())
+            ret = [''.join(self._vocab.to_tokens(ele_tokens)).replace(self._suffix, ' ').strip()
+                   for ele_tokens in tokens]
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(token_type))
+        if is_multiple_sentences:
+            return ret
+        else:
+            return ret[0]
+
+    def is_last_subword(self, tokens: Union[str, int, List[str], List[int]]) \
+            -> Union[bool, List[bool]]:
+        """Whether the token is the last subword token
+
+        Parameters
+        ----------
+        tokens
+            The input tokens
+
+        Returns
+        -------
+        ret
+            Whether the token is the last subword token in the list of subwords
+        """
+        if isinstance(tokens, str):
+            return not tokens.endswith(self._separator)
+        elif isinstance(tokens, int):
+            return tokens in self._last_subword_id_set
+        elif isinstance(tokens, list):
+            if len(tokens) == 0:
+                return []
+            if isinstance(tokens[0], str):
+                return [not ele.endswith(self._separator) for ele in tokens]
+            elif isinstance(tokens[0], int):
+                return [ele in self._last_subword_id_set for ele in tokens]
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+    @property
+    def vocab(self) -> Optional[Vocab]:
+        return self._vocab
+
+    def set_vocab(self, vocab: Vocab):
+        self._vocab = vocab
+
+    def set_bpe_dropout(self, bpe_dropout: float):
+        self._bpe_dropout = bpe_dropout
+
+    def __repr__(self):
+        ret = '{}(\n' \
+              '   codec_path = {}\n' \
+              '   separator = {}\n' \
+              '   bpe_dropout = {}\n' \
+              '   vocab = {}\n' \
+              ')'.format(self.__class__.__name__,
+                         os.path.realpath(self._codec_path),
+                         self._separator,
+                         self._bpe_dropout,
+                         self._vocab)
+        return ret
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state['_bpe'] = None
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+        from subword_nmt.apply_bpe import BPE
+        with open(self._codec_path, 'r', encoding='utf-8') as merge_codes:
+            self._bpe = BPE(codes=merge_codes, separator=self._separator)
+
+
+class HuggingFaceTokenizer(BaseTokenizerWithVocab):
+    def encode(self, sentences, output_type=str):
+        is_multi_sentences = isinstance(sentences, list)
+        if not is_multi_sentences:
+            sentences = [sentences]
+        encode_sentences = self._bpe.encode_batch(sentences)
+        if output_type is str:
+            ret = [encode_sentence.tokens for encode_sentence in encode_sentences]
+        elif output_type is int:
+            ret = [encode_sentence.ids for encode_sentence in encode_sentences]
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(output_type))
+        if is_multi_sentences:
+            return ret
+        else:
+            return ret[0]
+
+    def encode_with_offsets(self, sentences, output_type=str):
+        is_multi_sentences = isinstance(sentences, list)
+        if not is_multi_sentences:
+            sentences = [sentences]
+        encode_sentences = self._bpe.encode_batch(sentences)
+        if output_type is str:
+            ret = [encode_sentence.tokens for encode_sentence in encode_sentences]
+            offsets = [encode_sentence.offsets for encode_sentence in encode_sentences]
+        elif output_type is int:
+            ret = [encode_sentence.ids for encode_sentence in encode_sentences]
+            offsets = [encode_sentence.offsets for encode_sentence in encode_sentences]
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(output_type))
+        if is_multi_sentences:
+            return ret, offsets
+        else:
+            return ret[0], offsets[0]
+
+    def decode(self, tokens):
+        is_multiple_sentences = _is_tokens_from_multiple_sentences(tokens)
+        if not is_multiple_sentences:
+            tokens = [tokens]
+        token_type = _get_token_type(tokens)
+        if token_type is str:
+            id_tokens = [[self._bpe.token_to_id(token) for token in sentence] for sentence in
+                         tokens]
+            ret = self._bpe.decode_batch(id_tokens)
+        elif token_type is int:
+            ret = self._bpe.decode_batch(tokens)
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(token_type))
+        if is_multiple_sentences:
+            return ret
+        else:
+            return ret[0]
+
+    @property
+    def vocab(self):
+        return self._vocab
+
+    def set_vocab(self, vocab):
+        raise NotImplementedError('Cannot set vocabulary for the HuggingFaceTokenizer.')
+
+
+@TOKENIZER_REGISTRY.register('hf_bpe')
+class HuggingFaceBPETokenizer(HuggingFaceTokenizer):
+    def __init__(self, merges_file: Optional[str] = None, vocab_file: Optional[str] = None,
+                 unk_token: Optional[str] = Vocab.UNK_TOKEN, suffix: Optional[str] = '</w>',
+                 dropout: Optional[float] = None, lowercase: bool = False,
+                 unicode_normalizer: Optional[str] = None):
+        self._merges_file = merges_file
+        self._vocab_file = vocab_file
+        self._unk_token = unk_token
+        self._suffix = suffix
+        self._dropout = dropout
+        self._lowercase = lowercase
+        self._unicode_normalizer = unicode_normalizer
+        self.__rebuild_tokenizer()
+        self._last_subword_id_set = frozenset([self._vocab[ele]
+                                               for ele in self._vocab.all_tokens
+                                               if ele.endswith(self._suffix)])
+
+    def is_last_subword(self, tokens: Union[str, int, List[str], List[int]]) \
+            -> Union[bool, List[bool]]:
+        """Whether the token is the last subword token
+
+        Parameters
+        ----------
+        tokens
+            The input tokens
+
+        Returns
+        -------
+        ret
+            Whether the token is the last subword token in the list of subwords
+        """
+        if isinstance(tokens, str):
+            return tokens.endswith(self._suffix)
+        elif isinstance(tokens, int):
+            return tokens in self._last_subword_id_set
+        elif isinstance(tokens, list):
+            if len(tokens) == 0:
+                return []
+            if isinstance(tokens[0], str):
+                return [ele.endswith(self._suffix) for ele in tokens]
+            elif isinstance(tokens[0], int):
+                return [ele in self._last_subword_id_set for ele in tokens]
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+    def set_bpe_dropout(self, bpe_dropout: float):
+        self._dropout = bpe_dropout
+        self.__rebuild_tokenizer()
+
+    def set_lowercase(self, lowercase: float):
+        self._lowercase = lowercase
+        self.__rebuild_tokenizer()
+
+    @property
+    def lowercase(self):
+        return self._lowercase
+
+    def __rebuild_tokenizer(self):
+        tokenizers = try_import_huggingface_tokenizers()
+        # build vocab and temp_hf_vocab_file
+        try:
+            # using Vocab obj file
+            self._vocab = _get_vocab(self._vocab_file)
+            all_tokens = self._vocab.all_tokens
+            hf_vocab = OrderedDict()
+            for i in range(len(all_tokens)):
+                hf_vocab[all_tokens[i]] = i
+            temp_hf_vocab_file = str(uuid4()) + '.hf_vocab'
+            with open(temp_hf_vocab_file, 'w', encoding='utf-8') as ftv:
+                json.dump(hf_vocab, ftv, ensure_ascii=False)
+        except TypeError:
+            # using hf_bpe vocab file
+            with open(self._vocab_file, 'r', encoding='utf-8') as fv:
+                hf_vocab = json.load(fv)
+            hf_vocab = sorted(list(hf_vocab.items()), key=lambda x: x[1])
+            all_tokens = [x[0] for x in hf_vocab]
+            # defualt special tokens corresponding to the default
+            # special_tokens setting in CharBPETokenizer.train
+            # and the default special_tokens=[unk]
+            self._vocab = Vocab(all_tokens, unk_token=self._unk_token)
+            temp_hf_vocab_file = None
+        assert self._unk_token == self._vocab.unk_token
+        self._bpe = tokenizers.CharBPETokenizer(
+            vocab_file=temp_hf_vocab_file if temp_hf_vocab_file else self._vocab_file,
+            merges_file=self._merges_file,
+            unk_token=self._unk_token, suffix=self._suffix, dropout=self._dropout,
+            lowercase=self._lowercase, unicode_normalizer=self._unicode_normalizer)
+        if temp_hf_vocab_file:
+            os.remove(temp_hf_vocab_file)
+
+    def __repr__(self):
+        ret = '{}(\n' \
+              '   merges_file = {}\n' \
+              '   vocab_file = {}\n' \
+              '   unk_token = {}, suffix = {}\n' \
+              '   dropout = {}, lowercase = {}\n' \
+              '   unicode_normalizer = {}' \
+              '   vocab = {}\n' \
+              ')'.format(self.__class__.__name__,
+                         os.path.realpath(self._merges_file),
+                         os.path.realpath(self._vocab_file),
+                         self._unk_token, self._suffix,
+                         self._dropout, self._lowercase,
+                         self._unicode_normalizer,
+                         self._vocab)
+        return ret
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state['_bpe'] = None
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+        self.__rebuild_tokenizer()
+
+
+@TOKENIZER_REGISTRY.register('hf_bytebpe')
+class HuggingFaceByteBPETokenizer(HuggingFaceTokenizer):
+    def __init__(self, merges_file: Optional[str] = None, vocab_file: Optional[str] = None,
+                 add_prefix_space: bool = False, lowercase: bool = False,
+                 dropout: Optional[float] = None,
+                 unicode_normalizer: Optional[str] = None,
+                 continuing_subword_prefix: Optional[str] = None,
+                 end_of_word_suffix: Optional[str] = None, trim_offsets: bool = False):
+        self._merges_file = merges_file
+        self._vocab_file = vocab_file
+        self._add_prefix_space = add_prefix_space
+        self._lowercase = lowercase
+        self._dropout = dropout
+        self._unicode_normalizer = unicode_normalizer
+        self._continuing_subword_prefix = continuing_subword_prefix
+        self._end_of_word_suffix = end_of_word_suffix
+        self._trim_offsets = trim_offsets
+        self.__rebuild_tokenizer()
+
+    def set_bpe_dropout(self, bpe_dropout: float):
+        self._dropout = bpe_dropout
+        self.__rebuild_tokenizer()
+
+    def set_lowercase(self, lowercase: float):
+        self._lowercase = lowercase
+        self.__rebuild_tokenizer()
+
+    @property
+    def lowercase(self):
+        return self._lowercase
+
+    def __rebuild_tokenizer(self):
+        tokenizers = try_import_huggingface_tokenizers()
+        # build vocab and temp_hf_vocab_file
+        try:
+            # using Vocab obj file
+            self._vocab = _get_vocab(self._vocab_file)
+            all_tokens = self._vocab.all_tokens
+            hf_vocab = OrderedDict()
+            for i in range(len(all_tokens)):
+                hf_vocab[all_tokens[i]] = i
+            temp_hf_vocab_file = str(uuid4()) + '.hf_vocab'
+            with open(temp_hf_vocab_file, 'w', encoding='utf-8') as ftv:
+                json.dump(hf_vocab, ftv, ensure_ascii=False)
+        except TypeError:
+            # using hf_bytebpe vocab file
+            with open(self._vocab_file, 'r', encoding='utf-8') as fv:
+                hf_vocab = json.load(fv)
+            hf_vocab = sorted(list(hf_vocab.items()), key=lambda x: x[1])
+            all_tokens = [x[0] for x in hf_vocab]
+            # defualt special tokens corresponding to the default
+            # special_tokens setting in ByteBPETokenizer.train
+            # and the default special_tokens=[]
+            self._vocab = Vocab(all_tokens)
+            temp_hf_vocab_file = None
+        self._bpe = tokenizers.ByteLevelBPETokenizer(
+            vocab_file=temp_hf_vocab_file if temp_hf_vocab_file else self._vocab_file,
+            merges_file=self._merges_file,
+            add_prefix_space=self._add_prefix_space, lowercase=self._lowercase,
+            dropout=self._dropout, unicode_normalizer=self._unicode_normalizer,
+            continuing_subword_prefix=self._continuing_subword_prefix,
+            end_of_word_suffix=self._end_of_word_suffix,
+            trim_offsets=self._trim_offsets)
+        if temp_hf_vocab_file:
+            os.remove(temp_hf_vocab_file)
+
+    def __repr__(self):
+        ret = '{}(\n' \
+              '   merges_file = {}\n' \
+              '   vocab_file = {}\n' \
+              '   add_prefix_space = {}, lowercase = {}, dropout = {}\n' \
+              '   unicode_normalizer = {}, continuing_subword_prefix = {}\n' \
+              '   end_of_word_suffix = {}\n' \
+              '   trim_offsets = {}\n' \
+              '   vocab = {}\n' \
+              ')'.format(self.__class__.__name__,
+                         os.path.realpath(self._merges_file),
+                         os.path.realpath(self._vocab_file),
+                         self._add_prefix_space, self._lowercase, self._dropout,
+                         self._unicode_normalizer, self._continuing_subword_prefix,
+                         self._end_of_word_suffix,
+                         self._trim_offsets,
+                         self._vocab)
+        return ret
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state['_bpe'] = None
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+        self.__rebuild_tokenizer()
+
+
+@TOKENIZER_REGISTRY.register('hf_wordpiece')
+class HuggingFaceWordPieceTokenizer(HuggingFaceTokenizer):
+    def __init__(self, vocab_file: Optional[str] = None,
+                 unk_token: str = Vocab.UNK_TOKEN,
+                 sep_token: str = Vocab.SEP_TOKEN,
+                 cls_token: str = Vocab.CLS_TOKEN,
+                 pad_token: str = Vocab.PAD_TOKEN,
+                 mask_token: str = Vocab.MASK_TOKEN,
+                 clean_text: bool = True, handle_chinese_chars: bool = True,
+                 strip_accents: bool = False, lowercase: bool = False,
+                 wordpieces_prefix: str = "##"):
+        self._vocab_file = vocab_file
+        self._unk_token = unk_token
+        self._sep_token = sep_token
+        self._cls_token = cls_token
+        self._pad_token = pad_token
+        self._mask_token = mask_token
+        self._clean_text = clean_text
+        self._handle_chinese_chars = handle_chinese_chars
+        self._strip_accents = strip_accents
+        self._lowercase = lowercase
+        self._wordpieces_prefix = wordpieces_prefix
+        self.__rebuild_tokenizer()
+        self._first_subword_id_set = frozenset([self._vocab[ele]
+                                                for ele in self._vocab.all_tokens
+                                                if not ele.startswith(self._wordpieces_prefix) and
+                                                not ele in [self._sep_token, self._cls_token]])
+
+    def encode(self, sentences, output_type=str):
+        """
+        remove the cls and sep tokens of original huggingface wordpiece encoding
+        """
+        is_multi_sentences = isinstance(sentences, list)
+        ret = HuggingFaceTokenizer.encode(self, sentences, output_type)
+        if not is_multi_sentences:
+            ret = [ret]
+        if output_type == str:
+            cls_token, sep_token = self._vocab.cls_token, self._vocab.sep_token
+            ret = [x[1:-1] if x[0] == cls_token and x[-1] == sep_token else x
+                   for x in ret]
+        else:
+            cls_id, sep_id = self._vocab.cls_id, self._vocab.sep_id
+            ret = [x[1:-1] if x[0] == cls_id and x[-1] == sep_id else x
+                   for x in ret]
+        if is_multi_sentences:
+            return ret
+        else:
+            return ret[0]
+
+    def encode_with_offsets(self, sentences, output_type=str):
+        """
+        remove the cls and sep tokens of original huggingface wordpiece encoding
+        """
+        is_multi_sentences = isinstance(sentences, list)
+        ret, offsets = HuggingFaceTokenizer.encode_with_offsets(self, sentences, output_type)
+        if not is_multi_sentences:
+            ret, offsets = [ret], [offsets]
+        if output_type == str:
+            cls_token, sep_token = self._vocab.cls_token, self._vocab.sep_token
+            for i in range(len(ret)):
+                if ret[i][0] == cls_token and ret[i][-1] == sep_token:
+                    ret[i], offsets[i] = ret[i][1:-1], offsets[i][1:-1]
+        else:
+            cls_id, sep_id = self._vocab.cls_id, self._vocab.sep_id
+            for i in range(len(ret)):
+                if ret[i][0] == cls_id and ret[i][-1] == sep_id:
+                    ret[i], offsets[i] = ret[i][1:-1], offsets[i][1:-1]
+        if is_multi_sentences:
+            return ret, offsets
+        else:
+            return ret[0], offsets[0]
+
+    def is_first_subword(self, tokens: Union[str, int, List[str], List[int]]) \
+            -> Union[bool, List[bool]]:
+        if isinstance(tokens, str):
+            return not tokens.startswith(self._wordpieces_prefix) and not tokens in [
+                self._cls_token, self._sep_token]
+        elif isinstance(tokens, int):
+            return tokens in self._first_subword_id_set
+        elif isinstance(tokens, list):
+            if len(tokens) == 0:
+                return []
+            if isinstance(tokens[0], str):
+                return [not ele.startswith(self._wordpieces_prefix) and not ele in [self._cls_token,
+                                                                                    self._sep_token]
+                        for ele in tokens]
+            elif isinstance(tokens[0], int):
+                return [ele in self._first_subword_id_set for ele in tokens]
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+    def set_lowercase(self, lowercase: float):
+        self._lowercase = lowercase
+        self.__rebuild_tokenizer()
+
+    @property
+    def lowercase(self):
+        return self._lowercase
+
+    def __rebuild_tokenizer(self):
+        tokenizers = try_import_huggingface_tokenizers()
+        # build vocab and temp_hf_vocab_file
+        try:
+            # using Vocab obj file
+            self._vocab = _get_vocab(self._vocab_file)
+            all_tokens = self._vocab.all_tokens
+        except json.JSONDecodeError:
+            # using hf_wordpiece vocab file
+            all_tokens = []
+            with open(self._vocab_file, 'r', encoding='utf-8') as fv:
+                for line in fv:
+                    all_tokens.append(line.strip())
+            # defualt special tokens corresponding to the default
+            # special_tokens setting in BertWordPieceTokenizer.train
+            # and the default special_tokens=[pad, unk, cls, sep, mask]
+            default_special_tokens = {'pad_token': self._pad_token,
+                                      'cls_token': self._cls_token,
+                                      'sep_token': self._sep_token,
+                                      'mask_token': self._mask_token}
+            self._vocab = Vocab(all_tokens, unk_token=self._unk_token, **default_special_tokens)
+            all_tokens = self._vocab.all_tokens
+        # for safety, also use temp file when using wordpiece vocab file
+        # for situation that original all_tokens not cotain special tokens
+        # (vocab file of BERT do not contain all special tokens)
+        temp_hf_vocab_file = str(uuid4()) + '.hf_vocab'
+        with open(temp_hf_vocab_file, 'w', encoding='utf-8') as ftv:
+            ftv.write('\n'.join(all_tokens))
+        self._vocab.mask_token_id = self._vocab.mask_id
+        assert [self._unk_token, self._sep_token, self._cls_token, self._pad_token,
+                self._mask_token] == \
+               [self._vocab.unk_token, self._vocab.sep_token, self._vocab.cls_token,
+                self._vocab.pad_token, self._vocab.mask_token]
+        self._bpe = tokenizers.BertWordPieceTokenizer(
+            vocab_file=temp_hf_vocab_file if temp_hf_vocab_file else self._vocab_file,
+            unk_token=self._unk_token,
+            sep_token=self._sep_token,
+            cls_token=self._cls_token,
+            pad_token=self._pad_token,
+            mask_token=self._mask_token,
+            clean_text=self._clean_text,
+            handle_chinese_chars=self._handle_chinese_chars,
+            strip_accents=self._strip_accents, lowercase=self._lowercase,
+            wordpieces_prefix=self._wordpieces_prefix)
+        os.remove(temp_hf_vocab_file)
+
+    def __repr__(self):
+        ret = '{}(\n' \
+              '   vocab_file = {}\n' \
+              '   unk_token = {}, sep_token = {}, cls_token = {}\n' \
+              '   pad_token = {}, mask_token = {}\n' \
+              '   clean_text = {}, handle_chinese_chars = {}\n' \
+              '   strip_accents = {}, lowercase = {}\n' \
+              '   wordpieces_prefix = {}\n' \
+              '   vocab = {}\n' \
+              ')'.format(self.__class__.__name__,
+                         os.path.realpath(self._vocab_file),
+                         self._unk_token, self._sep_token, self._cls_token,
+                         self._pad_token, self._mask_token,
+                         self._clean_text, self._handle_chinese_chars,
+                         self._strip_accents, self._lowercase,
+                         self._wordpieces_prefix,
+                         self._vocab)
+        return ret
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state['_bpe'] = None
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+        self.__rebuild_tokenizer()
+
+
+@TOKENIZER_REGISTRY.register('spm')
+class SentencepieceTokenizer(BaseTokenizerWithVocab):
+    r"""Apply the Sentencepiece Tokenizer, which trains subword tokenization via the
+    unigram language modeling.
+
+    Users of this class are required to `install sentencepiece
+    <https://github.com/google/sentencepiece>`_. For example, one can use
+    :samp:`pip install sentencepiece`
+
+
+    Parameters
+    ----------
+    model_path
+        Path to the pre-trained sentencepiece model.
+    vocab
+        Path to the vocabulary of the sentencepiece model in GluonNLP
+    num_best
+        A scalar for sampling subwords. If num_best = {0,1}, no sampling is performed.
+        If num_best > 1, then samples from the num_best results.
+        If num_best < 0, then assume that num_best is infinite and
+        samples from the all hypothesis (lattice) using forward-filtering-and-backward-sampling
+        algorithm.
+    alpha
+        A scalar for a smoothing parameter for probability rescaling.
+    lowercase
+        Whether to convert the input string to lower-case strings
+    **kwargs
+
+    Examples
+    --------
+    >>> from mxnet import gluon
+    >>> url = 'https://gluonnlp-numpy-data.s3-accelerate.amazonaws.com/tokenizer_test_models/sentencepiece/test_ende-a9bee4.model'
+    >>> model_f = gluon.utils.download(url)
+    -etc-
+    >>> tokenizer = gluonnlp.data.SentencepieceTokenizer(model_f)
+    >>> sentence = 'This is a very awesome, life-changing sentence.'
+    >>> tokenizer.encode(sentence)
+    ['▁This', '▁is', '▁a', '▁very', '▁awesome', ',', '▁life', '-', 'ch', 'anging', '▁sentence', '.']
+    >>> tokenizer.decode(tokenizer.encode(sentence))
+    'This is a very awesome, life-changing sentence.'
+    >>> os.remove('test_ende-a9bee4.model')
+
+    """
+
+    def __init__(self, model_path: Optional[str] = None,
+                 vocab: Optional[Union[str, Vocab]] = None,
+                 nbest: int = 0, alpha: float = 0.0, lowercase=False,
+                 **kwargs):
+        self._model_path = model_path
+        sentencepiece = try_import_sentencepiece()
+        from ..third_party.sentencepiece_pb2 import SentencePieceText
+        self._spt_cls = SentencePieceText
+        self._sp_model = sentencepiece.SentencePieceProcessor()
+        self._sp_model.load(model_path)
+        self._nbest = nbest
+        self._alpha = alpha
+        self._lowercase = lowercase
+        self._meta_symbol = u'▁'
+        sp_model_all_tokens = [self._sp_model.id_to_piece(i) for i in range(len(self._sp_model))]
+        special_tokens_kv = dict()
+        existing_control_token_ids = set()
+        token_id_to_token_name = dict()
+        if self._sp_model.unk_id() != -1:
+            special_tokens_kv['unk_token'] = self._sp_model.id_to_piece(self._sp_model.unk_id())
+            existing_control_token_ids.add(self._sp_model.unk_id())
+            token_id_to_token_name[self._sp_model.unk_id()] = 'unk_token'
+        if self._sp_model.pad_id() != -1:
+            special_tokens_kv['pad_token'] = self._sp_model.id_to_piece(self._sp_model.pad_id())
+            existing_control_token_ids.add(self._sp_model.pad_id())
+            token_id_to_token_name[self._sp_model.pad_id()] = 'pad_token'
+        if self._sp_model.bos_id() != -1:
+            special_tokens_kv['bos_token'] = self._sp_model.id_to_piece(self._sp_model.bos_id())
+            existing_control_token_ids.add(self._sp_model.bos_id())
+            token_id_to_token_name[self._sp_model.bos_id()] = 'bos_token'
+        if self._sp_model.eos_id() != -1:
+            special_tokens_kv['eos_token'] = self._sp_model.id_to_piece(self._sp_model.eos_id())
+            existing_control_token_ids.add(self._sp_model.eos_id())
+            token_id_to_token_name[self._sp_model.eos_id()] = 'eos_token'
+        existing_control_tokens = set([self._sp_model.id_to_piece(ele)
+                                       for ele in existing_control_token_ids])
+        other_control_tokens_ids = \
+            [i for i in range(len(self._sp_model))
+             if self._sp_model.is_control(i) and i not in existing_control_token_ids]
+        other_control_tokens = set([self._sp_model.id_to_piece(ele)
+                                    for ele in other_control_tokens_ids])
+        matched_other_control_tokens = dict()
+        for k, v in kwargs.items():
+            if k in special_tokens_kv:
+                if v != special_tokens_kv[k]:
+                    raise ValueError(
+                        '"vocab.{}" is already set to "{}" in the sentencepiece model. '
+                        'Cannot reset it to "{}"'.format(k, special_tokens_kv[k], v))
+                continue
+            if v in existing_control_tokens:
+                if k != token_id_to_token_name[v]:
+                    raise ValueError('"{}" is already registered as "vocab.{}". '
+                                     'We cannot rename it to "vocab.{}".'
+                                     .format(v, token_id_to_token_name[v], k))
+                continue
+            if v in other_control_tokens:
+                if v in matched_other_control_tokens:
+                    raise ValueError(
+                        '"{}" has already been registered as "vocab.{}", '
+                        'we cannot register it again as "vocab.{}".'
+                            .format(v, matched_other_control_tokens[v], k))
+                matched_other_control_tokens[v] = k
+                special_tokens_kv[k] = v
+            else:
+                raise ValueError('Mismatch vocabulary! All special tokens specified '
+                                 'must be control tokens in the sentencepiece vocabulary.')
+        if vocab is None:
+            if len(matched_other_control_tokens) < len(other_control_tokens):
+                for i, token in enumerate(other_control_tokens.difference(
+                        set(matched_other_control_tokens.keys()))):
+                    token_key = 'other{}_token'.format(i)
+                    assert token_key not in special_tokens_kv
+                    special_tokens_kv[token_key] = token
+            self._vocab = Vocab(sp_model_all_tokens, **special_tokens_kv)
+        else:
+            self._vocab = _get_vocab(vocab)
+        # Sanity check
+        assert self._vocab.all_tokens == sp_model_all_tokens
+        for token in self._vocab.special_tokens:
+            piece_id = self._sp_model.piece_to_id(token)
+            if self._sp_model.is_unknown(piece_id):
+                assert self._vocab[token] == self._sp_model.unk_id()
+            else:
+                assert self._sp_model.is_control(piece_id), \
+                    'Vocab mismatch! "{}" is a special token in the given vocab but not in the ' \
+                    'sentencepiece model!'.format(token)
+        self._first_subword_id_set = frozenset([self._vocab[ele]
+                                                for ele in sp_model_all_tokens
+                                                if ele.startswith(self._meta_symbol)])
+
+    def encode(self, sentences, output_type=str):
+        is_multi_sentences = isinstance(sentences, list)
+        if not is_multi_sentences:
+            sentences = [sentences]
+        if self._lowercase:
+            sentences = [sentence.lower() for sentence in sentences]
+        if output_type is str:
+            ret = [self._sp_model.sample_encode_as_pieces(sentence, self._nbest, self._alpha)
+                   for sentence in sentences]
+        elif output_type is int:
+            ret = [self._sp_model.sample_encode_as_ids(sentence, self._nbest, self._alpha)
+                   for sentence in sentences]
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(output_type))
+        if is_multi_sentences:
+            return ret
+        else:
+            return ret[0]
+
+    def decode(self, tokens):
+        is_multi_sentences = _is_tokens_from_multiple_sentences(tokens)
+        token_type = _get_token_type(tokens)
+        if not is_multi_sentences:
+            tokens = [tokens]
+        if token_type is str:
+            ret = [self._sp_model.decode_pieces(ele_tokens) for ele_tokens in tokens]
+        elif token_type is int:
+            ret = [self._sp_model.decode_ids(ele_tokens) for ele_tokens in tokens]
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(token_type))
+        if is_multi_sentences:
+            return ret
+        else:
+            return ret[0]
+
+    def encode_with_offsets(self, sentences, output_type=str):
+        is_multi_sentences = isinstance(sentences, list)
+        if not is_multi_sentences:
+            sentences = [sentences]
+        tokens = []
+        token_ids = []
+        offsets = []
+        for sentence in sentences:
+            if self._lowercase:
+                sentence = sentence.lower()
+            spt = self._spt_cls()
+            spt.ParseFromString(self._sp_model.SampleEncodeAsSerializedProto(
+                sentence, self._nbest, self._alpha))
+            tokens.append([ele.piece for ele in spt.pieces])
+            token_ids.append([ele.id for ele in spt.pieces])
+            # In theory, we can recover the character offset from byte offset
+            sentence_byte_offsets = [(ele.begin, ele.end) for ele in spt.pieces]
+            char_offsets = _get_char_offset_from_byte_offset(sentence, sentence_byte_offsets)
+            offsets.append(char_offsets)
+        if not is_multi_sentences:
+            tokens = tokens[0]
+            token_ids = token_ids[0]
+            offsets = offsets[0]
+        if output_type is str:
+            return tokens, offsets
+        elif output_type is int:
+            return token_ids, offsets
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(output_type))
+
+    def is_first_subword(self, tokens: Union[str, int, List[str], List[int]]) \
+            -> Union[bool, List[bool]]:
+        """Whether the token is the first subword token
+
+        Parameters
+        ----------
+        tokens
+            The input tokens
+
+        Returns
+        -------
+        ret
+            Whether the token is the first subword token in the list of subwords
+        """
+        if isinstance(tokens, str):
+            return tokens.startswith(self._meta_symbol)
+        elif isinstance(tokens, int):
+            return tokens in self._first_subword_id_set
+        elif isinstance(tokens, list):
+            if len(tokens) == 0:
+                return []
+            if isinstance(tokens[0], str):
+                return [ele.startswith(self._meta_symbol) for ele in tokens]
+            elif isinstance(tokens[0], int):
+                return [ele in self._first_subword_id_set for ele in tokens]
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+    @property
+    def vocab(self):
+        return self._vocab
+
+    def set_vocab(self, vocab):
+        raise NotImplementedError('Currently, we cannot set the vocabulary of a '
+                                  'SentencepieceTokenizer.')
+
+    @property
+    def lowercase(self):
+        return self._lowercase
+
+    def set_subword_regularization(self, nbest, alpha):
+        self._nbest = nbest
+        self._alpha = alpha
+
+    def __repr__(self):
+        ret = '{}(\n' \
+              '   model_path = {}\n' \
+              '   lowercase = {}, nbest = {}, alpha = {}\n' \
+              '   vocab = {}\n' \
+              ')'.format(self.__class__.__name__,
+                         os.path.realpath(self._model_path),
+                         self._lowercase, self._nbest, self._alpha,
+                         self._vocab)
+        return ret
+
+    def __getstate__(self):
+        """Make the SentencepieceTokenizer pickleble.
+         We will remove the _spt_cls and _sp_model, which are not picklable, and try to
+         reconstruct the class via the saved model_path. This behavior is only acceptable for
+         multiprocessing and should not be used to save sentencepiece models."""
+        state = self.__dict__.copy()
+        state['_spt_cls'] = None
+        state['_sp_model'] = None
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+        sentencepiece = try_import_sentencepiece()
+        from ..third_party.sentencepiece_pb2 import SentencePieceText
+        self._spt_cls = SentencePieceText
+        self._sp_model = sentencepiece.SentencePieceProcessor()
+        ret = self._sp_model.load(self._model_path)
+        assert ret is True, 'Cannot load data from the saved seralized protobuffer!'
+
+
+@TOKENIZER_REGISTRY.register('yttm')
+class YTTMTokenizer(BaseTokenizerWithVocab):
+    def __init__(self, model_path: str, bpe_dropout: float = 0.0, n_threads: int = -1):
+        """
+
+        Parameters
+        ----------
+        model_path
+        bpe_dropout
+            The dropout probability in BPE-Dropout:
+                "BPE-Dropout: Simple and Effective Subword Regularization"
+        n_threads
+            The number of threads for encoding
+        """
+        yttm = try_import_yttm()
+        self._model_path = model_path
+        self._bpe = yttm.BPE(model=model_path, n_threads=n_threads)
+        self._bpe_dropout = bpe_dropout
+        self._out_type = yttm.OutputType
+        all_tokens = self._bpe.vocab()
+        self._vocab = Vocab(all_tokens,
+                            unk_token='<UNK>', pad_token='<PAD>',
+                            bos_token='<BOS>', eos_token='<EOS>')
+        self._meta_symbol = u'▁'  # U+2581 as the symbol for the first subword token
+        if len(self._vocab) != len(all_tokens):
+            raise ValueError('Cannot load the trained YTTM model file!')
+        self._first_subword_id_set = frozenset([self._vocab[ele]
+                                                for ele in self._vocab.all_tokens
+                                                if ele.startswith(self._meta_symbol)])
+
+    def encode(self, sentences, output_type=str):
+        is_single_sentence = not isinstance(sentences, list)
+        if is_single_sentence:
+            sentences = [sentences]
+        if output_type is str:
+            tokens = self._bpe.encode(sentences, output_type=self._out_type.SUBWORD,
+                                      dropout_prob=self._bpe_dropout)
+        elif output_type is int:
+            tokens = self._bpe.encode(sentences, output_type=self._out_type.ID,
+                                      dropout_prob=self._bpe_dropout)
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(output_type))
+        if is_single_sentence:
+            return tokens[0]
+        else:
+            return tokens
+
+    def decode(self, tokens):
+        is_multi_sentences = _is_tokens_from_multiple_sentences(tokens)
+        token_type = _get_token_type(tokens)
+        if not is_multi_sentences:
+            tokens = [tokens]
+        if token_type is int:
+            ret = self._bpe.decode(tokens)
+        elif token_type is str:
+            ret = []
+            for ele_tokens in tokens:
+                sentence = ''.join(ele_tokens)
+                if sentence[0] == self._meta_symbol:
+                    sentence = sentence[1:]
+                sentence = sentence.replace(self._meta_symbol, ' ')
+                ret.append(sentence)
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(token_type))
+        if is_multi_sentences:
+            return ret
+        else:
+            return ret[0]
+
+    def encode_with_offsets(self, sentences, output_type=str):
+        is_multi_sentences = isinstance(sentences, list)
+        if not is_multi_sentences:
+            sentences = [sentences]
+        tokens = []
+        token_ids = []
+        offsets = []
+        for sentence in sentences:
+            encode_token = self._bpe.encode([sentence],
+                                            output_type=self._out_type.SUBWORD,
+                                            dropout_prob=self._bpe_dropout)[0]
+            encode_id = self._bpe.encode([sentence],
+                                         output_type=self._out_type.ID,
+                                         dropout_prob=self._bpe_dropout)[0]
+            encode_token_without_meta_symbol = [x.replace(self._meta_symbol, ' ')
+                                                for x in encode_token]
+            if len(encode_token_without_meta_symbol) > 0:
+                encode_token_without_meta_symbol[0] = \
+                    encode_token_without_meta_symbol[0].replace(' ', '')
+            encode_offset = _rebuild_offset_from_tokens(sentence, encode_token_without_meta_symbol)
+            tokens.append(encode_token)
+            token_ids.append(encode_id)
+            offsets.append(encode_offset)
+        if not is_multi_sentences:
+            tokens = tokens[0]
+            token_ids = token_ids[0]
+            offsets = offsets[0]
+        if output_type is str:
+            return tokens, offsets
+        elif output_type is int:
+            return token_ids, offsets
+        else:
+            raise ValueError(_token_type_unsupported_err_msg(output_type))
+
+    def is_first_subword(self, tokens: Union[str, int, List[str], List[int]]) \
+            -> Union[bool, List[bool]]:
+        """Whether the token is the first subword token
+
+        Parameters
+        ----------
+        tokens
+            The input tokens
+
+        Returns
+        -------
+        ret
+            Whether the token is the first subword token in the list of subwords
+        """
+        if isinstance(tokens, str):
+            return tokens.startswith(self._meta_symbol)
+        elif isinstance(tokens, int):
+            return tokens in self._first_subword_id_set
+        elif isinstance(tokens, list):
+            if len(tokens) == 0:
+                return []
+            if isinstance(tokens[0], str):
+                return [ele.startswith(self._meta_symbol) for ele in tokens]
+            elif isinstance(tokens[0], int):
+                return [ele in self._first_subword_id_set for ele in tokens]
+            else:
+                raise NotImplementedError
+        else:
+            raise NotImplementedError
+
+    @property
+    def vocab(self):
+        return self._vocab
+
+    def set_vocab(self, vocab: Vocab):
+        raise NotImplementedError('Cannot set vocabulary for the YTTMTokenizer.')
+
+    def set_bpe_dropout(self, bpe_dropout: float):
+        """Set the bpe dropout probability
+
+        Parameters
+        ----------
+        bpe_dropout
+            The dropout ratio for BPE Dropout
+        """
+        self._bpe_dropout = bpe_dropout
+
+    def __repr__(self):
+        ret = '{}(\n' \
+              '   model_path = {}\n' \
+              '   bpe_dropout = {}\n' \
+              '   vocab = {}\n' \
+              ')'.format(self.__class__.__name__,
+                         os.path.realpath(self._model_path),
+                         self._bpe_dropout,
+                         self._vocab)
+        return ret
+
+    def __getstate__(self):
+        """Support multiprocessing by making it pickleble"""
+        state = self.__dict__.copy()
+        state['_bpe'] = None
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__ = state
+        yttm = try_import_yttm()
+        self._bpe = yttm.BPE(self._model_path)
+
+
+def create(name: str, *args, **kwargs) -> BaseTokenizer:
+    """
+
+    Parameters
+    ----------
+    name
+    args
+    kwargs
+
+    Returns
+    -------
+    tokenizer
+    """
+    return TOKENIZER_REGISTRY.create(name, *args, **kwargs)
+
+
+def create_with_json(name: str, json_str: str) -> BaseTokenizer:
+    """
+
+    Parameters
+    ----------
+    name
+    json_str
+
+    Returns
+    -------
+    tokenizer
+    """
+    return TOKENIZER_REGISTRY.create_with_json(name, json_str)
+
+
+def list_all():
+    return TOKENIZER_REGISTRY.list_keys()
diff --git a/src/gluonnlp/data/transforms.py b/src/gluonnlp/data/transforms.py
deleted file mode 100644
index fc7aff91a5..0000000000
--- a/src/gluonnlp/data/transforms.py
+++ /dev/null
@@ -1,1257 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=invalid-encoded-data, too-many-lines
-"""Transformer API. It provides tools for common transformation on samples in text dataset, such as
-clipping, padding, and tokenization."""
-
-
-__all__ = [
-    'ClipSequence', 'PadSequence', 'SacreMosesTokenizer',
-    'SpacyTokenizer', 'SacreMosesDetokenizer',
-    'JiebaTokenizer', 'NLTKStanfordSegmenter', 'SentencepieceTokenizer',
-    'SentencepieceDetokenizer', 'BERTBasicTokenizer', 'BERTTokenizer',
-    'BERTSentenceTransform', 'BERTSPTokenizer',
-    'GPT2BPETokenizer', 'GPT2BPEDetokenizer'
-]
-
-import errno
-import functools
-import io
-import os
-import time
-import unicodedata
-import warnings
-import zipfile
-from typing import List, Optional
-
-import mxnet as mx
-from mxnet.gluon.utils import _get_repo_url, check_sha1, download
-import numpy as np
-
-from ..base import get_home_dir
-from ..vocab.vocab import Vocab
-from .utils import _extract_archive
-from .fast_bert_tokenizer import is_control, is_punctuation, is_whitespace
-from .fast_bert_tokenizer import BasicTokenizer, WordpieceTokenizer
-
-
-class ClipSequence:
-    """Clip the sequence to have length no more than `length`.
-
-    Parameters
-    ----------
-    length : int
-        Maximum length of the sequence
-
-    Examples
-    --------
-    >>> datasets = gluon.data.SimpleDataset([[1, 3, 5, 7], [1, 2, 3], [1, 2, 3, 4, 5, 6, 7, 8]])
-    >>> list(datasets.transform(gluonnlp.data.ClipSequence(4)))
-    [[1, 3, 5, 7], [1, 2, 3], [1, 2, 3, 4]]
-    >>> datasets = gluon.data.SimpleDataset([np.array([[1, 3], [5, 7], [7, 5], [3, 1]]),
-    ...                                      np.array([[1, 2], [3, 4], [5, 6],
-    ...                                                [6, 5], [4, 3], [2, 1]]),
-    ...                                      np.array([[2, 4], [4, 2]])])
-    >>> list(datasets.transform(gluonnlp.data.ClipSequence(3)))
-    [array([[1, 3],
-           [5, 7],
-           [7, 5]]), array([[1, 2],
-           [3, 4],
-           [5, 6]]), array([[2, 4],
-           [4, 2]])]
-    """
-
-    def __init__(self, length):
-        self._length = length
-
-    def __call__(self, sample):
-        return sample[:min(len(sample), self._length)]
-
-
-class PadSequence:
-    """Pad the sequence.
-
-    Pad the sequence to the given `length` by inserting `pad_val`. If `clip` is set,
-    sequence that has length larger than `length` will be clipped.
-
-    Parameters
-    ----------
-    length : int
-        The maximum length to pad/clip the sequence
-    pad_val : number
-        The pad value. Default 0
-    clip : bool
-
-    Examples
-    --------
-    >>> datasets = gluon.data.SimpleDataset([[1, 3, 5, 7], [1, 2, 3], [1, 2, 3, 4, 5, 6, 7, 8]])
-    >>> list(datasets.transform(gluonnlp.data.PadSequence(6)))
-    [[1, 3, 5, 7, 0, 0], [1, 2, 3, 0, 0, 0], [1, 2, 3, 4, 5, 6]]
-    >>> list(datasets.transform(gluonnlp.data.PadSequence(6, clip=False)))
-    [[1, 3, 5, 7, 0, 0], [1, 2, 3, 0, 0, 0], [1, 2, 3, 4, 5, 6, 7, 8]]
-    >>> list(datasets.transform(gluonnlp.data.PadSequence(6, pad_val=-1, clip=False)))
-    [[1, 3, 5, 7, -1, -1], [1, 2, 3, -1, -1, -1], [1, 2, 3, 4, 5, 6, 7, 8]]
-    """
-
-    def __init__(self, length, pad_val=0, clip=True):
-        self._length = length
-        self._pad_val = pad_val
-        self._clip = clip
-
-    def __call__(self, sample):
-        """
-
-        Parameters
-        ----------
-        sample : list of number or mx.nd.NDArray or np.ndarray
-
-        Returns
-        -------
-        ret : list of number or mx.nd.NDArray or np.ndarray
-        """
-        sample_length = len(sample)
-        if sample_length >= self._length:
-            if self._clip and sample_length > self._length:
-                return sample[:self._length]
-            else:
-                return sample
-        else:
-            if isinstance(sample, mx.nd.NDArray):
-                # TODO(sxjscience) Use this trick for padding because mx.pad currently only supports
-                # 4D/5D inputs
-                new_sample_shape = (self._length, ) + sample.shape[1:]
-                ret = mx.nd.full(shape=new_sample_shape,
-                                 val=self._pad_val,
-                                 ctx=sample.context,
-                                 dtype=sample.dtype)
-                ret[:sample_length] = sample
-                return ret
-            elif isinstance(sample, np.ndarray):
-                pad_width = [(0, self._length - sample_length)] +\
-                            [(0, 0) for _ in range(sample.ndim - 1)]
-                return np.pad(sample,
-                              mode='constant',
-                              constant_values=self._pad_val,
-                              pad_width=pad_width)
-            elif isinstance(sample, list):
-                return sample + [
-                    self._pad_val for _ in range(self._length - sample_length)
-                ]
-            else:
-                raise NotImplementedError(
-                    'The input must be 1) list or 2) numpy.ndarray or 3) '
-                    'mxnet.NDArray, received type=%s' % str(type(sample)))
-
-
-class SacreMosesTokenizer:
-    """Apply the Moses Tokenizer implemented in sacremoses.
-
-    Users of this class are required to install
-    `sacremoses <https://github.com/alvations/sacremoses>`_.
-    For example, one can use :samp:`pip install sacremoses`.
-
-    .. note::
-        sacremoses carries an LGPL 2.1+ license.
-
-    Examples
-    --------
-    >>> tokenizer = gluonnlp.data.SacreMosesTokenizer()
-    >>> tokenizer('Gluon NLP toolkit provides a suite of text processing tools.')
-    ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.']
-    >>> tokenizer('Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools '
-    ...           'zur Verfügung.')
-    ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \
-'zur', 'Verf\xfcgung', '.']
-    """
-
-    def __init__(self):
-        from sacremoses import MosesTokenizer  # pylint: disable=import-outside-toplevel
-        self._tokenizer = MosesTokenizer()
-
-    def __call__(self, sample: str, return_str: bool = False):
-        """Tokenize a sample.
-
-        Parameters
-        ----------
-        sample
-            The sentence to tokenize
-        return_str
-            True: return a single string
-            False: return a list of tokens
-
-        Returns
-        -------
-        ret : list of strs or str
-            List of tokens or tokenized text
-        """
-        return self._tokenizer.tokenize(sample, return_str=return_str)
-
-
-class SpacyTokenizer:
-    """Apply the Spacy Tokenizer.
-
-    Users of this class are required to install `spaCy <https://spacy.io/usage/>`_
-    and download corresponding NLP models, such as :samp:`python -m spacy download en`.
-
-    Only spacy>=2.0.0 is supported.
-
-    Parameters
-    ----------
-    lang : str
-        The language to tokenize. Default is 'en', i.e, English.
-        You may refer to https://spacy.io/usage/models for supported languages.
-
-    Examples
-    --------
-    >>> tokenizer = gluonnlp.data.SpacyTokenizer()
-    >>> tokenizer('Gluon NLP toolkit provides a suite of text processing tools.')
-    ['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of', 'text', 'processing', 'tools', '.']
-    >>> tokenizer = gluonnlp.data.SpacyTokenizer('de')
-    >>> tokenizer('Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools'
-    ...           ' zur Verfügung.')
-    ['Das', 'Gluon', 'NLP-Toolkit', 'stellt', 'eine', 'Reihe', 'von', 'Textverarbeitungstools', \
-'zur', 'Verf\xfcgung', '.']
-    """
-
-    def __init__(self, lang='en_core_web_sm'):
-        try:
-            import spacy  # pylint: disable=import-outside-toplevel
-            from pkg_resources import parse_version  # pylint: disable=import-outside-toplevel
-            assert parse_version(spacy.__version__) >= parse_version('2.0.0'),\
-                'We only support spacy>=2.0.0'
-        except ImportError:
-            raise ImportError(
-                'spaCy is not installed. You must install spaCy in order to use the '
-                'SpacyTokenizer. You can refer to the official installation guide '
-                'in https://spacy.io/usage/.')
-        try:
-            self._nlp = spacy.load(lang, disable=['parser', 'tagger', 'ner'])
-        except IOError:
-            raise IOError(
-                'SpaCy Model for the specified language="{lang}" has not been '
-                'downloaded. You need to check the installation guide in '
-                'https://spacy.io/usage/models. Usually, the installation command '
-                'should be `python -m spacy download {lang}`.'.format(
-                    lang=lang))
-
-    def __call__(self, sample):
-        """
-
-        Parameters
-        ----------
-        sample: str
-            The sentence to tokenize
-
-        Returns
-        -------
-        ret : list of strs
-            List of tokens
-        """
-        return [tok.text for tok in self._nlp(sample)]
-
-
-class SacreMosesDetokenizer:
-    r"""Apply the Moses Detokenizer implemented in sacremoses.
-
-    Users of this class are required to `install sacremoses
-    <https://github.com/alvations/sacremoses>`_. For example, one can use
-    :samp:`pip install sacremoses`.
-
-    .. note::
-        sacremoses carries an LGPL 2.1+ license.
-
-    Parameters
-    ----------
-    return_str: bool, default False
-        True: return a single string
-        False: return a list of words
-
-    Examples
-    --------
-    >>> detokenizer = gluonnlp.data.SacreMosesDetokenizer()
-    >>> detokenizer(['Gluon', 'NLP', 'toolkit', 'provides', 'a', 'suite', 'of',
-    ...              'text', 'processing', 'tools', '.'], return_str=True)
-    'Gluon NLP toolkit provides a suite of text processing tools.'
-    >>> detokenizer(['Das', 'Gluon','NLP-Toolkit','stellt','eine','Reihe','von',
-    ...              'Textverarbeitungstools','zur','Verfügung','.'], return_str=True)
-    'Das Gluon NLP-Toolkit stellt eine Reihe von Textverarbeitungstools zur Verfügung.'
-    """
-
-    def __init__(self, return_str=True):
-        self._return_str = return_str
-        from sacremoses import MosesDetokenizer  # pylint: disable=import-outside-toplevel
-        self._detokenizer = MosesDetokenizer()
-
-    def __call__(self, sample: List[str], return_str: Optional[bool] = None):
-        """
-
-        Parameters
-        ----------
-        sample
-            The sentence to detokenize
-        return_str
-            True: return a single string
-            False: return a list of words
-            None: use constructor setting
-
-        Returns
-        -------
-        ret : list of strs or str
-            List of words or detokenized text
-        """
-        ret_str = self._return_str if return_str is None else return_str
-        return self._detokenizer.detokenize(sample, return_str=ret_str)
-
-
-class JiebaTokenizer:
-    r"""Apply the jieba Tokenizer.
-
-    Users of this class are required to `install jieba <https://github.com/fxsjy/jieba>`_
-
-    Parameters
-    ----------
-    lang : str
-        The language to tokenize. Default is "zh", i.e, Chinese.
-
-    Examples
-    --------
-    >>> tokenizer = gluonnlp.data.JiebaTokenizer()
-    >>> tokenizer('我来到北京清华大学')
-    ['我', '来到', '北京', '清华大学']
-    >>> tokenizer('小明硕士毕业于中国科学院计算所，后在日本京都大学深造')
-    ['小明', '硕士', '毕业', '于', '中国科学院', '计算所', '，', '后', '在', '日本京都大学', '深造']
-
-    """
-
-    def __init__(self):
-        try:
-            with warnings.catch_warnings():  # jieba uses deprecated imp module
-                warnings.simplefilter('ignore')
-                import jieba  # pylint: disable=import-outside-toplevel
-        except ImportError:
-            raise ImportError(
-                'jieba is not installed. You must install jieba in order to use the '
-                'JiebaTokenizer. You can refer to the official installation guide '
-                'in https://github.com/fxsjy/jieba')
-        self._tokenizer = jieba
-
-    def __call__(self, sample):
-        """
-
-        Parameters
-        ----------
-        sample: str
-            The Chinese sentence to tokenize. Better not to input sentence in other languages
-            since this class is mainly used for Chinese Word Segmentation.
-
-        Returns
-        -------
-        ret : list of strs
-            List of tokens
-        """
-        # we use default cutting mode provided by jieba, i.e., accurate mode
-        return [
-            tok for tok in self._tokenizer.cut(sample)
-            if tok not in (' ', '')
-        ]
-
-
-class NLTKStanfordSegmenter:
-    r"""Apply the Stanford Chinese Word Segmenter implemented in NLTK.
-
-    Users of this class are required to install Java, NLTK and download Stanford Word Segmenter
-
-    Parameters
-    ----------
-    segmenter_root : str, default '$MXNET_HOME/stanford-segmenter'
-        Path to folder for storing stanford segmenter.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    slf4j_root : str, default '$MXNET_HOME/slf4j'
-        Path to foler for storing slf4j.
-        MXNET_HOME defaults to '~/.mxnet'
-
-    java_class : str, default 'edu.stanford.nlp.ie.crf.CRFClassifier'
-        The learning algorithm used for segmentation
-
-    Examples
-    --------
-    >>> tokenizer = gluonnlp.data.NLTKStanfordSegmenter() #doctest:+SKIP
-    >>> tokenizer('我来到北京清华大学') #doctest:+SKIP
-    ['我', '来到', '北京', '清华大学']
-    >>> tokenizer('小明硕士毕业于中国科学院计算所，后在日本京都大学深造') #doctest:+SKIP
-    ['小明', '硕士', '毕业', '于', '中国科学院', '计算所', '，', '后', '在', '日本京都大学', '深造']
-
-    """
-
-    def __init__(self,
-                 segmenter_root=os.path.join(get_home_dir(),
-                                             'stanford-segmenter'),
-                 slf4j_root=os.path.join(get_home_dir(), 'slf4j'),
-                 java_class='edu.stanford.nlp.ie.crf.CRFClassifier'):
-        is_java_exist = os.system('java -version')
-        assert is_java_exist == 0, 'Java is not installed. You must install Java 8.0' \
-                                   'in order to use the NLTKStanfordSegmenter'
-        try:
-            from nltk.tokenize import StanfordSegmenter  # pylint: disable=import-outside-toplevel
-        except ImportError:
-            raise ImportError(
-                'NLTK or relevant packages are not installed. You must install NLTK '
-                'in order to use the NLTKStanfordSegmenter. You can refer to the '
-                'official installation guide in https://www.nltk.org/install.html.'
-            )
-        path_to_jar = os.path.join(segmenter_root,
-                                   'stanford-segmenter-2018-02-27',
-                                   'stanford-segmenter-3.9.1.jar')
-        path_to_model = os.path.join(segmenter_root,
-                                     'stanford-segmenter-2018-02-27', 'data',
-                                     'pku.gz')
-        path_to_dict = os.path.join(segmenter_root,
-                                    'stanford-segmenter-2018-02-27', 'data',
-                                    'dict-chris6.ser.gz')
-        path_to_sihan_corpora_dict = os.path.join(
-            segmenter_root, 'stanford-segmenter-2018-02-27', 'data')
-        segmenter_url = 'https://nlp.stanford.edu/software/stanford-segmenter-2018-02-27.zip'
-        segmenter_sha1 = 'aa27a6433704b7b4c6a14be1c126cb4b14b8f57b'
-        stanford_segmenter = os.path.join(segmenter_root,
-                                          'stanford-segmenter-2018-02-27.zip')
-        if not os.path.exists(path_to_jar) or \
-                not os.path.exists(path_to_model) or \
-                not os.path.exists(path_to_dict) or \
-                not os.path.exists(path_to_sihan_corpora_dict) or \
-                not check_sha1(filename=stanford_segmenter, sha1_hash=segmenter_sha1):
-            # automatically download the files from the website and place them to stanford_root
-            if not os.path.exists(segmenter_root):
-                os.mkdir(segmenter_root)
-            download(url=segmenter_url,
-                     path=segmenter_root,
-                     sha1_hash=segmenter_sha1)
-            _extract_archive(file=stanford_segmenter,
-                             target_dir=segmenter_root)
-
-        path_to_slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25',
-                                     'slf4j-api-1.7.25.jar')
-        slf4j_url = 'https://www.slf4j.org/dist/slf4j-1.7.25.zip'
-        slf4j_sha1 = '89ea41ad6ebe1b190139421bb7c8d981e9df1625'
-        slf4j = os.path.join(slf4j_root, 'slf4j-1.7.25.zip')
-        if not os.path.exists(path_to_slf4j) or \
-                not check_sha1(filename=slf4j, sha1_hash=slf4j_sha1):
-            # automatically download the files from the website and place them to slf4j_root
-            if not os.path.exists(slf4j_root):
-                os.mkdir(slf4j_root)
-            download(url=slf4j_url, path=slf4j_root, sha1_hash=slf4j_sha1)
-            _extract_archive(file=slf4j, target_dir=slf4j_root)
-        self._tokenizer = StanfordSegmenter(
-            java_class=java_class,
-            path_to_jar=path_to_jar,
-            path_to_slf4j=path_to_slf4j,
-            path_to_dict=path_to_dict,
-            path_to_sihan_corpora_dict=path_to_sihan_corpora_dict,
-            path_to_model=path_to_model)
-
-    def __call__(self, sample):
-        """
-
-        Parameters
-        ----------
-        sample: str
-            The Chinese sentence to tokenize. Better not to input sentence in other languages
-            since this class is mainly used for Chinese Word Segmentation.
-
-        Returns
-        -------
-        ret : list of strs
-            List of tokens
-        """
-        return self._tokenizer.segment(sample).strip().split()
-
-
-class _SentencepieceProcessor:
-    def __init__(self, path):
-        try:
-            import sentencepiece  # pylint: disable=import-outside-toplevel
-        except ImportError:
-            raise ImportError(
-                'sentencepiece is not installed. You must install sentencepiece '
-                'in order to use the Sentencepiece tokenizer and detokenizer. '
-                'You can refer to the official installation guide '
-                'in https://github.com/google/sentencepiece#installation')
-        self._processor = sentencepiece.SentencePieceProcessor()
-        self._processor.Load(path)
-
-    def __len__(self):
-        return len(self._processor)
-
-    @property
-    def tokens(self):
-        return [self._processor.id_to_piece(i) for i in range(len(self))]
-
-
-class SentencepieceTokenizer(_SentencepieceProcessor):
-    r"""Apply the Sentencepiece Tokenizer, which supports subword tokenization such as BPE.
-
-    Users of this class are required to `install sentencepiece
-    <https://github.com/google/sentencepiece>`_. For example, one can use
-    :samp:`pip install sentencepiece`
-
-
-    Parameters
-    ----------
-    path : str
-        Path to the pre-trained subword tokenization model.
-    num_best : int, default 0
-        A scalar for sampling subwords. If num_best = {0,1}, no sampling is performed.
-        If num_best > 1, then samples from the num_best results.
-        If num_best < 0, then assume that num_best is infinite and
-        samples from the all hypothesis (lattice) using forward-filtering-and-backward-sampling
-        algorithm.
-    alpha : float, default 1.0
-        A scalar for a smoothing parameter. Inverse temperature for probability rescaling.
-
-    Examples
-    --------
-    >>> url = 'http://repo.mxnet.io/gluon/dataset/vocab/test-0690baed.bpe'
-    >>> f = gluon.utils.download(url)
-    -etc-
-    >>> tokenizer = gluonnlp.data.SentencepieceTokenizer(f)
-    >>> detokenizer = gluonnlp.data.SentencepieceDetokenizer(f)
-    >>> sentence = 'This is a very awesome, life-changing sentence.'
-    >>> tokenizer(sentence)
-    ['▁This', '▁is', '▁a', '▁very', '▁awesome', ',', '▁life', '-', 'ch', 'anging', '▁sentence', '.']
-    >>> detokenizer(tokenizer(sentence))
-    'This is a very awesome, life-changing sentence.'
-    >>> os.remove('test-0690baed.bpe')
-
-    """
-
-    def __init__(self, path, num_best=0, alpha=1.0):
-        super(SentencepieceTokenizer, self).__init__(path)
-        self._path = path
-        self._nbest = num_best
-        self._alpha = alpha
-
-    def __call__(self, sample):
-        """
-
-        Parameters
-        ----------
-        sample: str
-            The string to tokenize.
-
-        Returns
-        -------
-        ret : list of strs
-            List of tokens
-        """
-        return self._processor.SampleEncodeAsPieces(sample, self._nbest,
-                                                    self._alpha)
-
-
-class SentencepieceDetokenizer(_SentencepieceProcessor):
-    r"""Apply the Sentencepiece detokenizer, which supports recombining subwords such as BPE.
-
-    Users of this class are required to `install sentencepiece
-    <https://github.com/google/sentencepiece>`_. For example, one can use
-    :samp:`pip install sentencepiece`
-
-
-    Parameters
-    ----------
-    path : str
-        Path to the pre-trained subword tokenization model.
-
-    Examples
-    --------
-    >>> url = 'http://repo.mxnet.io/gluon/dataset/vocab/test-0690baed.bpe'
-    >>> f = gluon.utils.download(url)
-    -etc-
-    >>> tokenizer = gluonnlp.data.SentencepieceTokenizer(f)
-    >>> detokenizer = gluonnlp.data.SentencepieceDetokenizer(f)
-    >>> sentence = 'This is a very awesome, life-changing sentence.'
-    >>> tokenizer(sentence)
-    ['▁This', '▁is', '▁a', '▁very', '▁awesome', ',', '▁life', '-', 'ch', 'anging', '▁sentence', '.']
-    >>> detokenizer(tokenizer(sentence))
-    'This is a very awesome, life-changing sentence.'
-    >>> os.remove('test-0690baed.bpe')
-
-    """
-
-    def __call__(self, sample):
-        """
-
-        Parameters
-        ----------
-        sample: list(str)
-            The sentence to detokenize
-
-        Returns
-        -------
-        ret : str
-            Detokenized text
-        """
-        return self._processor.DecodePieces(sample)
-
-
-class BERTBasicTokenizer:
-    r"""Runs basic tokenization
-
-    performs invalid character removal (e.g. control chars) and whitespace.
-    tokenize CJK chars.
-    splits punctuation on a piece of text.
-    strips accents and convert to lower case.(If lower is true)
-
-    Parameters
-    ----------
-    lower : bool, default True
-        whether the text strips accents and convert to lower case.
-
-    Examples
-    --------
-    >>> tokenizer = gluonnlp.data.BERTBasicTokenizer(lower=True)
-    >>> tokenizer(' \tHeLLo!how  \n Are yoU?  ')
-    ['hello', '!', 'how', 'are', 'you', '?']
-    >>> tokenizer = gluonnlp.data.BERTBasicTokenizer(lower=False)
-    >>> tokenizer(' \tHeLLo!how  \n Are yoU?  ')
-    ['HeLLo', '!', 'how', 'Are', 'yoU', '?']
-
-    """
-
-    def __init__(self, lower=True):
-        self.tokenizer = BasicTokenizer(lower=lower)
-
-    def __call__(self, sample):
-        """
-
-        Parameters
-        ----------
-        sample:  str
-            The string to tokenize. Must be unicode.
-
-        Returns
-        -------
-        ret : list of strs
-            List of tokens
-        """
-        return self.tokenizer.tokenize(sample)
-
-    def _is_control(self, char):
-        """Checks whether `chars` is a control character."""
-        return is_control(char, unicodedata.category(char))
-
-    def _is_punctuation(self, char):
-        """Checks whether `chars` is a punctuation character."""
-        return is_punctuation(char, unicodedata.category(char))
-
-    def _is_whitespace(self, char):
-        """Checks whether `chars` is a whitespace character."""
-        return is_whitespace(char, unicodedata.category(char))
-
-
-class BERTTokenizer:
-    r"""End-to-end tokenization for BERT models.
-
-    Parameters
-    ----------
-    vocab
-        Vocabulary for the corpus.
-    lower
-        whether the text strips accents and convert to lower case.
-        If you use the BERT pre-training model,
-        lower is set to Flase when using the cased model,
-        otherwise it is set to True.
-    max_input_chars_per_word
-    lru_cache_size
-        Maximum size of a least-recently-used cache to speed up tokenization.
-        Use size of 2**20 for example.
-
-    Examples
-    --------
-    >>> _, vocab = gluonnlp.model.bert_12_768_12(dataset_name='wiki_multilingual_uncased',
-    ...                                          pretrained=False, root='./model')
-    -etc-
-    >>> tokenizer = gluonnlp.data.BERTTokenizer(vocab=vocab)
-    >>> tokenizer('gluonnlp: 使NLP变得简单。')
-    ['gl', '##uo', '##nn', '##lp', ':', '使', 'nl', '##p', '变', '得', '简', '单', '。']
-
-    """
-
-    _special_prefix = '##'
-
-    def __init__(self, vocab: Vocab, lower: bool = True, max_input_chars_per_word: int = 200,
-                 lru_cache_size: Optional[int] = None):
-        self.vocab = vocab
-        self.max_input_chars_per_word = max_input_chars_per_word
-        self.basic_tokenizer = BasicTokenizer(lower=lower)
-        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=vocab,
-                                                      unk_token=vocab.unknown_token,
-                                                      max_input_chars_per_word=\
-                                                              max_input_chars_per_word)
-
-        if lru_cache_size:
-            self._word_to_wordpiece_optimized = functools.lru_cache(maxsize=lru_cache_size)(
-                self._word_to_wordpiece_optimized)
-
-    def __call__(self, sample):
-        """
-
-        Parameters
-        ----------
-        sample: str
-            The string to tokenize.
-
-        Returns
-        -------
-        ret : list of strs
-            List of tokens
-        """
-
-        return self._tokenizer(sample)
-
-    def _tokenizer(self, text):
-        split_tokens = []
-        for token in self.basic_tokenizer.tokenize(text):
-            for sub_token in self._word_to_wordpiece_optimized(token):
-                split_tokens.append(sub_token)
-
-        return split_tokens
-
-    def _word_to_wordpiece_optimized(self, text):  # pylint: disable=method-hidden
-        return self.wordpiece_tokenizer.tokenize(text)
-
-    def convert_tokens_to_ids(self, tokens):
-        """Converts a sequence of tokens into ids using the vocab."""
-        return self.vocab.to_indices(tokens)
-
-    @staticmethod
-    def is_first_subword(token):
-        """Check if a token is the beginning of subwords.
-
-        Parameters
-        ----------
-        token : str
-            The input token.
-
-        Returns
-        -------
-        ret : True if the token is the beginning of a serious of wordpieces.
-
-        Examples
-        --------
-        >>> _, vocab = gluonnlp.model.bert_12_768_12(dataset_name='wiki_multilingual_uncased',
-        ...                                          pretrained=False, root='./bert_tokenizer')
-        -etc-
-        >>> tokenizer = gluonnlp.data.BERTTokenizer(vocab=vocab)
-        >>> tokenizer('gluonnlp: 使NLP变得简单。')
-        ['gl', '##uo', '##nn', '##lp', ':', '使', 'nl', '##p', '变', '得', '简', '单', '。']
-        >>> tokenizer.is_first_subword('gl')
-        True
-        >>> tokenizer.is_first_subword('##uo')
-        False
-        """
-        return not token.startswith(BERTTokenizer._special_prefix)
-
-
-class BERTSPTokenizer:
-    r"""End-to-end SentencePiece tokenization for BERT models.
-
-    It works best with BERTSentenceTransform().
-
-    .. note::
-
-        BERTSPTokenizer depends on the sentencepiece library. For multi-processing
-        with BERTSPTokenizer, making an extra copy of the BERTSPTokenizer instance
-        is recommended before using it.
-
-    Parameters
-    ----------
-    path : str
-        Path to the pre-trained subword tokenization model.
-    vocab : gluonnlp.Vocab
-        Vocabulary for the corpus.
-    num_best : int, default 0
-        A scalar for sampling subwords. If num_best = {0,1}, no sampling is performed.
-        If num_best > 1, then samples from the num_best results.
-        If num_best < 0, then assume that num_best is infinite and
-        samples from the all hypothesis (lattice) using forward-filtering-and-backward-sampling
-        algorithm.
-    alpha : float
-        A scalar for a smoothing parameter. Inverse temperature for probability rescaling.
-    lower : bool, default True
-        Whether the text strips accents and convert to lower case.
-        If you use the BERT pre-training model,
-        lower is set to False when using the cased model,
-        otherwise it is set to True.
-    max_input_chars_per_word : int, default 200
-
-    Examples
-    --------
-    >>> url = 'http://repo.mxnet.io/gluon/dataset/vocab/test-682b5d15.bpe'
-    >>> f = gluon.utils.download(url)
-    -etc-
-    >>> bert_vocab = gluonnlp.vocab.BERTVocab.from_sentencepiece(f)
-    >>> sp_tokenizer = BERTSPTokenizer(f, bert_vocab, lower=True)
-    >>> sentence = 'Better is to bow than break.'
-    >>> sp_tokenizer(sentence)
-    ['▁better', '▁is', '▁to', '▁b', 'ow', '▁than', '▁brea', 'k', '▁', '.']
-    >>> os.remove('test-682b5d15.bpe')
-    """
-
-    _special_prefix = '▁'
-
-    def __init__(self,
-                 path,
-                 vocab,
-                 num_best=0,
-                 alpha=1.0,
-                 lower=True,
-                 max_input_chars_per_word=200):
-        self._path = path
-        self._num_best = num_best
-        self._alpha = alpha
-        self.sentencepiece = None
-        self.basic_tokenizer = BERTBasicTokenizer(lower=lower)
-        self.vocab = vocab
-        self.max_input_chars_per_word = max_input_chars_per_word
-
-
-    def _activate_sp(self):
-        self.sentencepiece = SentencepieceTokenizer(self._path, self._num_best,
-                                                    self._alpha)
-
-    def __call__(self, sample):
-        """
-
-        Parameters
-        ----------
-        sample: str
-            The string to tokenize.
-
-        Returns
-        -------
-        ret : list of strs
-            List of tokens
-        """
-
-        return self._tokenizer(sample)
-
-    def _tokenizer(self, text):
-        split_tokens = []
-        for token in self.basic_tokenizer(text):
-            for sub_token in self._tokenize_wordpiece(token):
-                split_tokens.append(sub_token)
-
-        return split_tokens
-
-    def _tokenize_wordpiece(self, text):
-        """Tokenizes a piece of text into its word pieces.
-
-        This use Google's SentencePiece tokenizer model file
-
-        For example:
-          input = "unaffable"
-          output = ["▁un", "aff", "able"]
-
-        Args:
-          text: A single token or whitespace separated tokens. This should have
-            already been passed through `BERTBasicTokenizer.
-
-        Returns:
-          A list of sentencepieced tokens.
-        """
-        # Swig object can not be pickled when multiprocessing.
-        if self.sentencepiece is None:
-            self._activate_sp()
-        output_tokens = self.sentencepiece(text)
-        return output_tokens
-
-    def convert_tokens_to_ids(self, tokens):
-        """Converts a sequence of tokens into ids using the vocab."""
-        return self.vocab.to_indices(tokens)
-
-    @staticmethod
-    def is_first_subword(token):
-        """Check if a string token is a subword following a previous subword,
-        instead of the beginning of a word.
-
-        Parameters
-        ----------
-        token : str
-            The input token.
-
-        Returns
-        -------
-        ret : True if the token is the beginning of a series of subwords,
-
-        Examples
-        --------
-        >>> url = 'http://repo.mxnet.io/gluon/dataset/vocab/test-682b5d15.bpe'
-        >>> f = gluon.utils.download(url)
-        -etc-
-        >>> bert_vocab = gluonnlp.vocab.BERTVocab.from_sentencepiece(f)
-        >>> sp_tokenizer = BERTSPTokenizer(f, bert_vocab, lower=True)
-        >>> sp_tokenizer('Better is to bow than break.')
-        ['▁better', '▁is', '▁to', '▁b', 'ow', '▁than', '▁brea', 'k', '▁', '.']
-        >>> sp_tokenizer.is_first_subword('▁better')
-        True
-        >>> sp_tokenizer.is_first_subword('ow')
-        False
-        >>> os.remove('test-682b5d15.bpe')
-        """
-        return token.startswith(BERTSPTokenizer._special_prefix)
-
-
-class BERTSentenceTransform:
-    r"""BERT style data transformation.
-
-    Parameters
-    ----------
-    tokenizer : BERTTokenizer.
-        Tokenizer for the sentences.
-    max_seq_length : int.
-        Maximum sequence length of the sentences.
-    vocab : Vocab
-        The vocabulary which has cls_token and sep_token registered.
-        If vocab.cls_token is not present, vocab.bos_token is used instead.
-        If vocab.sep_token is not present, vocab.eos_token is used instead.
-    pad : bool, default True
-        Whether to pad the sentences to maximum length.
-    pair : bool, default True
-        Whether to transform sentences or sentence pairs.
-    """
-
-    def __init__(self, tokenizer, max_seq_length, vocab=None, pad=True, pair=True):
-        self._tokenizer = tokenizer
-        self._max_seq_length = max_seq_length
-        self._pad = pad
-        self._pair = pair
-        self._vocab = self._tokenizer.vocab if vocab is None else vocab
-        # RoBERTa does not register CLS token and SEP token
-        if hasattr(self._vocab, 'cls_token'):
-            self._cls_token = self._vocab.cls_token
-        else:
-            self._cls_token = self._vocab.bos_token
-        if hasattr(self._vocab, 'sep_token'):
-            self._sep_token = self._vocab.sep_token
-        else:
-            self._sep_token = self._vocab.eos_token
-        self._padding_token = self._vocab.padding_token
-
-    def __call__(self, line):
-        """Perform transformation for sequence pairs or single sequences.
-
-        The transformation is processed in the following steps:
-        - tokenize the input sequences
-        - insert [CLS], [SEP] as necessary
-        - generate type ids to indicate whether a token belongs to the first
-        sequence or the second sequence.
-        - generate valid length
-
-        For sequence pairs, the input is a tuple of 2 strings:
-        text_a, text_b.
-
-        Inputs:
-            text_a: 'is this jacksonville ?'
-            text_b: 'no it is not'
-        Tokenization:
-            text_a: 'is this jack ##son ##ville ?'
-            text_b: 'no it is not .'
-        Processed:
-            tokens: '[CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]'
-            type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
-            valid_length: 14
-
-        For single sequences, the input is a tuple of single string:
-        text_a.
-
-        Inputs:
-            text_a: 'the dog is hairy .'
-        Tokenization:
-            text_a: 'the dog is hairy .'
-        Processed:
-            text_a: '[CLS] the dog is hairy . [SEP]'
-            type_ids: 0     0   0   0  0     0 0
-            valid_length: 7
-
-        If vocab.cls_token and vocab.sep_token are not present,
-        vocab.bos_token and vocab.eos_token are used instead.
-
-        Parameters
-        ----------
-        line: tuple of str
-            Input strings. For sequence pairs, the input is a tuple of 2 strings:
-            (text_a, text_b). For single sequences, the input is a tuple of single
-            string: (text_a,).
-
-        Returns
-        -------
-        np.array: input token ids in 'int32', shape (batch_size, seq_length)
-        np.array: valid length in 'int32', shape (batch_size,)
-        np.array: input token type ids in 'int32', shape (batch_size, seq_length)
-
-        """
-
-        # convert to unicode
-        text_a = line[0]
-        if self._pair:
-            assert len(line) == 2
-            text_b = line[1]
-
-        tokens_a = self._tokenizer(text_a)
-        tokens_b = None
-
-        if self._pair:
-            tokens_b = self._tokenizer(text_b)
-
-        if tokens_b:
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            self._truncate_seq_pair(tokens_a, tokens_b,
-                                    self._max_seq_length - 3)
-        else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > self._max_seq_length - 2:
-                tokens_a = tokens_a[0:(self._max_seq_length - 2)]
-
-        # The embedding vectors for `type=0` and `type=1` were learned during
-        # pre-training and are added to the wordpiece embedding vector
-        # (and position vector). This is not *strictly* necessary since
-        # the [SEP] token unambiguously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens = []
-        tokens.append(self._cls_token)
-        tokens.extend(tokens_a)
-        tokens.append(self._sep_token)
-        segment_ids = [0] * len(tokens)
-
-        if tokens_b:
-            tokens.extend(tokens_b)
-            tokens.append(self._sep_token)
-            segment_ids.extend([1] * (len(tokens) - len(segment_ids)))
-
-        input_ids = self._vocab[tokens]
-
-        # The valid length of sentences. Only real  tokens are attended to.
-        valid_length = len(input_ids)
-
-        if self._pad:
-            # Zero-pad up to the sequence length.
-            padding_length = self._max_seq_length - valid_length
-            # use padding tokens for the rest
-            input_ids.extend([self._vocab[self._padding_token]] * padding_length)
-            segment_ids.extend([0] * padding_length)
-
-        return np.array(input_ids, dtype='int32'), np.array(valid_length, dtype='int32'),\
-            np.array(segment_ids, dtype='int32')
-
-    def _truncate_seq_pair(self, tokens_a, tokens_b, max_length):
-        """Truncates a sequence pair in place to the maximum length."""
-        # This is a simple heuristic which will always truncate the longer sequence
-        # one token at a time. This makes more sense than truncating an equal percent
-        # of tokens from each, since if one sequence is very short then each token
-        # that's truncated likely contains more information than a longer sequence.
-        while True:
-            total_length = len(tokens_a) + len(tokens_b)
-            if total_length <= max_length:
-                break
-            if len(tokens_a) > len(tokens_b):
-                tokens_a.pop()
-            else:
-                tokens_b.pop()
-
-class _GPT2BPE:
-    """Base class for GPT-2 BPE tokenizer and detokenizer."""
-    def __init__(self):
-        codes = list(range(ord('!'), ord('~') + 1)) +\
-                list(range(ord('¡'), ord('¬') + 1)) +\
-                list(range(ord('®'), ord('ÿ') + 1))
-        chr_fn = chr
-        try:
-            chr_fn(256)
-        except ValueError:
-            chr_fn = unichr # noqa: F821
-        byte_encoder = {code: chr_fn(code) for code in codes}
-        shift = 0
-        for code in range(2 ** 8):
-            if code not in byte_encoder:
-                byte_encoder[code] = chr_fn(2 ** 8 + shift)
-                shift += 1
-        self._byte_encoder = byte_encoder
-
-
-class GPT2BPETokenizer(_GPT2BPE):
-    """BPE tokenizer used in OpenAI GPT-2 model.
-
-    Parameters
-    ----------
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the BPE rank file.
-        MXNET_HOME defaults to '~/.mxnet'.
-    """
-    bpe_ranks_file_hash = ('openai_webtext_bpe_ranks-396d4d8e.json',
-                           '396d4d8ec90cb02f4d56e049e0e4add868bcd943')
-    bpe_ranks_archive_hash = ('openai_webtext_bpe_ranks-396d4d8e.zip',
-                              '1a770728fd102bc9dc332f322e6bfb294767a685')
-    def __init__(self, root=os.path.join(get_home_dir(), 'models')):
-        try:
-            import regex  # pylint: disable=import-outside-toplevel
-            self._regex = regex
-        except ImportError:
-            raise ImportError(
-                'GPT2BPETokenizer requires regex. '
-                'To install regex, use pip install -U regex')
-        super(GPT2BPETokenizer, self).__init__()
-        root = os.path.expanduser(root)
-        file_name, sha1_hash = self.bpe_ranks_file_hash
-        file_path = os.path.join(root, file_name)
-        if not os.path.exists(file_path) or not check_sha1(file_path, sha1_hash):
-            if os.path.exists(file_path):
-                print('Detected mismatch in the content of BPE rank file. Downloading again.')
-            else:
-                print('BPE rank file is not found. Downloading.')
-            if not os.path.exists(root):
-                try:
-                    os.makedirs(root)
-                except OSError as e:
-                    if e.errno == errno.EEXIST and os.path.isdir(root):
-                        pass
-                    else:
-                        raise e
-
-            prefix = str(time.time())
-            zip_file_path = os.path.join(root, prefix + file_name)
-            repo_url = _get_repo_url()
-            if repo_url[-1] != '/':
-                repo_url = repo_url + '/'
-            archive_name, archive_hash = self.bpe_ranks_archive_hash
-            _url_format = '{repo_url}gluon/dataset/vocab/{file_name}'
-            download(_url_format.format(repo_url=repo_url, file_name=archive_name),
-                     path=zip_file_path,
-                     sha1_hash=archive_hash,
-                     overwrite=True)
-            with zipfile.ZipFile(zip_file_path) as zf:
-                if not os.path.exists(file_path):
-                    zf.extractall(root)
-            try:
-                os.remove(zip_file_path)
-            except OSError as e:
-                # file has already been removed.
-                if e.errno == 2:
-                    pass
-                else:
-                    raise e
-
-            if not check_sha1(file_path, sha1_hash):
-                raise ValueError('Downloaded file has different hash. Please try again.')
-        self._read_bpe_ranks(file_path)
-        self._cache = {}
-        self._token_pattern = self._regex.compile(
-            r'\'s|\'t|\'re|\'ve|\'m|\'ll|\'d| ?\p{L}+'
-            r'| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+')
-
-    def _read_bpe_ranks(self, file_path):
-        with io.open(file_path, 'r', encoding='utf-8') as f:
-            bpe_data = f.read()
-            self._bpe_ranks = {
-                tuple(merge_str.split()): i for i, merge_str
-                in enumerate(bpe_data.split('\n')[1:-1])}
-
-    def get_bpe_subword(self, token):
-        """ Encode the word token into BPE subwords
-
-        Parameters
-        ----------
-        token : str
-
-        Returns
-        -------
-        chars : list(str)
-        """
-        if token in self._cache:
-            return self._cache[token]
-        chars = list(token)
-        while len(chars) > 0:
-            min_pair, min_rank = None, float('inf')
-            # Find the pair with the minimum rank
-            for i in range(1, len(chars)):
-                pair = (chars[i - 1], chars[i])
-                rank = self._bpe_ranks.get(pair, float('inf'))
-                if rank < min_rank:
-                    min_rank = rank
-                    min_pair = pair
-            if min_pair is None or min_pair not in self._bpe_ranks:
-                break
-            # Merge the pair
-            last, tail = chars[0], 1
-            for index in range(1, len(chars)):
-                if (last, chars[index]) == min_pair:
-                    chars[tail - 1] = last + chars[index]
-                    last = last + chars[index]
-                else:
-                    chars[tail - 1] = last
-                    tail += 1
-                    last = chars[index]
-            chars[tail - 1] = last
-            chars = chars[:tail]
-        self._cache[token] = chars
-        return chars
-
-    def __call__(self, sample):
-        """
-
-        Parameters
-        ----------
-        sample : str
-
-        Returns
-        -------
-        ret : list(str)
-        """
-        ret = []
-        for word_token in self._regex.findall(self._token_pattern, sample):
-            word_token = bytearray(word_token.encode('utf-8'))
-            word_token = ''.join(self._byte_encoder[code] for code in word_token)
-            ret.extend(self.get_bpe_subword(word_token))
-        return ret
-
-
-class GPT2BPEDetokenizer(_GPT2BPE):
-    """BPE detokenizer used in OpenAI GPT-2 model."""
-    def __init__(self):
-        super(GPT2BPEDetokenizer, self).__init__()
-        self._byte_decoder = {v: k for k, v in self._byte_encoder.items()}
-
-    def __call__(self, sample):
-        """
-
-        Parameters
-        ----------
-        sample : list(str)
-
-        Returns
-        -------
-        ret : str
-        """
-        text = ''.join(sample)
-        ret = bytearray(
-            [self._byte_decoder[byte] for byte in text]).decode('utf-8', errors='replace')
-        return ret
diff --git a/src/gluonnlp/data/translation.py b/src/gluonnlp/data/translation.py
deleted file mode 100644
index 6ab02e0ce8..0000000000
--- a/src/gluonnlp/data/translation.py
+++ /dev/null
@@ -1,462 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=
-"""Machine translation datasets."""
-
-__all__ = ['IWSLT2015', 'WMT2014', 'WMT2014BPE', 'WMT2016', 'WMT2016BPE']
-
-
-import os
-import zipfile
-import shutil
-import io
-
-from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url
-from mxnet.gluon.data import ArrayDataset
-
-from .dataset import TextLineDataset
-from ..vocab import Vocab
-from .registry import register
-from ..base import get_home_dir
-
-
-def _get_pair_key(src_lang, tgt_lang):
-    return '_'.join(sorted([src_lang, tgt_lang]))
-
-
-class _TranslationDataset(ArrayDataset):
-    def __init__(self, namespace, segment, src_lang, tgt_lang, root):
-        assert _get_pair_key(src_lang, tgt_lang) in self._archive_file, \
-            'The given language combination: src_lang={}, tgt_lang={}, is not supported. ' \
-            'Only supports language pairs = {}.'.format(
-                src_lang, tgt_lang, str(self._archive_file.keys()))
-        if isinstance(segment, str):
-            assert segment in self._supported_segments, \
-                'Only supports {} for the segment. Received segment={}'.format(
-                    self._supported_segments, segment)
-        else:
-            for ele_segment in segment:
-                assert ele_segment in self._supported_segments, \
-                    'segment should only contain elements in {}. Received segment={}'.format(
-                        self._supported_segments, segment)
-        self._namespace = 'gluon/dataset/{}'.format(namespace)
-        self._segment = segment
-        self._src_lang = src_lang
-        self._tgt_lang = tgt_lang
-        self._src_vocab = None
-        self._tgt_vocab = None
-        self._pair_key = _get_pair_key(src_lang, tgt_lang)
-        root = os.path.expanduser(root)
-        if not os.path.isdir(root):
-            os.makedirs(root)
-        self._root = root
-        if isinstance(segment, str):
-            segment = [segment]
-        src_corpus = []
-        tgt_corpus = []
-        for ele_segment in segment:
-            [src_corpus_path, tgt_corpus_path] = self._get_data(ele_segment)
-            src_corpus.extend(TextLineDataset(src_corpus_path))
-            tgt_corpus.extend(TextLineDataset(tgt_corpus_path))
-        # Filter 0-length src/tgt sentences
-        src_lines = []
-        tgt_lines = []
-        for src_line, tgt_line in zip(list(src_corpus), list(tgt_corpus)):
-            if len(src_line) > 0 and len(tgt_line) > 0:
-                src_lines.append(src_line)
-                tgt_lines.append(tgt_line)
-        super(_TranslationDataset, self).__init__(src_lines, tgt_lines)
-
-    def _fetch_data_path(self, file_name_hashs):
-        archive_file_name, archive_hash = self._archive_file[self._pair_key]
-        paths = []
-        root = self._root
-        for data_file_name, data_hash in file_name_hashs:
-            path = os.path.join(root, data_file_name)
-            if not os.path.exists(path) or not check_sha1(path, data_hash):
-                downloaded_file_path = download(_get_repo_file_url(self._namespace,
-                                                                   archive_file_name),
-                                                path=root,
-                                                sha1_hash=archive_hash)
-
-                with zipfile.ZipFile(downloaded_file_path, 'r') as zf:
-                    for member in zf.namelist():
-                        filename = os.path.basename(member)
-                        if filename:
-                            dest = os.path.join(root, filename)
-                            with zf.open(member) as source, \
-                                    open(dest, 'wb') as target:
-                                shutil.copyfileobj(source, target)
-            paths.append(path)
-        return paths
-
-    def _get_data(self, segment):
-        src_corpus_file_name, src_corpus_hash =\
-            self._data_file[self._pair_key][segment + '_' + self._src_lang]
-        tgt_corpus_file_name, tgt_corpus_hash =\
-            self._data_file[self._pair_key][segment + '_' + self._tgt_lang]
-        return self._fetch_data_path([(src_corpus_file_name, src_corpus_hash),
-                                      (tgt_corpus_file_name, tgt_corpus_hash)])
-
-    @property
-    def src_vocab(self):
-        """Source Vocabulary of the Dataset.
-
-        Returns
-        -------
-        src_vocab : Vocab
-            Source vocabulary.
-        """
-        if self._src_vocab is None:
-            src_vocab_file_name, src_vocab_hash = \
-                self._data_file[self._pair_key]['vocab' + '_' + self._src_lang]
-            [src_vocab_path] = self._fetch_data_path([(src_vocab_file_name, src_vocab_hash)])
-            with io.open(src_vocab_path, 'r', encoding='utf-8') as in_file:
-                self._src_vocab = Vocab.from_json(in_file.read())
-        return self._src_vocab
-
-    @property
-    def tgt_vocab(self):
-        """Target Vocabulary of the Dataset.
-
-        Returns
-        -------
-        tgt_vocab : Vocab
-            Target vocabulary.
-        """
-        if self._tgt_vocab is None:
-            tgt_vocab_file_name, tgt_vocab_hash = \
-                self._data_file[self._pair_key]['vocab' + '_' + self._tgt_lang]
-            [tgt_vocab_path] = self._fetch_data_path([(tgt_vocab_file_name, tgt_vocab_hash)])
-            with io.open(tgt_vocab_path, 'r', encoding='utf-8') as in_file:
-                self._tgt_vocab = Vocab.from_json(in_file.read())
-        return self._tgt_vocab
-
-
-@register(segment=['train', 'val', 'test'])
-class IWSLT2015(_TranslationDataset):
-    """Preprocessed IWSLT English-Vietnamese Translation Dataset.
-
-    We use the preprocessed version provided in https://nlp.stanford.edu/projects/nmt/
-
-    Parameters
-    ----------
-    segment : str or list of str, default 'train'
-        Dataset segment. Options are 'train', 'val', 'test' or their combinations.
-    src_lang : str, default 'en'
-        The source language. Option for source and target languages are 'en' <-> 'vi'
-    tgt_lang : str, default 'vi'
-        The target language. Option for source and target languages are 'en' <-> 'vi'
-    root : str, default '$MXNET_HOME/datasets/iwslt2015'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    """
-    def __init__(self, segment='train', src_lang='en', tgt_lang='vi',
-                 root=os.path.join(get_home_dir(), 'datasets', 'iwslt2015')):
-        self._supported_segments = ['train', 'val', 'test']
-        self._archive_file = {_get_pair_key('en', 'vi'):
-                                  ('iwslt15.zip', '15a05df23caccb1db458fb3f9d156308b97a217b')}
-        self._data_file = {_get_pair_key('en', 'vi'):
-                               {'train_en': ('train.en',
-                                             '675d16d057f2b6268fb294124b1646d311477325'),
-                                'train_vi': ('train.vi',
-                                             'bb6e21d4b02b286f2a570374b0bf22fb070589fd'),
-                                'val_en': ('tst2012.en',
-                                           'e381f782d637b8db827d7b4d8bb3494822ec935e'),
-                                'val_vi': ('tst2012.vi',
-                                           '4511988ce67591dc8bcdbb999314715f21e5a1e1'),
-                                'test_en': ('tst2013.en',
-                                            'd320db4c8127a85de81802f239a6e6b1af473c3d'),
-                                'test_vi': ('tst2013.vi',
-                                            'af212c48a68465ceada9263a049f2331f8af6290'),
-                                'vocab_en': ('vocab.en.json',
-                                             'b6f8e77a45f6dce648327409acd5d52b37a45d94'),
-                                'vocab_vi' : ('vocab.vi.json',
-                                              '9be11a9edd8219647754d04e0793d2d8c19dc852')}}
-        super(IWSLT2015, self).__init__('iwslt2015', segment=segment, src_lang=src_lang,
-                                        tgt_lang=tgt_lang, root=root)
-
-
-@register(segment=['train', 'newstest2009', 'newstest2010', 'newstest2011', \
-                   'newstest2012', 'newstest2013', 'newstest2014'])
-class WMT2014(_TranslationDataset):
-    """Translation Corpus of the WMT2014 Evaluation Campaign.
-
-    http://www.statmt.org/wmt14/translation-task.html
-
-    Parameters
-    ----------
-    segment : str or list of str, default 'train'
-        Dataset segment. Options are 'train', 'newstest2009', 'newstest2010',
-        'newstest2011', 'newstest2012', 'newstest2013', 'newstest2014' or their combinations
-    src_lang : str, default 'en'
-        The source language. Option for source and target languages are 'en' <-> 'de'
-    tgt_lang : str, default 'de'
-        The target language. Option for source and target languages are 'en' <-> 'de'
-    full : bool, default False
-        By default, we use the "filtered test sets" while if full is True, we use the "cleaned test
-        sets".
-    root : str, default '$MXNET_HOME/datasets/wmt2014'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    """
-    def __init__(self, segment='train', src_lang='en', tgt_lang='de', full=False,
-                 root=os.path.join(get_home_dir(), 'datasets', 'wmt2014')):
-        self._supported_segments = ['train'] + ['newstest%d' % i for i in range(2009, 2015)]
-        self._archive_file = {_get_pair_key('de', 'en'):
-                                  ('wmt2014_de_en-b0e0e703.zip',
-                                   'b0e0e7036217ffa94f4b35a5b5d2a96a27f680a4')}
-        self._data_file = {_get_pair_key('de', 'en'):
-                               {'train_en': ('train.en',
-                                             'cec2d4c5035df2a54094076348eaf37e8b588a9b'),
-                                'train_de': ('train.de',
-                                             '6348764640ffc40992e7de89a8c48d32a8bcf458'),
-                                'newstest2009_en': ('newstest2009.en',
-                                                    'f8623af2de682924f9841488427e81c430e3ce60'),
-                                'newstest2009_de': ('newstest2009.de',
-                                                    'dec03f14cb47e726ccb19bec80c645d4a996f8a9'),
-                                'newstest2010_en': ('newstest2010.en',
-                                                    '5966eb13bd7cc8855cc6b40f9797607e36e9cc80'),
-                                'newstest2010_de': ('newstest2010.de',
-                                                    'b9af0cb004fa6996eda246d0173c191693b26025'),
-                                'newstest2011_en': ('newstest2011.en',
-                                                    '2c1d9d077fdbfe9d0e052a6e08a85ee7959479ab'),
-                                'newstest2011_de': ('newstest2011.de',
-                                                    'efbded3d175a9d472aa5938fe22afcc55c6055ff'),
-                                'newstest2012_en': ('newstest2012.en',
-                                                    '52f05ae725be45ee4012c6e208cef13614abacf1'),
-                                'newstest2012_de': ('newstest2012.de',
-                                                    'd9fe32143b88e6fe770843e15ee442a69ff6752d'),
-                                'newstest2013_en': ('newstest2013.en',
-                                                    '5dca5d02cf40278d8586ee7d58d58215253156a9'),
-                                'newstest2013_de': ('newstest2013.de',
-                                                    'ddda1e7b3270cb68108858640bfb619c37ede2ab'),
-                                'newstest2014_en': ('newstest2014.src.en',
-                                                    '610c5bb4cc866ad04ab1f6f80d740e1f4435027c'),
-                                'newstest2014_de': ('newstest2014.ref.de',
-                                                    '03b02c7f60c8509ba9bb4c85295358f7c9f00d2d')}}
-        if full:
-            self._data_file[_get_pair_key('de', 'en')]['newstest2014_en'] = \
-                ('newstest2014.full.en', '528742a3a9690995d031f49d1dbb704844684976')
-            self._data_file[_get_pair_key('de', 'en')]['newstest2014_de'] = \
-                ('newstest2014.full.de', '2374b6a28cecbd965b73a9acc35a425e1ed81963')
-        else:
-            if src_lang == 'de':
-                self._data_file[_get_pair_key('de', 'en')]['newstest2014_en'] = \
-                    ('newstest2014.ref.en', 'cf23229ec6db8b85f240618d2a245f69afebed1f')
-                self._data_file[_get_pair_key('de', 'en')]['newstest2014_de'] = \
-                    ('newstest2014.src.de', '791d644b1a031268ca19600b2734a63c7bfcecc4')
-        super(WMT2014, self).__init__('wmt2014', segment=segment, src_lang=src_lang,
-                                      tgt_lang=tgt_lang,
-                                      root=os.path.join(root, _get_pair_key(src_lang, tgt_lang)))
-
-
-@register(segment=['train', 'newstest2009', 'newstest2010', 'newstest2011', \
-                   'newstest2012', 'newstest2013', 'newstest2014'])
-class WMT2014BPE(_TranslationDataset):
-    """Preprocessed Translation Corpus of the WMT2014 Evaluation Campaign.
-
-    We preprocess the dataset by adapting
-    https://github.com/tensorflow/nmt/blob/master/nmt/scripts/wmt16_en_de.sh
-
-    Parameters
-    ----------
-    segment : str or list of str, default 'train'
-        Dataset segment. Options are 'train', 'newstest2009', 'newstest2010',
-        'newstest2011', 'newstest2012', 'newstest2013', 'newstest2014' or their combinations
-    src_lang : str, default 'en'
-        The source language. Option for source and target languages are 'en' <-> 'de'
-    tgt_lang : str, default 'de'
-        The target language. Option for source and target languages are 'en' <-> 'de'
-    full : bool, default False
-        In default, we use the test dataset in http://statmt.org/wmt14/test-filtered.tgz.
-        When full is True, we use the test dataset in http://statmt.org/wmt14/test-full.tgz
-    root : str, default '$MXNET_HOME/datasets/wmt2014'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    """
-    def __init__(self, segment='train', src_lang='en', tgt_lang='de', full=False,
-                 root=os.path.join(get_home_dir(), 'datasets', 'wmt2014')):
-        self._supported_segments = ['train'] + ['newstest%d' % i for i in range(2009, 2015)]
-        self._archive_file = {_get_pair_key('de', 'en'):
-                                  ('wmt2014bpe_de_en-ace8f41c.zip',
-                                   'ace8f41c22c0da8729ff15f40d416ebd16738979')}
-        self._data_file = {_get_pair_key('de', 'en'):
-                               {'train_en': ('train.tok.clean.bpe.32000.en',
-                                             'e3f093b64468db7084035c9650d9eecb86a3db5f'),
-                                'train_de': ('train.tok.clean.bpe.32000.de',
-                                             '60703ad088706a3d9d1f3328889c6f4725a36cfb'),
-                                'newstest2009_en': ('newstest2009.tok.bpe.32000.en',
-                                                    '5678547f579528a8716298e895f886e3976085e1'),
-                                'newstest2009_de': ('newstest2009.tok.bpe.32000.de',
-                                                    '32caa69023eac1750a0036780f9d511d979aed2c'),
-                                'newstest2010_en': ('newstest2010.tok.bpe.32000.en',
-                                                    '813103f7b4b472cf213fe3b2c3439e267dbc4afb'),
-                                'newstest2010_de': ('newstest2010.tok.bpe.32000.de',
-                                                    '972076a897ecbc7a3acb639961241b33fd58a374'),
-                                'newstest2011_en': ('newstest2011.tok.bpe.32000.en',
-                                                    'c3de2d72d5e7bdbe848839c55c284fece90464ce'),
-                                'newstest2011_de': ('newstest2011.tok.bpe.32000.de',
-                                                    '7a8722aeedacd99f1aa8dffb6d8d072430048011'),
-                                'newstest2012_en': ('newstest2012.tok.bpe.32000.en',
-                                                    '876ad3c72e33d8e1ed14f5362f97c771ce6a9c7f'),
-                                'newstest2012_de': ('newstest2012.tok.bpe.32000.de',
-                                                    '57467fcba8442164d058a05eaf642a1da1d92c13'),
-                                'newstest2013_en': ('newstest2013.tok.bpe.32000.en',
-                                                    'de06a155c3224674b2434f3ff3b2c4a4a293d238'),
-                                'newstest2013_de': ('newstest2013.tok.bpe.32000.de',
-                                                    '094084989128dd091a2fe2a5818a86bc99ecc0e7'),
-                                'newstest2014_en': ('newstest2014.tok.bpe.32000.src.en',
-                                                    '347cf4d3d5c3c46ca1220247d22c07aa90092bd9'),
-                                'newstest2014_de': ('newstest2014.tok.bpe.32000.ref.de',
-                                                    'f66b80a0c460c524ec42731e527c54aab5507a66'),
-                                'vocab_en': ('vocab.bpe.32000.json',
-                                             '71413f497ce3a0fa691c55277f367e5d672b27ee'),
-                                'vocab_de': ('vocab.bpe.32000.json',
-                                             '71413f497ce3a0fa691c55277f367e5d672b27ee')}}
-        if full:
-            self._data_file[_get_pair_key('de', 'en')]['newstest2014_en'] = \
-                ('newstest2014.tok.bpe.32000.full.en', '6c398b61641cd39f186b417c54b171876563193f')
-            self._data_file[_get_pair_key('de', 'en')]['newstest2014_de'] = \
-                ('newstest2014.tok.bpe.32000.full.de', 'b890a8dfc2146dde570fcbcb42e4157292e95251')
-        else:
-            if src_lang == 'de':
-                self._data_file[_get_pair_key('de', 'en')]['newstest2014_en'] = \
-                    ('newstest2014.tok.bpe.32000.ref.en',
-                     'cd416085db722bf07cbba4ff29942fe94e966023')
-                self._data_file[_get_pair_key('de', 'en')]['newstest2014_de'] = \
-                    ('newstest2014.tok.bpe.32000.src.de',
-                     '9274d31f92141933f29a405753d5fae051fa5725')
-        super(WMT2014BPE, self).__init__('wmt2014', segment=segment, src_lang=src_lang,
-                                         tgt_lang=tgt_lang,
-                                         root=os.path.join(root, _get_pair_key(src_lang, tgt_lang)))
-
-
-@register(segment=['train', 'newstest2012', 'newstest2013', 'newstest2014', \
-                   'newstest2015', 'newstest2016'])
-class WMT2016(_TranslationDataset):
-    """Translation Corpus of the WMT2016 Evaluation Campaign.
-
-    Parameters
-    ----------
-    segment : str or list of str, default 'train'
-        Dataset segment. Options are 'train', 'newstest2012', 'newstest2013',
-        'newstest2014', 'newstest2015', 'newstest2016' or their combinations
-    src_lang : str, default 'en'
-        The source language. Option for source and target languages are 'en' <-> 'de'
-    tgt_lang : str, default 'de'
-        The target language. Option for source and target languages are 'en' <-> 'de'
-    root : str, default '$MXNET_HOME/datasets/wmt2016'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    """
-    def __init__(self, segment='train', src_lang='en', tgt_lang='de',
-                 root=os.path.join(get_home_dir(), 'datasets', 'wmt2016')):
-        self._supported_segments = ['train'] + ['newstest%d' % i for i in range(2012, 2017)]
-        self._archive_file = {_get_pair_key('de', 'en'):
-                                  ('wmt2016_de_en-88767407.zip',
-                                   '887674077b951ce949fe3e597086b826bd7574d8')}
-        self._data_file = {_get_pair_key('de', 'en'):
-                               {'train_en': ('train.en',
-                                             '1be6d00c255c57183305276c5de60771e201d3b0'),
-                                'train_de': ('train.de',
-                                             '4eec608b8486bfb65b61bda237b0c9b3c0f66f17'),
-                                'newstest2012_en': ('newstest2012.en',
-                                                    '52f05ae725be45ee4012c6e208cef13614abacf1'),
-                                'newstest2012_de': ('newstest2012.de',
-                                                    'd9fe32143b88e6fe770843e15ee442a69ff6752d'),
-                                'newstest2013_en': ('newstest2013.en',
-                                                    '5dca5d02cf40278d8586ee7d58d58215253156a9'),
-                                'newstest2013_de': ('newstest2013.de',
-                                                    'ddda1e7b3270cb68108858640bfb619c37ede2ab'),
-                                'newstest2014_en': ('newstest2014.en',
-                                                    '528742a3a9690995d031f49d1dbb704844684976'),
-                                'newstest2014_de': ('newstest2014.de',
-                                                    '2374b6a28cecbd965b73a9acc35a425e1ed81963'),
-                                'newstest2015_en': ('newstest2015.en',
-                                                    'bf90439b209a496128995c4b948ad757979d0756'),
-                                'newstest2015_de': ('newstest2015.de',
-                                                    'd69ac825fe3d5796b4990b969ad71903a38a0cd1'),
-                                'newstest2016_en': ('newstest2016.en',
-                                                    'a99c145d5214eb1645b56d21b02a541fbe7eb3c2'),
-                                'newstest2016_de': ('newstest2016.de',
-                                                    'fcdd3104f21eb4b9c49ba8ddef46d9b2d472b3fe')}}
-        super(WMT2016, self).__init__('wmt2016', segment=segment, src_lang=src_lang,
-                                      tgt_lang=tgt_lang,
-                                      root=os.path.join(root, _get_pair_key(src_lang, tgt_lang)))
-
-
-@register(segment=['train', 'newstest2012', 'newstest2013', 'newstest2014', \
-                   'newstest2015', 'newstest2016'])
-class WMT2016BPE(_TranslationDataset):
-    """Preprocessed Translation Corpus of the WMT2016 Evaluation Campaign.
-
-    We use the preprocessing script in
-    https://github.com/tensorflow/nmt/blob/master/nmt/scripts/wmt16_en_de.sh
-
-    Parameters
-    ----------
-    segment : str or list of str, default 'train'
-        Dataset segment. Options are 'train', 'newstest2012', 'newstest2013',
-        'newstest2014', 'newstest2015', 'newstest2016' or their combinations
-    src_lang : str, default 'en'
-        The source language. Option for source and target languages are 'en' <-> 'de'
-    tgt_lang : str, default 'de'
-        The target language. Option for source and target languages are 'en' <-> 'de'
-    root : str, default '$MXNET_HOME/datasets/wmt2016'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    """
-    def __init__(self, segment='train', src_lang='en', tgt_lang='de',
-                 root=os.path.join(get_home_dir(), 'datasets', 'wmt2016')):
-        self._supported_segments = ['train'] + ['newstest%d' % i for i in range(2012, 2017)]
-        self._archive_file = {_get_pair_key('de', 'en'):
-                                  ('wmt2016bpe_de_en-8cf0dbf6.zip',
-                                   '8cf0dbf6a102381443a472bcf9f181299231b496')}
-        self._data_file = {_get_pair_key('de', 'en'):
-                               {'train_en': ('train.tok.clean.bpe.32000.en',
-                                             '56f37cb4d68c2f83efd6a0c555275d1fe09f36b5'),
-                                'train_de': ('train.tok.clean.bpe.32000.de',
-                                             '58f30a0ba7f80a8840a5cf3deff3c147de7d3f68'),
-                                'newstest2012_en': ('newstest2012.tok.bpe.32000.en',
-                                                    '25ed9ad228a236f57f97bf81db1bb004bedb7f33'),
-                                'newstest2012_de': ('newstest2012.tok.bpe.32000.de',
-                                                    'bb5622831ceea1894966fa993ebcd882cc461943'),
-                                'newstest2013_en': ('newstest2013.tok.bpe.32000.en',
-                                                    'fa03fe189fe68cb25014c5e64096ac8daf2919fa'),
-                                'newstest2013_de': ('newstest2013.tok.bpe.32000.de',
-                                                    '7d10a884499d352c2fea6f1badafb40473737640'),
-                                'newstest2014_en': ('newstest2014.tok.bpe.32000.en',
-                                                    '7b8ea824021cc5291e6a54bb32a1fc27c2955588'),
-                                'newstest2014_de': ('newstest2014.tok.bpe.32000.de',
-                                                    'd84497d4c425fa4e9b2b6be4b62c763086410aad'),
-                                'newstest2015_en': ('newstest2015.tok.bpe.32000.en',
-                                                    'ca335076f67b2f9b98848f8abc2cd424386f2309'),
-                                'newstest2015_de': ('newstest2015.tok.bpe.32000.de',
-                                                    'e633a3fb74506eb498fcad654d82c9b1a0a347b3'),
-                                'newstest2016_en': ('newstest2016.tok.bpe.32000.en',
-                                                    '5a5e36a6285823035b642aef7c1a9ec218da59f7'),
-                                'newstest2016_de': ('newstest2016.tok.bpe.32000.de',
-                                                    '135a79acb6a4f8fad0cbf5f74a15d9c0b5bf8c73'),
-                                'vocab_en': ('vocab.bpe.32000.json',
-                                             '1c5aea0a77cad592c4e9c1136ec3b70ceeff4e8c'),
-                                'vocab_de': ('vocab.bpe.32000.json',
-                                             '1c5aea0a77cad592c4e9c1136ec3b70ceeff4e8c')}}
-        super(WMT2016BPE, self).__init__('wmt2016', segment=segment, src_lang=src_lang,
-                                         tgt_lang=tgt_lang,
-                                         root=os.path.join(root, _get_pair_key(src_lang, tgt_lang)))
diff --git a/src/gluonnlp/data/utils.py b/src/gluonnlp/data/utils.py
deleted file mode 100644
index 61ed357525..0000000000
--- a/src/gluonnlp/data/utils.py
+++ /dev/null
@@ -1,519 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Utility classes and functions. They help organize and keep statistics of datasets."""
-import collections
-import os
-import tarfile
-import zipfile
-import random
-import sys
-import shutil
-
-import numpy as np
-from mxnet.gluon.data import SimpleDataset
-from mxnet.gluon.utils import _get_repo_url, check_sha1, download
-
-from .. import _constants as C
-from .. import utils
-
-__all__ = [
-    'Counter', 'count_tokens', 'concat_sequence', 'slice_sequence', 'train_valid_split',
-    'line_splitter', 'whitespace_splitter', 'Splitter'
-]
-
-
-class Counter(collections.Counter):  # pylint: disable=abstract-method
-    """Counter class for keeping token frequencies."""
-
-    def discard(self, min_freq, unknown_token):
-        """Discards tokens with frequency below min_frequency and represents them
-        as `unknown_token`.
-
-        Parameters
-        ----------
-        min_freq: int
-            Tokens whose frequency is under min_freq is counted as `unknown_token` in
-            the Counter returned.
-        unknown_token: str
-            The representation for any unknown token.
-
-        Returns
-        -------
-        The Counter instance.
-
-        Examples
-        --------
-        >>> a = gluonnlp.data.Counter({'a': 10, 'b': 1, 'c': 1})
-        >>> a.discard(3, '<unk>')
-        Counter({'a': 10, '<unk>': 2})
-        """
-        freq = 0
-        ret = Counter({})
-        for token, count in self.items():
-            if count < min_freq:
-                freq += count
-            else:
-                ret[token] = count
-        ret[unknown_token] = ret.get(unknown_token, 0) + freq
-        return ret
-
-
-class DefaultLookupDict(dict):
-    """Dictionary class with fall-back look-up with default value set in the constructor."""
-
-    def __init__(self, default, d=None):
-        if d:
-            super(DefaultLookupDict, self).__init__(d)
-        else:
-            super(DefaultLookupDict, self).__init__()
-        self._default = default
-
-    def __getitem__(self, k):
-        return self.get(k, self._default)
-
-
-def count_tokens(tokens, to_lower=False, counter=None):
-    r"""Counts tokens in the specified string.
-
-    For token_delim='(td)' and seq_delim='(sd)', a specified string of two sequences of tokens may
-    look like::
-
-        (td)token1(td)token2(td)token3(td)(sd)(td)token4(td)token5(td)(sd)
-
-
-    Parameters
-    ----------
-    tokens : list of str
-        A source list of tokens.
-    to_lower : bool, default False
-        Whether to convert the source source_str to the lower case.
-    counter : Counter or None, default None
-        The Counter instance to be updated with the counts of `tokens`. If
-        None, return a new Counter instance counting tokens from `tokens`.
-
-    Returns
-    -------
-    The `counter` Counter instance after being updated with the token
-    counts of `source_str`. If `counter` is None, return a new Counter
-    instance counting tokens from `source_str`.
-
-    Examples
-    --------
-    >>> import re
-    >>> source_str = ' Life is great ! \n life is good . \n'
-    >>> source_str_tokens = filter(None, re.split(' |\n', source_str))
-    >>> counter = gluonnlp.data.count_tokens(source_str_tokens)
-    >>> sorted(counter.items())
-    [('!', 1), ('.', 1), ('Life', 1), ('good', 1), ('great', 1), ('is', 2), ('life', 1)]
-
-    """
-    if to_lower:
-        tokens = [t.lower() for t in tokens]
-
-    if counter is None:
-        return Counter(tokens)
-    else:
-        counter.update(tokens)
-        return counter
-
-
-def concat_sequence(sequences):
-    """Concatenate sequences of tokens into a single flattened list of tokens.
-
-    Parameters
-    ----------
-    sequences : list of list of object
-        Sequences of tokens, each of which is an iterable of tokens.
-
-    Returns
-    -------
-    Flattened list of tokens.
-
-    """
-    return [token for seq in sequences for token in seq if token]
-
-
-def slice_sequence(sequence, length, pad_last=False, pad_val=C.PAD_TOKEN, overlap=0):
-    """Slice a flat sequence of tokens into sequences tokens, with each
-    inner sequence's length equal to the specified `length`, taking into account the requested
-    sequence overlap.
-
-    Parameters
-    ----------
-    sequence : list of object
-        A flat list of tokens.
-    length : int
-        The length of each of the samples.
-    pad_last : bool, default False
-        Whether to pad the last sequence when its length doesn't align. If the last sequence's
-        length doesn't align and ``pad_last`` is False, it will be dropped.
-    pad_val : object, default
-        The padding value to use when the padding of the last sequence is enabled. In general,
-        the type of ``pad_val`` should be the same as the tokens.
-    overlap : int, default 0
-        The extra number of items in current sample that should overlap with the
-        next sample.
-
-    Returns
-    -------
-    List of list of tokens, with the length of each inner list equal to `length`.
-
-    """
-    if length <= overlap:
-        raise ValueError('length needs to be larger than overlap')
-
-    if pad_last:
-        pad_len = _slice_pad_length(len(sequence), length, overlap)
-        sequence = sequence + [pad_val] * pad_len
-    num_samples = (len(sequence) - length) // (length - overlap) + 1
-
-    return [sequence[i * (length - overlap): ((i + 1) * length - i * overlap)]
-            for i in range(num_samples)]
-
-
-def _slice_pad_length(num_items, length, overlap=0):
-    """Calculate the padding length needed for sliced samples in order not to discard data.
-
-    Parameters
-    ----------
-    num_items : int
-        Number of items in dataset before collating.
-    length : int
-        The length of each of the samples.
-    overlap : int, default 0
-        The extra number of items in current sample that should overlap with the
-        next sample.
-
-    Returns
-    -------
-    Length of paddings.
-
-    """
-    if length <= overlap:
-        raise ValueError('length needs to be larger than overlap')
-
-    step = length - overlap
-    span = num_items - length
-    residual = span % step
-    if residual:
-        return step - residual
-    else:
-        return 0
-
-
-# name:[sha hash, file extension, special tokens]
-_vocab_sha1 = {'wikitext-2':
-               ['be36dc5238c2e7d69720881647ab72eb506d0131', '.vocab', {}],
-               'gbw':
-               ['ebb1a287ca14d8fa6f167c3a779e5e7ed63ac69f', '.vocab', {}],
-               'WMT2014_src':
-               ['230ebb817b1d86950d71e2e765f192a4e4f34415', '.vocab', {}],
-               'WMT2014_tgt':
-               ['230ebb817b1d86950d71e2e765f192a4e4f34415', '.vocab', {}],
-               'book_corpus_wiki_en_cased':
-               ['2d62af22535ed51f35cc8e2abb607723c89c2636', '.vocab', {}],
-               'book_corpus_wiki_en_uncased':
-               ['a66073971aa0b1a262453fe51342e57166a8abcf', '.vocab', {}],
-               'openwebtext_book_corpus_wiki_en_uncased':
-               ['a66073971aa0b1a262453fe51342e57166a8abcf', '.vocab', {}],
-               'openwebtext_ccnews_stories_books_cased':
-               ['2b804f8f90f9f93c07994b703ce508725061cf43', '.vocab', {}],
-               'wiki_multilingual_cased':
-               ['0247cb442074237c38c62021f36b7a4dbd2e55f7', '.vocab', {}],
-               'distilbert_book_corpus_wiki_en_uncased':
-               ['80ef760a6bdafec68c99b691c94ebbb918c90d02', '.vocab', {}],
-               'wiki_cn_cased':
-               ['ddebd8f3867bca5a61023f73326fb125cf12b4f5', '.vocab', {}],
-               'wiki_multilingual_uncased':
-               ['2b2514cc539047b9179e9d98a4e68c36db05c97a', '.vocab', {}],
-               'scibert_scivocab_uncased':
-               ['2d2566bfc416790ab2646ab0ada36ba628628d60', '.vocab', {}],
-               'scibert_scivocab_cased':
-               ['2c714475b521ab8542cb65e46259f6bfeed8041b', '.vocab', {}],
-               'scibert_basevocab_uncased':
-               ['80ef760a6bdafec68c99b691c94ebbb918c90d02', '.vocab', {}],
-               'scibert_basevocab_cased':
-               ['a4ff6fe1f85ba95f3010742b9abc3a818976bb2c', '.vocab', {}],
-               'biobert_v1.0_pmc_cased':
-               ['a4ff6fe1f85ba95f3010742b9abc3a818976bb2c', '.vocab', {}],
-               'biobert_v1.0_pubmed_cased':
-               ['a4ff6fe1f85ba95f3010742b9abc3a818976bb2c', '.vocab', {}],
-               'biobert_v1.0_pubmed_pmc_cased':
-               ['a4ff6fe1f85ba95f3010742b9abc3a818976bb2c', '.vocab', {}],
-               'biobert_v1.1_pubmed_cased':
-               ['a4ff6fe1f85ba95f3010742b9abc3a818976bb2c', '.vocab', {}],
-               'clinicalbert_uncased':
-               ['80ef760a6bdafec68c99b691c94ebbb918c90d02', '.vocab', {}],
-               'baidu_ernie_uncased':
-               ['223553643220255e2a0d4c60e946f4ad7c719080', '.vocab', {}],
-               'openai_webtext':
-               ['f917dc7887ce996068b0a248c8d89a7ec27b95a1', '.vocab', {}],
-               'xlnet_126gb':
-               ['0d74490383bbc5c62b8bcea74d8b74a1bb1280b3', '.vocab', {}],
-               'kobert_news_wiki_ko_cased':
-               ['f86b1a8355819ba5ab55e7ea4a4ec30fdb5b084f', '.spiece', {'padding_token': '[PAD]'}]}
-
-_url_format = '{repo_url}gluon/dataset/vocab/{file_name}.zip'
-
-
-def train_valid_split(dataset, valid_ratio=0.05, stratify=None):
-    """Split the dataset into training and validation sets.
-
-    Parameters
-    ----------
-    dataset : list
-        A list of training samples.
-    valid_ratio : float, default 0.05
-        Proportion of training samples to use for validation set
-        range: [0, 1]
-    stratify : list, default None
-        If not None, data is split in a stratified fashion,
-        using the contents of stratify as class labels.
-
-    Returns
-    -------
-    train : SimpleDataset
-    valid : SimpleDataset
-    """
-    if not 0.0 <= valid_ratio <= 1.0:
-        raise ValueError('valid_ratio should be in [0, 1]')
-
-    if not stratify:
-        num_train = len(dataset)
-        num_valid = np.ceil(num_train * valid_ratio).astype('int')
-        indices = np.arange(num_train)
-
-        np.random.shuffle(indices)
-        valid = SimpleDataset([dataset[indices[i]] for i in range(num_valid)])
-        train = SimpleDataset(
-            [dataset[indices[i + num_valid]] for i in range(num_train - num_valid)])
-
-        return train, valid
-    else:
-        if not isinstance(stratify, list):
-            raise TypeError('stratify should be a list')
-        if not len(stratify) == len(dataset):
-            raise ValueError('stratify should be the same length as num_train')
-
-        classes, digitized = np.unique(stratify, return_inverse=True)
-        n_classes = len(classes)
-        num_class = np.bincount(digitized)
-        num_valid = np.ceil(valid_ratio * num_class).astype('int')
-
-        valid = []
-        train = []
-
-        for idx in range(n_classes):
-            indices = np.nonzero(stratify == classes[idx])[0]
-            np.random.shuffle(indices)
-            valid += [dataset[indices[i]] for i in range(num_valid[idx])]
-            train += [dataset[indices[i + num_valid[idx]]]
-                      for i in range(num_class[idx] - num_valid[idx])]
-
-        np.random.shuffle(valid)
-        np.random.shuffle(train)
-
-        train = SimpleDataset(train)
-        valid = SimpleDataset(valid)
-
-        return train, valid
-
-
-def short_hash(name):
-    if name not in _vocab_sha1:
-        vocabs = list(_vocab_sha1.keys())
-        raise ValueError('Vocabulary for {name} is not available. '
-                         'Hosted vocabularies include: {vocabs}'.format(name=name,
-                                                                        vocabs=vocabs))
-    return _vocab_sha1[name][0][:8]
-
-
-def _get_vocab_tokenizer_info(name, root):
-    file_name = '{name}-{short_hash}'.format(name=name,
-                                             short_hash=short_hash(name))
-    root = os.path.expanduser(root)
-    sha1_hash, file_ext, special_tokens = _vocab_sha1[name]
-    return file_name, file_ext, sha1_hash, special_tokens
-
-
-def _download_vocab_tokenizer(root, file_name, file_ext, file_path):
-    utils.mkdir(root)
-
-    temp_num = str(random.Random().randint(1, sys.maxsize))
-    temp_root = os.path.join(root, temp_num)
-    temp_file_path = os.path.join(temp_root, file_name + file_ext)
-    temp_zip_file_path = os.path.join(temp_root, temp_num + '_' + file_name + '.zip')
-
-    repo_url = _get_repo_url()
-    download(_url_format.format(repo_url=repo_url, file_name=file_name),
-             path=temp_zip_file_path, overwrite=True)
-    with zipfile.ZipFile(temp_zip_file_path) as zf:
-        assert file_name + file_ext in zf.namelist(), '{} not part of {}. Only have: {}'.format(
-            file_name + file_ext, file_name + '.zip', zf.namelist())
-        utils.mkdir(temp_root)
-        zf.extractall(temp_root)
-        os.replace(temp_file_path, file_path)
-        shutil.rmtree(temp_root)
-
-def _load_pretrained_vocab(name, root, cls=None):
-    """Load the accompanying vocabulary object for pre-trained model.
-
-    Parameters
-    ----------
-    name : str
-        Name of the vocabulary, usually the name of the dataset.
-    root : str
-        Location for keeping the model vocabulary.
-    cls : nlp.Vocab or nlp.vocab.BERTVocab, default nlp.Vocab
-
-    Returns
-    -------
-    Vocab or nlp.vocab.BERTVocab, Tokenizer or None
-        Loaded vocabulary object and Tokenizer for the pre-trained model.
-    """
-    file_name, file_ext, sha1_hash, special_tokens = _get_vocab_tokenizer_info(name, root)
-    file_path = os.path.join(root, file_name + file_ext)
-    if os.path.exists(file_path):
-        if check_sha1(file_path, sha1_hash):
-            return _load_vocab_file(file_path, cls, **special_tokens)
-        else:
-            print('Detected mismatch in the content of model vocab file. Downloading again.')
-    else:
-        print('Vocab file is not found. Downloading.')
-    _download_vocab_tokenizer(root, file_name, file_ext, file_path)
-    if check_sha1(file_path, sha1_hash):
-        return _load_vocab_file(file_path, cls, **special_tokens)
-    else:
-        raise ValueError('Downloaded file has different hash. Please try again.')
-
-
-def _load_pretrained_sentencepiece_tokenizer(name, root, **kwargs):
-    from ..data import SentencepieceTokenizer  # pylint: disable=import-outside-toplevel
-    file_name, file_ext, sha1_hash, _ = _get_vocab_tokenizer_info(name, root)
-    file_path = os.path.join(root, file_name + file_ext)
-    if os.path.exists(file_path):
-        if check_sha1(file_path, sha1_hash):
-            assert file_path.endswith('.spiece')
-            return SentencepieceTokenizer(file_path, **kwargs)
-        else:
-            print('Detected mismatch in the content of model tokenizer file. Downloading again.')
-    else:
-        print('tokenizer file is not found. Downloading.')
-    _download_vocab_tokenizer(root, file_name, file_ext, file_path)
-    if check_sha1(file_path, sha1_hash):
-        assert file_path.endswith('.spiece')
-        return SentencepieceTokenizer(file_path, **kwargs)
-    else:
-        raise ValueError('Downloaded file has different hash. Please try again.')
-
-
-def _load_vocab_file(file_path, cls, **kwargs):
-    with open(file_path, 'r') as f:
-        if cls is None:
-            from ..vocab import Vocab  # pylint: disable=import-outside-toplevel
-            cls = Vocab
-        if file_path.endswith('.spiece'):
-            assert kwargs is not None, 'special tokens must be specified when .spiece provide.'
-            from ..vocab import BERTVocab  # pylint: disable=import-outside-toplevel
-            return BERTVocab.from_sentencepiece(
-                file_path,
-                **kwargs)
-        else:
-            return cls.from_json(f.read())
-
-
-def _extract_archive(file, target_dir):  # pylint: disable=redefined-builtin
-    """Extract archive file
-
-    Parameters
-    ----------
-    file : str
-        Absolute path of the archive file.
-    target_dir : str
-        Target directory of the archive to be uncompressed
-
-    """
-    if file.endswith('.gz') or file.endswith('.tar') or file.endswith('.tgz'):
-        archive = tarfile.open(file, 'r')
-    elif file.endswith('.zip'):
-        archive = zipfile.ZipFile(file, 'r')
-    else:
-        raise Exception('Unrecognized file type: ' + file)
-    archive.extractall(path=target_dir)
-    archive.close()
-
-
-def line_splitter(s):
-    """Split a string at newlines.
-
-    Parameters
-    ----------
-    s : str
-        The string to be split
-
-    Returns
-    --------
-    List[str]
-        List of strings. Obtained by calling s.splitlines().
-
-    """
-    return s.splitlines()
-
-
-def whitespace_splitter(s):
-    """Split a string at whitespace (space, tab, newline, return, formfeed).
-
-    Parameters
-    ----------
-    s : str
-        The string to be split
-
-    Returns
-    --------
-    List[str]
-        List of strings. Obtained by calling s.split().
-    """
-    return s.split()
-
-
-class Splitter:
-    """Split a string based on a separator.
-
-    Parameters
-    ----------
-    separator : str
-        The separator based on which string is split.
-    """
-
-    def __init__(self, separator=None):
-        self._separator = separator
-
-    def __call__(self, s):
-        """Split a string based on the separator.
-
-        Parameters
-        ----------
-        s : str
-            The string to be split
-
-        Returns
-        --------
-        List[str]
-            List of strings. Obtained by calling s.split(separator).
-        """
-        return s.split(self._separator)
diff --git a/src/gluonnlp/data/vocab.py b/src/gluonnlp/data/vocab.py
new file mode 100644
index 0000000000..c421d42943
--- /dev/null
+++ b/src/gluonnlp/data/vocab.py
@@ -0,0 +1,382 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Vocabulary."""
+__all__ = ['Vocab']
+
+import collections
+import json
+import warnings
+import numpy as np
+from typing import Dict, Hashable, List, Optional, Counter, Union, Tuple
+
+
+def _check_special_token_identifier(key):
+    """Raise error if the key is not valid as a key for the special token.
+
+    Parameters
+    ----------
+    key
+        The identifier
+    """
+    if not (key.endswith('_token') and key != '_token'):
+        raise ValueError('Each key needs to have the form "name_token".'
+                         ' Received {}'.format(key))
+
+
+#TODO Revise examples
+class Vocab:
+    """Indexing the text tokens.
+
+    Parameters
+    ----------
+    tokens
+        You may specify the input tokens as a python counter object or a list.
+        If it's a counter
+            It represents the text tokens + the frequencies of these tokens in the text data.
+            Its keys will be indexed according to frequency thresholds such as `max_size` and `min_freq`.
+        If it's a list
+            It represents the list of tokens we will add to the vocabulary.
+            We will follow the order of the tokens in the list to assign the indices.
+        The special tokens (those specified in kwargs) that are not specified in `tokens`
+        will be added after the tokens.
+    max_size
+        The maximum possible number of the most frequent tokens in the keys of `counter` that can be
+        indexed. Note that this argument does not count only count tokens in counter and does not
+        count the special tokens like the padding token and bos token. Suppose
+        that there are different keys of `counter` whose counts are the same, if indexing all of
+        them will exceed this argument value, such keys will be indexed one by one according to
+        their __cmp__() order until the frequency threshold is met. If this argument is None or
+        larger than its largest possible value restricted by `counter`, this
+        argument has no effect.
+    min_freq
+        The minimum frequency required for a token in the keys of `counter` to be indexed.
+        If it is None, all keys in `counter` will be used.
+    unk_token
+        The representation for any unknown token. If `unk_token` is not
+        `None`, looking up any token that is not part of the vocabulary and
+        thus considered unknown will return the index of `unk_token`. If
+        None, looking up an unknown token will result in `KeyError`.
+    `**kwargs`
+        Keyword arguments of the format `xxx_token` can be used to specify
+        further special tokens that will be exposed as attribute of the
+        vocabulary and associated with an index.
+        For example, specifying `mask_token='<mask>` as additional keyword
+        argument when constructing a vocabulary `v` leads to `v.mask_token`
+        exposing the value of the special token: `<mask>`.
+        If the specified token is not part of the Vocabulary, it will be added to the vocabulary.
+
+
+    Examples
+    --------
+
+    >>> import gluonnlp as nlp
+    >>> import collections
+    >>> text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world']
+    >>> counter = collections.Counter(text_data)
+    >>> my_vocab = nlp.data.Vocab(counter)
+
+    Extra keyword arguments of the format `xxx_token` are used to expose
+    specified tokens as attributes.
+
+    >>> my_vocab2 = nlp.data.Vocab(counter, special_token='hi')
+    >>> my_vocab2.special_token
+    'hi'
+
+    """
+    UNK_TOKEN = '<unk>'
+    PAD_TOKEN = '<pad>'
+    BOS_TOKEN = '<bos>'
+    EOS_TOKEN = '<eos>'
+    CLS_TOKEN = '<cls>'
+    SEP_TOKEN = '<sep>'
+    MASK_TOKEN = '<mask>'
+
+    def __init__(self, tokens: Optional[Union[Counter, List]] = None,
+                 max_size: Optional[int] = None,
+                 min_freq: Optional[int] = None, *,
+                 unk_token: Optional[Hashable] = '<unk>',
+                 **kwargs):
+        self._all_tokens = []
+        self._token_to_idx = dict()
+        self._special_token_kv = collections.OrderedDict()
+        # Sanity checks.
+        if not (min_freq is None or min_freq > 0):
+            raise ValueError('`min_freq` must be either a positive value or None.')
+        # Add all tokens one by one, if the input is a python counter, we will sort the
+        # (freq, token) pair in descending order to guarantee the insertion order.
+        if isinstance(tokens, collections.Counter):
+            if min_freq is None:
+                valid_word_cnts = list(tokens.items())
+            else:
+                valid_word_cnts = [ele for ele in tokens.items() if ele[1] >= min_freq]
+            valid_word_cnts.sort(key=lambda ele: (ele[1], ele[0]), reverse=True)
+            if max_size is None or max_size >= len(valid_word_cnts):
+                tokens = [ele[0] for ele in valid_word_cnts]
+            else:
+                tokens = [valid_word_cnts[i][0] for i in range(max_size)]
+        else:
+            if tokens is None:
+                tokens = []
+            if max_size is not None or min_freq is not None:
+                warnings.warn('`max_size` and `min_freq` have no effect if the tokens is not'
+                              ' a python Counter.')
+        for token in tokens:
+            if token in self._token_to_idx:
+                raise ValueError('Find duplicated token. {} is already added to the vocabulary. '
+                                 'Please check your input data.'.format(token))
+            idx = len(self._all_tokens)
+            self._all_tokens.append(token)
+            self._token_to_idx[token] = idx
+        for k, token in [('unk_token', unk_token)] + sorted(list(kwargs.items())):
+            _check_special_token_identifier(k)
+            if token is None:
+                continue
+            if hasattr(self, k) or k in self._special_token_kv:
+                raise ValueError('Duplicated keys! "{}" is already in the class. '
+                                 'Please consider to use another name as the identifier. '
+                                 'Received kwargs["{}"] = "{}"'.format(k, k, token))
+            if token in self.special_tokens:
+                raise ValueError('Duplicate values! "{}" is already registered as a special token. '
+                                 'All registered special tokens={}'.format(token,
+                                                                           self.special_tokens))
+            setattr(self, k, token)
+            self._special_token_kv[k] = token
+            if token in self._token_to_idx:
+                idx = self._token_to_idx[token]
+            else:
+                idx = len(self._all_tokens)
+                self._all_tokens.append(token)
+                self._token_to_idx[token] = idx
+            # Add the {name}_idx properties to the object
+            setattr(self, k[:(-6)] + '_id', idx)
+        self._special_token_kv = collections.OrderedDict(
+            sorted(self._special_token_kv.items(),
+                   key=lambda ele: self._token_to_idx[ele[1]]))
+        special_tokens_set = frozenset(self._special_token_kv.values())
+        self._non_special_tokens = [ele for ele in self._all_tokens
+                                    if ele not in special_tokens_set]
+
+    @property
+    def has_unk(self) -> bool:
+        return hasattr(self, 'unk_token')
+
+    @property
+    def all_tokens(self) -> List[Hashable]:
+        """Return all tokens in the vocabulary"""
+        return self._all_tokens
+
+    @property
+    def non_special_tokens(self) -> List[Hashable]:
+        """Return all tokens that are not marked as special tokens."""
+        return self._non_special_tokens
+
+    @property
+    def special_tokens(self) -> List[Hashable]:
+        """Return all special tokens.  We will order the tokens in ascending order of their
+        index in the vocabulary."""
+        return list(self._special_token_kv.values())
+
+    @property
+    def special_token_keys(self) -> List[str]:
+        """Return all the keys to fetch the special tokens. We will order them in ascending order
+        of their index in the vocabulary."""
+        return list(self._special_token_kv.keys())
+
+    @property
+    def special_tokens_kv(self) -> 'OrderedDict[str, Hashable]':
+        """Return the dictionary that maps the special_token_key to the special token"""
+        return self._special_token_kv
+
+    @property
+    def token_to_idx(self) -> Dict[Hashable, int]:
+        return self._token_to_idx
+
+    def to_tokens(self, idx: Union[int, Tuple[int], List[int], np.ndarray])\
+            -> Union[Hashable, List[Hashable]]:
+        """Get the tokens correspond to the chosen indices
+
+        Parameters
+        ----------
+        idx
+            The index used to select the tokens.
+
+        Returns
+        -------
+        ret
+            The tokens of these selected indices.
+        """
+        if isinstance(idx, (list, tuple)):
+            return [self.all_tokens[i] for i in idx]
+        elif isinstance(idx, np.ndarray):
+            if idx.ndim == 0:
+                return self.all_tokens[idx]
+            elif idx.ndim == 1:
+                return [self.all_tokens[i] for i in idx]
+            else:
+                raise ValueError('Unsupported numpy ndarray ndim={}'.format(idx.ndim))
+        else:
+            return self.all_tokens[idx]
+
+    def __contains__(self, token: Hashable) -> bool:
+        """Checks whether a text token exists in the vocabulary.
+
+
+        Parameters
+        ----------
+        token
+            A text token.
+
+
+        Returns
+        -------
+        ret
+            Whether the text token exists in the vocabulary (including `unknown_token`).
+        """
+        return token in self._token_to_idx
+
+    def __getitem__(self, tokens: Union[Hashable, List[Hashable], Tuple[Hashable]])\
+            -> Union[int, List[int]]:
+        """Looks up indices of text tokens according to the vocabulary.
+
+        If `unknown_token` of the vocabulary is None, looking up unknown tokens results in KeyError.
+
+        Parameters
+        ----------
+        tokens
+            A source token or tokens to be converted.
+
+
+        Returns
+        -------
+        ret
+            A token index or a list of token indices according to the vocabulary.
+        """
+
+        if isinstance(tokens, (list, tuple)):
+            if self.has_unk:
+                return [self._token_to_idx.get(token, self.unk_id) for token in tokens]
+            else:
+                return [self._token_to_idx[token] for token in tokens]
+        else:
+            if self.has_unk:
+                return self._token_to_idx.get(tokens, self.unk_id)
+            else:
+                return self._token_to_idx[tokens]
+
+    def __len__(self):
+        return len(self.all_tokens)
+
+    def __call__(self, tokens: Union[Hashable, List[Hashable], Tuple[Hashable]])\
+            -> Union[int, np.ndarray]:
+        """Looks up indices of text tokens according to the vocabulary.
+
+        Parameters
+        ----------
+        tokens
+            A source token or tokens to be converted.
+
+
+        Returns
+        -------
+        ret
+            A token index or a list of token indices according to the vocabulary.
+        """
+
+        return self[tokens]
+
+    def __repr__(self):
+        unk = '"{}"'.format(self.unk_token) if self.has_unk else 'None'
+        extra_special_tokens = []
+        for k, v in self._special_token_kv.items():
+            if k != 'unk_token':
+                extra_special_tokens.append('{}="{}"'.format(k, v))
+        if len(extra_special_tokens) > 0:
+            extra_special_token_str = ', {}'.format(', '.join(extra_special_tokens))
+        else:
+            extra_special_token_str = ''
+        return 'Vocab(size={}, unk_token={}{})'.format(len(self), unk, extra_special_token_str)
+
+    def to_json(self) -> str:
+        """Serialize Vocab object into a json string.
+
+        Returns
+        -------
+        ret
+            The serialized json string
+        """
+        vocab_dict = dict()
+        # Perform sanity check to make sure that we are able to reconstruct the original vocab
+        for i, tok in enumerate(self._all_tokens):
+            if self._token_to_idx[tok] != i:
+                warnings.warn('The vocabulary is corrupted! One possible reason is that the '
+                              'tokens are changed manually without updating the '
+                              '_token_to_idx map. Please check your code or report an issue in '
+                              'Github!')
+        vocab_dict['all_tokens'] = self._all_tokens
+        vocab_dict['special_token_key_value'] = self._special_token_kv
+        ret = json.dumps(vocab_dict, ensure_ascii=False)
+        return ret
+
+    def save(self, path: str):
+        """Save vocab to a json file
+
+        Parameters
+        ----------
+        path
+            The file to write the json string. Nothing happens if it is None.
+        """
+        with open(path, 'w', encoding='utf-8') as f:
+            f.write(self.to_json())
+
+    @classmethod
+    def from_json(cls, json_str: Union[str, bytes, bytearray]) -> 'Vocab':
+        """Deserialize Vocab object from json string.
+
+        Parameters
+        ----------
+        json_str
+            Serialized json string of a Vocab object.
+
+        Returns
+        -------
+        vocab
+            The constructed Vocab object
+        """
+        vocab_dict = json.loads(json_str)
+        all_tokens = vocab_dict.get('all_tokens')
+        special_token_kv = vocab_dict.get('special_token_key_value')
+        if 'unk_token' not in special_token_kv:
+            special_token_kv['unk_token'] = None
+        vocab = cls(tokens=all_tokens, **special_token_kv)
+        return vocab
+
+    @classmethod
+    def load(cls, path: str) -> 'Vocab':
+        """Save the vocabulary to location specified by the filename
+
+        Parameters
+        ----------
+        path
+            The path to load the vocabulary
+
+        Returns
+        -------
+        vocab
+            The constructed Vocab object
+        """
+        with open(path, 'r', encoding='utf-8') as f:
+            return cls.from_json(f.read())
diff --git a/src/gluonnlp/data/word_embedding_evaluation.py b/src/gluonnlp/data/word_embedding_evaluation.py
deleted file mode 100644
index a2e6becfb8..0000000000
--- a/src/gluonnlp/data/word_embedding_evaluation.py
+++ /dev/null
@@ -1,916 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=too-many-lines
-"""Word embedding evaluation datasets."""
-
-import os
-import tarfile
-import zipfile
-
-from mxnet.gluon.data.dataset import SimpleDataset
-from mxnet.gluon.utils import check_sha1, _get_repo_file_url, download
-
-from .. import _constants as C
-from .dataset import CorpusDataset
-from .registry import register
-from ..base import get_home_dir
-
-base_datasets = [
-    'WordSimilarityEvaluationDataset', 'WordAnalogyEvaluationDataset'
-]
-word_similarity_datasets = [
-    'WordSim353', 'MEN', 'RadinskyMTurk', 'RareWords', 'SimLex999',
-    'SimVerb3500', 'SemEval17Task2', 'BakerVerb143', 'YangPowersVerb130'
-]
-word_analogy_datasets = ['GoogleAnalogyTestSet', 'BiggerAnalogyTestSet']
-__all__ = base_datasets + word_similarity_datasets + word_analogy_datasets
-
-
-class _Dataset(SimpleDataset):
-    _url = None  # Dataset is retrieved from here if not cached
-    _archive_file = (None, None)  # Archive name and checksum
-    _checksums = None  # Checksum of archive contents
-    _verify_ssl = True  # Verify SSL certificates when downloading from self._url
-    _namespace = None  # Contains S3 namespace for self-hosted datasets
-
-    def __init__(self, root):
-        self.root = os.path.expanduser(root)
-        if not os.path.isdir(self.root):
-            os.makedirs(self.root)
-        self._download_data()
-        super(_Dataset, self).__init__(self._get_data())
-
-    def _download_data(self):
-        _, archive_hash = self._archive_file
-        for name, checksum in self._checksums.items():
-            name = name.split('/')
-            path = os.path.join(self.root, *name)
-            if not os.path.exists(path) or not check_sha1(path, checksum):
-                if self._namespace is not None:
-                    url = _get_repo_file_url(self._namespace,
-                                             self._archive_file[0])
-                else:
-                    url = self._url
-                downloaded_file_path = download(url, path=self.root,
-                                                sha1_hash=archive_hash,
-                                                verify_ssl=self._verify_ssl)
-
-                if downloaded_file_path.lower().endswith('zip'):
-                    with zipfile.ZipFile(downloaded_file_path, 'r') as zf:
-                        zf.extractall(path=self.root)
-                elif downloaded_file_path.lower().endswith('tar.gz'):
-                    with tarfile.open(downloaded_file_path, 'r') as tf:
-                        tf.extractall(path=self.root)
-                elif len(self._checksums) > 1:
-                    err = 'Failed retrieving {clsname}.'.format(
-                        clsname=self.__class__.__name__)
-                    err += (' Expecting multiple files, '
-                            'but could not detect archive format.')
-                    raise RuntimeError(err)
-
-    def _get_data(self):
-        raise NotImplementedError
-
-
-###############################################################################
-# Word similarity and relatedness datasets
-###############################################################################
-class WordSimilarityEvaluationDataset(_Dataset):
-    """Base class for word similarity or relatedness task datasets.
-
-    Inheriting classes are assumed to implement datasets of the form ['word1',
-    'word2', score] where score is a numerical similarity or relatedness score
-    with respect to 'word1' and 'word2'.
-
-    """
-
-    def __init__(self, root):
-        super(WordSimilarityEvaluationDataset, self).__init__(root=root)
-        self._cast_score_to_float()
-
-    def _get_data(self):
-        raise NotImplementedError
-
-    def _cast_score_to_float(self):
-        self._data = [[row[0], row[1], float(row[2])] for row in self._data]
-
-
-@register(segment=['all', 'similarity', 'relatedness'])
-class WordSim353(WordSimilarityEvaluationDataset):
-    """WordSim353 dataset.
-
-    The dataset was collected by Finkelstein et al.
-    (http://www.cs.technion.ac.il/~gabr/resources/data/wordsim353/). Agirre et
-    al. proposed to split the collection into two datasets, one focused on
-    measuring similarity, and the other one on relatedness
-    (http://alfonseca.org/eng/research/wordsim353.html).
-
-    - Finkelstein, L., Gabrilovich, E., Matias, Y., Rivlin, E., Solan, Z.,
-      Wolfman, G., & Ruppin, E. (2002). Placing search in context: the concept
-      revisited. ACM} Trans. Inf. Syst., 20(1), 116–131.
-      https://dl.acm.org/citation.cfm?id=372094
-    - Agirre, E., Alfonseca, E., Hall, K. B., Kravalova, J., Pasca, M., & Soroa, A.
-      (2009). A study on similarity and relatedness using distributional and
-      wordnet-based approaches. In , Human Language Technologies: Conference of the
-      North American Chapter of the Association of Computational Linguistics,
-      Proceedings, May 31 - June 5, 2009, Boulder, Colorado, {USA (pp. 19–27). :
-      The Association for Computational Linguistics.
-
-    License: Creative Commons Attribution 4.0 International (CC BY 4.0)
-
-    Each sample consists of a pair of words, and a score with scale from
-    0 (totally unrelated words) to 10 (very much related or identical words).
-
-    Parameters
-    ----------
-    segment : str
-        'relatedness', 'similarity' or 'all'
-    root : str, default '$MXNET_HOME/datasets/wordsim353'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> wordsim353 = gluonnlp.data.WordSim353('similarity', root='./datasets/wordsim353')
-    -etc-
-    >>> len(wordsim353)
-    203
-    >>> wordsim353[0]
-    ['Arafat', 'Jackson', 2.5]
-    """
-    _url = 'http://alfonseca.org/pubs/ws353simrel.tar.gz'
-    _namespace = 'gluon/dataset/ws353'
-    _archive_file = ('ws353simrel.tar.gz',
-                     '1b9ca7f4d61682dea0004acbd48ce74275d5bfff')
-    _checksums = {
-        'wordsim353_sim_rel/wordsim353_agreed.txt':
-        '1c9f77c9dd42bcc09092bd32adf0a1988d03ca80',
-        'wordsim353_sim_rel/wordsim353_annotator1.txt':
-        '674d5a9263d099a5128b4bf4beeaaceb80f71f4e',
-        'wordsim353_sim_rel/wordsim353_annotator2.txt':
-        '9b79a91861a4f1075183b93b89b73e1b470b94c1',
-        'wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt':
-        'c36c5dc5ebea9964f4f43e2c294cd620471ab1b8',
-        'wordsim353_sim_rel/wordsim_similarity_goldstandard.txt':
-        '4845df518a83c8f7c527439590ed7e4c71916a99'
-    }
-
-    _data_file = {
-        'relatedness': ('wordsim_relatedness_goldstandard.txt',
-                        'c36c5dc5ebea9964f4f43e2c294cd620471ab1b8'),
-        'similarity': ('wordsim_similarity_goldstandard.txt',
-                       '4845df518a83c8f7c527439590ed7e4c71916a99')
-    }
-
-    min = 0
-    max = 10
-
-    def __init__(self, segment='all', root=os.path.join(
-            get_home_dir(), 'datasets', 'wordsim353')):
-        if segment is not None:
-            assert segment in ['all', 'relatedness', 'similarity']
-
-        self.segment = segment
-        super(WordSim353, self).__init__(root=root)
-
-    def _get_data(self):
-        paths = []
-        if self.segment == 'relatedness' or self.segment == 'all':
-            paths.append(
-                os.path.join(
-                    self.root,
-                    'wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt'))
-        if self.segment == 'similarity' or self.segment == 'all':
-            paths.append(
-                os.path.join(
-                    self.root,
-                    'wordsim353_sim_rel/wordsim_similarity_goldstandard.txt'))
-
-        return sorted(list({tuple(row) for row in CorpusDataset(paths)}))
-
-
-@register(segment=['full', 'dev', 'test'])
-class MEN(WordSimilarityEvaluationDataset):
-    """MEN dataset for word-similarity and relatedness.
-
-    The dataset was collected by Bruni et al.
-    (https://staff.fnwi.uva.nl/e.bruni/MEN).
-
-    - Bruni, E., Boleda, G., Baroni, M., & Nam-Khanh Tran (2012). Distributional
-      semantics in technicolor. In , The 50th Annual Meeting of the Association for
-      Computational Linguistics, Proceedings of the Conference, July 8-14, 2012,
-      Jeju Island, Korea - Volume 1: Long Papers (pp. 136–145). : The Association
-      for Computer Linguistics.
-
-    License: Creative Commons Attribution 2.0 Generic (CC BY 2.0)
-
-    Each sample consists of a pair of words, and a score with scale from
-    0 (totally unrelated words) to 50 (very much related or identical words).
-
-    Parameters
-    ----------
-    root : str, default '$MXNET_HOME/datasets/men'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    segment : str, default 'train'
-        Dataset segment. Options are 'train', 'dev', 'test'.
-
-    Examples
-    --------
-    >>> men = gluonnlp.data.MEN('test', root='./datasets/men')
-    -etc-
-    >>> len(men)
-    1000
-    >>> men[0]
-    ['display', 'pond', 10.0]
-    """
-    _url = 'https://staff.fnwi.uva.nl/e.bruni/resources/MEN.tar.gz'
-    _namespace = 'gluon/dataset/men'
-    _archive_file = ('MEN.tar.gz', '3c4af1b7009c1ad75e03562f7f7bc5f51ff3a31a')
-    _checksums = {
-        'MEN/MEN_dataset_lemma_form.dev':
-        '55d2c9675f84dc661861172fc89db437cab2ed92',
-        'MEN/MEN_dataset_lemma_form.test':
-        'c003c9fddfe0ce1d38432cdb13863599d7a2d37d',
-        'MEN/MEN_dataset_lemma_form_full':
-        'e32e0a0fa09ccf95aa898bd42011e84419f7fafb',
-        'MEN/MEN_dataset_natural_form_full':
-        'af9c2ca0033e2561676872eed98e223ee6366b82',
-        'MEN/agreement/agreement-score.txt':
-        'bee1fe16ce63a198a12a924ceb50253c49c7b45c',
-        'MEN/agreement/elias-men-ratings.txt':
-        'd180252df271de96c8fbba6693eaa16793e0f7f0',
-        'MEN/agreement/marcos-men-ratings.txt':
-        'dbfceb7d88208c2733861f27d3d444c15db18519',
-        'MEN/instructions.txt':
-        'e6f69c7338246b404bafa6e24257fc4a5ba01baa',
-        'MEN/licence.txt':
-        'f57c6d61814a0895236ab99c06b61b2611430f92'
-    }
-
-    _segment_file = {
-        'full': 'MEN/MEN_dataset_lemma_form_full',
-        'dev': 'MEN/MEN_dataset_lemma_form.dev',
-        'test': 'MEN/MEN_dataset_lemma_form.test',
-    }
-
-    min = 0
-    max = 50
-
-    def __init__(self, segment='dev', root=os.path.join(
-            get_home_dir(), 'datasets', 'men')):
-        self.segment = segment
-        super(MEN, self).__init__(root=root)
-
-    def _get_data(self):
-        datafilepath = os.path.join(
-            self.root, *self._segment_file[self.segment].split('/'))
-        dataset = CorpusDataset(datafilepath)
-
-        # Remove lemma information
-        return [[row[0][:-2], row[1][:-2], row[2]] for row in dataset]
-
-
-@register
-class RadinskyMTurk(WordSimilarityEvaluationDataset):
-    """MTurk dataset for word-similarity and relatedness by Radinsky et al..
-
-    - Radinsky, K., Agichtein, E., Gabrilovich, E., & Markovitch, S. (2011). A word
-      at a time: computing word relatedness using temporal semantic analysis. In S.
-      Srinivasan, K. Ramamritham, A. Kumar, M. P. Ravindra, E. Bertino, & R. Kumar,
-      Proceedings of the 20th International Conference on World Wide Web, {WWW}
-      2011, Hyderabad, India, March 28 - April 1, 2011 (pp. 337–346). : ACM.
-
-    License: Unspecified
-
-    Each sample consists of a pair of words, and a score with scale from
-    1 (totally unrelated words) to 5 (very much related or identical words).
-
-    Parameters
-    ----------
-    root : str, default '$MXNET_HOME/datasets/radinskymturk'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> radinskymturk = gluonnlp.data.RadinskyMTurk(root='./datasets/radinskymturk')
-    -etc-
-    >>> len(radinskymturk)
-    287
-    >>> radinskymturk[0]
-    ['episcopal', 'russia', 2.75]
-    """
-    _url = 'http://www.kiraradinsky.com/files/Mtruk.csv'
-    _archive_file = ('Mtruk.csv', '14959899c092148abba21401950d6957c787434c')
-    _checksums = {'Mtruk.csv': '14959899c092148abba21401950d6957c787434c'}
-
-    min = 1
-    max = 5
-
-    def __init__(self, root=os.path.join(get_home_dir(), 'datasets',
-                                         'radinskymturk')):
-        super(RadinskyMTurk, self).__init__(root=root)
-
-    def _get_data(self):
-        datafilepath = os.path.join(self.root, self._archive_file[0])
-
-        return list(CorpusDataset(datafilepath, tokenizer=lambda x: x.split(',')))
-
-
-@register
-class RareWords(WordSimilarityEvaluationDataset):
-    """Rare words dataset word-similarity and relatedness.
-
-    - Luong, T., Socher, R., & Manning, C. D. (2013). Better word representations
-      with recursive neural networks for morphology. In J. Hockenmaier, & S.
-      Riedel, Proceedings of the Seventeenth Conference on Computational Natural
-      Language Learning, CoNLL 2013, Sofia, Bulgaria, August 8-9, 2013 (pp.
-      104–113). : ACL.
-
-    License: Unspecified
-
-    Each sample consists of a pair of words, and a score with scale from
-    0 (totally unrelated words) to 10 (very much related or identical words).
-
-    Parameters
-    ----------
-    root : str, default '$MXNET_HOME/datasets/rarewords',
-        MXNET_HOME defaults to '~/.mxnet'.
-        Path to temp folder for storing data.
-
-    Examples
-    --------
-    >>> rarewords = gluonnlp.data.RareWords(root='./datasets/rarewords')
-    -etc-
-    >>> len(rarewords)
-    2034
-    >>> rarewords[0]
-    ['squishing', 'squirt', 5.88]
-    """
-    _url = 'http://www-nlp.stanford.edu/~lmthang/morphoNLM/rw.zip'
-    _archive_file = ('rw.zip', 'bf9c5959a0a2d7ed8e51d91433ac5ebf366d4fb9')
-    _checksums = {'rw/rw.txt': 'bafc59f099f1798b47f5bed7b0ebbb933f6b309a'}
-
-    min = 0
-    max = 10
-
-    def __init__(self, root=os.path.join(get_home_dir(), 'datasets',
-                                         'rarewords')):
-        super(RareWords, self).__init__(root=root)
-
-    def _get_data(self):
-        datafilepath = os.path.join(self.root, 'rw', 'rw.txt')
-        dataset = CorpusDataset(datafilepath)
-        return [[row[0], row[1], row[2]] for row in dataset]
-
-
-@register
-class SimLex999(WordSimilarityEvaluationDataset):
-    """SimLex999 dataset word-similarity.
-
-    - Hill, F., Reichart, R., & Korhonen, A. (2015). Simlex-999: evaluating
-      semantic models with (genuine) similarity estimation. Computational
-      Linguistics, 41(4), 665–695. https://arxiv.org/abs/1408.3456
-
-    License: Unspecified
-
-    Each sample consists of a pair of words, and a score with scale from
-    0 (totally unrelated words) to 10 (very much related or identical words).
-
-    Parameters
-    ----------
-    root : str, default '$MXNET_HOME/datasets/simlex999'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> simlex999 = gluonnlp.data.SimLex999(root='./datasets/simlex999')
-    -etc-
-    >>> len(simlex999)
-    999
-    >>> simlex999[0]
-    ['old', 'new', 1.58]
-    """
-    _url = 'https://www.cl.cam.ac.uk/~fh295/SimLex-999.zip'
-    _archive_file = ('SimLex-999.zip',
-                     '0d3afe35b89d60acf11c28324ac7be10253fda39')
-    _checksums = {
-        'SimLex-999/README.txt': 'f54f4a93213b847eb93cc8952052d6b990df1bd1',
-        'SimLex-999/SimLex-999.txt': '0496761e49015bc266908ea6f8e35a5ec77cb2ee'
-    }
-
-    min = 0
-    max = 10
-
-    score = 'SimLex999'
-
-    def __init__(self, root=os.path.join(get_home_dir(), 'datasets',
-                                         'simlex999')):
-        super(SimLex999, self).__init__(root=root)
-
-    def _get_data(self):
-        dataset = CorpusDataset(
-            os.path.join(self.root, 'SimLex-999', 'SimLex-999.txt'))
-        return [[row[0], row[1], row[3]] for i, row in enumerate(dataset)
-                if i != 0]  # Throw away header
-
-
-@register
-class SimVerb3500(WordSimilarityEvaluationDataset):
-    """SimVerb3500 dataset word-similarity.
-
-    - Hill, F., Reichart, R., & Korhonen, A. (2015). Simlex-999: evaluating
-      semantic models with (genuine) similarity estimation. Computational
-      Linguistics, 41(4), 665–695. https://arxiv.org/abs/1408.3456
-
-    License: Unspecified
-
-    Each sample consists of a pair of words, and a score with scale from
-    0 (totally unrelated words) to 10 (very much related or identical words).
-
-    Parameters
-    ----------
-    root : str, default '$MXNET_HOME/datasets/verb3500'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> simverb3500 = gluonnlp.data.SimVerb3500(root='./datasets/simverb3500') #doctest:+SKIP
-    -etc-
-    >>> len(simverb3500) #doctest:+SKIP
-    3500
-    >>> simverb3500[0] #doctest:+SKIP
-    ['take', 'remove', 6.81]
-    """
-    _url = 'https://www.aclweb.org/anthology/attachments/D16-1235.Attachment.zip'
-    _archive_file = ('D16-1235.Attachment.zip', '7bcfff115ca3e4c909b3763a2ba35e83992f2a2f')
-    _checksums = {
-        'data/README.txt':
-        'fc2645b30a291a7486015c3e4b51d8eb599f7c7e',
-        'data/SimVerb-3000-test.txt':
-        '4cddf11f0fbbb3b94958e69b0614be5d125ec607',
-        'data/SimVerb-3500-ratings.txt':
-        '133d45daeb0e73b9da26930741455856887ac17b',
-        'data/SimVerb-3500-stats.txt':
-        '79a0fd7c6e03468742d276b127d70478a6995681',
-        'data/SimVerb-3500.txt':
-        '0e79af04fd42f44affc93004f2a02b62f155a9ae',
-        'data/SimVerb-3520-annotator-ratings.csv':
-        '9ff69cec9c93a1abba7be1404fc82d7f20e6633b',
-        'data/SimVerb-500-dev.txt':
-        '3ae184352ca2d9f855ca7cb099a65635d184f75a'
-    }
-
-    _segment_file = {
-        'full': 'data/SimVerb-3500.txt',
-        'test': 'data/SimVerb-3000-test.txt',
-        'dev': 'data/SimVerb-500-dev.txt'
-    }
-
-    min = 0
-    max = 10
-
-    def __init__(self, segment='full', root=os.path.join(
-            get_home_dir(), 'datasets', 'simverb3500')):
-        self.segment = segment
-        super(SimVerb3500, self).__init__(root=root)
-
-    def _get_data(self):
-        dataset = CorpusDataset(
-            os.path.join(self.root,
-                         *self._segment_file[self.segment].split('/')))
-        return [[row[0], row[1], row[3]] for row in dataset]
-
-
-@register(segment=['trial', 'test'])
-class SemEval17Task2(WordSimilarityEvaluationDataset):
-    """SemEval17Task2 dataset for word-similarity.
-
-    The dataset was collected by Finkelstein et al.
-    (http://www.cs.technion.ac.il/~gabr/resources/data/wordsim353/). Agirre et
-    al. proposed to split the collection into two datasets, one focused on
-    measuring similarity, and the other one on relatedness
-    (http://alfonseca.org/eng/research/wordsim353.html).
-
-    - Finkelstein, L., Gabrilovich, E., Matias, Y., Rivlin, E., Solan, Z.,
-      Wolfman, G., & Ruppin, E. (2002). Placing search in context: the concept
-      revisited. ACM} Trans. Inf. Syst., 20(1), 116–131.
-      https://dl.acm.org/citation.cfm?id=372094
-    - Agirre, E., Alfonseca, E., Hall, K. B., Kravalova, J., Pasca, M., & Soroa, A.
-      (2009). A study on similarity and relatedness using distributional and
-      wordnet-based approaches. In , Human Language Technologies: Conference of the
-      North American Chapter of the Association of Computational Linguistics,
-      Proceedings, May 31 - June 5, 2009, Boulder, Colorado, {USA (pp. 19–27). :
-      The Association for Computational Linguistics.
-
-    License: Unspecified
-
-    Each sample consists of a pair of words, and a score with scale from
-    0 (totally unrelated words) to 5 (very much related or identical words).
-
-    Parameters
-    ----------
-    root : str, default '$MXNET_HOME/datasets/semeval17task2'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-    segment : str, default 'train'
-        Dataset segment. Options are 'trial', 'test'.
-    language : str, default 'en'
-        Dataset language.
-
-    Examples
-    --------
-    >>> semeval17task2 = gluonnlp.data.SemEval17Task2()  # doctest: +SKIP
-    -etc-
-    >>> len(semeval17task2)  # doctest: +SKIP
-    18
-    >>> semeval17task2[0]  # doctest: +SKIP
-    ['sunset', 'string', 0.05]
-    """
-    # TODO: reenable doctest once semeval17task2 is available again
-    _url = 'http://alt.qcri.org/semeval2017/task2/data/uploads/semeval2017-task2.zip'
-    _archive_file = ('semeval2017-task2.zip',
-                     'b29860553f98b057303815817dfb60b9fe79cfba')
-    _checksums = C.SEMEVAL17_CHECKSUMS
-
-    _datatemplate = ('SemEval17-Task2/{segment}/subtask1-monolingual/data/'
-                     '{language}.{segment}.data.txt')
-    _keytemplate = ('SemEval17-Task2/{segment}/subtask1-monolingual/keys/'
-                    '{language}.{segment}.gold.txt')
-
-    min = 0
-    max = 5
-    segments = ('trial', 'test')
-    languages = ('en', 'es', 'de', 'it', 'fa')
-
-    def __init__(self, segment='trial', language='en', root=os.path.join(
-            get_home_dir(), 'datasets', 'semeval17task2')):
-        assert segment in self.segments
-        assert language in self.languages
-        self.language = language
-        self.segment = segment
-        super(SemEval17Task2, self).__init__(root=root)
-
-    def _get_data(self):
-        data = self._datatemplate.format(segment=self.segment,
-                                         language=self.language)
-        data = os.path.join(self.root, *data.split('/'))
-        keys = self._keytemplate.format(segment=self.segment,
-                                        language=self.language)
-        keys = os.path.join(self.root, *keys.split('/'))
-
-        data_dataset = CorpusDataset(data)
-        keys_dataset = CorpusDataset(keys)
-        return [[d[0], d[1], k[0]] for d, k in zip(data_dataset, keys_dataset)]
-
-
-@register
-class BakerVerb143(WordSimilarityEvaluationDataset):
-    """Verb143 dataset.
-
-    - Baker, S., Reichart, R., & Korhonen, A. (2014). An unsupervised model for
-      instance level subcategorization acquisition. In A. Moschitti, B. Pang, &
-      W. Daelemans, Proceedings of the 2014 Conference on Empirical Methods in
-      Natural Language Processing, {EMNLP} 2014, October 25-29, 2014, Doha,
-      Qatar, {A} meeting of SIGDAT, a Special Interest Group of the {ACL (pp.
-      278–289). : ACL.
-
-    144 pairs of verbs annotated by 10 annotators following the WS-353
-    guidelines.
-
-    License: unspecified
-
-    Each sample consists of a pair of words, and a score with scale from
-    0 (totally unrelated words) to 1 (very much related or identical words).
-
-    Parameters
-    ----------
-    root : str, default '$MXNET_HOME/datasets/verb143'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> bakerverb143 = gluonnlp.data.BakerVerb143(root='./datasets/bakerverb143') #doctest:+SKIP
-    -etc-
-    >>> len(bakerverb143) #doctest:+SKIP
-    144
-    >>> bakerverb143[0] #doctest:+SKIP
-    ['happen', 'say', 0.19]
-    """
-    _url = 'https://ie.technion.ac.il/~roiri/papers/EMNLP14.zip'
-    _archive_file = ('EMNLP14.zip', '1862e52af784e76e83d472532a75eb797fb8b807')
-    _checksums = {
-        'verb_similarity dataset.txt':
-        'd7e4820c7504cbae56898353e4d94e6408c330fc'
-    }
-
-    min = 0
-    max = 1
-
-    def __init__(self, root=os.path.join(get_home_dir(), 'datasets',
-                                         'verb143')):
-        super(BakerVerb143, self).__init__(root=root)
-
-    def _get_data(self):
-        path = os.path.join(self.root, 'verb_similarity dataset.txt')
-
-        dataset = CorpusDataset(path)
-        return [[row[0], row[1], row[12]] for row in dataset]
-
-
-@register
-class YangPowersVerb130(WordSimilarityEvaluationDataset):
-    """Verb-130 dataset.
-
-    - Yang, D., & Powers, D. M. (2006). Verb similarity on the taxonomy of
-      wordnet. In The Third International WordNet Conference: GWC 2006
-
-    License: Unspecified
-
-    Each sample consists of a pair of words, and a score with scale from
-    0 (totally unrelated words) to 4 (very much related or identical words).
-
-    Parameters
-    ----------
-    root : str, default '$MXNET_HOME/datasets/verb130'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> yangpowersverb130 = gluonnlp.data.YangPowersVerb130(root='./datasets/yangpowersverb130')
-    >>> len(yangpowersverb130)
-    130
-    >>> yangpowersverb130[0]
-    ['brag', 'boast', 4.0]
-    """
-
-    _words1 = [
-        'brag', 'concoct', 'divide', 'build', 'end', 'accentuate',
-        'demonstrate', 'solve', 'consume', 'position', 'swear', 'furnish',
-        'merit', 'submit', 'seize', 'spin', 'enlarge', 'swing', 'circulate',
-        'recognize', 'resolve', 'prolong', 'tap', 'block', 'arrange', 'twist',
-        'hail', 'dissipate', 'approve', 'impose', 'hasten', 'rap', 'lean',
-        'make', 'show', 'sell', 'weave', 'refer', 'distribute', 'twist',
-        'drain', 'depict', 'build', 'hail', 'call', 'swing', 'yield', 'split',
-        'challenge', 'hinder', 'welcome', 'need', 'refer', 'finance', 'expect',
-        'terminate', 'yell', 'swell', 'rotate', 'seize', 'approve', 'supply',
-        'clip', 'divide', 'advise', 'complain', 'want', 'twist', 'swing',
-        'make', 'hinder', 'build', 'express', 'resolve', 'bruise', 'swing',
-        'catch', 'swear', 'request', 'arrange', 'relieve', 'move', 'weave',
-        'swear', 'forget', 'supervise', 'situate', 'explain', 'ache',
-        'evaluate', 'recognize', 'dilute', 'hasten', 'scorn', 'swear',
-        'arrange', 'discard', 'list', 'stamp', 'market', 'boil', 'sustain',
-        'resolve', 'dissipate', 'anger', 'approve', 'research', 'request',
-        'boast', 'furnish', 'refine', 'acknowledge', 'clean', 'lean',
-        'postpone', 'hail', 'remember', 'scrape', 'sweat', 'highlight',
-        'seize', 'levy', 'alter', 'refer', 'empty', 'flush', 'shake',
-        'imitate', 'correlate', 'refer'
-    ]
-    _words2 = [
-        'boast', 'devise', 'split', 'construct', 'terminate', 'highlight',
-        'show', 'figure', 'eat', 'situate', 'vow', 'supply', 'deserve',
-        'yield', 'take', 'twirl', 'swell', 'sway', 'distribute', 'acknowledge',
-        'settle', 'sustain', 'knock', 'hinder', 'plan', 'curl', 'acclaim',
-        'disperse', 'support', 'levy', 'accelerate', 'tap', 'rest', 'earn',
-        'publish', 'market', 'intertwine', 'direct', 'commercialize',
-        'intertwine', 'tap', 'recognize', 'organize', 'address', 'refer',
-        'bounce', 'seize', 'crush', 'yield', 'assist', 'recognize', 'deserve',
-        'explain', 'build', 'deserve', 'postpone', 'boast', 'curl', 'situate',
-        'request', 'scorn', 'consume', 'twist', 'figure', 'furnish', 'boast',
-        'deserve', 'fasten', 'crash', 'trade', 'yield', 'propose', 'figure',
-        'examine', 'split', 'break', 'consume', 'explain', 'levy', 'study',
-        'hinder', 'swell', 'print', 'think', 'resolve', 'concoct', 'isolate',
-        'boast', 'spin', 'terminate', 'succeed', 'market', 'permit', 'yield',
-        'describe', 'explain', 'arrange', 'figure', 'weave', 'sweeten', 'tap',
-        'lower', 'publicize', 'isolate', 'approve', 'boast', 'distribute',
-        'concoct', 'yield', 'impress', 'sustain', 'distribute', 'concoct',
-        'grate', 'show', 'judge', 'hail', 'lean', 'spin', 'restore', 'refer',
-        'believe', 'highlight', 'carry', 'situate', 'spin', 'swell',
-        'highlight', 'levy', 'lean'
-    ]
-
-    _url = ('https://dspace2.flinders.edu.au/xmlui/bitstream/handle/'
-            '2328/9557/Yang%20Verb.pdf?sequence=1')
-
-    min = 0
-    max = 4
-
-    def __init__(self, root=os.path.join('~', '.mxnet', 'datasets',
-                                         'verb130')):
-        super(YangPowersVerb130, self).__init__(root=root)
-
-    def _get_data(self):
-        scores = [4] * 26 + [3] * 26 + [2] * 26 + [1] * 26 + [0] * 26
-        return list(zip(self._words1, self._words2, scores))
-
-    def _download_data(self):
-        # Overwrite download method as this dataset is self-contained
-        pass
-
-
-###############################################################################
-# Word analogy datasets
-###############################################################################
-class WordAnalogyEvaluationDataset(_Dataset):
-    """Base class for word analogy task datasets.
-
-    Inheriting classes are assumed to implement datasets of the form ['word1',
-    'word2', 'word3', 'word4'] or ['word1', [ 'word2a', 'word2b', ... ],
-    'word3', [ 'word4a', 'word4b', ... ]].
-
-    """
-
-    def _get_data(self):
-        raise NotImplementedError
-
-
-@register(category=C.GOOGLEANALOGY_CATEGORIES)
-class GoogleAnalogyTestSet(WordAnalogyEvaluationDataset):
-    """Google analogy test set
-
-    - Mikolov, T., Chen, K., Corrado, G., & Dean, J. (2013). Efficient
-      estimation of word representations in vector space. In Proceedings of
-      the International Conference on Learning Representations (ICLR).
-
-    License: Unspecified
-
-    Each sample consists of two analogical pairs of words.
-
-    Parameters
-    ----------
-    group : {'syntactic', 'semantic'} or None, default None
-        The subset for the specified type of analogy. None for the complete dataset.
-    category : str or None, default None
-        The subset for the specified category of analogy. None for the complete dataset.
-    lowercase : boolean, default True
-        Whether to convert words to lowercase.
-    root : str, default '$MXNET_HOME/datasets/google_analogy'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> googleanalogytestset = gluonnlp.data.GoogleAnalogyTestSet(
-    ...     root='./datasets/googleanalogytestset')
-    -etc-
-    >>> len(googleanalogytestset)
-    19544
-    >>> googleanalogytestset[0]
-    ['athens', 'greece', 'baghdad', 'iraq']
-    >>> googleanalogytestset = gluonnlp.data.GoogleAnalogyTestSet(
-    ...     'syntactic', root='./datasets/googleanalogytestset')
-    >>> googleanalogytestset[0]
-    ['amazing', 'amazingly', 'apparent', 'apparently']
-    >>> googleanalogytestset = gluonnlp.data.GoogleAnalogyTestSet(
-    ...     'syntactic', 'gram8-plural', root='./datasets/googleanalogytestset')
-    >>> googleanalogytestset[0]
-    ['banana', 'bananas', 'bird', 'birds']
-    """
-
-    _archive_file = ('questions-words.txt',
-                     'fa92df4bbe788f2d51827c762c63bd8e470edf31')
-    _checksums = {
-        'questions-words.txt': 'fa92df4bbe788f2d51827c762c63bd8e470edf31'
-    }
-    _url = 'http://download.tensorflow.org/data/questions-words.txt'
-
-    groups = ['syntactic', 'semantic']
-    categories = C.GOOGLEANALOGY_CATEGORIES
-
-    def __init__(self, group=None,
-                 category=None, lowercase=True, root=os.path.join(
-                     get_home_dir(), 'datasets', 'google_analogy')):
-
-        assert group is None or group in self.groups
-        assert category is None or category in self.categories
-        self.category = category
-        self.group = group
-        self.lowercase = lowercase
-        super(GoogleAnalogyTestSet, self).__init__(root=root)
-
-    def _get_data(self):
-        words = []
-        with open(os.path.join(self.root, self._archive_file[0])) as f:
-            for line in f:
-                if line.startswith(':'):
-                    current_category = line.split()[1]
-                    if 'gram' in current_category:
-                        current_group = 'syntactic'
-                    else:
-                        current_group = 'semantic'
-                else:
-                    if self.group is not None and self.group != current_group:
-                        continue
-                    if self.category is not None and self.category != current_category:
-                        continue
-
-                    if self.lowercase:
-                        line = line.lower()
-
-                    words.append(line.split())
-
-        return words
-
-
-@register(category=list(C.BATS_CATEGORIES.keys()))
-class BiggerAnalogyTestSet(WordAnalogyEvaluationDataset):
-    """Bigger analogy test set
-
-    - Gladkova, A., Drozd, A., & Matsuoka, S. (2016). Analogy-based detection
-      of morphological and semantic relations with word embeddings: what works
-      and what doesn’t. In Proceedings of the NAACL-HLT SRW (pp. 47–54). San
-      Diego, California, June 12-17, 2016: ACL. Retrieved from
-      https://www.aclweb.org/anthology/N/N16/N16-2002.pdf
-
-    License: Unspecified
-
-    Each sample consists of two analogical pairs of words.
-
-    Parameters
-    ----------
-    root : str, default '$MXNET_HOME/datasets/bats'
-        Path to temp folder for storing data.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Examples
-    --------
-    >>> biggeranalogytestset = gluonnlp.data.BiggerAnalogyTestSet(
-    ...     root='./datasets/biggeranalogytestset')
-    -etc-
-    >>> len(biggeranalogytestset)
-    98000
-    >>> biggeranalogytestset[0]
-    ['arm', 'armless', 'art', 'artless']
-    """
-    _archive_file = ('BATS_3.0.zip',
-                     'bf94d47884be9ea83af369beeea7499ed25dcf0d')
-    _checksums = C.BATS_CHECKSUMS
-    _url = 'https://s3.amazonaws.com/blackbirdprojects/tut_vsm/BATS_3.0.zip'
-    _category_group_map = {
-        'I': '1_Inflectional_morphology',
-        'D': '2_Derivational_morphology',
-        'E': '3_Encyclopedic_semantics',
-        'L': '4_Lexicographic_semantics'
-    }
-    _categories = C.BATS_CATEGORIES
-
-    def __init__(self, category=None, form_analogy_pairs=True,
-                 drop_alternative_solutions=True, root=os.path.join(
-                     get_home_dir(), 'datasets', 'bigger_analogy')):
-        self.form_analogy_pairs = form_analogy_pairs
-        self.drop_alternative_solutions = drop_alternative_solutions
-        self.category = category
-
-        if self.category is not None:
-            assert self.category in self._categories.keys()
-
-        super(BiggerAnalogyTestSet, self).__init__(root=root)
-
-    def _get_data(self):
-        if self.category is not None:
-            categories = [self.category]
-        else:
-            categories = sorted(list(self._categories.keys()))
-
-        datasets = []
-        for category in categories:
-            group = self._category_group_map[category[0]]
-            category_name = self._categories[category]
-            path = os.path.join(
-                self.root,
-                *('BATS_3.0/{group}/{category} {category_name}.txt'.format(
-                    group=group, category=category,
-                    category_name=category_name).split('/')))
-            dataset = CorpusDataset(path)
-            dataset = [[row[0], row[1].split('/')] for row in dataset]
-            # Drop alternative solutions seperated by '/' from word2 column
-            if self.drop_alternative_solutions:
-                dataset = [[row[0], row[1][0]] for row in dataset]
-
-            # Final dataset consists of all analogy pairs per category
-            if self.form_analogy_pairs:
-                dataset = [[arow[0], arow[1], brow[0], brow[1]]
-                           for arow in dataset for brow in dataset
-                           if arow != brow]
-            datasets += dataset
-        return datasets
diff --git a/src/gluonnlp/data/xlnet/__init__.py b/src/gluonnlp/data/xlnet/__init__.py
deleted file mode 100644
index 4f4377db8f..0000000000
--- a/src/gluonnlp/data/xlnet/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""XLNet fineuning data preprocessing utils"""
-
-from . import squad
-from .squad import *
-
-__all__ = squad.__all__
diff --git a/src/gluonnlp/data/xlnet/squad.py b/src/gluonnlp/data/xlnet/squad.py
deleted file mode 100644
index ef89aba5ba..0000000000
--- a/src/gluonnlp/data/xlnet/squad.py
+++ /dev/null
@@ -1,138 +0,0 @@
-"""Utility functions for xlnet squad preprocessing"""
-
-__all__ = ['convert_index', 'lcs_match']
-
-import unicodedata
-import numpy as np
-
-def _preprocess_text(inputs, lower=False, remove_space=True, keep_accents=False):
-    """Remove space, convert to lower case, keep accents.
-
-    Parameters
-    ----------
-    inputs: str
-        input string
-    lower: bool
-        If convert the input string to lower case.
-    remove_space: bool
-        If remove the spaces in the input string.
-    keep_accents: bool
-        If keep accents in the input string.
-
-    Returns
-    -------
-    str: processed input string
-    """
-    if remove_space:
-        outputs = ' '.join(inputs.strip().split())
-    else:
-        outputs = inputs
-    outputs = outputs.replace('``', '"').replace('\'\'', '"')
-    if not keep_accents:
-        outputs = unicodedata.normalize('NFKD', outputs)
-        outputs = ''.join([c for c in outputs if not unicodedata.combining(c)])
-    if lower:
-        outputs = outputs.lower()
-    return outputs
-
-
-def convert_index(index_map, pos, M=None, is_start=True):
-    """Working best with lcs_match(), convert the token index to origin text index
-
-    Parameters
-    ----------
-    index_map: list of int
-        Typically, it is a map form origin indices to converted indices
-    pos: int
-        The origin index to be converted.
-    M: int
-        The maximum index.
-    is_start: bool
-        True if pos is a start position.
-
-    Returns
-    -------
-    int : the converted index regarding index_map
-    """
-    if index_map[pos] is not None:
-        return index_map[pos]
-    N = len(index_map)
-    rear = pos
-    while rear < N - 1 and index_map[rear] is None:
-        rear += 1
-    front = pos
-    while front > 0 and index_map[front] is None:
-        front -= 1
-    assert index_map[front] is not None or index_map[rear] is not None
-    if index_map[front] is None:
-        if index_map[rear] >= 1:
-            if is_start:
-                return 0
-            else:
-                return index_map[rear] - 1
-        return index_map[rear]
-    if index_map[rear] is None:
-        if M is not None and index_map[front] < M - 1:
-            if is_start:
-                return index_map[front] + 1
-            else:
-                return M - 1
-        return index_map[front]
-    if is_start:
-        if index_map[rear] > index_map[front] + 1:
-            return index_map[front] + 1
-        else:
-            return index_map[rear]
-    else:
-        if index_map[rear] > index_map[front] + 1:
-            return index_map[rear] - 1
-        else:
-            return index_map[front]
-
-
-def lcs_match(max_dist, seq1, seq2, max_seq_length=1024, lower=False):
-    """Longest common sequence match.
-
-    unlike standard LCS, this is specifically optimized for the setting
-    because the mismatch between sentence pieces and original text will be small
-
-    Parameters
-    ----------
-    max_dist: int
-        The max distance between tokens to be considered.
-    seq1: list
-        The first sequence to be matched.
-    seq2: list
-        The second sequence to be matched.
-    lower: bool
-        If match the lower-cased tokens.
-    Returns
-    -------
-    numpyArray: Token-wise lcs matrix f. Shape of ((max(len(seq1), 1024), max(len(seq2), 1024))
-    Map: The dp path in matrix f.
-        g[(i ,j)] == 2 if token_i in seq1 matches token_j in seq2.
-        g[(i, j)] == 1 if token_i in seq1 matches token_{j-1} in seq2.
-        g[(i, j)] == 0 of token_{i-1} in seq1 matches token_j in seq2.
-    """
-    f = np.zeros((max(len(seq1), max_seq_length), max(len(seq2), max_seq_length)),
-                 dtype=np.float32)
-    g = {}
-    for i, token in enumerate(seq1):
-        for j in range(i - max_dist, i + max_dist):
-            if j >= len(seq2) or j < 0:
-                continue
-
-            if i > 0:
-                g[(i, j)] = 0
-                f[i, j] = f[i - 1, j]
-
-            if j > 0 and f[i, j - 1] > f[i, j]:
-                g[(i, j)] = 1
-                f[i, j] = f[i, j - 1]
-
-            f_prev = f[i - 1, j - 1] if i > 0 and j > 0 else 0
-            if (_preprocess_text(token, lower=lower, remove_space=False) == seq2[j]
-                    and f_prev + 1 > f[i, j]):
-                g[(i, j)] = 2
-                f[i, j] = f_prev + 1
-    return f, g
diff --git a/src/gluonnlp/embedding/__init__.py b/src/gluonnlp/embedding/__init__.py
index 3ca8826cf1..73b1b54178 100644
--- a/src/gluonnlp/embedding/__init__.py
+++ b/src/gluonnlp/embedding/__init__.py
@@ -1,24 +1,24 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Word embeddings."""
-
-from . import evaluation, token_embedding
-from .token_embedding import *
-
-__all__ = (token_embedding.__all__ + ['evaluation'])
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=wildcard-import
+"""Word embeddings."""
+
+from . import embed_loader
+from .embed_loader import *
+
+__all__ = (embed_loader.__all__ )
diff --git a/src/gluonnlp/_constants.py b/src/gluonnlp/embedding/_constants.py
similarity index 63%
rename from src/gluonnlp/_constants.py
rename to src/gluonnlp/embedding/_constants.py
index 09e5774c69..1c7921d313 100644
--- a/src/gluonnlp/_constants.py
+++ b/src/gluonnlp/embedding/_constants.py
@@ -18,24 +18,6 @@
 # pylint: disable=too-many-lines
 """Constants."""
 
-import mxnet as mx
-
-UNK_TOKEN = '<unk>'
-
-BOS_TOKEN = '<bos>'
-
-EOS_TOKEN = '<eos>'
-
-PAD_TOKEN = '<pad>'
-
-LARGE_POSITIVE_FLOAT = 1e18
-
-LARGE_NEGATIVE_FLOAT = -LARGE_POSITIVE_FLOAT
-
-INT_TYPES = mx.base.integer_types
-
-S3_PREFIX = 's3://'
-
 GLOVE_NPZ_SHA1 = \
     {'glove.42B.300d': ('glove.42B.300d.npz',
                         '7deee8f4860744db53ed9e50892effe9883e6d89'),
@@ -558,7 +540,6 @@
     'cc.zh.300': ('cc.zh.300-355cfcaf.npz', '355cfcafe71536226a1737aafb4530c9ba4fd09f'),
 }
 
-
 FAST_TEXT_BIN_SHA1 = {
     'wiki-news-300d-1M-subword': ('wiki-news-300d-1M-subword-c8853bda.bin',
                                   'c8853bdae00318097b6337c4631d342879d6b18c'),
@@ -1019,818 +1000,3 @@
     'wiki.zu': ('wiki.zu-642b157b.bin', '642b157b3b799cfb50b13eda0b7d156698cdde83'),
 }
 
-
-GOOGLEANALOGY_CATEGORIES = [
-    'capital-common-countries', 'capital-world', 'currency', 'city-in-state',
-    'family', 'gram1-adjective-to-adverb', 'gram2-opposite',
-    'gram3-comparative', 'gram4-superlative', 'gram5-present-participle',
-    'gram6-nationality-adjective', 'gram7-past-tense', 'gram8-plural',
-    'gram9-plural-verbs'
-]
-
-BATS_CHECKSUMS = \
-    {'BATS_3.0/1_Inflectional_morphology/I01 [noun - plural_reg].txt':
-     'cfcba2835edf81abf11b84defd2f4daa3ca0b0bf',
-     'BATS_3.0/1_Inflectional_morphology/I02 [noun - plural_irreg].txt':
-     '44dbc56432b79ff5ce2ef80b6840a8aa916524f9',
-     'BATS_3.0/1_Inflectional_morphology/I03 [adj - comparative].txt':
-     'dc530918e98b467b8102a7dab772a66d3db32a73',
-     'BATS_3.0/1_Inflectional_morphology/I04 [adj - superlative].txt':
-     '6c6fdfb6c733bc9b298d95013765163f42faf6fb',
-     'BATS_3.0/1_Inflectional_morphology/I05 [verb_inf - 3pSg].txt':
-     '39fa47ec7238ddb3f9818bc586f23f55b55418d8',
-     'BATS_3.0/1_Inflectional_morphology/I06 [verb_inf - Ving].txt':
-     '8fabeb9f5af6c3e7154a220b7034bbe5b900c36f',
-     'BATS_3.0/1_Inflectional_morphology/I07 [verb_inf - Ved].txt':
-     'aa04df95aa2edb436cbcc03c7b15bc492ece52d6',
-     'BATS_3.0/1_Inflectional_morphology/I08 [verb_Ving - 3pSg].txt':
-     '5f22d8121a5043ce76d3b6b53a49a7bb3fe33920',
-     'BATS_3.0/1_Inflectional_morphology/I09 [verb_Ving - Ved].txt':
-     '377777c1e793c638e72c010228156d01f916708e',
-     'BATS_3.0/1_Inflectional_morphology/I10 [verb_3pSg - Ved].txt':
-     '051c0c3c633e10900f827991dac14cf76da7f022',
-     'BATS_3.0/2_Derivational_morphology/D01 [noun+less_reg].txt':
-     '5d6839e9d34ee1e9fddb5bbf6516cf6420b85d8d',
-     'BATS_3.0/2_Derivational_morphology/D02 [un+adj_reg].txt':
-     '80b82227a0d5f7377f1e8cebe28c582bfeb1afb5',
-     'BATS_3.0/2_Derivational_morphology/D03 [adj+ly_reg].txt':
-     '223e120bd61b3116298a253f392654c15ad5a39a',
-     'BATS_3.0/2_Derivational_morphology/D04 [over+adj_reg].txt':
-     'a56f8685af489bcd09c36f864eba1657ce0a7c28',
-     'BATS_3.0/2_Derivational_morphology/D05 [adj+ness_reg].txt':
-     '5da99b1f1781ecfb4a1a7448c715abf07451917b',
-     'BATS_3.0/2_Derivational_morphology/D06 [re+verb_reg].txt':
-     '4c5e1796091fade503fbf0bfc2fae2c7f98b5dd2',
-     'BATS_3.0/2_Derivational_morphology/D07 [verb+able_reg].txt':
-     'a6218162bc257d98e875fc667c23edfac59e19fd',
-     'BATS_3.0/2_Derivational_morphology/D08 [verb+er_irreg].txt':
-     '9a4236c3bbc23903e101a42fb5ad6e15e552fadf',
-     'BATS_3.0/2_Derivational_morphology/D09 [verb+tion_irreg].txt':
-     '3ab0153926d5cf890cf08a4077da6d9946133874',
-     'BATS_3.0/2_Derivational_morphology/D10 [verb+ment_irreg].txt':
-     '2a012b87a9a60e128e064c5fe24b60f99e16ddce',
-     'BATS_3.0/3_Encyclopedic_semantics/E01 [country - capital].txt':
-     '9890315d3c4e6a38b8ae5fc441858564be3d3dc4',
-     'BATS_3.0/3_Encyclopedic_semantics/E02 [country - language].txt':
-     'ef08a00e8ff7802811ace8f00fabac41b5d03678',
-     'BATS_3.0/3_Encyclopedic_semantics/E03 [UK_city - county].txt':
-     '754957101c93a25b438785bd4458404cd9010259',
-     'BATS_3.0/3_Encyclopedic_semantics/E04 [name - nationality].txt':
-     '71a6562c34fb6154992a7c3e499375fcc3529c96',
-     'BATS_3.0/3_Encyclopedic_semantics/E05 [name - occupation].txt':
-     'a9a6f9f1af959aef83106f3dbd6bed16dfe9a3ea',
-     'BATS_3.0/3_Encyclopedic_semantics/E06 [animal - young].txt':
-     '12d5b51c7b76b9136eadc719abc8cf4806c67b73',
-     'BATS_3.0/3_Encyclopedic_semantics/E07 [animal - sound].txt':
-     '91991b007a35f45bd42bd7d0d465c6f8311df911',
-     'BATS_3.0/3_Encyclopedic_semantics/E08 [animal - shelter].txt':
-     'e5af11e216db392986ba0cbb597d861066c29adb',
-     'BATS_3.0/3_Encyclopedic_semantics/E09 [things - color].txt':
-     'd30b2eb2fc7a60f19afda7c54582e30f6fe28f51',
-     'BATS_3.0/3_Encyclopedic_semantics/E10 [male - female].txt':
-     '247a588671bc1da8f615e14076bd42573d24b4b3',
-     'BATS_3.0/4_Lexicographic_semantics/L01 [hypernyms - animals].txt':
-     '4b5c4dabe2c9c038fafee85d8d3958f1b1dec987',
-     'BATS_3.0/4_Lexicographic_semantics/L02 [hypernyms - misc].txt':
-     '83d5ecad78d9de28fd70347731c7ee5918ba43c9',
-     'BATS_3.0/4_Lexicographic_semantics/L03 [hyponyms - misc].txt':
-     'a8319856ae2f76b4d4c030ac7e899bb3a06a9a48',
-     'BATS_3.0/4_Lexicographic_semantics/L04 [meronyms - substance].txt':
-     'c081e1104e1b40725063f4b39d13d1ec12496bfd',
-     'BATS_3.0/4_Lexicographic_semantics/L05 [meronyms - member].txt':
-     'bcbf05f3be76cef990a74674a9999a0bb9790a07',
-     'BATS_3.0/4_Lexicographic_semantics/L06 [meronyms - part].txt':
-     '2f9bdcc74b881e1c54b391c9a6e7ea6243b3accc',
-     'BATS_3.0/4_Lexicographic_semantics/L07 [synonyms - intensity].txt':
-     '8fa287860b096bef004fe0f6557e4f686e3da81a',
-     'BATS_3.0/4_Lexicographic_semantics/L08 [synonyms - exact].txt':
-     'a17c591961bddefd97ae5df71f9d1559ce7900f4',
-     'BATS_3.0/4_Lexicographic_semantics/L09 [antonyms - gradable].txt':
-     '117fbb86504c192b33a5469f2f282e741d9c016d',
-     'BATS_3.0/4_Lexicographic_semantics/L10 [antonyms - binary].txt':
-     '3cde2f2c2a0606777b8d7d11d099f316416a7224'}
-
-BATS_CATEGORIES = {
-    'I01': '[noun - plural_reg]',
-    'I02': '[noun - plural_irreg]',
-    'I03': '[adj - comparative]',
-    'I04': '[adj - superlative]',
-    'I05': '[verb_inf - 3pSg]',
-    'I06': '[verb_inf - Ving]',
-    'I07': '[verb_inf - Ved]',
-    'I08': '[verb_Ving - 3pSg]',
-    'I09': '[verb_Ving - Ved]',
-    'I10': '[verb_3pSg - Ved]',
-    'D01': '[noun+less_reg]',
-    'D02': '[un+adj_reg]',
-    'D03': '[adj+ly_reg]',
-    'D04': '[over+adj_reg]',
-    'D05': '[adj+ness_reg]',
-    'D06': '[re+verb_reg]',
-    'D07': '[verb+able_reg]',
-    'D08': '[verb+er_irreg]',
-    'D09': '[verb+tion_irreg]',
-    'D10': '[verb+ment_irreg]',
-    'E01': '[country - capital]',
-    'E02': '[country - language]',
-    'E03': '[UK_city - county]',
-    'E04': '[name - nationality]',
-    'E05': '[name - occupation]',
-    'E06': '[animal - young]',
-    'E07': '[animal - sound]',
-    'E08': '[animal - shelter]',
-    'E09': '[things - color]',
-    'E10': '[male - female]',
-    'L01': '[hypernyms - animals]',
-    'L02': '[hypernyms - misc]',
-    'L03': '[hyponyms - misc]',
-    'L04': '[meronyms - substance]',
-    'L05': '[meronyms - member]',
-    'L06': '[meronyms - part]',
-    'L07': '[synonyms - intensity]',
-    'L08': '[synonyms - exact]',
-    'L09': '[antonyms - gradable]',
-    'L10': '[antonyms - binary]'
-}
-
-SEMEVAL17_CHECKSUMS = \
-    {'SemEval17-Task2/README.txt':
-     'ad02d4c22fff8a39c9e89a92ba449ec78750af6b',
-     'SemEval17-Task2/task2-scorer.jar':
-     '145ef73ce955656d59e3b67b41f8152e8ee018d8',
-     'SemEval17-Task2/test/subtask1-monolingual/data/de.test.data.txt':
-     '6fc840f989d2274509549e472a68fb88dd2e149f',
-     'SemEval17-Task2/test/subtask1-monolingual/data/en.test.data.txt':
-     '05293fcbd80b2f4aad9b6518ce1a546ad8f61f33',
-     'SemEval17-Task2/test/subtask1-monolingual/data/es.test.data.txt':
-     '552904b5988f9951311290ca8fa0441dd4351d4b',
-     'SemEval17-Task2/test/subtask1-monolingual/data/fa.test.data.txt':
-     '29d5970feac5982961bd6ab621ba31f83d3bff77',
-     'SemEval17-Task2/test/subtask1-monolingual/data/it.test.data.txt':
-     'c95fe2be8fab37e9c70610117bdedc48a0a8e95c',
-     'SemEval17-Task2/test/subtask1-monolingual/keys/de.test.gold.txt':
-     'c51463460495a242cc726d41713c5e00b66fdd18',
-     'SemEval17-Task2/test/subtask1-monolingual/keys/en.test.gold.txt':
-     '2d2bb2ed41308cc60e7953cc9036f7dc89141b48',
-     'SemEval17-Task2/test/subtask1-monolingual/keys/es.test.gold.txt':
-     'a5842ff17fe3847d15414924826a8eb236018bcc',
-     'SemEval17-Task2/test/subtask1-monolingual/keys/fa.test.gold.txt':
-     '717bbe035d8ae2bad59416eb3dd4feb7238b97d4',
-     'SemEval17-Task2/test/subtask1-monolingual/keys/it.test.gold.txt':
-     'a342b950109c73afdc86a7829e17c1d8f7c482f0',
-     'SemEval17-Task2/test/subtask2-crosslingual/data/de-es.test.data.txt':
-     'ef92b1375762f68c700e050d214d3241ccde2319',
-     'SemEval17-Task2/test/subtask2-crosslingual/data/de-fa.test.data.txt':
-     '17aa103981f3193960309bb9b4cc151acaf8136c',
-     'SemEval17-Task2/test/subtask2-crosslingual/data/de-it.test.data.txt':
-     'eced15e8565689dd67605a82a782d19ee846222a',
-     'SemEval17-Task2/test/subtask2-crosslingual/data/en-de.test.data.txt':
-     '5cb69370a46385a7a3d37cdf2018744be77203a0',
-     'SemEval17-Task2/test/subtask2-crosslingual/data/en-es.test.data.txt':
-     '402f7fed52b60e915fb1be49f935395488cf7a7b',
-     'SemEval17-Task2/test/subtask2-crosslingual/data/en-fa.test.data.txt':
-     '9bdddbbde3da755f2a700bddfc3ed1cd9324ad48',
-     'SemEval17-Task2/test/subtask2-crosslingual/data/en-it.test.data.txt':
-     'd3b37aac79ca10311352309ef9b172f686ecbb80',
-     'SemEval17-Task2/test/subtask2-crosslingual/data/es-fa.test.data.txt':
-     'a2959aec346c26475a4a6ad4d950ee0545f2381e',
-     'SemEval17-Task2/test/subtask2-crosslingual/data/es-it.test.data.txt':
-     'ca627c30143d9f82a37a8776fabf2cee226dd35c',
-     'SemEval17-Task2/test/subtask2-crosslingual/data/it-fa.test.data.txt':
-     'a03d79a6ce7b798356b53b4e85dbe828247b97ef',
-     'SemEval17-Task2/test/subtask2-crosslingual/keys/de-es.test.gold.txt':
-     '7564130011d38daad582b83135010a2a58796df6',
-     'SemEval17-Task2/test/subtask2-crosslingual/keys/de-fa.test.gold.txt':
-     'c9e23c2e5e970e7f95550fbac3362d85b82cc569',
-     'SemEval17-Task2/test/subtask2-crosslingual/keys/de-it.test.gold.txt':
-     'b74cc2609b2bd2ceb5e076f504882a2e0a996a3c',
-     'SemEval17-Task2/test/subtask2-crosslingual/keys/en-de.test.gold.txt':
-     '428dfdad2a144642c13c24b845e6b7de6bf5f663',
-     'SemEval17-Task2/test/subtask2-crosslingual/keys/en-es.test.gold.txt':
-     '1dd7ab08a10552486299151cdd32ed19b56db682',
-     'SemEval17-Task2/test/subtask2-crosslingual/keys/en-fa.test.gold.txt':
-     '17451ac2165aa9b695dae9b1aba20eb8609fb400',
-     'SemEval17-Task2/test/subtask2-crosslingual/keys/en-it.test.gold.txt':
-     '5041c0b84a603ed85aa0a5cbe4b1c34f69a2fa7c',
-     'SemEval17-Task2/test/subtask2-crosslingual/keys/es-fa.test.gold.txt':
-     '8c09a219670dc32ab3864078bf0c28a287accabc',
-     'SemEval17-Task2/test/subtask2-crosslingual/keys/es-it.test.gold.txt':
-     'b1cdd13209354cc2fc2f4226c80aaa85558daf4a',
-     'SemEval17-Task2/test/subtask2-crosslingual/keys/it-fa.test.gold.txt':
-     'e0b560bb1d2db39ce45e841c8aad611734dc94f1',
-     'SemEval17-Task2/trial/subtask1-monolingual/data/de.trial.data.txt':
-     'dd071fd90f59bec8d271a447d86ee2e462941f52',
-     'SemEval17-Task2/trial/subtask1-monolingual/data/en.trial.data.txt':
-     'e8e5add0850b3dec07f102be26b8791a5e9bbbcf',
-     'SemEval17-Task2/trial/subtask1-monolingual/data/es.trial.data.txt':
-     '8956c78ff9ceae1d923a57816e55392c6a7dfc49',
-     'SemEval17-Task2/trial/subtask1-monolingual/data/fa.trial.data.txt':
-     '2f7c4247cde0d918b3508e90f6b49a1f5031c81b',
-     'SemEval17-Task2/trial/subtask1-monolingual/data/it.trial.data.txt':
-     'c11e0b5b55f94fc97c7b11fa455e71b071be879f',
-     'SemEval17-Task2/trial/subtask1-monolingual/keys/de.trial.gold.txt':
-     'ce5567b1accf3eb07da53229dfcb2a8a1dfac380',
-     'SemEval17-Task2/trial/subtask1-monolingual/keys/en.trial.gold.txt':
-     '693cb5928e807c79e39136dc0981dadca7832ae6',
-     'SemEval17-Task2/trial/subtask1-monolingual/keys/es.trial.gold.txt':
-     '8241ca66bf5ba55f77607e9bcfae8e34902715d8',
-     'SemEval17-Task2/trial/subtask1-monolingual/keys/fa.trial.gold.txt':
-     'd30701a93c8c5500b82ac2334ed8410f9a23864b',
-     'SemEval17-Task2/trial/subtask1-monolingual/keys/it.trial.gold.txt':
-     'bad225573e1216ba8b35429e9fa520a20e8ce031',
-     'SemEval17-Task2/trial/subtask1-monolingual/output/de.trial.sample.output.txt':
-     'f85cba9f6690d61736623c16e620826b09384aa5',
-     'SemEval17-Task2/trial/subtask1-monolingual/output/en.trial.sample.output.txt':
-     'f85cba9f6690d61736623c16e620826b09384aa5',
-     'SemEval17-Task2/trial/subtask1-monolingual/output/es.trial.sample.output.txt':
-     'f85cba9f6690d61736623c16e620826b09384aa5',
-     'SemEval17-Task2/trial/subtask1-monolingual/output/fa.trial.sample.output.txt':
-     'f85cba9f6690d61736623c16e620826b09384aa5',
-     'SemEval17-Task2/trial/subtask1-monolingual/output/it.trial.sample.output.txt':
-     'f85cba9f6690d61736623c16e620826b09384aa5',
-     'SemEval17-Task2/trial/subtask2-crosslingual/data/de-es.trial.data.txt':
-     'c27c8977d8d4434fdc3e59a7b0121d87e0a03237',
-     'SemEval17-Task2/trial/subtask2-crosslingual/data/de-fa.trial.data.txt':
-     '88a6f6dd1bba309f7cae7281405e37f442782983',
-     'SemEval17-Task2/trial/subtask2-crosslingual/data/de-it.trial.data.txt':
-     'ebdab0859f3b349fa0120fc8ab98be3394f0d73d',
-     'SemEval17-Task2/trial/subtask2-crosslingual/data/en-de.trial.data.txt':
-     '128d1a460fe9836b66f0fcdf59455b02edb9f258',
-     'SemEval17-Task2/trial/subtask2-crosslingual/data/en-es.trial.data.txt':
-     '508c5dde8ffcc32ee3009a0d020c7c96a338e1d1',
-     'SemEval17-Task2/trial/subtask2-crosslingual/data/en-fa.trial.data.txt':
-     '1a3640eb5facfe15b1e23a07183a2e62ed80c7d9',
-     'SemEval17-Task2/trial/subtask2-crosslingual/data/en-it.trial.data.txt':
-     '141c83d591b0292016583d9c23a2cc5514a006aa',
-     'SemEval17-Task2/trial/subtask2-crosslingual/data/es-fa.trial.data.txt':
-     'a0a548cd698c389ee80c34d6ec72abed5f1625e5',
-     'SemEval17-Task2/trial/subtask2-crosslingual/data/es-it.trial.data.txt':
-     '8d42bed8a43ff93d26ca95794758d9392ca707ed',
-     'SemEval17-Task2/trial/subtask2-crosslingual/data/it-fa.trial.data.txt':
-     '9c85223f1f734de61c28157df0ce417bb0537803',
-     'SemEval17-Task2/trial/subtask2-crosslingual/keys/de-es.trial.gold.txt':
-     '126c92b2fb3b8f2784dd4ae2a4c52b02a87a8196',
-     'SemEval17-Task2/trial/subtask2-crosslingual/keys/de-fa.trial.gold.txt':
-     '1db6201c2c8f19744c39dbde8bd4a803859d64c1',
-     'SemEval17-Task2/trial/subtask2-crosslingual/keys/de-it.trial.gold.txt':
-     '5300bf2ead163ff3981fb41ec5d0e291c287c9e0',
-     'SemEval17-Task2/trial/subtask2-crosslingual/keys/en-de.trial.gold.txt':
-     'd4f5205de929bb0c4020e1502a3f2204b5accd51',
-     'SemEval17-Task2/trial/subtask2-crosslingual/keys/en-es.trial.gold.txt':
-     '3237e11c3a0d9c0f5d583f8dc1d025b97a1f8bfe',
-     'SemEval17-Task2/trial/subtask2-crosslingual/keys/en-fa.trial.gold.txt':
-     'c14de7bf326907336a02d499c9b92ab229f3f4f8',
-     'SemEval17-Task2/trial/subtask2-crosslingual/keys/en-it.trial.gold.txt':
-     '3c0276c4b4e7a6d8a618bbe1ab0f30ad7b07929c',
-     'SemEval17-Task2/trial/subtask2-crosslingual/keys/es-fa.trial.gold.txt':
-     '359f69e9dfd6411a936baa3392b8f05c398a7707',
-     'SemEval17-Task2/trial/subtask2-crosslingual/keys/es-it.trial.gold.txt':
-     '44090607fabe5a26926a384e521ef1317f6f00d0',
-     'SemEval17-Task2/trial/subtask2-crosslingual/keys/it-fa.trial.gold.txt':
-     '97b09ffa11803023c2143fd4a4ac4bbc9775e645',
-     'SemEval17-Task2/trial/subtask2-crosslingual/output/de-es.trial.sample.output.txt':
-     'a0735361a692be357963959728dacef85ea08240',
-     'SemEval17-Task2/trial/subtask2-crosslingual/output/de-fa.trial.sample.output.txt':
-     'b71166d8615e921ee689cefc81419398d341167f',
-     'SemEval17-Task2/trial/subtask2-crosslingual/output/de-it.trial.sample.output.txt':
-     'b71166d8615e921ee689cefc81419398d341167f',
-     'SemEval17-Task2/trial/subtask2-crosslingual/output/en-de.trial.sample.output.txt':
-     'b71166d8615e921ee689cefc81419398d341167f',
-     'SemEval17-Task2/trial/subtask2-crosslingual/output/en-es.trial.sample.output.txt':
-     'b71166d8615e921ee689cefc81419398d341167f',
-     'SemEval17-Task2/trial/subtask2-crosslingual/output/en-fa.trial.sample.output.txt':
-     'a0735361a692be357963959728dacef85ea08240',
-     'SemEval17-Task2/trial/subtask2-crosslingual/output/en-it.trial.sample.output.txt':
-     'a0735361a692be357963959728dacef85ea08240',
-     'SemEval17-Task2/trial/subtask2-crosslingual/output/es-fa.trial.sample.output.txt':
-     'b71166d8615e921ee689cefc81419398d341167f',
-     'SemEval17-Task2/trial/subtask2-crosslingual/output/es-it.trial.sample.output.txt':
-     'b71166d8615e921ee689cefc81419398d341167f',
-     'SemEval17-Task2/trial/subtask2-crosslingual/output/it-fa.trial.sample.output.txt':
-     'a0735361a692be357963959728dacef85ea08240'}
-
-UD21_DATA_FILE_SHA1 = \
-    {'af': {'dev': ('af-ud-dev.conllu',
-                    'e37b104f4425ee00afc81779201816d5ac525194'),
-            'test': ('af-ud-test.conllu',
-                     'd2bf02370d308ee957c04242bd0871db0e488389'),
-            'train': ('af-ud-train.conllu',
-                      'a652c7b19c236063d3ea489947f83095893b699a')},
-     'grc_proiel': {'dev': ('grc_proiel-ud-dev.conllu',
-                            'd199530c7e40ff0214e510957bb126af0dc12c1c'),
-                    'test': ('grc_proiel-ud-test.conllu',
-                             'bb7825ddeb18fc2d86638e4725f04563f3e08aab'),
-                    'train': ('grc_proiel-ud-train.conllu',
-                              'fe6c861299b033abe8c4ce2b6131cd74f87b96a7')},
-     'grc': {'dev': ('grc-ud-dev.conllu',
-                     'debdfec0272cd558ccd29fe0ae2f13175dd20a33'),
-             'test': ('grc-ud-test.conllu',
-                      'f19accf31db95e2c736d716d3438c09aa877eb07'),
-             'train': ('grc-ud-train.conllu',
-                       'e98d3eabea67787c5d43a498f5a0fa4246f38104')},
-     'ar_nyuad': {'dev': ('ar_nyuad-ud-dev.conllu',
-                          'b740de9bd68e68b30b9b313eb050d44e94470ca5'),
-                  'test': ('ar_nyuad-ud-test.conllu',
-                           'f5d5b8979b7fedd76235d4bae77e0b4a7b0a750a'),
-                  'train': ('ar_nyuad-ud-train.conllu',
-                            'd065f03958fd8782a7431b6778c6665ad09444a6')},
-     'ar_pud': {'test': ('ar_pud-ud-test.conllu',
-                         '2161701e6726b6feb14733a312fba6160b9eb722')},
-     'ar': {'dev': ('ar-ud-dev.conllu',
-                    '5f8964974d5ba5eb3504cdafb93c34c473c4177c'),
-            'test': ('ar-ud-test.conllu',
-                     '58df161047f310cc3bb4d0e615ca33466e630bb9'),
-            'train': ('ar-ud-train.conllu',
-                      '0a3d5cefa1fecd6a74f2016ee73ea7a7a02eb359')},
-     'eu': {'dev': ('eu-ud-dev.conllu',
-                    '3ee15b5ed46ec93d7278c8cc0351d242417d553d'),
-            'test': ('eu-ud-test.conllu',
-                     'aa68d6442ac6dc1abedc19c1b98c4a9944786188'),
-            'train': ('eu-ud-train.conllu',
-                      'd56ec997916e38ee6ab1badd78c119e81e4797c9')},
-     'be': {'dev': ('be-ud-dev.conllu',
-                    '015473e91cf8937c46e8b721f206415abac16a35'),
-            'test': ('be-ud-test.conllu',
-                     'f009ea1885f54cfd77fca8a2c89133b2af8f9f5e'),
-            'train': ('be-ud-train.conllu',
-                      '26b871e28d2f356a709f106b6e3e86b417ba74e7')},
-     'bg': {'dev': ('bg-ud-dev.conllu',
-                    '0a2284b10547681eb65691eb2a9f0f1662e16e90'),
-            'test': ('bg-ud-test.conllu',
-                     '75ea2a5e1d55bb57efecae6ec2b5ac3cc1b37e57'),
-            'train': ('bg-ud-train.conllu',
-                      'd4b2fa267010c4486885c91f3af65ff66c8be94c')},
-     'bxr': {'sample': ('bxr-ud-sample.conllu',
-                        '9239bdd251a60820c71111ec54de9e7d58a8579d'),
-             'test': ('bxr-ud-test.conllu',
-                      '0a06e527454ae0b547153222f67eb5db94e528fd')},
-     'yue': {'test': ('yue-ud-test.conllu',
-                      'd91477c65aa75cd45489cca13f7a122066972bdb')},
-     'ca': {'dev': ('ca-ud-dev.conllu',
-                    '5737824f0afff0d07a43db331f102d62c6da2d96'),
-            'test': ('ca-ud-test.conllu',
-                     '0e28bd2a3b982515c1158194ad52bcbbe741e170'),
-            'train': ('ca-ud-train.conllu',
-                      'b5ff2392722d4a1df3bfc52fa5b8f2043b7aec0c')},
-     'zh_cfl': {'test': ('zh_cfl-ud-test.conllu',
-                         '32fe45cd0e4e11ced95202971bce74acbc6a8c30')},
-     'zh_hk': {'test': ('zh_hk-ud-test.conllu',
-                        '4c75fa5bbcdcb181447b4e037224d50feb2776fb')},
-     'zh_pud': {'test': ('zh_pud-ud-test.conllu',
-                         'b3e448884b7b6229379f9723b97c6e9a6fedcb61')},
-     'zh': {'dev': ('zh-ud-dev.conllu',
-                    '34d8253b35ad2245d59ddffa71b5689ef267b6b2'),
-            'test': ('zh-ud-test.conllu',
-                     '0f00516097650c12262298dd0fbd1b17a6d2bfe2'),
-            'train': ('zh-ud-train.conllu',
-                      '9444eec5f4561f289ad140e47e49013689512a65')},
-     'cop': {'dev': ('cop-ud-dev.conllu',
-                     '863d1004df1a92df52515105f6fae6ff68539595'),
-             'test': ('cop-ud-test.conllu',
-                      'd3b33566679f071d4ad622ad840cd98381835706'),
-             'train': ('cop-ud-train.conllu',
-                       '33d0e5de5d6077f7c52a4cd90bce0047f3e9ff6f')},
-     'hr': {'dev': ('hr-ud-dev.conllu',
-                    '8da2a419980807d2e91e09b6bf496e58d442b0ba'),
-            'test': ('hr-ud-test.conllu',
-                     '49d673cba3d32d39d413e557276a45a0214ed83e'),
-            'train': ('hr-ud-train.conllu',
-                      'e5cc686bb46c80c84c3ac60ed459e1f124c04c08')},
-     'cs_cac': {'dev': ('cs_cac-ud-dev.conllu',
-                        '69dfed28c29146b41a3428f4715bde70a6aecf00'),
-                'test': ('cs_cac-ud-test.conllu',
-                         'a994b33ebbde486c1818a9df460fb112055e95de'),
-                'train': ('cs_cac-ud-train.conllu',
-                          '694f8559471dc481612606bf5df078daa094a84e')},
-     'cs_cltt': {'dev': ('cs_cltt-ud-dev.conllu',
-                         'f35d5dbe57cd95760901ea29de4f493d5d2a44d4'),
-                 'test': ('cs_cltt-ud-test.conllu',
-                          'a8f6696785e658471f759bc736b738a105cba9a3'),
-                 'train': ('cs_cltt-ud-train.conllu',
-                           'ab97886066bfa462e5da03d25f802489292c0b56')},
-     'cs_fictree': {'dev': ('cs_fictree-ud-dev.conllu',
-                            'dc67c07737a3a8bf2633068941f2d55f1500e192'),
-                    'test': ('cs_fictree-ud-test.conllu',
-                             '06becaedef1cfdb8e1b2dce3f0d3a3a607d178a4'),
-                    'train': ('cs_fictree-ud-train.conllu',
-                              'fe7dbe3a0e6ee73e19e788c43bbb8f8f47ae1645')},
-     'cs_pud': {'test': ('cs_pud-ud-test.conllu',
-                         '9f205677041de694157ba2ef3e1eadb44d467f2f')},
-     'cs': {'dev': ('cs-ud-dev.conllu',
-                    'd609e895b21b8710337e23a98b58ffd7b7a54bf1'),
-            'test': ('cs-ud-test.conllu',
-                     '34091286a11b1ce2a9c8bcfa03fdd86fb0e13965'),
-            'train': ('cs-ud-train.conllu',
-                      'd1f855798a29d433b580d01ade0d8d062cd58534')},
-     'da': {'dev': ('da-ud-dev.conllu',
-                    '2c0c798c20a2efb30273172d388342a82bb0ce3c'),
-            'test': ('da-ud-test.conllu',
-                     '85a95a8527f8773f1575ceaf0ab51f204b211047'),
-            'train': ('da-ud-train.conllu',
-                      'b653c029a7ae5c106f865dcef949fb3fe2aa0420')},
-     'nl_lassysmall': {'dev': ('nl_lassysmall-ud-dev.conllu',
-                               '2a169af74c2206c9073c3932b4a300492a314ee5'),
-                       'test': ('nl_lassysmall-ud-test.conllu',
-                                '39f08896a40ad370f2acc37d58689cdc43a660a9'),
-                       'train': ('nl_lassysmall-ud-train.conllu',
-                                 'e4fd6bac246c81bb17a3c932e251b8662739cc19')},
-     'nl': {'dev': ('nl-ud-dev.conllu',
-                    '33a9387eef9f5c0b15bd1e76e78776863f1f6d90'),
-            'test': ('nl-ud-test.conllu',
-                     '01b3e1048792c851fdd59882c353fcdb76dc165e'),
-            'train': ('nl-ud-train.conllu',
-                      '8e6a10152b7d09ce61433dd5f715ab2401611cf6')},
-     'en_lines': {'dev': ('en_lines-ud-dev.conllu',
-                          '83b63b7670ea4394b558bc26e16a004339f0a0ef'),
-                  'test': ('en_lines-ud-test.conllu',
-                           'ccc9d3c71a873313d138c3adb12405a97eb270d8'),
-                  'train': ('en_lines-ud-train.conllu',
-                            'da42bfac9fd97d98ebbbc37c65d83ff4c53b4e79')},
-     'en_pud': {'test': ('en_pud-ud-test.conllu',
-                         '4a9c83ba058a7e51979af790ba0440cc274b948f')},
-     'en_partut': {'dev': ('en_partut-ud-dev.conllu',
-                           '863a6f571158acaaca95223e50bd08fc0c1134f0'),
-                   'test': ('en_partut-ud-test.conllu',
-                            '0c0780b0f14e4623f1014e6496d639cd2d2f6ffd'),
-                   'train': ('en_partut-ud-train.conllu',
-                             'e00a2d6f7efa28c8aaa40dccdf29b59a50f48e18')},
-     'en': {'dev': ('en-ud-dev.conllu',
-                    'e2159dda4400d289ad8a403b466c8d23d733ba35'),
-            'test': ('en-ud-test.conllu',
-                     'bd36ef23f76155625b379d063427bd62f19b7658'),
-            'train': ('en-ud-train.conllu',
-                      '993c44f62104971fe2d056847349facbb7986258')},
-     'et': {'dev': ('et-ud-dev.conllu',
-                    '312f9477f7ee1dd380c1fbcf77a6f0c63476fdbb'),
-            'test': ('et-ud-test.conllu',
-                     'd70907f0771b41a27406672b9d91043a0954f946'),
-            'train': ('et-ud-train.conllu',
-                      'b6d788e7a3362d0984d1cff06c1ba3d66f6bf773')},
-     'fi_ftb': {'dev': ('fi_ftb-ud-dev.conllu',
-                        '552ec574acdb3209e7545af4e16a43a1e2956979'),
-                'test': ('fi_ftb-ud-test.conllu',
-                         '13c34838a0fa9e379f9624ed1f4c368ca50a7d98'),
-                'train': ('fi_ftb-ud-train.conllu',
-                          '73d025250bfc82a24181b5ed601dc4ae7c8e846c')},
-     'fi_pud': {'test': ('fi_pud-ud-test.conllu',
-                         '4ab7b0d99ce6697d79732e401be97585a28c2afa')},
-     'fi': {'dev': ('fi-ud-dev.conllu',
-                    'e023cf7eaffbda20bd4518d87fe9086207bb5361'),
-            'test': ('fi-ud-test.conllu',
-                     'fd57c5106e43994250f4472890572bdbb8b4a48b'),
-            'train': ('fi-ud-train.conllu',
-                      'ab27bda8cbb62886196b78de87985a4c6cf8215d')},
-     'fr_ftb': {'dev': ('fr_ftb-ud-dev.conllu',
-                        '71b3cc02601f64711f98e33a6b2af10aa00700be'),
-                'test': ('fr_ftb-ud-test.conllu',
-                         '723b8c44e74202a18b7e71268b738a5e1aa15f86'),
-                'train': ('fr_ftb-ud-train.conllu',
-                          '9a347120478254647deb7c7e02871b28aad23ec4')},
-     'fr_pud': {'test': ('fr_pud-ud-test.conllu',
-                         '570b7e31dc359ed62123bea6546efa13cfc2cf25')},
-     'fr_partut': {'dev': ('fr_partut-ud-dev.conllu',
-                           '1505030048829a8dccc466cc86bca057996301ae'),
-                   'test': ('fr_partut-ud-test.conllu',
-                            'f6446317c9f82cc0b70a76be75282804a3359ac0'),
-                   'train': ('fr_partut-ud-train.conllu',
-                             'f87c246cfa91186b90c7780cb64783034f196622')},
-     'fr_sequoia': {'dev': ('fr_sequoia-ud-dev.conllu',
-                            '859b10d80c7b3a382571cce9b2620039673539d1'),
-                    'test': ('fr_sequoia-ud-test.conllu',
-                             'be0ef69e392e64030414748da2995433f23e033d'),
-                    'train': ('fr_sequoia-ud-train.conllu',
-                              '48ac01913518888a32670a687123ed1bac57e0e9')},
-     'fr': {'dev': ('fr-ud-dev.conllu',
-                    '5de0aee778bcc69d14285ada88f0ff7e5ac0a0cd'),
-            'test': ('fr-ud-test.conllu',
-                     'd20a014acd38193155a33a5233c13f89541c78c3'),
-            'train': ('fr-ud-train.conllu',
-                      'feee0cc85a2d7dcb3397399ef22c8af8ef75420b')},
-     'gl_treegal': {'dev': ('gl_treegal-ud-dev.conllu',
-                            '272558614cff4a5e1f2805626904e6dc488b8d25'),
-                    'test': ('gl_treegal-ud-test.conllu',
-                             '18d99474d3aa9c83878c42a79d7881330dd9b861'),
-                    'train': ('gl_treegal-ud-train.conllu',
-                              'b1691dd5f587a19eb9dc6f141ecbd3eec3bb0e07')},
-     'gl': {'dev': ('gl-ud-dev.conllu',
-                    'e72390dce9bf973442deef31ed0cd7a975361fe5'),
-            'test': ('gl-ud-test.conllu',
-                     '7d82ba3672bd4427674428e1dcbcae4feebc3aeb'),
-            'train': ('gl-ud-train.conllu',
-                      'd586e7bffa314f8c5b85288e060e68dddc1f5d33')},
-     'de_pud': {'test': ('de_pud-ud-test.conllu',
-                         '2c91e42b7345145290b68385ff5270910048b8c4')},
-     'de': {'dev': ('de-ud-dev.conllu',
-                    '9b4f49bfa2b609d54369890d9e7d8d24a3c229af'),
-            'test': ('de-ud-test.conllu',
-                     '48f0f6f98b38710906481b5e9fe1d459d28f1b4a'),
-            'train': ('de-ud-train.conllu',
-                      '04a1d6a6a2da9d9c38496118e0432c9a6720db64')},
-     'got': {'dev': ('got-ud-dev.conllu',
-                     '501c47193ca2af5826e4afcc04941df87a7c47c3'),
-             'test': ('got-ud-test.conllu',
-                      'cfcf16d562434987562bd1f5faa0d8c007e9ddb8'),
-             'train': ('got-ud-train.conllu',
-                       'b4951ede89d947c6617df782ac248566235f78fb')},
-     'el': {'dev': ('el-ud-dev.conllu',
-                    '9df0919ed6f9dcab3ba3f60f0ad31d0c79ae6cdb'),
-            'test': ('el-ud-test.conllu',
-                     '1bb4a6b24521f0c3c7d6cf71e2456ef3a1ee31aa'),
-            'train': ('el-ud-train.conllu',
-                      '32f4abc821624c4cd4d3b3b555c1558f06366e2c')},
-     'he': {'dev': ('he-ud-dev.conllu',
-                    'c5b76874fcf11c7733e1555957bb49e8298af140'),
-            'test': ('he-ud-test.conllu',
-                     '4fbe4115948250fc2e42dd43399d1c6c11ddcfd2'),
-            'train': ('he-ud-train.conllu',
-                      'eae49a515b38d224b109138bf006a112e80a7caf')},
-     'hi_pud': {'test': ('hi_pud-ud-test.conllu',
-                         'd237fecc594186e7a52ad33313ac52e927905d73')},
-     'hi': {'dev': ('hi-ud-dev.conllu',
-                    '48b592bb1aa1cbc30d41d2913421cfd3f9d2c790'),
-            'test': ('hi-ud-test.conllu',
-                     '004a7fdde368f32f9f230bc5e2cf4ce9e1d8f8d7'),
-            'train': ('hi-ud-train.conllu',
-                      '9be8afb2cabda361817c55b3de6ebba2c3fef7e0')},
-     'hu': {'dev': ('hu-ud-dev.conllu',
-                    'ec622e6bcf2a84b0b47eba0de01cf5768157a50e'),
-            'test': ('hu-ud-test.conllu',
-                     'fd717d25add38c2fb2dc8e82e2f9e5b0b9f3c5b8'),
-            'train': ('hu-ud-train.conllu',
-                      'e5486523a8bebe40d633ad8b4050be8a3d11c78a')},
-     'id': {'dev': ('id-ud-dev.conllu',
-                    '7b181aa954a4f4b22b80a18e4f67cbf423e9c701'),
-            'test': ('id-ud-test.conllu',
-                     '357ed8c216725760bf5be561ed6e918ce602b5ac'),
-            'train': ('id-ud-train.conllu',
-                      '328ea588b75de55ef48373c2bf9983bca277d724')},
-     'ga': {'dev': ('ga-ud-dev.conllu',
-                    '180a1a9dcfcec6528a559032c67e9a15693a039d'),
-            'test': ('ga-ud-test.conllu',
-                     'b74a56372af3f68f089ea82ba858e5a82aae4e22'),
-            'train': ('ga-ud-train.conllu',
-                      '40df0b12fbadae6e56c0a01da483d6c612d9450c')},
-     'it_pud': {'test': ('it_pud-ud-test.conllu',
-                         'c7121c03dbdc7d27f89c6f6dd8f046b89233438e')},
-     'it_partut': {'dev': ('it_partut-ud-dev.conllu',
-                           '0bb5dc0c0815212c9832eaef3b802cf885e0543b'),
-                   'test': ('it_partut-ud-test.conllu',
-                            'b5eccd3d9a94a2f96c8c3a6e4192a287ac563898'),
-                   'train': ('it_partut-ud-train.conllu',
-                             '784b18bf8d3b59d967d147075a3cb5b03fb28637')},
-     'it_postwita': {'dev': ('it_postwita-ud-dev.conllu',
-                             '07f6f658246aa070e2166e688f7569d61aafff54'),
-                     'test': ('it_postwita-ud-test.conllu',
-                              'c2d58f50e51d37cb5f55bd0a3129138e95a72a8a'),
-                     'train': ('it_postwita-ud-train.conllu',
-                               '69684c47fba99230f6ef1a204b95c37d28eaa5a6')},
-     'it': {'dev': ('it-ud-dev.conllu',
-                    'ea8fd59f36280fbd77b9a807959491636048a698'),
-            'test': ('it-ud-test.conllu',
-                     '34839fdeeef883f8034c723a18772947106cec6b'),
-            'train': ('it-ud-train.conllu',
-                      'a0cae413f46a344366f86bc7ffe4f5d7ecbf6a14')},
-     'ja_pud': {'test': ('ja_pud-ud-test.conllu',
-                         '4c914016a0968ca434348370d38c9579a60e8fd7')},
-     'ja': {'dev': ('ja-ud-dev.conllu',
-                    '21f06fef7fbeccd05a298385bf40f8b4ffe95146'),
-            'test': ('ja-ud-test.conllu',
-                     '240d3532698356a7c6f93c3215718ef2f66a672f'),
-            'train': ('ja-ud-train.conllu',
-                      '35eaf307d94c2006241fe08f745d7b1b17f049cf')},
-     'kk': {'dev': ('kk-ud-dev.conllu',
-                    '038033c822b407040a4ecb87c077506cd0d1a322'),
-            'test': ('kk-ud-test.conllu',
-                     '4124bcaa6e4fc132613d94a882abcff8ecad8ca0'),
-            'train': ('kk-ud-train.conllu',
-                      '48d664d273ad6731cb65228ce9b57ad3cf50f7f5')},
-     'ko': {'dev': ('ko-ud-dev.conllu',
-                    '60e7da7cca44c923873a062e80262726659f5528'),
-            'test': ('ko-ud-test.conllu',
-                     'bc9a0fc4ddfed14b70bb58048bf8b8d50062cffd'),
-            'train': ('ko-ud-train.conllu',
-                      'ee21328f9ea39668e802f0cb6a794358f5c256bf')},
-     'kmr': {'sample': ('kmr-ud-sample.conllu',
-                        'd76d631400d17b63b9592ce3c0f4ecada012d6d0'),
-             'test': ('kmr-ud-test.conllu',
-                      '606a338db2d6adde6b4d7d8c9ee2bdf1f988d729')},
-     'la_ittb': {'dev': ('la_ittb-ud-dev.conllu',
-                         'd9f17992bd0258a734aea9b6c53759039717c86a'),
-                 'test': ('la_ittb-ud-test.conllu',
-                          'f4d097d076083240c48594d4cb058840ff16be8e'),
-                 'train': ('la_ittb-ud-train.conllu',
-                           '627d5b30b20655efab194c75fc9219b0aa2cf4b6')},
-     'la_proiel': {'dev': ('la_proiel-ud-dev.conllu',
-                           '9a510ff1f29b507ce46d32c04eb8f02ec8bdb4fb'),
-                   'test': ('la_proiel-ud-test.conllu',
-                            '697dbeae38507856a4fafa8506dfc8db5e8e4054'),
-                   'train': ('la_proiel-ud-train.conllu',
-                             '5e57e0a83ed8dcdfcc892c2558249cb6bc02b37a')},
-     'la': {'dev': ('la-ud-dev.conllu',
-                    '2748bb0479cb599e1a007d1d1634d5870b45549b'),
-            'test': ('la-ud-test.conllu',
-                     '19c62c64ce41a650e9b55a345c61e7c0d994816e'),
-            'train': ('la-ud-train.conllu',
-                      '183ce6f58b0305e5926161e29b9a6aacc424662c')},
-     'lv': {'dev': ('lv-ud-dev.conllu',
-                    '6bf3843d92aeb5b4a5e3b457708ad0aca176fbd2'),
-            'test': ('lv-ud-test.conllu',
-                     '9f7806a24656db0e859efe041a88926b220b8e28'),
-            'train': ('lv-ud-train.conllu',
-                      'f1eeff608e8f27d92b683ae041591355198841eb')},
-     'lt': {'dev': ('lt-ud-dev.conllu',
-                    '0b8dc19005571fa7b66d8302b797d51a241f128b'),
-            'test': ('lt-ud-test.conllu',
-                     'def54d6caf97610eb4ca8c0179d661c8eab98951'),
-            'train': ('lt-ud-train.conllu',
-                      '13fe42a3d21f17a5cad5aaf38692619c7713e177')},
-     'mr': {'dev': ('mr-ud-dev.conllu',
-                    'abf7ac90a3696bb979e6ddc17cbc0fc761040b1b'),
-            'test': ('mr-ud-test.conllu',
-                     'b70e2a135e69dc17474951bfd9c7cf3f203d4798'),
-            'train': ('mr-ud-train.conllu',
-                      '24a1370184054a7f5af647997dca783d6c571242')},
-     'sme': {'sample': ('sme-ud-sample.conllu',
-                        '8c456f06b363c4d273fc454a49505f783f00fe43'),
-             'test': ('sme-ud-test.conllu',
-                      '6c2084f60d7f2d1468a0cb4f4a4b9669274b122e'),
-             'train': ('sme-ud-train.conllu',
-                       '203eab4183fd585efe3fea7e6df493a6746b0a9f')},
-     'no_bokmaal': {'dev': ('no_bokmaal-ud-dev.conllu',
-                            '3a1aa6646ee62c605a6e5a7b535434ce93d0581f'),
-                    'test': ('no_bokmaal-ud-test.conllu',
-                             '18336ef0e4877ae28eb7d6019afe05b5a53245d5'),
-                    'train': ('no_bokmaal-ud-train.conllu',
-                              'c6a1d75956dfb9376e568bf241b3ee5ebf3be3a5')},
-     'no_nynorsk': {'dev': ('no_nynorsk-ud-dev.conllu',
-                            '5b95a070d11a61a23fc340ecbbbbb70f86884498'),
-                    'test': ('no_nynorsk-ud-test.conllu',
-                             '3eaab8e4af82de2333521e9be0954ffaf6b1440b'),
-                    'train': ('no_nynorsk-ud-train.conllu',
-                              '79319993097c30ddf28d4c1137b8662f4f35d17e')},
-     'no_nynorsklia': {'dev': ('no_nynorsklia-ud-dev.conllu',
-                               'f3e3cc9b156784c12e7540b6e09a19963df8d7d9'),
-                       'test': ('no_nynorsklia-ud-test.conllu',
-                                'c43abf4ad0d9c1d844edb9ff0fdf8b00949c4a0b')},
-     'cu': {'dev': ('cu-ud-dev.conllu',
-                    '0b67035ed5ca52aeefae443611232ed202fb990a'),
-            'test': ('cu-ud-test.conllu',
-                     '0fed872a5a2480b601c67ebbecf8dcd680b6863b'),
-            'train': ('cu-ud-train.conllu',
-                      '1c58f7322b96aa65e2b6bbeb5cb5226b46dc3ef0')},
-     'fa': {'dev': ('fa-ud-dev.conllu',
-                    '098f97ff4c0a6a9dcaafe2c83908b1ff044b4446'),
-            'test': ('fa-ud-test.conllu',
-                     '0024aa6bad5eceed2e36f77d88578304a5886a80'),
-            'train': ('fa-ud-train.conllu',
-                      '1692f90f58fb1ed2faaa4e8c5d2d47a37c47082b')},
-     'pl': {'dev': ('pl-ud-dev.conllu',
-                    'b7af7bee091feb0788eb9793a7102972006421dc'),
-            'test': ('pl-ud-test.conllu',
-                     'e141e793ba35f8a08510ec1ce494099b5c800ca8'),
-            'train': ('pl-ud-train.conllu',
-                      'f2227ba184a5030fc47b1aff732e04ae11b9ab94')},
-     'pt_br': {'dev': ('pt_br-ud-dev.conllu',
-                       '8eedc77096a87fe8ab251100d460780e161e5397'),
-               'test': ('pt_br-ud-test.conllu',
-                        '37a64e3acef107b62ab62ce478fc36ed112fb58f'),
-               'train': ('pt_br-ud-train.conllu',
-                         '023cafcb6959d52298ad619f7838f26db9798aa9')},
-     'pt_pud': {'test': ('pt_pud-ud-test.conllu',
-                         '4f7a98b59255ff58a1a423dda6f2cb7261dcea7d')},
-     'pt': {'dev': ('pt-ud-dev.conllu',
-                    '2171b4ac2b0726c9dfae6adf394b76be927accab'),
-            'test': ('pt-ud-test.conllu',
-                     '9e819a4592db42905806141d6fca3b7b20396ce3'),
-            'train': ('pt-ud-train.conllu',
-                      'b5fbb6598d5cc53a0f7e699adeb4a61948a49b5c')},
-     'ro_nonstandard': {'test': ('ro_nonstandard-ud-test.conllu',
-                                 '300d53091412dc5700dc5cad0fd3e136f7c8cb11'),
-                        'train': ('ro_nonstandard-ud-train.conllu',
-                                  'ed97f51129b63857627f838f68f41c9ef8541686')},
-     'ro': {'dev': ('ro-ud-dev.conllu',
-                    'a320e29582e837fa48bbe0aab8e205cadfcb4a02'),
-            'test': ('ro-ud-test.conllu',
-                     '0cfe4806a28ebdc02dc7ea58635d8b550c3a9d7b'),
-            'train': ('ro-ud-train.conllu',
-                      '74beb2aa92d2fca50dbb1a4f716b936afb436ab9')},
-     'ru_pud': {'test': ('ru_pud-ud-test.conllu',
-                         'bca81ce7aaf3cb8add98b19faecc1d8303901631')},
-     'ru_syntagrus': {'dev': ('ru_syntagrus-ud-dev.conllu',
-                              '304c6ec7fb5060583af5f890384e3a480f8c3ad5'),
-                      'test': ('ru_syntagrus-ud-test.conllu',
-                               'c138e39b48dc1c66d106e68ee75c6fce28ef780c'),
-                      'train': ('ru_syntagrus-ud-train.conllu',
-                                '8fa56fa80845e4ad946189d1e7af228b5595e312')},
-     'ru': {'dev': ('ru-ud-dev.conllu',
-                    'd3b11c0fd8a87bfb7ce9666a1888126ae5ddca90'),
-            'test': ('ru-ud-test.conllu',
-                     'ae13bbf49e0d2fddae8ba2eeacd15a9a77c7bfff'),
-            'train': ('ru-ud-train.conllu',
-                      'fd43e7323ad2e62a6924fc5b5d48e85c6ab5a430')},
-     'sa': {'test': ('sa-ud-test.conllu',
-                     'fad3a03a6834884a092b1d326625c6f663e36636')},
-     'sr': {'dev': ('sr-ud-dev.conllu',
-                    'dcb9a242986285e83512ddaa4b3ada07c4cea17a'),
-            'test': ('sr-ud-test.conllu',
-                     '0f0c9e394c440bb2dd514bdd6873d3ffef13821b'),
-            'train': ('sr-ud-train.conllu',
-                      '97ea9bfe4ac97011598fbb5ca20b5cbaf5093334')},
-     'sk': {'dev': ('sk-ud-dev.conllu',
-                    'c84563c08922d60b0c765e9f9c22d9f6f2765ff9'),
-            'test': ('sk-ud-test.conllu',
-                     '89af4581c5f9058809f48788eb635a92cda0603c'),
-            'train': ('sk-ud-train.conllu',
-                      '89e108093bbf5619578955fdadfe200cefd8cf01')},
-     'sl_sst': {'dev': ('sl_sst-ud-dev.conllu',
-                        'c65ae82123af95ec11f47262546b5ab2fc5735e5'),
-                'test': ('sl_sst-ud-test.conllu',
-                         '144a0124c1181b49d0c542a4a6d4465e45545f3b'),
-                'train': ('sl_sst-ud-train.conllu',
-                          '4cbb97d5c19cfb1d85cdd54a13e24de2343a4ac5')},
-     'sl': {'dev': ('sl-ud-dev.conllu',
-                    '0078572c19574d32defeae9924176da2dd701ede'),
-            'test': ('sl-ud-test.conllu',
-                     '616ace00e25df99be8dd49b7bf7c48f1093df96a'),
-            'train': ('sl-ud-train.conllu',
-                      '1462ac69163b30cf1399527e95f686ebf91be2d3')},
-     'es_ancora': {'dev': ('es_ancora-ud-dev.conllu',
-                           '94b00cc6449a1793b5ba1d9d5c1e4b34ad1cc7d5'),
-                   'test': ('es_ancora-ud-test.conllu',
-                            '8d7dc8d8441e1ca4b54708a5382ed61b48bf7920'),
-                   'train': ('es_ancora-ud-train.conllu',
-                             '95d5bf7ad33304f3440ffb014ac094c4967c303f')},
-     'es_pud': {'test': ('es_pud-ud-test.conllu',
-                         'c2b17fce1da3bdd2a50d9dd7eca101db1d2907e0')},
-     'es': {'dev': ('es-ud-dev.conllu',
-                    '4cdb828c492c6b7707af0ab6c7fbf734f770630a'),
-            'test': ('es-ud-test.conllu',
-                     'afd1ae1b7eb73a91456c30acf388eef4faf4785a'),
-            'train': ('es-ud-train.conllu',
-                      '5ce48b44ba1b3e748a40cb5bf893d3096518ecbc')},
-     'sv_lines': {'dev': ('sv_lines-ud-dev.conllu',
-                          '15f1a04d960518fe7bfee23ce227fc7b78d4b755'),
-                  'test': ('sv_lines-ud-test.conllu',
-                           '843df4ea3ab4f551b1eaa661652a8d6489a81d41'),
-                  'train': ('sv_lines-ud-train.conllu',
-                            '16e3533bf174b36d728847a36a3600f16c63baa6')},
-     'sv_pud': {'test': ('sv_pud-ud-test.conllu',
-                         '18dadac0c15468256b340835ebc0529facbe9b73')},
-     'sv': {'dev': ('sv-ud-dev.conllu',
-                    '6d14e1aae5c9ae37c35481c44c04bf74a4233455'),
-            'test': ('sv-ud-test.conllu',
-                     '7ead0f7b49508db0022c042195ac5925b611c5b7'),
-            'train': ('sv-ud-train.conllu',
-                      '68affb85efde6ed017eab1e998e9666108559e04')},
-     'swl': {'dev': ('swl-ud-dev.conllu',
-                     '828e0a08f12cabfa75f9dd2b53dba58606522a7c'),
-             'test': ('swl-ud-test.conllu',
-                      '674f76631cf16172d67b795ff92dfbb297eb4930'),
-             'train': ('swl-ud-train.conllu',
-                       '46b721f9cae2d5ba43f818dd487600b0ce76362a')},
-     'ta': {'dev': ('ta-ud-dev.conllu',
-                    '4d01f555012ddc1976933d4d928e26470f71bfa1'),
-            'test': ('ta-ud-test.conllu',
-                     'e8db8816a98d8b7e81188786db7c405979a7e3c3'),
-            'train': ('ta-ud-train.conllu',
-                      '6753d8c7b1b016de39c087aab45056de6021c3ae')},
-     'te': {'dev': ('te-ud-dev.conllu',
-                    '29f46355d767e54e8565f76a063c43e95ead0fca'),
-            'test': ('te-ud-test.conllu',
-                     '50abe345d4ab5bae021cacd096266c57b00572b8'),
-            'train': ('te-ud-train.conllu',
-                      '1794469abe09e7364cda0d9764cf515dcb4a61b6')},
-     'tr_pud': {'test': ('tr_pud-ud-test.conllu',
-                         'aae839e2476a2f149c98e0274d245d07a50dafaa')},
-     'tr': {'dev': ('tr-ud-dev.conllu',
-                    '421de4d8d0fbdda46750523bde72880414c134a3'),
-            'test': ('tr-ud-test.conllu',
-                     'b175f136f6f0271c494a58a1846971c4a07cda27'),
-            'train': ('tr-ud-train.conllu',
-                      '5aeaf25fc9e00c75e377983a0d0a642e4df6ae7d')},
-     'uk': {'dev': ('uk-ud-dev.conllu',
-                    '0d3e3507edcd46a3eaa8c4702d0f5d84661a6d9d'),
-            'test': ('uk-ud-test.conllu',
-                     '46c88fd623894fabdafb01a826016c215e4f65cc'),
-            'train': ('uk-ud-train.conllu',
-                      'd06e0e2fa67c35a20517738bd728ac3b26d8eafe')},
-     'hsb': {'sample': ('hsb-ud-sample.conllu',
-                        '148eddbb19b06115ea54e17a3fca58e99a85cbd9'),
-             'test': ('hsb-ud-test.conllu',
-                      '3d319288b4c06395b2627980737131995949f770')},
-     'ur': {'dev': ('ur-ud-dev.conllu',
-                    'dc41e72b5adeb92f308cdc8dfcbf71f84b4a5cf9'),
-            'test': ('ur-ud-test.conllu',
-                     'af5da25be4c4ec1f2a222bc462b39ca4bbcc0eb0'),
-            'train': ('ur-ud-train.conllu',
-                      '488d65b394d0de264be1221614c09e541f92f9de')},
-     'ug': {'dev': ('ug-ud-dev.conllu',
-                    'a2e6cd7ef51ffd7c83de7c62fbad998f1020f857'),
-            'test': ('ug-ud-test.conllu',
-                     '4877323d8dbfaa8ab862f0aa8e5484fdadb9ef43')},
-     'vi': {'dev': ('vi-ud-dev.conllu',
-                    '1c733d3ea3e4cce00cb0aa4d599bcb3b0a6096a8'),
-            'test': ('vi-ud-test.conllu',
-                     '1bb822e58f21aa5ccac15fe6c6742a42e8389d41'),
-            'train': ('vi-ud-train.conllu',
-                      'ac86132afc061625740abd524c5cdf3d35ebbbc4')}}
diff --git a/src/gluonnlp/embedding/embed_loader.py b/src/gluonnlp/embedding/embed_loader.py
new file mode 100644
index 0000000000..b3b65ac93f
--- /dev/null
+++ b/src/gluonnlp/embedding/embed_loader.py
@@ -0,0 +1,326 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=consider-iterating-dictionary, too-many-lines
+"""Load token embedding"""
+
+__all__ = [
+    'list_sources', 'load_embeddings', 'get_fasttext_model'
+]
+
+import io
+import logging
+import os
+import warnings
+import fasttext
+
+import numpy as np
+from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url
+
+from . import _constants as C
+from ..base import get_home_dir
+from ..data import Vocab
+
+text_embedding_reg = {
+    'glove' : C.GLOVE_NPZ_SHA1,
+    'word2vec' : C.WORD2VEC_NPZ_SHA1,
+    'fasttext' : C.FAST_TEXT_NPZ_SHA1
+}
+def list_sources(embedding_name=None):
+    """Get valid token embedding names and their pre-trained file names.
+
+    Parameters
+    ----------
+    embedding_name : str or None, default None
+        The pre-trained token embedding name.
+
+    Returns
+    -------
+    dict or list:
+        A list of all the valid pre-trained token embedding file names (`source`) for the
+        specified token embedding name (`embedding_name`). If the text embedding name is set to
+        None, returns a dict mapping each valid token embedding name to a list of valid pre-trained
+        files (`source`).
+    """
+    if embedding_name is not None:
+        embedding_name = embedding_name.lower()
+        if embedding_name == 'fasttext.bin':
+            return list(C.FAST_TEXT_BIN_SHA1.keys())
+        if embedding_name not in text_embedding_reg:
+            raise KeyError('Cannot find `embedding_name` {}. Use '
+                           '`list_sources(embedding_name=None).keys()` to get all the valid'
+                           'embedding names.'.format(embedding_name))
+        return list(text_embedding_reg[embedding_name].keys())
+    else:
+        return {embedding_name: list(embedding_cls.keys())
+                for embedding_name, embedding_cls in text_embedding_reg.items()}
+
+def _append_unk_vecs(matrix, vocab_size):
+    append_dim = vocab_size - len(matrix)
+    assert append_dim in [0, 1], "Error occurs in the embedding file."
+    if append_dim == 1:
+        # there is no unknown_token in the embedding file
+        mean = np.mean(found_vectors, axis=0, keepdims=True)
+        std = np.std(found_vectors, axis=0, keepdims=True)
+        vecs = np.random.randn(append_dim, dim).astype('float32') * std + mean
+        return np.concatenate([matrix, vecs], axis=0)
+    return matrix
+
+def _load_embedding_txt(file_path, vocab, unknown_token):
+    if vocab is not None:
+        result = np.zeros(len(vocab), dtype=bool)
+    else:
+        result = []
+    with open(file_path, 'r', encoding='utf-8') as f:
+        line = f.readline().strip()
+        parts = line.split()
+        start_idx = 0
+        if len(parts) == 2:
+            dim = int(parts[1])
+            start_idx += 1
+        else:
+            dim = len(parts) - 1
+            f.seek(0)
+        if vocab is None:
+            matrix = []
+        else: matrix = np.random.randn(len(vocab), dim).astype('float32')
+        for idx, line in enumerate(f, start_idx):
+            try:
+                parts = line.strip().split()
+                word = ''.join(parts[:-dim])
+                nums = parts[-dim:]
+                if vocab is None:
+                    result.append(word)
+                    matrix.append(np.fromstring(' '.join(nums), sep=' ', dtype='float32', count=dim))
+                else:
+                    if word == unknown_token and vocab.unk_token is not None:
+                        word = vocab.unk_token
+                    if word in vocab:
+                        index = vocab[word]
+                        matrix[index] = np.fromstring(' '.join(nums), sep=' ', dtype='float32', count=dim)
+                        result[index] = True
+            except Exception as e:
+                logging.error("Error occurred at the {} line.".format(idx))
+                raise e
+    if vocab is None:
+        result = Vocab(result, unk_token=unknown_token)
+        matrix = _append_unk_vecs(np.array(matrix), len(result))
+    return matrix, result
+
+def _load_embedding_npz(file_path, vocab, unknown):
+    if vocab is not None:
+        result = np.zeros(len(vocab), dtype=bool)
+    else:
+        result = []
+    npz_dict = np.load(file_path, allow_pickle=True)
+    unknown_token = npz_dict['unknown_token']
+    if not unknown_token:
+        unknown_token = unknown
+    else:
+        if isinstance(unknown_token, np.ndarray):
+            if unknown_token.dtype.kind == 'S':
+                unknown_token = unknown_token.tobytes().decode()
+            else:
+                unknown_token = str(unknown_token)
+    if unknown != unknown_token:
+        warnings.warn("You may not assign correct unknown token in the pretrained file"
+                      "Use {} as the unknown mark.".format(unknown_token))
+
+    idx_to_token = npz_dict['idx_to_token'].tolist()
+    token2idx = {x : i for i, x in enumerate(idx_to_token)}
+    idx_to_vec = npz_dict['idx_to_vec']
+    if vocab is None:
+        result = Vocab(idx_to_token, unk_token=unknown_token)
+        idx_to_vec = _append_unk_vecs(idx_to_vec, len(result))
+        return idx_to_vec, result
+    else:
+        matrix = np.random.randn(len(vocab), idx_to_vec.shape[-1]).astype('float32')
+        for i, token in enumerate(vocab.all_tokens):
+            if token == vocab.unk_token and unknown_token is not None:
+                word = unknown_token
+            else:
+                word = token
+            if word in token2idx:
+                index = token2idx[word]
+                matrix[i] = idx_to_vec[index]
+                result[i] = True
+        return matrix, result
+
+def _get_file_url(cls_name, file_name):
+    namespace = 'gluon/embeddings/{}'.format(cls_name)
+    return _get_repo_file_url(namespace, file_name)
+
+def _get_file_path(cls_name, file_name, file_hash):
+    root_path = os.path.expanduser(os.path.join(get_home_dir(), 'embedding'))
+    embedding_dir = os.path.join(root_path, cls_name)
+    url = _get_file_url(cls_name, file_name)
+    file_path = os.path.join(embedding_dir, file_name)
+    if not os.path.exists(file_path) or not check_sha1(file_path, file_hash):
+        logging.info('Embedding file {} is not found. Downloading from Gluon Repository. '
+                        'This may take some time.'.format(file_name))
+        download(url, file_path, sha1_hash=file_hash)
+    return file_path
+
+def _check_and_get_path(pretrained_name_or_dir):
+    if os.path.exists(pretrained_name_or_dir):
+        return pretrained_name_or_dir
+    for cls_name, embedding_cls in text_embedding_reg.items():
+        if pretrained_name_or_dir in embedding_cls:
+            source = pretrained_name_or_dir
+            file_name, file_hash = embedding_cls[source]
+            return _get_file_path(cls_name, file_name, file_hash)
+
+    return None
+
+def load_embeddings(vocab=None, pretrained_name_or_dir='glove.6B.50d', unknown='<unk>',
+                    unk_method=None):
+    """Load pretrained word embeddings for building an embedding matrix for a given Vocab.
+
+    This function supports loading GloVe, Word2Vec and FastText word embeddings from remote sources.
+    You can also load your own embedding file(txt with Word2Vec or GloVe format) from a given file path.
+
+    Glove: an unsupervised learning algorithm for obtaining vector representations for words.
+    Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and
+    the resulting representations showcase interesting linear substructures of the word vector
+    space. (Source from https://nlp.stanford.edu/projects/glove/)
+    
+    Available sources:
+    ['glove.42B.300d', 'glove.6B.100d', 'glove.6B.200d', 'glove.6B.300d', 'glove.6B.50d', \
+     'glove.840B.300d', 'glove.twitter.27B.100d', 'glove.twitter.27B.200d', \
+     'glove.twitter.27B.25d', 'glove.twitter.27B.50d']
+
+    Word2Vec: an unsupervised learning algorithm for obtaining vector representations for words.
+    Training is performed with continuous bag-of-words or skip-gram architecture for computing vector
+    representations of words.
+
+    Available sources:
+    ['GoogleNews-vectors-negative300', 'freebase-vectors-skipgram1000', \
+     'freebase-vectors-skipgram1000-en']
+
+    FastText: an open-source, free, lightweight library that allows users to learn text
+    representations and text classifiers. It works on standard, generic hardware. Models can later
+    be reduced in size to even fit on mobile devices. (Source from https://fasttext.cc/)
+
+    Available sources:
+    ['cc.af.300', ..., 'cc.en.300', ..., 'crawl-300d-2M', 'crawl-300d-2M-subword', \
+     'wiki-news-300d-1M', 'wiki-news-300d-1M-subword', \
+     'wiki.aa', ..., 'wiki.multi.ar', ..., 'wiki.zu']
+
+    Detailed sources can be founded by `gluonnlp.embedding.list_sources('FastText')`
+
+    For 'wiki.multi' embedding:
+    Word Translation Without Parallel Data
+    Alexis Conneau, Guillaume Lample, Marc'Aurelio Ranzato, Ludovic Denoyer, and Herve Jegou.
+    https://arxiv.org/abs/1710.04087
+
+    Parameters
+    ----------
+    vocab : gluonnlp.data.Vocab object, default None
+        A vocabulary on which an embedding matrix is built.
+        If `vocab` is `None`, then all tokens in the pretrained file will be used.
+    pretrained_name_or_dir : str, default 'glove.6B.50d'
+        A file path for a pretrained embedding file or the name of the pretrained token embedding file.
+        This method would first check if it is a file path.
+        If not, the method will load from cache or download.
+    unknown : str, default '<unk>'
+        To specify the unknown token in the pretrained file.
+    unk_method : Callable, default None
+        A function which receives `List[str]` and returns `numpy.ndarray`.
+        The input of the function is a list of words which are in the `vocab`,
+        but do not occur in the pretrained file.
+        And the function is aimed to return an embedding matrix for these words.
+        If `unk_method` is None, we generate vectors for these words,
+        by sampling from normal distribution with the same std and mean of the embedding matrix.
+        It is only useful when `vocab` is not `None`.
+
+    Returns
+    -------
+    If `vocab` is `None`
+        numpy.ndarray:
+            An embedding matrix in the pretrained file.
+        gluonnlp.data.Vocab:
+            The vocabulary in the pretrained file.
+    Otherwise,
+        numpy.ndarray:
+            An embedding matrix for the given vocabulary.
+    """
+    assert isinstance(vocab, (Vocab, type(None))), "Only gluonnlp.data.Vocab is supported."
+    file_path = _check_and_get_path(pretrained_name_or_dir)
+    if file_path is None:
+        raise ValueError("Cannot recognize `{}`".format(pretrained_name_or_dir))
+
+    if file_path.endswith('.npz'):
+        matrix, result = _load_embedding_npz(file_path, vocab, unknown)
+    else:
+        matrix, result = _load_embedding_txt(file_path, vocab, unknown)
+    dim = matrix.shape[-1]
+    logging.info("Pre-trained embedding dim: {}".format(dim))
+    if vocab is None:
+        return matrix, result
+    else:
+        hit_flags = result
+        total_hits = sum(hit_flags)
+        logging.info("Found {} out of {} words in the pretrained embedding.".format(total_hits, len(vocab)))
+        if total_hits != len(vocab):
+            if unk_method is None:
+                found_vectors = matrix[hit_flags]
+                mean = np.mean(found_vectors, axis=0, keepdims=True)
+                std = np.std(found_vectors, axis=0, keepdims=True)
+                unfound_vec_num = len(vocab) - total_hits
+                r_vecs = np.random.randn(unfound_vec_num, dim).astype('float32') * std + mean
+                matrix[hit_flags == False] = r_vecs
+            else:
+                unk_idxs = (hit_flags == False).nonzero()[0]
+                matrix[hit_flags == False] = unk_method(vocab.to_tokens(unk_idxs))
+
+        return matrix
+
+def get_fasttext_model(model_name_or_dir='cc.en.300'):
+    """ Load fasttext model from the binaray file
+
+    This method will load fasttext model binaray file from a given file path or remote sources,
+    and return a `fasttext` model object. See `fasttext.cc` for more usage information.
+
+    Available sources:
+    ['wiki-news-300d-1M-subword', 'crawl-300d-2M-subword', \
+     'cc.af.300', ..., 'cc.en.300', ..., 'wiki.aa', ..., 'wiki.en', ..., 'wiki.zu']
+    Detailed sources can be founded by `gluonnlp.embedding.list_sources('FastText.bin')`
+
+    Parameters
+    ----------
+    model_name_or_dir : str, default 'cc.en.300'
+        A file path for a FastText binary file or the name of the FastText model.
+        This method would first check if it is a file path.
+        If not, the method will load from cache or download.
+
+    Returns
+    -------
+    fasttext.FastText._FastText:
+        A FastText model based on `fasttext` package.
+    """
+    if os.path.exists(model_name_or_dir):
+        file_path = model_name_or_dir
+    else:
+        source = model_name_or_dir
+        root_path = os.path.expanduser(os.path.join(get_home_dir(), 'embedding'))
+        embedding_dir = os.path.join(root_path, 'fasttext')
+        if source not in C.FAST_TEXT_BIN_SHA1:
+            raise ValueError('Cannot recognize {} for the bin file'.format(source))
+        file_name, file_hash = C.FAST_TEXT_BIN_SHA1[source]
+        file_path = _get_file_path('fasttext', file_name, file_hash)
+    return fasttext.load_model(file_path)
+
diff --git a/src/gluonnlp/embedding/evaluation.py b/src/gluonnlp/embedding/evaluation.py
deleted file mode 100644
index aeb9e741f4..0000000000
--- a/src/gluonnlp/embedding/evaluation.py
+++ /dev/null
@@ -1,513 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=eval-used, redefined-outer-name
-"""Models for intrinsic and extrinsic word embedding evaluation"""
-
-import mxnet as mx
-from mxnet import registry
-from mxnet.gluon import HybridBlock
-
-__all__ = [
-    'register', 'create', 'list_evaluation_functions',
-    'WordEmbeddingSimilarityFunction', 'WordEmbeddingAnalogyFunction',
-    'CosineSimilarity', 'ThreeCosMul', 'ThreeCosAdd',
-    'WordEmbeddingSimilarity', 'WordEmbeddingAnalogy']
-
-
-class _WordEmbeddingEvaluationFunction(HybridBlock):  # pylint: disable=abstract-method
-    """Base class for word embedding evaluation functions."""
-
-
-class WordEmbeddingSimilarityFunction(_WordEmbeddingEvaluationFunction):  # pylint: disable=abstract-method
-    """Base class for word embedding similarity functions."""
-
-
-class WordEmbeddingAnalogyFunction(_WordEmbeddingEvaluationFunction):  # pylint: disable=abstract-method
-    """Base class for word embedding analogy functions.
-
-    Parameters
-    ----------
-    idx_to_vec : mxnet.ndarray.NDArray
-        Embedding matrix.
-    k : int, default 1
-        Number of analogies to predict per input triple.
-    eps : float, optional, default=1e-10
-        A small constant for numerical stability.
-    """
-
-
-###############################################################################
-# Similarity and analogy functions registry helpers
-###############################################################################
-_REGSITRY_KIND_CLASS_MAP = {
-    'similarity': WordEmbeddingSimilarityFunction,
-    'analogy': WordEmbeddingAnalogyFunction
-}
-
-
-def register(class_):
-    """Registers a new word embedding evaluation function.
-
-    Once registered, we can create an instance with
-    :func:`~gluonnlp.embedding.evaluation.create`.
-
-    Examples
-    --------
-    >>> @gluonnlp.embedding.evaluation.register
-    ... class MySimilarityFunction(gluonnlp.embedding.evaluation.WordEmbeddingSimilarityFunction):
-    ...     def __init__(self, eps=1e-10):
-    ...         pass
-    >>> similarity_function = gluonnlp.embedding.evaluation.create('similarity',
-    ...                                                            'MySimilarityFunction')
-    >>> print(type(similarity_function))
-    <class 'gluonnlp.embedding.evaluation.MySimilarityFunction'>
-
-    >>> @gluonnlp.embedding.evaluation.register
-    ... class MyAnalogyFunction(gluonnlp.embedding.evaluation.WordEmbeddingAnalogyFunction):
-    ...     def __init__(self, k=1, eps=1E-10):
-    ...         pass
-    >>> analogy_function = gluonnlp.embedding.evaluation.create('analogy', 'MyAnalogyFunction')
-    >>> print(type(analogy_function))
-    <class 'gluonnlp.embedding.evaluation.MyAnalogyFunction'>
-
-    """
-
-    if issubclass(class_, WordEmbeddingSimilarityFunction):
-        register_ = registry.get_register_func(
-            WordEmbeddingSimilarityFunction,
-            'word embedding similarity evaluation function')
-    elif issubclass(class_, WordEmbeddingAnalogyFunction):
-        register_ = registry.get_register_func(
-            WordEmbeddingAnalogyFunction,
-            'word embedding analogy evaluation function')
-    else:
-        raise RuntimeError(
-            'The custom function must either subclass '
-            'WordEmbeddingSimilarityFunction or WordEmbeddingAnalogyFunction')
-
-    return register_(class_)
-
-
-def create(kind, name, **kwargs):
-    """Creates an instance of a registered word embedding evaluation function.
-
-    Parameters
-    ----------
-    kind : ['similarity', 'analogy']
-        Return only valid names for similarity, analogy or both kinds of
-        functions.
-    name : str
-        The evaluation function name (case-insensitive).
-
-
-    Returns
-    -------
-    An instance of
-    :class:`gluonnlp.embedding.evaluation.WordEmbeddingAnalogyFunction`:
-    or
-    :class:`gluonnlp.embedding.evaluation.WordEmbeddingSimilarityFunction`:
-        An instance of the specified evaluation function.
-
-    """
-    if kind not in _REGSITRY_KIND_CLASS_MAP.keys():
-        raise KeyError(
-            'Cannot find `kind` {}. Use '
-            '`list_evaluation_functions(kind=None).keys()` to get'
-            'all the valid kinds of evaluation functions.'.format(kind))
-
-    create_ = registry.get_create_func(
-        _REGSITRY_KIND_CLASS_MAP[kind],
-        'word embedding {} evaluation function'.format(kind))
-
-    return create_(name, **kwargs)
-
-
-def list_evaluation_functions(kind=None):
-    """Get valid word embedding functions names.
-
-    Parameters
-    ----------
-    kind : ['similarity', 'analogy', None]
-        Return only valid names for similarity, analogy or both kinds of functions.
-
-    Returns
-    -------
-    dict or list:
-        A list of all the valid evaluation function names for the specified
-        kind. If kind is set to None, returns a dict mapping each valid name to
-        its respective output list. The valid names can be plugged in
-        `gluonnlp.model.word_evaluation_model.create(name)`.
-
-    """
-
-    if kind is None:
-        kind = tuple(_REGSITRY_KIND_CLASS_MAP.keys())
-
-    if not isinstance(kind, tuple):
-        if kind not in _REGSITRY_KIND_CLASS_MAP.keys():
-            raise KeyError(
-                'Cannot find `kind` {}. Use '
-                '`list_evaluation_functions(kind=None).keys()` to get all the'
-                'valid kinds of evaluation functions.'.format(kind))
-
-        reg = registry.get_registry(_REGSITRY_KIND_CLASS_MAP[kind])
-        return list(reg.keys())
-    else:
-        return {name: list_evaluation_functions(kind=name) for name in kind}
-
-
-###############################################################################
-# Word embedding similarity functions
-###############################################################################
-@register
-class CosineSimilarity(WordEmbeddingSimilarityFunction):
-    """Computes the cosine similarity.
-
-    Parameters
-    ----------
-    eps : float, optional, default=1e-10
-        A small constant for numerical stability.
-
-    """
-
-    def __init__(self, eps=1e-10, **kwargs):
-        super(CosineSimilarity, self).__init__(**kwargs)
-        self.eps = eps
-
-    def hybrid_forward(self, F, x, y):  # pylint: disable=arguments-differ
-        """Compute the cosine similarity between two batches of vectors.
-
-        The cosine similarity is the dot product between the L2 normalized
-        vectors.
-
-        Parameters
-        ----------
-        x : Symbol or NDArray
-        y : Symbol or NDArray
-
-        Returns
-        -------
-        similarity : Symbol or NDArray
-            The similarity computed by WordEmbeddingSimilarity.similarity_function.
-
-        """
-
-        x = F.L2Normalization(x, eps=self.eps)
-        y = F.L2Normalization(y, eps=self.eps)
-        x = F.expand_dims(x, axis=1)
-        y = F.expand_dims(y, axis=2)
-        return F.batch_dot(x, y).reshape((-1, ))
-
-
-###############################################################################
-# Word embedding analogy functions
-###############################################################################
-@register
-class ThreeCosMul(WordEmbeddingAnalogyFunction):
-    """The 3CosMul analogy function.
-
-    The 3CosMul analogy function is defined as
-
-    .. math::
-        \\arg\\max_{b^* ∈ V}\\frac{\\cos(b^∗, b) \\cos(b^*, a)}{cos(b^*, a^*) + ε}
-
-    See the following paper for more details:
-
-    - Levy, O., & Goldberg, Y. (2014). Linguistic regularities in sparse and
-      explicit word representations. In R. Morante, & W. Yih, Proceedings of the
-      Eighteenth Conference on Computational Natural Language Learning, CoNLL 2014,
-      Baltimore, Maryland, USA, June 26-27, 2014 (pp. 171–180). : ACL.
-
-    Parameters
-    ----------
-    idx_to_vec : mxnet.ndarray.NDArray
-        Embedding matrix.
-    k : int, default 1
-        Number of analogies to predict per input triple.
-    exclude_question_words : bool, default True
-        Exclude the 3 question words from being a valid answer.
-    eps : float, optional, default=1e-10
-        A small constant for numerical stability.
-
-    """
-
-    def __init__(self, idx_to_vec, k=1, eps=1E-10, exclude_question_words=True, **kwargs):
-        super(ThreeCosMul, self).__init__(**kwargs)
-
-        self.k = k
-        self.eps = eps
-        self._exclude_question_words = exclude_question_words
-
-        self._vocab_size, self._embed_size = idx_to_vec.shape
-
-        idx_to_vec = mx.nd.L2Normalization(idx_to_vec, eps=self.eps)
-        with self.name_scope():
-            self.weight = self.params.get_constant('weight', idx_to_vec)
-
-    def hybrid_forward(self, F, words1, words2, words3, weight):  # pylint: disable=arguments-differ
-        """Compute ThreeCosMul for given question words.
-
-        Parameters
-        ----------
-        words1 : Symbol or NDArray
-            Question words at first position. Shape (batch_size, )
-        words2 : Symbol or NDArray
-            Question words at second position. Shape (batch_size, )
-        words3 : Symbol or NDArray
-            Question words at third position. Shape (batch_size, )
-
-        Returns
-        -------
-        Symbol or NDArray
-            Predicted answer words. Shape (batch_size, k).
-
-        """
-        words123 = F.concat(words1, words2, words3, dim=0)
-        embeddings_words123 = F.Embedding(words123, weight,
-                                          input_dim=self._vocab_size,
-                                          output_dim=self._embed_size)
-        similarities = F.FullyConnected(
-            embeddings_words123, weight, no_bias=True,
-            num_hidden=self._vocab_size, flatten=False)
-        # Map cosine similarities to [0, 1]
-        similarities = (similarities + 1) / 2
-
-        sim_w1w4, sim_w2w4, sim_w3w4 = F.split(similarities, num_outputs=3,
-                                               axis=0)
-
-        sim = (sim_w2w4 * sim_w3w4) / (sim_w1w4 + self.eps)
-
-        if self._exclude_question_words:
-            for words in [words1, words2, words3]:
-                sim = sim * F.one_hot(words, self.weight.shape[0], 0, 1)
-
-        pred_idxs = F.topk(sim, k=self.k)
-        return pred_idxs
-
-
-@register
-class ThreeCosAdd(WordEmbeddingAnalogyFunction):
-    """The 3CosAdd analogy function.
-
-    The 3CosAdd analogy function is defined as
-
-    .. math::
-        \\arg\\max_{b^* ∈ V}[\\cos(b^∗, b - a + a^*)]
-
-    See the following paper for more details:
-
-    - Levy, O., & Goldberg, Y. (2014). Linguistic regularities in sparse and
-      explicit word representations. In R. Morante, & W. Yih, Proceedings of the
-      Eighteenth Conference on Computational Natural Language Learning, CoNLL 2014,
-      Baltimore, Maryland, USA, June 26-27, 2014 (pp. 171–180). : ACL.
-
-    Parameters
-    ----------
-    idx_to_vec : mxnet.ndarray.NDArray
-        Embedding matrix.
-    normalize : bool, default True
-        Normalize all word embeddings before computing the analogy.
-    k : int, default 1
-        Number of analogies to predict per input triple.
-    exclude_question_words : bool, default True
-        Exclude the 3 question words from being a valid answer.
-    eps : float, optional, default=1e-10
-        A small constant for numerical stability.
-
-
-    """
-
-    def __init__(self,
-                 idx_to_vec,
-                 normalize=True,
-                 k=1,
-                 eps=1E-10,
-                 exclude_question_words=True,
-                 **kwargs):
-        super(ThreeCosAdd, self).__init__(**kwargs)
-
-        self.k = k
-        self.eps = eps
-        self.normalize = normalize
-        self._exclude_question_words = exclude_question_words
-        self._vocab_size, self._embed_size = idx_to_vec.shape
-
-        if self.normalize:
-            idx_to_vec = mx.nd.L2Normalization(idx_to_vec, eps=self.eps)
-
-        with self.name_scope():
-            self.weight = self.params.get_constant('weight', idx_to_vec)
-
-    def hybrid_forward(self, F, words1, words2, words3, weight):  # pylint: disable=arguments-differ
-        """Compute ThreeCosAdd for given question words.
-
-        Parameters
-        ----------
-        words1 : Symbol or NDArray
-            Question words at first position. Shape (batch_size, )
-        words2 : Symbol or NDArray
-            Question words at second position. Shape (batch_size, )
-        words3 : Symbol or NDArray
-            Question words at third position. Shape (batch_size, )
-
-        Returns
-        -------
-        Symbol or NDArray
-            Predicted answer words. Shape (batch_size, k).
-
-        """
-        words123 = F.concat(words1, words2, words3, dim=0)
-        embeddings_words123 = F.Embedding(words123, weight,
-                                          input_dim=self._vocab_size,
-                                          output_dim=self._embed_size)
-        if self.normalize:
-            similarities = F.FullyConnected(
-                embeddings_words123, weight, no_bias=True,
-                num_hidden=self._vocab_size, flatten=False)
-            sim_w1w4, sim_w2w4, sim_w3w4 = F.split(similarities, num_outputs=3,
-                                                   axis=0)
-            pred = sim_w3w4 - sim_w1w4 + sim_w2w4
-        else:
-            embeddings_word1, embeddings_word2, embeddings_word3 = F.split(
-                embeddings_words123, num_outputs=3, axis=0)
-            vector = (embeddings_word3 - embeddings_word1 + embeddings_word2)
-            pred = F.FullyConnected(
-                vector, weight, no_bias=True, num_hidden=self._vocab_size,
-                flatten=False)
-
-        if self._exclude_question_words:
-            for words in [words1, words2, words3]:
-                pred = pred * F.one_hot(words, self.weight.shape[0], 0, 1)
-
-        pred_idxs = F.topk(pred, k=self.k)
-        return pred_idxs
-
-
-###############################################################################
-# Evaluation blocks
-###############################################################################
-class WordEmbeddingSimilarity(HybridBlock):
-    """Word embeddings similarity task evaluator.
-
-    Parameters
-    ----------
-    idx_to_vec : mxnet.ndarray.NDArray
-        Embedding matrix.
-    similarity_function : str, default 'CosineSimilarity'
-        Name of a registered WordEmbeddingSimilarityFunction.
-    eps : float, optional, default=1e-10
-        A small constant for numerical stability.
-
-    """
-
-    def __init__(self, idx_to_vec, similarity_function='CosineSimilarity',
-                 eps=1e-10, **kwargs):
-        super(WordEmbeddingSimilarity, self).__init__(**kwargs)
-
-        self.eps = eps
-        self._vocab_size, self._embed_size = idx_to_vec.shape
-
-        with self.name_scope():
-            self.weight = self.params.get_constant('weight', idx_to_vec)
-            self.similarity = create(kind='similarity',
-                                     name=similarity_function, eps=self.eps)
-
-        if not isinstance(self.similarity, WordEmbeddingSimilarityFunction):
-            raise RuntimeError(
-                '{} is not a WordEmbeddingAnalogyFunction'.format(
-                    self.similarity.__class__.__name__))
-
-    def hybrid_forward(self, F, words1, words2, weight):  # pylint: disable=arguments-differ
-        """Predict the similarity of words1 and words2.
-
-        Parameters
-        ----------
-        words1 : Symbol or NDArray
-            The indices of the words the we wish to compare to the words in words2.
-        words2 : Symbol or NDArray
-            The indices of the words the we wish to compare to the words in words1.
-
-        Returns
-        -------
-        similarity : Symbol or NDArray
-            The similarity computed by WordEmbeddingSimilarity.similarity_function.
-        """
-        embeddings_words1 = F.Embedding(words1, weight,
-                                        input_dim=self._vocab_size,
-                                        output_dim=self._embed_size)
-        embeddings_words2 = F.Embedding(words2, weight,
-                                        input_dim=self._vocab_size,
-                                        output_dim=self._embed_size)
-        similarity = self.similarity(embeddings_words1, embeddings_words2)
-        return similarity
-
-
-class WordEmbeddingAnalogy(HybridBlock):
-    """Word embeddings analogy task evaluator.
-
-    Parameters
-    ----------
-    idx_to_vec : mxnet.ndarray.NDArray
-        Embedding matrix.
-    analogy_function : str, default 'ThreeCosMul'
-        Name of a registered WordEmbeddingAnalogyFunction.
-    k : int, default 1
-        Number of analogies to predict per input triple.
-    exclude_question_words : bool, default True
-        Exclude the 3 question words from being a valid answer.
-
-    """
-
-    def __init__(self, idx_to_vec, analogy_function='ThreeCosMul', k=1,
-                 exclude_question_words=True, **kwargs):
-        super(WordEmbeddingAnalogy, self).__init__(**kwargs)
-
-        assert k >= 1
-        self.k = k
-        self.exclude_question_words = exclude_question_words
-
-        with self.name_scope():
-            self.analogy = create(
-                kind='analogy',
-                name=analogy_function,
-                idx_to_vec=idx_to_vec,
-                k=self.k,
-                exclude_question_words=exclude_question_words)
-
-        if not isinstance(self.analogy, WordEmbeddingAnalogyFunction):
-            raise RuntimeError(
-                '{} is not a WordEmbeddingAnalogyFunction'.format(
-                    self.analogy.__class__.__name__))
-
-    def hybrid_forward(self, F, words1, words2, words3):  # pylint: disable=arguments-differ, unused-argument
-        """Compute analogies for given question words.
-
-        Parameters
-        ----------
-        words1 : Symbol or NDArray
-            Word indices of first question words. Shape (batch_size, ).
-        words2 : Symbol or NDArray
-            Word indices of second question words. Shape (batch_size, ).
-        words3 : Symbol or NDArray
-            Word indices of third question words. Shape (batch_size, ).
-
-        Returns
-        -------
-        predicted_indices : Symbol or NDArray
-            Indices of predicted analogies of shape (batch_size, k)
-        """
-        return self.analogy(words1, words2, words3)
diff --git a/src/gluonnlp/embedding/token_embedding.py b/src/gluonnlp/embedding/token_embedding.py
deleted file mode 100644
index 5cd6dbb37b..0000000000
--- a/src/gluonnlp/embedding/token_embedding.py
+++ /dev/null
@@ -1,1347 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=consider-iterating-dictionary, too-many-lines
-"""Text token embedding."""
-
-__all__ = [
-    'register', 'create', 'list_sources', 'TokenEmbedding', 'GloVe',
-    'FastText', 'Word2Vec'
-]
-
-import io
-import logging
-import os
-import warnings
-
-import numpy as np
-from mxnet import nd, registry, cpu
-from mxnet.gluon.utils import download, check_sha1, _get_repo_file_url
-
-from .. import _constants as C
-from ..base import get_home_dir
-from ..data.utils import DefaultLookupDict
-from ..model.train import FasttextEmbeddingModel
-
-UNK_IDX = 0
-ENCODING = 'utf8'
-INIT_UNKNOWN_VEC = nd.zeros
-
-def register(embedding_cls):
-    """Registers a new token embedding.
-
-
-    Once an embedding is registered, we can create an instance of this embedding with
-    :func:`~gluonnlp.embedding.create`.
-
-
-    Examples
-    --------
-    >>> @gluonnlp.embedding.register
-    ... class MyTextEmbed(gluonnlp.embedding.TokenEmbedding):
-    ...     def __init__(self, source='my_pretrain_file'):
-    ...         pass
-    >>> embed = gluonnlp.embedding.create('MyTextEmbed')
-    >>> print(type(embed))
-    <class 'gluonnlp.embedding.token_embedding.MyTextEmbed'>
-    """
-
-    register_text_embedding = registry.get_register_func(TokenEmbedding, 'token embedding')
-    return register_text_embedding(embedding_cls)
-
-
-def create(embedding_name, **kwargs):
-    """Creates an instance of token embedding.
-
-
-    Creates a token embedding instance by loading embedding vectors from an externally hosted
-    pre-trained token embedding file, such as those of GloVe and FastText. To get all the valid
-    `embedding_name` and `source`, use :func:`gluonnlp.embedding.list_sources`.
-
-
-    Parameters
-    ----------
-    embedding_name : str
-        The token embedding name (case-insensitive).
-    kwargs : dict
-        All other keyword arguments are passed to the initializer of token
-        embedding class. For example `create(embedding_name='fasttext',
-        source='wiki.simple', load_ngrams=True)` will return
-        `FastText(source='wiki.simple', load_ngrams=True)`.
-
-
-    Returns
-    -------
-    An instance of :class:`gluonnlp.embedding.TokenEmbedding`:
-        A token embedding instance that loads embedding vectors from an externally hosted
-        pre-trained token embedding file.
-    """
-
-    create_text_embedding = registry.get_create_func(TokenEmbedding, 'token embedding')
-    return create_text_embedding(embedding_name, **kwargs)
-
-
-def list_sources(embedding_name=None):
-    """Get valid token embedding names and their pre-trained file names.
-
-
-    To load token embedding vectors from an externally hosted pre-trained token embedding file,
-    such as those of GloVe and FastText, one should use
-    `gluonnlp.embedding.create(embedding_name, source)`. This method returns all the
-    valid names of `source` for the specified `embedding_name`. If `embedding_name` is set to
-    None, this method returns all the valid names of `embedding_name` with their associated
-    `source`.
-
-
-    Parameters
-    ----------
-    embedding_name : str or None, default None
-        The pre-trained token embedding name.
-
-
-    Returns
-    -------
-    dict or list:
-        A list of all the valid pre-trained token embedding file names (`source`) for the
-        specified token embedding name (`embedding_name`). If the text embedding name is set to
-        None, returns a dict mapping each valid token embedding name to a list of valid pre-trained
-        files (`source`). They can be plugged into
-        `gluonnlp.embedding.create(embedding_name, source)`.
-    """
-
-    text_embedding_reg = registry.get_registry(TokenEmbedding)
-
-    if embedding_name is not None:
-        embedding_name = embedding_name.lower()
-        if embedding_name not in text_embedding_reg:
-            raise KeyError('Cannot find `embedding_name` {}. Use '
-                           '`list_sources(embedding_name=None).keys()` to get all the valid'
-                           'embedding names.'.format(embedding_name))
-        return list(text_embedding_reg[embedding_name].source_file_hash.keys())
-    else:
-        return {embedding_name: list(embedding_cls.source_file_hash.keys())
-                for embedding_name, embedding_cls in registry.get_registry(TokenEmbedding).items()}
-
-
-class TokenEmbedding:
-    """Token embedding base class.
-
-    To load token embedding from an externally hosted pre-trained token embedding file, such as
-    those of GloVe and FastText, use :func:`gluonnlp.embedding.create`.
-    To get all the available `embedding_name` and `source`, use
-    :func:`gluonnlp.embedding.list_sources`.
-
-    Alternatively, to load embedding vectors from a custom pre-trained token embedding file, use
-    :func:`gluonnlp.embedding.TokenEmbedding.from_file`.
-
-    If `unknown_token` is None, looking up unknown tokens results in KeyError.
-    Otherwise, for every unknown token, if its representation `self.unknown_token` is encountered
-    in the pre-trained token embedding file, index 0 of `self.idx_to_vec` maps to the pre-trained
-    token embedding vector loaded from the file; otherwise, index 0 of `self.idx_to_vec` maps to
-    the token embedding vector initialized by `init_unknown_vec`.
-
-    If a token is encountered multiple times in the pre-trained token embedding file, only the
-    first-encountered token embedding vector will be loaded and the rest will be skipped.
-
-    Parameters
-    ----------
-    unknown_token : hashable object or None, default '<unk>'
-        Any unknown token will be replaced by unknown_token and consequently
-        will be indexed as the same representation. Only used if oov_imputer is
-        not specified.
-    init_unknown_vec : callback, default nd.zeros
-        The callback used to initialize the embedding vector for the unknown
-        token. Only used if `unknown_token` is not None and `idx_to_token` is
-        not None and does not contain `unknown_vec`.
-    allow_extend : bool, default False
-        If True, embedding vectors for previously unknown words can be added
-        via token_embedding[tokens] = vecs. If False, only vectors for known
-        tokens can be updated.
-    unknown_lookup : object subscriptable with list of tokens returning nd.NDarray, default None
-        If not None, the TokenEmbedding obtains embeddings for unknown tokens
-        automatically from `unknown_lookup[unknown_tokens]`. For example, in a
-        FastText model, embeddings for unknown tokens can be computed from the
-        subword information.
-    idx_to_token : list of str or None, default None
-        If not None, a list of tokens for which the `idx_to_vec` argument
-        provides embeddings. The list indices and the indices of `idx_to_vec`
-        must be aligned.
-        If `idx_to_token` is not None, `idx_to_vec` must not be None either.
-        If `idx_to_token` is None, an empty TokenEmbedding object is created.
-        If `allow_extend` is True, tokens and their embeddings can be added to
-        the TokenEmbedding at a later stage.
-    idx_to_vec : mxnet.ndarray.NDArray or None, default None
-        If not None, a NDArray containing embeddings for the tokens specified
-        in `idx_to_token`. The first dimension of `idx_to_vec` must be aligned
-        with `idx_to_token`.
-        If `idx_to_vec` is not None, `idx_to_token` must not be None either.
-        If `idx_to_vec` is None, an empty TokenEmbedding object is created.
-        If `allow_extend` is True, tokens and their embeddings can be added to
-        the TokenEmbedding at a later stage.
-        No copy of the idx_to_vec array is made as long as unknown_token is
-        None or an embedding for unknown_token is specified in `idx_to_vec`.
-
-    """
-
-    def __init__(self, unknown_token=C.UNK_TOKEN, init_unknown_vec=INIT_UNKNOWN_VEC,
-                 allow_extend=False, unknown_lookup=None, idx_to_token=None,
-                 idx_to_vec=None):
-        unknown_index = None
-
-        # With pre-specified tokens and vectors
-        if idx_to_vec is not None or idx_to_token is not None:
-            idx_to_token = idx_to_token[:]
-
-            # Sanity checks
-            if idx_to_vec is None or idx_to_token is None:
-                raise ValueError('Must specify either none or both of '
-                                 'idx_to_token and idx_to_vec.')
-            if idx_to_vec.shape[0] != len(idx_to_token):
-                raise ValueError('idx_to_token and idx_to_vec must contain '
-                                 'the same number of tokens and embeddings respectively.')
-            if unknown_token is not None:
-                try:
-                    unknown_index = idx_to_token.index(unknown_token)
-                    if init_unknown_vec is not None:
-                        logging.info('Ignoring init_unknown_vec as idx_to_vec is specified')
-                except ValueError:
-                    if init_unknown_vec is not None:
-                        idx_to_token.insert(0, unknown_token)
-                        idx_to_vec = nd.concat(init_unknown_vec((1, idx_to_vec.shape[1])),
-                                               idx_to_vec, dim=0)
-                        unknown_index = 0
-                    else:
-                        raise ValueError('unknown_token "{}" is not part of idx_to_vec but '
-                                         'init_unknown_vec is None. '
-                                         'You must provide either of them.'.format(unknown_token))
-
-            # Initialization
-            self._unknown_token = unknown_token
-            self._init_unknown_vec = init_unknown_vec
-            self._allow_extend = allow_extend
-            self._unknown_lookup = unknown_lookup
-
-            self._idx_to_token = idx_to_token
-            self._idx_to_vec = idx_to_vec
-
-        # Empty token-embedding
-        else:
-            # Initialization
-            self._unknown_token = unknown_token
-            if self._unknown_token is not None:
-                unknown_index = UNK_IDX
-            self._init_unknown_vec = init_unknown_vec
-            self._allow_extend = allow_extend
-            self._unknown_lookup = unknown_lookup
-
-            assert UNK_IDX == 0
-            self._idx_to_token = [unknown_token] if unknown_token else []
-            self._idx_to_vec = None
-
-        # Initialization of token_to_idx mapping
-        if self._unknown_token:
-            assert unknown_index is not None
-            self._token_to_idx = DefaultLookupDict(unknown_index)
-        else:
-            self._token_to_idx = {}
-        self._token_to_idx.update((token, idx) for idx, token in enumerate(self._idx_to_token))
-
-    @staticmethod
-    def _get_file_url(cls_name, source_file_hash, source):
-        namespace = 'gluon/embeddings/{}'.format(cls_name)
-        return _get_repo_file_url(namespace, source_file_hash[source][0])
-
-    @classmethod
-    def _get_file_path(cls, source_file_hash, embedding_root, source):
-        cls_name = cls.__name__.lower()
-        embedding_root = os.path.expanduser(embedding_root)
-        url = cls._get_file_url(cls_name, source_file_hash, source)
-
-        embedding_dir = os.path.join(embedding_root, cls_name)
-
-        pretrained_file_name, expected_file_hash = source_file_hash[source]
-        pretrained_file_path = os.path.join(embedding_dir, pretrained_file_name)
-
-        if not os.path.exists(pretrained_file_path) \
-           or not check_sha1(pretrained_file_path, expected_file_hash):
-            print('Embedding file {} is not found. Downloading from Gluon Repository. '
-                  'This may take some time.'.format(pretrained_file_name))
-            download(url, pretrained_file_path, sha1_hash=expected_file_hash)
-
-        return pretrained_file_path
-
-    @staticmethod
-    def _load_embedding(pretrained_file_path, elem_delim, unknown_token,
-                        init_unknown_vec, encoding=ENCODING):
-        """Load embedding vectors from a pre-trained token embedding file.
-
-        Both text files and TokenEmbedding serialization files are supported.
-        elem_delim and encoding are ignored for non-text files.
-
-        For every unknown token, if its representation `self.unknown_token` is encountered in the
-        pre-trained token embedding file, index 0 of `self.idx_to_vec` maps to the pre-trained token
-        embedding vector loaded from the file; otherwise, index 0 of `self.idx_to_vec` maps to the
-        text embedding vector initialized by `self._init_unknown_vec`.
-
-        If a token is encountered multiple times in the pre-trained text embedding file, only the
-        first-encountered token embedding vector will be loaded and the rest will be skipped.
-
-        """
-
-        pretrained_file_path = os.path.expanduser(pretrained_file_path)
-
-        if not os.path.isfile(pretrained_file_path):
-            raise ValueError('`pretrained_file_path` must be a valid path '
-                             'to the pre-trained token embedding file.')
-
-        logging.info('Loading pre-trained token embedding vectors from %s',
-                     pretrained_file_path)
-
-        if pretrained_file_path.endswith('.npz'):
-            return TokenEmbedding._load_embedding_serialized(
-                pretrained_file_path=pretrained_file_path,
-                unknown_token=unknown_token,
-                init_unknown_vec=init_unknown_vec)
-        else:
-            return TokenEmbedding._load_embedding_txt(
-                pretrained_file_path=pretrained_file_path,
-                elem_delim=elem_delim,
-                unknown_token=unknown_token,
-                init_unknown_vec=init_unknown_vec,
-                encoding=encoding)
-
-
-    @staticmethod
-    def _load_embedding_txt(pretrained_file_path, elem_delim, unknown_token,
-                            init_unknown_vec, encoding=ENCODING):
-        """Load embedding vectors from a pre-trained token embedding file.
-
-        Returns idx_to_token, idx_to_vec and unknown_token suitable for the
-        TokenEmbedding constructor.
-
-        For every unknown token, if its representation `unknown_token` is encountered in the
-        pre-trained token embedding file, index 0 of `idx_to_vec` maps to the pre-trained token
-        embedding vector loaded from the file; otherwise, index 0 of `idx_to_vec` maps to the
-        text embedding vector initialized by `init_unknown_vec`.
-
-        If a token is encountered multiple times in the pre-trained text embedding file, only the
-        first-encountered token embedding vector will be loaded and the rest will be skipped.
-
-        """
-        idx_to_token = [unknown_token] if unknown_token else []
-        unk_idx = None
-        if unknown_token:
-            unk_idx = 0
-
-        vec_len = None
-        all_elems = []
-        tokens = set()
-        loaded_unknown_vec = None
-        with io.open(pretrained_file_path, 'rb') as f:
-            for line_num, line in enumerate(f):
-                try:
-                    line = line.decode(encoding)  # pytype: disable=attribute-error
-                except ValueError:
-                    warnings.warn('line {} in {}: failed to decode. Skipping.'
-                                  .format(line_num, pretrained_file_path))
-                    continue
-
-                elems = line.rstrip().split(elem_delim)
-
-                assert len(elems) > 1, 'line {} in {}: unexpected data format.'.format(
-                    line_num, pretrained_file_path)
-
-                token, elems = elems[0], [float(i) for i in elems[1:]]
-
-                if loaded_unknown_vec is None and token == unknown_token:
-                    loaded_unknown_vec = elems
-                    tokens.add(unknown_token)
-                elif token in tokens:
-                    warnings.warn('line {} in {}: duplicate embedding found for '
-                                  'token "{}". Skipped.'.format(line_num, pretrained_file_path,
-                                                                token))
-                elif len(elems) == 1 and line_num == 0:
-                    warnings.warn('line {} in {}: skipped likely header line.'
-                                  .format(line_num, pretrained_file_path))
-                else:
-                    if not vec_len:
-                        vec_len = len(elems)
-                        if unknown_token:
-                            # Reserve a vector slot for the unknown token at the very beggining
-                            # because the unknown token index is 0.
-                            assert len(all_elems) == 0
-                            all_elems.extend([0] * vec_len)
-                    else:
-                        assert len(elems) == vec_len, \
-                            'line {} in {}: found vector of inconsistent dimension for token ' \
-                            '"{}". expected dim: {}, found: {}'.format(line_num,
-                                                                       pretrained_file_path,
-                                                                       token, vec_len, len(elems))
-                    all_elems.extend(elems)
-                    idx_to_token.append(token)
-                    tokens.add(token)
-
-        idx_to_vec = nd.array(all_elems).reshape((-1, vec_len))
-
-        if unknown_token:
-            if loaded_unknown_vec is None:
-                idx_to_vec[unk_idx] = init_unknown_vec(shape=vec_len)
-            else:
-                idx_to_vec[unk_idx] = nd.array(loaded_unknown_vec)
-
-        return idx_to_token, idx_to_vec, unknown_token
-
-    @staticmethod
-    def _load_embedding_serialized(pretrained_file_path, unknown_token, init_unknown_vec):
-        """Load embedding vectors from a pre-trained token embedding file.
-
-        Returns idx_to_token, idx_to_vec and unknown_token suitable for the
-        TokenEmbedding constructor.
-
-        ValueError is raised if a token occurs multiple times.
-        """
-        deserialized_embedding = TokenEmbedding.deserialize(pretrained_file_path)
-
-        idx_to_token = deserialized_embedding.idx_to_token
-        if len(set(idx_to_token)) != len(idx_to_token):
-            raise ValueError('Serialized embedding contains duplicate tokens.')
-        idx_to_vec = deserialized_embedding.idx_to_vec
-        vec_len = idx_to_vec.shape[1]
-        loaded_unknown_vec = False
-        if deserialized_embedding.unknown_token:
-            if not unknown_token:
-                # If the TokenEmbedding shall not have an unknown token but the
-                # serialized file provided one, delete the provided one.
-                unk_idx = deserialized_embedding.token_to_idx[
-                    deserialized_embedding.unknown_token]
-                assert unk_idx >= 0
-                if unk_idx == 0:
-                    idx_to_token = idx_to_token[1:]
-                    idx_to_vec = idx_to_vec[1:]
-                else:
-                    idx_to_token = idx_to_token[:unk_idx] + idx_to_token[unk_idx + 1:]
-                    idx_to_vec = nd.concat(idx_to_vec[:unk_idx], idx_to_vec[unk_idx + 1:], dim=0)
-            else:
-                # If the TokenEmbedding shall have an unknown token and the
-                # serialized file provided one, replace the representation.
-                unk_idx = deserialized_embedding.token_to_idx[
-                    deserialized_embedding.unknown_token]
-                idx_to_token[unk_idx] = unknown_token
-                loaded_unknown_vec = True
-        else:
-            if unknown_token and unknown_token not in idx_to_token:
-                # If the TokenEmbedding shall have an unknown token but the
-                # serialized file didn't provided one, insert a new one
-                idx_to_token = [unknown_token] + idx_to_token
-                idx_to_vec = nd.concat(nd.zeros((1, vec_len)), idx_to_vec, dim=0)
-            elif unknown_token:
-                # The serialized file did define a unknown token, but contains
-                # the token that is specified by the user to represent the
-                # unknown token.
-                assert not deserialized_embedding.unknown_token
-                loaded_unknown_vec = True
-                # Move unknown_token to idx 0 to replicate the behavior of
-                # _load_embedding_text
-                unk_idx = idx_to_token.index(unknown_token)
-                if unk_idx > 0:
-                    idx_to_token[0], idx_to_token[unk_idx] = idx_to_token[unk_idx], idx_to_token[0]
-                    idx_to_vec[[0, unk_idx]] = idx_to_vec[[unk_idx, 0]]
-            else:
-                assert not deserialized_embedding.unknown_token
-                assert not unknown_token
-
-        if unknown_token and init_unknown_vec and not loaded_unknown_vec:
-            unk_idx = idx_to_token.index(unknown_token)
-            idx_to_vec[unk_idx] = init_unknown_vec(shape=vec_len)
-
-        return idx_to_token, idx_to_vec, unknown_token
-
-    @property
-    def idx_to_token(self):
-        """Index to token mapping.
-
-        Returns
-        -------
-        list of str:
-             A list of indexed tokens where the list indices and the token
-             indices are aligned.
-
-        """
-        return self._idx_to_token
-
-    @property
-    def token_to_idx(self):
-        """Token to index mapping.
-
-        Returns
-        -------
-        dict of int to strs:
-             A dictionary of tokens with their corresponding index numbers;
-             inverse vocab.
-        """
-        return self._token_to_idx
-
-    @property
-    def idx_to_vec(self):
-        """Index to vector mapping.
-
-        Returns
-        -------
-        mxnet.ndarray.NDArray:
-            For all the indexed tokens in this embedding, this NDArray maps
-            each token's index to an embedding vector.
-
-        """
-        return self._idx_to_vec
-
-    @property
-    def unknown_token(self):
-        """Unknown token representation.
-
-        Any token that is unknown will be indexed using the representation of
-        unknown_token.
-
-        Returns
-        -------
-        hashable object or None:
-            Unknown token representation
-
-        """
-        return self._unknown_token
-
-    @property
-    def allow_extend(self):
-        """Allow extension of the TokenEmbedding with new tokens.
-
-        If True, `TokenEmbedding[tokens] = vec` can introduce new tokens that
-        were previously unknown. New indices will be assigned to the newly
-        introduced tokens. If False, only known tokens can be updated.
-
-        Returns
-        -------
-        bool:
-            Extension of the TokenEmbedding is allowed.
-
-        """
-        return self._allow_extend
-
-    @property
-    def unknown_lookup(self):
-        """Vector lookup for unknown tokens.
-
-        If not None, unknown_lookup[tokens] is automatically called for any
-        unknown tokens.
-
-        Returns
-        -------
-        Mapping[List[str], nd.NDarray]
-            Vector lookup mapping from tokens to vectors.
-
-        """
-        return self._unknown_lookup
-
-    @unknown_lookup.setter
-    def unknown_lookup(self, unknown_lookup):
-        """Vector lookup for unknown tokens.
-
-        If not None, unknown_lookup[tokens] is called for any unknown tokens.
-
-        Parameters
-        ----------
-        unknown_lookup : Mapping[List[str], nd.NDarray]
-            Vector lookup mapping from tokens to vectors.
-
-        """
-        self._unknown_lookup = unknown_lookup
-
-    def __contains__(self, token):
-        """Check if token is known.
-
-        Parameters
-        ----------
-        token : str
-            A token.
-
-        Returns
-        -------
-        bool:
-            Return True if the token is known. A token is known if it has been
-            assigned an index and vector.
-        """
-        return token in self._token_to_idx
-
-    def __eq__(self, other):
-        if isinstance(other, TokenEmbedding):
-            return self.unknown_token == other.unknown_token \
-                and self.idx_to_token == other.idx_to_token and \
-                ((self.idx_to_vec == other.idx_to_vec).min().asscalar() == 1) \
-                and (self._token_to_idx == other._token_to_idx)
-        else:
-            return NotImplemented
-
-    def __ne__(self, other):
-        result = self.__eq__(other)
-        if result is NotImplemented:
-            return NotImplemented
-        else:
-            return not result
-
-    def __getitem__(self, tokens):
-        """Looks up embedding vectors of text tokens.
-
-        Parameters
-        ----------
-        tokens : str or list of strs
-            A token or a list of tokens.
-
-        Returns
-        -------
-        mxnet.ndarray.NDArray:
-            The embedding vector(s) of the token(s). According to numpy conventions, if `tokens` is
-            a string, returns a 1-D NDArray (vector); if `tokens` is a list of
-            strings, returns a 2-D NDArray (matrix) of shape=(len(tokens), vec_len).
-        """
-
-        to_reduce = not isinstance(tokens, (list, tuple))
-        if to_reduce:
-            tokens = [tokens]
-
-        if self.unknown_lookup is not None:
-            if self.idx_to_vec is None:
-                # May raise KeyError, but we cannot fallback to idx_to_vec's
-                # unknown vector, as idx_to_vec has not been initialized yet.
-                # Cannot initialize it, as we don't know the dimension.
-                vecs = self.unknown_lookup[tokens]
-            else:
-                vecs = [
-                    self.idx_to_vec[self.token_to_idx[token]] if
-                    (token in self.token_to_idx
-                     or token not in self.unknown_lookup) else
-                    self.unknown_lookup[token] for token in tokens]
-                vecs = nd.stack(*vecs, axis=0)
-        else:
-            indices = [self._token_to_idx[token] for token in tokens]
-            vecs = nd.Embedding(
-                nd.array(indices), self.idx_to_vec, self.idx_to_vec.shape[0],
-                self.idx_to_vec.shape[1])
-
-        return vecs[0] if to_reduce else vecs
-
-    def _check_vector_update(self, tokens, new_embedding):
-        """Check that tokens and embedding are in the format for __setitem__."""
-        assert self._idx_to_vec is not None, '`idx_to_vec` has not been initialized.'
-
-        if not isinstance(tokens, (list, tuple)) or len(tokens) == 1:
-            assert isinstance(new_embedding, nd.NDArray) and len(new_embedding.shape) in [1, 2], \
-                '`new_embedding` must be a 1-D or 2-D NDArray if `tokens` is a single token.'
-            if not isinstance(tokens, (list, tuple)):
-                tokens = [tokens]
-            if len(new_embedding.shape) == 1:
-                new_embedding = new_embedding.expand_dims(0)
-
-        else:
-            assert isinstance(new_embedding, nd.NDArray) and len(new_embedding.shape) == 2, \
-                '`new_embedding` must be a 2-D NDArray if `tokens` is a list of multiple strings.'
-        if self._idx_to_vec is not None:
-            assert new_embedding.shape == (len(tokens), self._idx_to_vec.shape[1]), \
-                'The length of `new_embedding` must be equal to the number ' \
-                'of tokens and the width of new_embedding must be equal ' \
-                'to the dimension of embedding of the glossary.'
-        else:
-            assert new_embedding.shape[0] == len(tokens), \
-                'The length of `new_embedding` must be equal to the number of tokens'
-        return tokens
-
-    def __setitem__(self, tokens, new_embedding):
-        """Updates embedding vectors for tokens.
-
-        If self.allow_extend is True, vectors for previously unknown tokens can be introduced.
-
-        Parameters
-        ----------
-        tokens : hashable object or a list or tuple of hashable objects
-            A token or a list of tokens whose embedding vector are to be updated.
-        new_embedding : mxnet.ndarray.NDArray
-            An NDArray to be assigned to the embedding vectors of `tokens`. Its length must be equal
-            to the number of `tokens` and its width must be equal to the dimension of embedding of
-            the glossary. If `tokens` is a singleton, it must be 1-D or 2-D. If `tokens` is a list
-            of multiple strings, it must be 2-D.
-        """
-        if not isinstance(tokens, (list, tuple)):
-            tokens = [tokens]
-        if ((self.allow_extend or all(t in self.token_to_idx for t in tokens))
-                and self._idx_to_vec is None):
-            # Initialize self._idx_to_vec
-            assert UNK_IDX == 0
-            self._idx_to_vec = self._init_unknown_vec(
-                shape=(1, new_embedding.shape[-1]))
-
-        tokens = self._check_vector_update(tokens, new_embedding)
-
-        if self.allow_extend:
-            # Add new / previously unknown tokens
-            len_before = len(self._token_to_idx)
-            for token in tokens:
-                if token not in self._token_to_idx:
-                    idx = len(self._token_to_idx)
-                    self._token_to_idx[token] = idx
-                    self._idx_to_token.append(token)
-
-            num_extended = len(self._token_to_idx) - len_before
-            if num_extended >= 1:
-                if num_extended == 1:
-                    warnings.warn(
-                        'When adding new tokens via TokenEmbedding.__setitem__ '
-                        'the internal embedding matrix needs to be reallocated. '
-                        'Users are therefore encouraged to batch their updates '
-                        '(i.e. add multiple new tokens at a time).')
-
-                # Extend shape of idx_to_vec
-                idx_to_vec = nd.zeros(shape=(len(self._token_to_idx),
-                                             self.idx_to_vec.shape[1]))
-                idx_to_vec[:self.idx_to_vec.shape[0]] = self._idx_to_vec
-                self._idx_to_vec = idx_to_vec
-
-        indices = []
-        for token in tokens:
-            if token in self._token_to_idx:
-                indices.append(self._token_to_idx[token])
-            else:
-                if self.unknown_token:
-                    raise KeyError(('Token "{}" is unknown. To update the embedding vector for an'
-                                    ' unknown token, please explicitly include "{}" as the '
-                                    '`unknown_token` in `tokens`. This is to avoid unintended '
-                                    'updates.').format(token, self.unknown_token))
-                raise KeyError(('Token "{}" is unknown. Updating the embedding vector for an '
-                                'unknown token is not allowed because `unknown_token` is not '
-                                'specified.').format(token))
-
-        self._idx_to_vec[nd.array(indices)] = new_embedding
-
-    @classmethod
-    def _check_source(cls, source_file_hash, source):
-        """Checks if a pre-trained token embedding source name is valid.
-
-
-        Parameters
-        ----------
-        source : str
-            The pre-trained token embedding source.
-        """
-        embedding_name = cls.__name__.lower()
-        if source not in source_file_hash:
-            raise KeyError('Cannot find pre-trained source {source} for token embedding {name}. '
-                           'Valid pre-trained file names for embedding {name}: {values}'.format(
-                               source=source, name=embedding_name,
-                               values=', '.join(source_file_hash.keys())))
-
-    @staticmethod
-    def from_file(file_path, elem_delim=' ', encoding=ENCODING, **kwargs):
-        """Creates a user-defined token embedding from a pre-trained embedding file.
-
-
-        This is to load embedding vectors from a user-defined pre-trained token embedding file.
-        For example, if `elem_delim` = ' ', the expected format of a custom pre-trained token
-        embedding file may look like:
-
-        'hello 0.1 0.2 0.3 0.4 0.5\\\\nworld 1.1 1.2 1.3 1.4 1.5\\\\n'
-
-        where embedding vectors of words `hello` and `world` are [0.1, 0.2, 0.3, 0.4, 0.5] and
-        [1.1, 1.2, 1.3, 1.4, 1.5] respectively.
-
-
-        Parameters
-        ----------
-        file_path : str
-            The path to the user-defined pre-trained token embedding file.
-        elem_delim : str, default ' '
-            The delimiter for splitting a token and every embedding vector element value on the same
-            line of the custom pre-trained token embedding file.
-        encoding : str, default 'utf8'
-            The encoding scheme for reading the custom pre-trained token embedding file.
-        kwargs : dict
-            All other keyword arguments are passed to the TokenEmbedding initializer.
-
-
-        Returns
-        -------
-        instance of :class:`gluonnlp.embedding.TokenEmbedding`
-            The user-defined token embedding instance.
-        """
-        unknown_token = kwargs.pop('unknown_token', C.UNK_TOKEN)
-        init_unknown_vec = kwargs.pop('init_unknown_vec', INIT_UNKNOWN_VEC)
-        idx_to_token, idx_to_vec, unknown_token = TokenEmbedding._load_embedding(
-            file_path,
-            elem_delim=elem_delim,
-            unknown_token=unknown_token,
-            init_unknown_vec=init_unknown_vec,
-            encoding=encoding)
-
-        assert 'idx_to_vec' not in kwargs
-        assert 'idx_to_token' not in kwargs
-        return TokenEmbedding(unknown_token=unknown_token,
-                              init_unknown_vec=None,
-                              idx_to_token=idx_to_token,
-                              idx_to_vec=idx_to_vec,
-                              **kwargs)
-
-    def serialize(self, file_path, compress=True):
-        """Serializes the TokenEmbedding to a file specified by file_path.
-
-        TokenEmbedding is serialized by converting the list of tokens, the
-        array of word embeddings and other metadata to numpy arrays, saving all
-        in a single (optionally compressed) Zipfile. See
-        https://docs.scipy.org/doc/numpy-1.14.2/neps/npy-format.html for more
-        information on the format.
-
-
-        Parameters
-        ----------
-        file_path : str or file
-            The path at which to create the file holding the serialized
-            TokenEmbedding. If file is a string or a Path, the .npz extension
-            will be appended to the file name if it is not already there.
-        compress : bool, default True
-            Compress the Zipfile or leave it uncompressed.
-
-        """
-        if self.unknown_lookup is not None:
-            warnings.warn(
-                'Serialization of `unknown_lookup` is not supported. '
-                'Save it manually and pass the loaded lookup object '
-                'during deserialization.')
-
-        unknown_token = np.array(self.unknown_token)
-        idx_to_token = np.array(self.idx_to_token, dtype='O')
-        idx_to_vec = self.idx_to_vec.asnumpy()
-
-        if not unknown_token:  # Store empty string instead of None
-            unknown_token = ''
-
-        if not compress:
-            np.savez(file=file_path, unknown_token=unknown_token,
-                     idx_to_token=idx_to_token, idx_to_vec=idx_to_vec)
-        else:
-            np.savez_compressed(file=file_path, unknown_token=unknown_token,
-                                idx_to_token=idx_to_token,
-                                idx_to_vec=idx_to_vec)
-
-    @staticmethod
-    def deserialize(file_path, **kwargs):
-        """Create a new TokenEmbedding from a serialized one.
-
-        TokenEmbedding is serialized by converting the list of tokens, the
-        array of word embeddings and other metadata to numpy arrays, saving all
-        in a single (optionally compressed) Zipfile. See
-        https://docs.scipy.org/doc/numpy-1.14.2/neps/npy-format.html for more
-        information on the format.
-
-
-        Parameters
-        ----------
-        file_path : str or file
-            The path to a file that holds the serialized TokenEmbedding.
-        kwargs : dict
-            Keyword arguments are passed to the TokenEmbedding initializer.
-            Useful for attaching unknown_lookup.
-        """
-        # idx_to_token is of dtype 'O' so we need to allow pickle
-        npz_dict = np.load(file_path, allow_pickle=True)
-
-        unknown_token = npz_dict['unknown_token']
-        if not unknown_token:
-            unknown_token = None
-        else:
-            if isinstance(unknown_token, np.ndarray):
-                if unknown_token.dtype.kind == 'S':
-                    unknown_token = unknown_token.tobytes().decode()
-                else:
-                    unknown_token = str(unknown_token)
-        idx_to_token = npz_dict['idx_to_token'].tolist()
-        idx_to_vec = nd.array(npz_dict['idx_to_vec'])
-
-        assert 'unknown_token' not in kwargs
-        assert 'init_unknown_vec' not in kwargs
-        assert 'idx_to_vec' not in kwargs
-        assert 'idx_to_token' not in kwargs
-        return TokenEmbedding(unknown_token=unknown_token,
-                              init_unknown_vec=None,
-                              idx_to_token=idx_to_token,
-                              idx_to_vec=idx_to_vec,
-                              **kwargs)
-
-
-@register
-class GloVe(TokenEmbedding):
-    """The GloVe word embedding.
-
-    GloVe is an unsupervised learning algorithm for obtaining vector representations for words.
-    Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and
-    the resulting representations showcase interesting linear substructures of the word vector
-    space. (Source from https://nlp.stanford.edu/projects/glove/)
-
-    Reference:
-
-    GloVe: Global Vectors for Word Representation.
-    Jeffrey Pennington, Richard Socher, and Christopher D. Manning.
-    https://nlp.stanford.edu/pubs/glove.pdf
-
-    Website: https://nlp.stanford.edu/projects/glove/
-
-    To get the updated URLs to the externally hosted pre-trained token embedding
-    files, visit https://nlp.stanford.edu/projects/glove/
-
-    License for pre-trained embedding: https://opendatacommons.org/licenses/pddl/
-
-    Available sources
-
-    >>> import gluonnlp as nlp
-    >>> sorted(nlp.embedding.list_sources('GloVe'))
-    [\
-'glove.42B.300d', 'glove.6B.100d', 'glove.6B.200d', 'glove.6B.300d', 'glove.6B.50d', \
-'glove.840B.300d', 'glove.twitter.27B.100d', 'glove.twitter.27B.200d', \
-'glove.twitter.27B.25d', 'glove.twitter.27B.50d'\
-]
-
-    Parameters
-    ----------
-    source : str, default 'glove.6B.50d'
-        The name of the pre-trained token embedding file.
-    embedding_root : str, default '$MXNET_HOME/embedding'
-        The root directory for storing embedding-related files.
-        MXNET_HOME defaults to '~/.mxnet'.
-    kwargs
-        All other keyword arguments are passed to
-        `gluonnlp.embedding.TokenEmbedding`.
-
-    Attributes
-    ----------
-    idx_to_vec : mxnet.ndarray.NDArray
-        For all the indexed tokens in this embedding, this NDArray maps each token's index to an
-        embedding vector.
-    unknown_token : hashable object
-        The representation for any unknown token. In other words, any unknown token will be indexed
-        as the same representation.
-    """
-
-    # Map a pre-trained token embedding file and its SHA-1 hash.
-    source_file_hash = C.GLOVE_NPZ_SHA1
-
-    def __init__(self, source='glove.6B.50d',
-                 embedding_root=os.path.join(get_home_dir(), 'embedding'), **kwargs):
-        self._check_source(self.source_file_hash, source)
-        pretrained_file_path = GloVe._get_file_path(self.source_file_hash, embedding_root, source)
-        unknown_token = kwargs.pop('unknown_token', C.UNK_TOKEN)
-        init_unknown_vec = kwargs.pop('init_unknown_vec', INIT_UNKNOWN_VEC)
-        encoding = kwargs.pop('encoding', ENCODING)
-        idx_to_token, idx_to_vec, unknown_token = self._load_embedding(
-            pretrained_file_path=pretrained_file_path,
-            elem_delim=' ',
-            unknown_token=unknown_token,
-            init_unknown_vec=init_unknown_vec,
-            encoding=encoding)
-
-        assert 'idx_to_vec' not in kwargs
-        assert 'idx_to_token' not in kwargs
-        super(GloVe, self).__init__(unknown_token=unknown_token,
-                                    init_unknown_vec=None,
-                                    idx_to_token=idx_to_token,
-                                    idx_to_vec=idx_to_vec,
-                                    **kwargs)
-
-
-@register
-class FastText(TokenEmbedding):
-    """The fastText word embedding.
-
-
-    FastText is an open-source, free, lightweight library that allows users to learn text
-    representations and text classifiers. It works on standard, generic hardware. Models can later
-    be reduced in size to even fit on mobile devices. (Source from https://fasttext.cc/)
-
-
-    References:
-
-    Enriching Word Vectors with Subword Information.
-    Piotr Bojanowski, Edouard Grave, Armand Joulin, and Tomas Mikolov.
-    https://arxiv.org/abs/1607.04606
-
-    Bag of Tricks for Efficient Text Classification.
-    Armand Joulin, Edouard Grave, Piotr Bojanowski, and Tomas Mikolov.
-    https://arxiv.org/abs/1607.01759
-
-    FastText.zip: Compressing text classification models.
-    Armand Joulin, Edouard Grave, Piotr Bojanowski, Matthijs Douze, Herve Jegou, and Tomas Mikolov.
-    https://arxiv.org/abs/1612.03651
-
-    For 'wiki.multi' embedding:
-    Word Translation Without Parallel Data
-    Alexis Conneau, Guillaume Lample, Marc'Aurelio Ranzato, Ludovic Denoyer, and Herve Jegou.
-    https://arxiv.org/abs/1710.04087
-
-    Website: https://fasttext.cc/
-
-    To get the updated URLs to the externally hosted pre-trained token embedding files, visit
-    https://github.com/facebookresearch/fastText/blob/master/docs/pretrained-vectors.md
-
-    License for pre-trained embedding: https://creativecommons.org/licenses/by-sa/3.0/
-
-    Available sources
-
-    >>> import gluonnlp as nlp
-    >>> sorted(nlp.embedding.list_sources('FastText'))
-    [\
-'cc.af.300', 'cc.als.300', 'cc.am.300', 'cc.an.300', 'cc.ar.300', 'cc.arz.300', \
-'cc.as.300', 'cc.ast.300', 'cc.az.300', 'cc.azb.300', 'cc.ba.300', 'cc.bar.300', \
-'cc.bcl.300', 'cc.be.300', 'cc.bg.300', 'cc.bh.300', 'cc.bn.300', 'cc.bo.300', \
-'cc.bpy.300', 'cc.br.300', 'cc.bs.300', 'cc.ca.300', 'cc.ce.300', 'cc.ceb.300', \
-'cc.ckb.300', 'cc.co.300', 'cc.cs.300', 'cc.cv.300', 'cc.cy.300', 'cc.da.300', \
-'cc.de.300', 'cc.diq.300', 'cc.dv.300', 'cc.el.300', 'cc.eml.300', 'cc.en.300', \
-'cc.eo.300', 'cc.es.300', 'cc.et.300', 'cc.eu.300', 'cc.fa.300', 'cc.fi.300', \
-'cc.fr.300', 'cc.frr.300', 'cc.fy.300', 'cc.ga.300', 'cc.gd.300', 'cc.gl.300', \
-'cc.gom.300', 'cc.gu.300', 'cc.gv.300', 'cc.he.300', 'cc.hi.300', 'cc.hif.300', \
-'cc.hr.300', 'cc.hsb.300', 'cc.ht.300', 'cc.hu.300', 'cc.hy.300', 'cc.ia.300', \
-'cc.id.300', 'cc.ilo.300', 'cc.io.300', 'cc.is.300', 'cc.it.300', 'cc.ja.300', \
-'cc.jv.300', 'cc.ka.300', 'cc.kk.300', 'cc.km.300', 'cc.kn.300', 'cc.ko.300', \
-'cc.ku.300', 'cc.ky.300', 'cc.la.300', 'cc.lb.300', 'cc.li.300', 'cc.lmo.300', \
-'cc.lt.300', 'cc.lv.300', 'cc.mai.300', 'cc.mg.300', 'cc.mhr.300', 'cc.min.300', \
-'cc.mk.300', 'cc.ml.300', 'cc.mn.300', 'cc.mr.300', 'cc.mrj.300', 'cc.ms.300', \
-'cc.mt.300', 'cc.mwl.300', 'cc.my.300', 'cc.myv.300', 'cc.mzn.300', 'cc.nah.300', \
-'cc.nap.300', 'cc.nds.300', 'cc.ne.300', 'cc.new.300', 'cc.nl.300', 'cc.nn.300', \
-'cc.no.300', 'cc.nso.300', 'cc.oc.300', 'cc.or.300', 'cc.os.300', 'cc.pa.300', \
-'cc.pam.300', 'cc.pfl.300', 'cc.pl.300', 'cc.pms.300', 'cc.pnb.300', 'cc.ps.300', \
-'cc.pt.300', 'cc.qu.300', 'cc.rm.300', 'cc.ro.300', 'cc.ru.300', 'cc.sa.300', \
-'cc.sah.300', 'cc.sc.300', 'cc.scn.300', 'cc.sco.300', 'cc.sd.300', 'cc.sh.300', \
-'cc.si.300', 'cc.sk.300', 'cc.sl.300', 'cc.so.300', 'cc.sq.300', 'cc.sr.300', \
-'cc.su.300', 'cc.sv.300', 'cc.sw.300', 'cc.ta.300', 'cc.te.300', 'cc.tg.300', \
-'cc.th.300', 'cc.tk.300', 'cc.tl.300', 'cc.tr.300', 'cc.tt.300', 'cc.ug.300', \
-'cc.uk.300', 'cc.ur.300', 'cc.uz.300', 'cc.vec.300', 'cc.vi.300', 'cc.vls.300', \
-'cc.vo.300', 'cc.wa.300', 'cc.war.300', 'cc.xmf.300', 'cc.yi.300', 'cc.yo.300', \
-'cc.zea.300', 'cc.zh.300', 'crawl-300d-2M', 'crawl-300d-2M-subword', \
-'wiki-news-300d-1M', 'wiki-news-300d-1M-subword', 'wiki.aa', 'wiki.ab', 'wiki.ace', \
-'wiki.ady', 'wiki.af', 'wiki.ak', 'wiki.als', 'wiki.am', 'wiki.an', 'wiki.ang', \
-'wiki.ar', 'wiki.arc', 'wiki.arz', 'wiki.as', 'wiki.ast', 'wiki.av', 'wiki.ay', \
-'wiki.az', 'wiki.azb', 'wiki.ba', 'wiki.bar', 'wiki.bat_smg', 'wiki.bcl', 'wiki.be', \
-'wiki.bg', 'wiki.bh', 'wiki.bi', 'wiki.bjn', 'wiki.bm', 'wiki.bn', 'wiki.bo', \
-'wiki.bpy', 'wiki.br', 'wiki.bs', 'wiki.bug', 'wiki.bxr', 'wiki.ca', 'wiki.cbk_zam', \
-'wiki.cdo', 'wiki.ce', 'wiki.ceb', 'wiki.ch', 'wiki.cho', 'wiki.chr', 'wiki.chy', \
-'wiki.ckb', 'wiki.co', 'wiki.cr', 'wiki.crh', 'wiki.cs', 'wiki.csb', 'wiki.cu', \
-'wiki.cv', 'wiki.cy', 'wiki.da', 'wiki.de', 'wiki.diq', 'wiki.dsb', 'wiki.dv', \
-'wiki.dz', 'wiki.ee', 'wiki.el', 'wiki.eml', 'wiki.en', 'wiki.eo', 'wiki.es', \
-'wiki.et', 'wiki.eu', 'wiki.ext', 'wiki.fa', 'wiki.ff', 'wiki.fi', 'wiki.fiu_vro', \
-'wiki.fj', 'wiki.fo', 'wiki.fr', 'wiki.frp', 'wiki.frr', 'wiki.fur', 'wiki.fy', \
-'wiki.ga', 'wiki.gag', 'wiki.gan', 'wiki.gd', 'wiki.gl', 'wiki.glk', 'wiki.gn', \
-'wiki.gom', 'wiki.got', 'wiki.gu', 'wiki.gv', 'wiki.ha', 'wiki.hak', 'wiki.haw', \
-'wiki.he', 'wiki.hi', 'wiki.hif', 'wiki.ho', 'wiki.hr', 'wiki.hsb', 'wiki.ht', \
-'wiki.hu', 'wiki.hy', 'wiki.hz', 'wiki.ia', 'wiki.id', 'wiki.ie', 'wiki.ig', \
-'wiki.ii', 'wiki.ik', 'wiki.ilo', 'wiki.io', 'wiki.is', 'wiki.it', 'wiki.iu', \
-'wiki.ja', 'wiki.jam', 'wiki.jbo', 'wiki.jv', 'wiki.ka', 'wiki.kaa', 'wiki.kab', \
-'wiki.kbd', 'wiki.kg', 'wiki.ki', 'wiki.kj', 'wiki.kk', 'wiki.kl', 'wiki.km', \
-'wiki.kn', 'wiki.ko', 'wiki.koi', 'wiki.kr', 'wiki.krc', 'wiki.ks', 'wiki.ksh', \
-'wiki.ku', 'wiki.kv', 'wiki.kw', 'wiki.ky', 'wiki.la', 'wiki.lad', 'wiki.lb', \
-'wiki.lbe', 'wiki.lez', 'wiki.lg', 'wiki.li', 'wiki.lij', 'wiki.lmo', 'wiki.ln', \
-'wiki.lo', 'wiki.lrc', 'wiki.lt', 'wiki.ltg', 'wiki.lv', 'wiki.mai', 'wiki.map_bms', \
-'wiki.mdf', 'wiki.mg', 'wiki.mh', 'wiki.mhr', 'wiki.mi', 'wiki.min', 'wiki.mk', \
-'wiki.ml', 'wiki.mn', 'wiki.mo', 'wiki.mr', 'wiki.mrj', 'wiki.ms', 'wiki.mt', \
-'wiki.multi.ar', 'wiki.multi.bg', 'wiki.multi.ca', 'wiki.multi.cs', 'wiki.multi.da', \
-'wiki.multi.de', 'wiki.multi.el', 'wiki.multi.en', 'wiki.multi.es', 'wiki.multi.et', \
-'wiki.multi.fi', 'wiki.multi.fr', 'wiki.multi.he', 'wiki.multi.hr', 'wiki.multi.hu', \
-'wiki.multi.id', 'wiki.multi.it', 'wiki.multi.mk', 'wiki.multi.nl', 'wiki.multi.no', \
-'wiki.multi.pl', 'wiki.multi.pt', 'wiki.multi.ro', 'wiki.multi.ru', 'wiki.multi.sk', \
-'wiki.multi.sl', 'wiki.multi.sv', 'wiki.multi.tr', 'wiki.multi.uk', 'wiki.multi.vi', \
-'wiki.mus', 'wiki.mwl', 'wiki.my', 'wiki.myv', 'wiki.mzn', 'wiki.na', 'wiki.nah', \
-'wiki.nap', 'wiki.nds', 'wiki.nds_nl', 'wiki.ne', 'wiki.new', 'wiki.ng', 'wiki.nl', \
-'wiki.nn', 'wiki.no', 'wiki.nov', 'wiki.nrm', 'wiki.nso', 'wiki.nv', 'wiki.ny', \
-'wiki.oc', 'wiki.olo', 'wiki.om', 'wiki.or', 'wiki.os', 'wiki.pa', 'wiki.pag', \
-'wiki.pam', 'wiki.pap', 'wiki.pcd', 'wiki.pdc', 'wiki.pfl', 'wiki.pi', 'wiki.pih', \
-'wiki.pl', 'wiki.pms', 'wiki.pnb', 'wiki.pnt', 'wiki.ps', 'wiki.pt', 'wiki.qu', \
-'wiki.rm', 'wiki.rmy', 'wiki.rn', 'wiki.ro', 'wiki.roa_rup', 'wiki.roa_tara', \
-'wiki.ru', 'wiki.rue', 'wiki.rw', 'wiki.sa', 'wiki.sah', 'wiki.sc', 'wiki.scn', \
-'wiki.sco', 'wiki.sd', 'wiki.se', 'wiki.sg', 'wiki.sh', 'wiki.si', 'wiki.simple', \
-'wiki.sk', 'wiki.sl', 'wiki.sm', 'wiki.sn', 'wiki.so', 'wiki.sq', 'wiki.sr', \
-'wiki.srn', 'wiki.ss', 'wiki.st', 'wiki.stq', 'wiki.su', 'wiki.sv', 'wiki.sw', \
-'wiki.szl', 'wiki.ta', 'wiki.tcy', 'wiki.te', 'wiki.tet', 'wiki.tg', 'wiki.th', \
-'wiki.ti', 'wiki.tk', 'wiki.tl', 'wiki.tn', 'wiki.to', 'wiki.tpi', 'wiki.tr', \
-'wiki.ts', 'wiki.tt', 'wiki.tum', 'wiki.tw', 'wiki.ty', 'wiki.tyv', 'wiki.udm', \
-'wiki.ug', 'wiki.uk', 'wiki.ur', 'wiki.uz', 'wiki.ve', 'wiki.vec', 'wiki.vep', \
-'wiki.vi', 'wiki.vls', 'wiki.vo', 'wiki.wa', 'wiki.war', 'wiki.wo', 'wiki.wuu', \
-'wiki.xal', 'wiki.xh', 'wiki.xmf', 'wiki.yi', 'wiki.yo', 'wiki.za', 'wiki.zea', \
-'wiki.zh', 'wiki.zh_classical', 'wiki.zh_min_nan', 'wiki.zh_yue', 'wiki.zu'\
-]
-
-
-    Parameters
-    ----------
-    source : str, default 'wiki.simple'
-        The name of the pre-trained token embedding file.
-    embedding_root : str, default '$MXNET_HOME/embedding'
-        The root directory for storing embedding-related files.
-        MXNET_HOME defaults to '~/.mxnet'.
-    load_ngrams : bool, default False
-        Load vectors for ngrams so that computing vectors for OOV words is
-        possible. This is disabled by default as it requires downloading an
-        additional 2GB file containing the vectors for ngrams. Note that
-        facebookresearch did not publish ngram vectors for all their models. If
-        load_ngrams is True, but no ngram vectors are available for the chosen
-        source this a RuntimeError is thrown. The ngram vectors are passed to
-        the resulting TokenEmbedding as `unknown_lookup`.
-    ctx : mx.Context, default mxnet.cpu()
-        Context to load the FasttextEmbeddingModel for ngram vectors to. This
-        parameter is ignored if load_ngrams is False.
-    kwargs
-        All other keyword arguments are passed to
-        `gluonnlp.embedding.TokenEmbedding`.
-
-
-    Attributes
-    ----------
-    idx_to_vec : mxnet.ndarray.NDArray
-        For all the indexed tokens in this embedding, this NDArray maps each token's index to an
-        embedding vector.
-    unknown_token : hashable object
-        The representation for any unknown token. In other words, any unknown token will be indexed
-        as the same representation.
-    """
-
-    # Map a pre-trained token embedding file and its SHA-1 hash.
-    source_file_hash = C.FAST_TEXT_NPZ_SHA1
-    source_bin_file_hash = C.FAST_TEXT_BIN_SHA1
-
-    def __init__(self, source='wiki.simple', embedding_root=os.path.join(
-            get_home_dir(), 'embedding'), load_ngrams=False, ctx=cpu(), **kwargs):
-        self._check_source(self.source_file_hash, source)
-        pretrained_file_path = FastText._get_file_path(self.source_file_hash,
-                                                       embedding_root, source)
-        if load_ngrams:
-            try:
-                self._check_source(self.source_bin_file_hash, source)
-            except KeyError:
-                raise KeyError(
-                    'No ngrams are available for {}. '
-                    'Ngram features were published for the following embeddings: {}'.
-                    format(source, ', '.join(self.source_bin_file_hash.keys())))
-
-            pretrained_bin_file_path = FastText._get_file_path(self.source_bin_file_hash,
-                                                               embedding_root, source)
-            unknown_lookup = FasttextEmbeddingModel.load_fasttext_format(
-                pretrained_bin_file_path, ctx=ctx)
-        else:
-            unknown_lookup = None
-
-        unknown_token = kwargs.pop('unknown_token', C.UNK_TOKEN)
-        init_unknown_vec = kwargs.pop('init_unknown_vec', INIT_UNKNOWN_VEC)
-        encoding = kwargs.pop('encoding', ENCODING)
-        idx_to_token, idx_to_vec, unknown_token = self._load_embedding(
-            pretrained_file_path=pretrained_file_path,
-            elem_delim=' ',
-            unknown_token=unknown_token,
-            init_unknown_vec=init_unknown_vec,
-            encoding=encoding)
-
-        assert 'idx_to_vec' not in kwargs
-        assert 'idx_to_token' not in kwargs
-        super(FastText, self).__init__(unknown_token=unknown_token,
-                                       init_unknown_vec=None,
-                                       idx_to_token=idx_to_token,
-                                       idx_to_vec=idx_to_vec,
-                                       unknown_lookup=unknown_lookup, **kwargs)
-
-
-@register
-class Word2Vec(TokenEmbedding):
-    """The Word2Vec word embedding.
-
-    Word2Vec is an unsupervised learning algorithm for obtaining vector
-    representations for words. Training is performed with continuous
-    bag-of-words or skip-gram architecture for computing vector
-    representations of words.
-
-    References:
-
-    [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. Efficient
-    Estimation of Word Representations in Vector Space. In Proceedings of
-    Workshop at ICLR, 2013.
-
-    [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey
-    Dean. Distributed Representations of Words and Phrases and their
-    Compositionality. In Proceedings of NIPS, 2013.
-
-    [3] Tomas Mikolov, Wen-tau Yih, and Geoffrey Zweig. Linguistic Regularities
-    in Continuous Space Word Representations. In Proceedings of NAACL HLT,
-    2013.
-
-    Website: https://code.google.com/archive/p/word2vec/
-
-    License for pre-trained embedding: Unspecified
-
-    Available sources
-
-    >>> import gluonnlp as nlp
-    >>> sorted(nlp.embedding.list_sources('Word2Vec'))
-    [\
-'GoogleNews-vectors-negative300', 'freebase-vectors-skipgram1000', \
-'freebase-vectors-skipgram1000-en'\
-]
-
-    Parameters
-    ----------
-    source : str, default 'GoogleNews-vectors-negative300'
-        The name of the pre-trained token embedding file.
-        A binary pre-trained file outside from the source list can be used for this constructor by
-        passing the path to it which ends with .bin as file extension name.
-    embedding_root : str, default '$MXNET_HOME/embedding'
-        The root directory for storing embedding-related files.
-        MXNET_HOME defaults to '~/.mxnet'.
-    kwargs
-        All other keyword arguments are passed to
-        `gluonnlp.embedding.TokenEmbedding`.
-
-    Attributes
-    ----------
-    idx_to_vec : mxnet.ndarray.NDArray
-        For all the indexed tokens in this embedding, this NDArray maps each token's index to an
-        embedding vector.
-    unknown_token : hashable object
-        The representation for any unknown token. In other words, any unknown token will be indexed
-        as the same representation.
-
-    """
-
-    # Map a pre-trained token embedding file and its SHA-1 hash.
-    source_file_hash = C.WORD2VEC_NPZ_SHA1
-
-    def __init__(self, source='GoogleNews-vectors-negative300',
-                 embedding_root=os.path.join(get_home_dir(), 'embedding'), encoding=ENCODING,
-                 **kwargs):
-        unknown_token = kwargs.pop('unknown_token', C.UNK_TOKEN)
-        init_unknown_vec = kwargs.pop('init_unknown_vec', INIT_UNKNOWN_VEC)
-        if source.endswith('.bin'):
-            pretrained_file_path = os.path.expanduser(source)
-            idx_to_token, idx_to_vec, unknown_token = self._load_w2v_binary(
-                pretrained_file_path, unknown_token=unknown_token,
-                init_unknown_vec=init_unknown_vec, encoding=encoding)
-        else:
-            self._check_source(self.source_file_hash, source)
-            pretrained_file_path = self._get_file_path(self.source_file_hash,
-                                                       embedding_root, source)
-            idx_to_token, idx_to_vec, unknown_token = self._load_embedding(
-                pretrained_file_path=pretrained_file_path,
-                elem_delim=' ',
-                unknown_token=unknown_token,
-                init_unknown_vec=init_unknown_vec,
-                encoding=encoding)
-
-        assert 'idx_to_vec' not in kwargs
-        assert 'idx_to_token' not in kwargs
-        super(Word2Vec, self).__init__(unknown_token=unknown_token,
-                                       init_unknown_vec=None,
-                                       idx_to_token=idx_to_token,
-                                       idx_to_vec=idx_to_vec,
-                                       **kwargs)
-
-    @classmethod
-    def _load_w2v_binary(cls, pretrained_file_path, unknown_token,
-                         init_unknown_vec=INIT_UNKNOWN_VEC, encoding=ENCODING):
-        """Load embedding vectors from a binary pre-trained token embedding file.
-
-        Parameters
-        ----------
-        pretrained_file_path: str
-            The path to a binary pre-trained token embedding file end with .bin as file extension
-            name.
-        encoding: str
-            The encoding type of the file.
-        """
-        idx_to_token = [unknown_token] if unknown_token else []
-        unk_idx = None
-        if unknown_token:
-            unk_idx = 0
-
-        all_elems = []
-        tokens = set()
-        loaded_unknown_vec = None
-        pretrained_file_path = os.path.expanduser(pretrained_file_path)
-        with io.open(pretrained_file_path, 'rb') as f:
-            header = f.readline().decode(encoding=encoding)  # pytype: disable=attribute-error
-            vocab_size, vec_len = (int(x) for x in header.split())
-            if unknown_token:
-                # Reserve a vector slot for the unknown token at the very beggining
-                # because the unknown token index is 0.
-                all_elems.extend([0] * vec_len)
-            binary_len = np.dtype(np.float32).itemsize * vec_len
-            for line_num in range(vocab_size):
-                token = []
-                while True:
-                    ch = f.read(1)
-                    if ch == b' ':
-                        break
-                    if ch == b'':
-                        raise EOFError('unexpected end of input; is count incorrect or file '
-                                       'otherwise damaged?')
-                    if ch != b'\n':  # ignore newlines in front of words (some binary files have)
-                        token.append(ch)
-                try:
-                    token = b''.join(token).decode(encoding=encoding)
-                except ValueError:
-                    warnings.warn('line {} in {}: failed to decode. Skipping.'
-                                  .format(line_num, pretrained_file_path))
-                    continue
-                elems = np.frombuffer(f.read(binary_len), dtype=np.float32)
-
-                assert len(elems) > 1, 'line {} in {}: unexpected data format.'.format(
-                    line_num, pretrained_file_path)
-
-                if token == unknown_token and loaded_unknown_vec is None:
-                    loaded_unknown_vec = elems
-                    tokens.add(unknown_token)
-                elif token in tokens:
-                    warnings.warn('line {} in {}: duplicate embedding found for '
-                                  'token "{}". Skipped.'.format(line_num, pretrained_file_path,
-                                                                token))
-                else:
-                    assert len(elems) == vec_len, \
-                        'line {} in {}: found vector of inconsistent dimension for token ' \
-                        '"{}". expected dim: {}, found: {}'.format(line_num,
-                                                                   pretrained_file_path,
-                                                                   token, vec_len, len(elems))
-                    all_elems.extend(elems)
-                    idx_to_token.append(token)
-                    tokens.add(token)
-
-        idx_to_vec = nd.array(all_elems).reshape((-1, vec_len))
-
-        if unknown_token:
-            if loaded_unknown_vec is None:
-                idx_to_vec[unk_idx] = init_unknown_vec(shape=vec_len)
-            else:
-                idx_to_vec[unk_idx] = nd.array(loaded_unknown_vec)
-
-        return idx_to_token, idx_to_vec, unknown_token
-
-    @classmethod
-    def from_w2v_binary(cls, pretrained_file_path, encoding=ENCODING):
-        """Load embedding vectors from a binary pre-trained token embedding file.
-
-        Parameters
-        ----------
-        pretrained_file_path: str
-            The path to a binary pre-trained token embedding file end with .bin as file extension
-            name.
-        encoding: str
-            The encoding type of the file.
-        """
-        return cls(source=pretrained_file_path, encoding=encoding)
diff --git a/src/gluonnlp/initializer.py b/src/gluonnlp/initializer.py
new file mode 100644
index 0000000000..4499c69723
--- /dev/null
+++ b/src/gluonnlp/initializer.py
@@ -0,0 +1,81 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Initializer in GluonNLP."""
+__all__ = ['TruncNorm']
+
+import warnings
+import math
+import mxnet as mx
+from mxnet.initializer import Initializer
+
+
+def norm_cdf(x):
+    return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+
+@mx.initializer.register
+class TruncNorm(Initializer):
+    r"""Initialize the weight by drawing sample from truncated normal distribution with
+    provided mean and standard deviation. Values whose magnitude is more than 2 standard deviations
+    from the mean are dropped and re-picked.
+
+    In the implementation, we used the method described in
+    https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf, which is also
+    obtained in PyTorch.
+
+    Parameters
+    ----------
+    mean
+        Mean of the underlying normal distribution
+    stdev
+        Standard deviation of the underlying normal distribution
+    scale
+        The scale of the truncated distribution.
+        The values
+    **kwargs
+        Additional parameters for base Initializer.
+    """
+    def __init__(self, mean: float = 0, stdev: float = 0.01,
+                 scale=2, **kwargs):
+        super().__init__(**kwargs)
+        self._mean = mean
+        self._stdev = stdev
+        self._scale = scale
+        self._a = mean - scale * stdev
+        self._b = mean + scale * stdev
+        if (mean < self._a - 2 * stdev) or (mean > self._b + 2 * stdev):
+            warnings.warn("mean is more than 2 std from [a, b] in init.TruncNorm. "
+                          "The distribution of values may be incorrect.",
+                          stacklevel=2)
+        self._l = norm_cdf(-scale)
+        self._u = norm_cdf(scale)
+
+    def _init_weight(self, name, arr):
+        # pylint: disable=unused-argument
+        """Abstract method to Initialize weight."""
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        arr[:] = mx.np.random.uniform(2 * self._l - 1, 2 * self._u - 1, size=arr.shape, ctx=arr.ctx)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        arr[:] = mx.npx.erfinv(arr)
+
+        # Transform to proper mean, std
+        arr *= self._stdev * math.sqrt(2.)
+        arr += self._mean
+        # Clamp to ensure it's in the proper range
+        arr[:] = arr.clip(self._a, self._b)
diff --git a/src/gluonnlp/initializer/__init__.py b/src/gluonnlp/initializer/__init__.py
deleted file mode 100644
index 8279515af8..0000000000
--- a/src/gluonnlp/initializer/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""NLP initializer."""
-
-from . import initializer
-
-from .initializer import *
-
-__all__ = initializer.__all__
diff --git a/src/gluonnlp/initializer/initializer.py b/src/gluonnlp/initializer/initializer.py
deleted file mode 100644
index f86031a63a..0000000000
--- a/src/gluonnlp/initializer/initializer.py
+++ /dev/null
@@ -1,95 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=
-"""Highway layer initializer."""
-__all__ = ['HighwayBias', 'TruncNorm']
-
-import mxnet
-from mxnet.initializer import Initializer
-
-@mxnet.initializer.register
-class HighwayBias(Initializer):
-    r"""Initialize all biases of an Highway layer by setting the biases
-    of nonlinear transformer and the transform gate differently.
-    The dimension of the biases are identical and equals to the :math:`arr.shape[0]/2`,
-    where :math:`arr` is the bias tensor.
-
-    The definition of the biases follows the work::
-
-        @inproceedings{srivastava2015training,
-         title={Training very deep networks},
-         author={Srivastava, Rupesh K and Greff, Klaus and Schmidhuber, J{\"u}rgen},
-         booktitle={Advances in neural information processing systems},
-         pages={2377--2385},
-         year={2015}
-        }
-
-    Parameters
-    ----------
-    nonlinear_transform_bias: float, default 0.0
-        bias for the non linear transformer.
-        We set the default according to the above original work.
-    transform_gate_bias: float, default -2.0
-        bias for the transform gate.
-        We set the default according to the above original work.
-    """
-    def __init__(self, nonlinear_transform_bias=0.0, transform_gate_bias=-2.0, **kwargs):
-        super(HighwayBias, self).__init__(**kwargs)
-        self.nonlinear_transform_bias = nonlinear_transform_bias
-        self.transform_gate_bias = transform_gate_bias
-
-    def _init_weight(self, name, arr):
-        # pylint: disable=unused-argument
-        """Abstract method to Initialize weight."""
-        arr[:int(arr.shape[0] / 2)] = self.nonlinear_transform_bias
-        arr[int(arr.shape[0] / 2):] = self.transform_gate_bias
-
-
-@mxnet.initializer.register
-class TruncNorm(Initializer):
-    r"""Initialize the weight by drawing sample from truncated normal distribution with
-    provided mean and standard deviation. Values whose magnitude is more than 2 standard deviations
-    from the mean are dropped and re-picked..
-
-    Parameters
-    ----------
-    mean : float, default 0
-        Mean of the underlying normal distribution
-
-    stdev : float, default 0.01
-        Standard deviation of the underlying normal distribution
-
-    **kwargs : dict
-        Additional parameters for base Initializer.
-    """
-    def __init__(self, mean=0, stdev=0.01, **kwargs):
-        super(TruncNorm, self).__init__(**kwargs)
-        try:
-            from scipy.stats import truncnorm  # pylint: disable=import-outside-toplevel
-        except ImportError:
-            raise ImportError('SciPy is not installed. '
-                              'You must install SciPy >= 1.0.0 in order to use the '
-                              'TruncNorm. You can refer to the official '
-                              'installation guide in https://www.scipy.org/install.html .')
-
-        self._frozen_rv = truncnorm(-2, 2, mean, stdev)
-
-    def _init_weight(self, name, arr):
-        # pylint: disable=unused-argument
-        """Abstract method to Initialize weight."""
-        arr[:] = self._frozen_rv.rvs(arr.size).reshape(arr.shape)
diff --git a/src/gluonnlp/layers.py b/src/gluonnlp/layers.py
new file mode 100644
index 0000000000..a2eff2bf4f
--- /dev/null
+++ b/src/gluonnlp/layers.py
@@ -0,0 +1,1028 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Layers."""
+__all__ = ['MultiHeadDense', 'PositionalEmbedding', 'SinusoidalPositionalEmbedding',
+           'LearnedPositionalEmbedding', 'BucketPositionalEmbedding', 'AdaptiveEmbedding',
+           'PositionwiseFFN', 'ProjectedAdaptiveLogSoftmaxWithLoss']
+
+import math
+import numpy as np
+from collections import OrderedDict
+import mxnet as mx
+from mxnet import use_np
+from mxnet.gluon import nn, HybridBlock, Parameter, Constant
+from typing import Union, Optional, List
+from .op import relative_position_bucket
+
+
+InitializerType = Optional[Union[mx.init.Initializer, str]]
+
+
+@use_np
+def get_layer_norm(normalization: str = 'layer_norm',
+                   axis: int = -1,
+                   epsilon: float = 1e-5,
+                   in_channels: int = 0, **kwargs):
+    """
+    Get the layer normalization based on the type
+
+    Parameters
+    ----------
+    normalization: str, default: 'layer_norm'
+        The type of the layer normalization from ['layer_norm', 'no_norm']
+    axis
+        The axis to normalize the
+    epsilon
+    in_channels
+
+    Returns
+    -------
+    ln
+        The layer normalization layer
+    """
+    if isinstance(normalization, str):
+        if normalization == 'layer_norm':
+            ln = nn.LayerNorm(axis=axis, epsilon=epsilon, in_channels=in_channels,
+                              **kwargs)
+        elif normalization == 'no_norm':
+            ln = NoNorm(in_channels=in_channels, **kwargs)
+        else:
+            raise NotImplementedError('normalization={} is not supported'.format(normalization))
+        return ln
+    else:
+        raise NotImplementedError('The type of normalization must be str')
+
+
+@use_np
+class NoNorm(HybridBlock):
+    r"""
+    Apply an element-wise linear transformation to the n-dimensional input array.
+    replacing the layer normalization.
+
+    .. math::
+        out = \gmmma \circ data + \beta
+
+    Parameters
+    ----------
+    in_channels : int
+        Number of channels (feature maps) in input data. If not specified,
+        initialization will be deferred to the first time `forward` is called
+    center: bool, default True
+        If True, add offset of `beta` to normalized tensor.
+        If False, `beta` is ignored.
+    scale: bool, default True
+        If True, multiply by `gamma`. If False, `gamma` is not used.
+    beta_initializer: str or `Initializer`, default 'zeros'
+        Initializer for the beta weight.
+    gamma_initializer: str or `Initializer`, default 'ones'
+        Initializer for the gamma weight.
+
+    Inputs:
+        - **data**: input tensor with arbitrary shape.
+
+    Outputs:
+        - **out**: output tensor with the same shape as `data`.
+
+    References
+    ----------
+        `MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices
+        <https://arxiv.org/pdf/2004.02984.pdf>`_
+
+    Examples
+    --------
+    >>> # Input of shape (2, 5)
+    >>> x = mx.np.array([[1, 2, 3, 4, 5], [1, 1, 2, 2, 2]])
+    >>> # Layer normalization is calculated with the above formula
+    >>> layer = NoNorm(in_channels=5)
+    >>> layer.initialize(ctx=mx.cpu(0))
+    >>> layer(x)
+    array([[1., 2., 3., 4., 5.],
+       [1., 1., 2., 2., 2.]])
+    """
+    def __init__(self, in_channels, center=True, scale=True,
+                 beta_initializer='zeros', gamma_initializer='ones',
+                 **kwargs):
+        super().__init__(**kwargs)
+        self._kwargs = {'center': center, 'scale': scale}
+        self._in_channels = in_channels
+        self.gamma = Parameter('gamma', grad_req='write' if scale else 'null',
+                               shape=(in_channels,), init=gamma_initializer,
+                               allow_deferred_init=True)
+        self.beta = Parameter('beta', grad_req='write' if center else 'null',
+                              shape=(in_channels,), init=beta_initializer,
+                              allow_deferred_init=True)
+
+    def hybrid_forward(self, F, data, gamma, beta):
+        return data * gamma + beta
+
+    def __repr__(self):
+        s = '{name}({content}'
+        in_channels = self.gamma.shape[0]
+        s += ', in_channels={0}'.format(in_channels)
+        s += ')'
+        return s.format(name=self.__class__.__name__,
+                        content=', '.join(['='.join([k, v.__repr__()])
+                                           for k, v in self._kwargs.items()]))
+
+
+def _fmt_and_check_cutoffs(cutoffs, vocab_size):
+    """Parse and get the cutoffs used in adaptive embedding + adaptive softmax
+
+    Parameters
+    ----------
+    cutoffs
+        The cutoffs of the
+    vocab_size
+        Size of the vocabulary
+
+    Returns
+    -------
+    cutoffs
+        The parsed cutoffs, will be [0, c0, c1, ..., c_{k-1}, V]
+        If the original cutoffs is empty or is None, return None
+    """
+    # Sanity checks
+    if cutoffs is None:
+        return None
+    if isinstance(cutoffs, int):
+        cutoffs = [cutoffs]
+    else:
+        cutoffs = list(cutoffs)
+        if len(cutoffs) == 0:
+            return None
+    if cutoffs != sorted(cutoffs):
+        raise ValueError('cutoffs must be a sorted list of cutoff values. '
+                         'Got {}, but expected {}'.format(cutoffs, sorted(cutoffs)))
+    if len(set(cutoffs)) != len(cutoffs):
+        raise ValueError('cutoffs cannot contain duplicates! cutoffs={}'.format(cutoffs))
+    if not cutoffs:
+        raise ValueError('cutoffs must not be empty. Got {}'.format(cutoffs))
+    if cutoffs[0] <= 0:
+        raise ValueError('The first cutoff value ({}) must be greater 0.'.format(cutoffs[0]))
+    if cutoffs[-1] >= vocab_size:
+        raise ValueError(
+            'The last cutoff value ({}) must be smaller than vocab_size ({}).'.format(
+                cutoffs[-1], vocab_size))
+    return cutoffs
+
+
+def _gen_repr_with_kwargs(kwargs, cls_name):
+    s = '{name}(\n'.format(name=cls_name)
+    for i, (k, v) in enumerate(kwargs.items()):
+        if i != len(kwargs.items()) - 1:
+            s += '\t{}={},\n'.format(k, v)
+        else:
+            s += '\t{}={}\n'.format(k, v)
+    s += ')'
+    return s
+
+
+def get_activation(act: Optional[Union[str, HybridBlock]]) -> HybridBlock:
+    """Get the activation based on the string
+
+    Parameters
+    ----------
+    act
+        The activation
+
+    Returns
+    -------
+    ret
+        The activation layer
+
+    """
+    if act is None:
+        return lambda x: x
+    if isinstance(act, str):
+        if act == 'leaky':
+            # TODO(sxjscience) Add regex matching here to parse `leaky(0.1)`
+            return nn.LeakyReLU(0.1)
+        elif act == 'identity':
+            return IdentityActivation()
+        elif act == 'elu':
+            return ELU()
+        elif act == 'gelu':
+            return GELU(mode='erf')
+        elif act == 'gelu(tanh)':
+            return GELU(mode='tanh')
+        elif act == 'gelu(sigmoid)':
+            return GELU(mode='sigmoid')
+        elif act in ['relu', 'sigmoid', 'tanh', 'softrelu', 'softsign']:
+            return nn.Activation(act)
+        else:
+            raise NotImplementedError('act={} is not supported'.format(act))
+    else:
+        return act
+
+
+@use_np
+class MultiHeadDense(HybridBlock):
+    def __init__(self, units, num_heads, use_bias=True, dtype='float32',
+                 weight_initializer=None, bias_initializer=None):
+        """Multiple Dense with different parameters and the same number of units
+        The inner shapes of the weight and bias are
+            weight: (self._parallel_num[0] * ... * self._parallel_num[k] * units, in_units)
+            bias: (self._parallel_num[0] * ... * self._parallel_num[k],)
+        Parameters
+        ----------
+        units : int
+            The basic units.
+        num_heads : int or tuple
+        use_bias : bool, default True
+        dtype : str, default 'float32'
+            The data type
+        weight_initializer : None or initialzer, default None
+        bias_initializer : None or initializer, default None
+        """
+        super().__init__()
+        if not isinstance(num_heads, (list, tuple)):
+            num_heads = (int(num_heads),)
+        else:
+            num_heads = tuple(num_heads)
+        self._num_heads = num_heads
+        self._use_bias = use_bias
+        for ele in self._num_heads:
+            if ele <= 0:
+                raise ValueError('Invalid number of heads, all numbers need to be larger than 0.'
+                                 ' num_heads={}'.format(num_heads))
+        self._units = units
+        self._mult = np.prod(num_heads)
+        self.weight = Parameter('weight', shape=(self._mult * units, 0),
+                                init=weight_initializer, dtype=dtype,
+                                allow_deferred_init=True)
+        if use_bias:
+            self.bias = Parameter('bias', shape=(self._mult * units,),
+                                  init=bias_initializer, dtype=dtype,
+                                  allow_deferred_init=True)
+        else:
+            self.bias = None
+
+    def hybrid_forward(self, F, data, weight, bias=None):
+        """
+        Parameters
+        ----------
+        F
+        data : Symbol or NDArray
+            Shape (B, ..., C_in)
+        Returns
+        -------
+        ret : Symbol or NDArray
+            Shape (B,) + num_heads + (, ..., C_out)
+        """
+        ret = F.npx.fully_connected(data, weight, bias, no_bias=bias is None,
+                                    num_hidden=self._mult * self._units, flatten=False, name='fwd')
+        ret = F.npx.reshape(ret, newshape=(-4, self._mult, -1, -6), reverse=True)
+        ret = F.np.moveaxis(ret, -2, 1)
+        for i in range(len(self._num_heads) - 1, 0, -1):
+            ret = F.npx.reshape(ret, newshape=(-2, -6, -1, self._num_heads[i], -4))
+        return ret
+
+    def __repr__(self):
+        s = '{name}(' \
+            'units={units},' \
+            ' num_heads={num_heads},' \
+            ' use_bias={use_bias},' \
+            ' weight={weight}' \
+            ')'.format(name=self.__class__.__name__,
+                       units=self._units,
+                       num_heads=self._num_heads,
+                       use_bias=self._use_bias,
+                       weight=self.weight.shape)
+        return s
+
+
+@use_np
+class IdentityActivation(HybridBlock):
+    def hybrid_forward(self, F, x):
+        return x
+
+
+@use_np
+class GELU(HybridBlock):
+    r"""Gaussian Error Linear Unit.
+
+    This is a smoother version of the RELU. See https://arxiv.org/abs/1606.08415 for more details.
+
+    The original formula is `x gaussian_cdf(x)`.
+    Here, we provide three different ways to calculate/approximate GELU.
+
+        - mode = 'erf'
+
+            y = 0.5 x (1 + erf(\frac{x}{\sqrt{2}}))
+
+        - mode = 'tanh'
+
+            y =  0.5 x (1 + tanh[\sqrt(2/\pi) * (x + 0.044715 x^3)])
+
+        - mode = 'sigmoid'
+
+            y = x \sigma(1.702x)
+
+
+    Parameters
+    ----------
+    Inputs:
+        - **data**: input tensor with arbitrary shape.
+    Outputs:
+        - **out**: output tensor with the same shape as `data`.
+    """
+    def __init__(self, mode='erf'):
+        """
+
+        Parameters
+        ----------
+        mode
+        """
+        super().__init__()
+        if mode not in ['erf', 'tanh', 'sigmoid']:
+            raise ValueError('Unsupported mode, only support "erf", "tanh", or "sigmoid". '
+                             'Received mode={}'.format(mode))
+        self._mode = mode
+
+    def hybrid_forward(self, F, x):
+        if self._mode == 'erf':
+            return F.npx.leaky_relu(x, act_type='gelu')
+        elif self._mode == 'tanh':
+            return 0.5 * x\
+                   * (1.0 + F.np.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * (x ** 3))))
+        elif self._mode == 'sigmoid':
+            return x * F.npx.sigmoid(1.702 * x)
+        else:
+            raise NotImplementedError
+
+    def __repr__(self):
+        s = '{name}(mode={mode})'
+        return s.format(name=self.__class__.__name__, mode=self._mode)
+
+
+@use_np
+class ELU(HybridBlock):
+    r"""
+    Exponential Linear Unit (ELU)
+        "Fast and Accurate Deep Network Learning by Exponential Linear Units", Clevert et al, 2016
+        https://arxiv.org/abs/1511.07289
+        Published as a conference paper at ICLR 2016
+    Parameters
+    ----------
+    alpha : float
+        The alpha parameter as described by Clevert et al, 2016
+    Inputs:
+        - **data**: input tensor with arbitrary shape.
+    Outputs:
+        - **out**: output tensor with the same shape as `data`.
+    """
+    def __init__(self, alpha=1.0, **kwargs):
+        super().__init__(**kwargs)
+        self._alpha = alpha
+
+    def hybrid_forward(self, F, x):
+        return - self._alpha * F.npx.relu(1.0 - F.np.exp(x)) + F.npx.relu(x)
+
+    def __repr__(self):
+        s = '{name}(alpha={alpha})'
+        return s.format(name=self.__class__.__name__, alpha=self._alpha)
+
+
+@use_np
+class PositionalEmbedding(HybridBlock):
+    def __init__(self, units, max_length=None, method='sinusoidal',
+                 dtype='float32'):
+        super().__init__()
+        self._units = units
+        self._max_length = max_length
+        self._method = method
+        self._dtype = dtype
+        if method == 'sinusoidal':
+            self._embed = SinusoidalPositionalEmbedding(units=units,
+                                                        dtype=dtype)
+        elif method == 'learned':
+            self._embed = LearnedPositionalEmbedding(units=units,
+                                                     max_length=max_length,
+                                                     dtype=dtype)
+        else:
+            raise NotImplementedError
+
+    def hybrid_forward(self, F, positions):
+        """
+
+        Parameters
+        ----------
+        F
+        positions : mx.numpy.ndarray or mx.numpy.Symbol
+            Shape (..., )
+
+        Returns
+        -------
+        ret :
+            Shape (..., units)
+        """
+        return self._embed(positions)
+
+
+@use_np
+class SinusoidalPositionalEmbedding(HybridBlock):
+    def __init__(self, units: int, dtype: Union[str, type] = 'float32'):
+        """Use a geometric sequence of timescales.
+
+        Parameters
+        ----------
+        units
+            The number of units for positional embedding
+        dtype
+            The dtype of the inner positional embeddings
+        """
+        super().__init__()
+
+        def _init_sinusodial_base(units):
+            half_units = units // 2
+            val = np.log(10000) / (half_units - 1)
+            val = np.exp(np.arange(half_units, dtype=np.float32) * -val)
+            return val
+
+        self._units = units
+        self._dtype = dtype
+        self.base_mult = Constant(_init_sinusodial_base(units))
+
+    def hybrid_forward(self, F, positions, base_mult):
+        """
+
+        Parameters
+        ----------
+        F
+        positions : NDArray
+            Shape (..., )
+
+        Returns
+        -------
+        ret :
+            Shape (..., units)
+        """
+        emb = F.np.expand_dims(positions.astype(self._dtype), axis=-1) * base_mult
+        sin_emb = F.np.sin(emb)
+        cos_emb = F.np.cos(emb)
+        if self._units % 2 == 0:
+            return F.np.concatenate([sin_emb, cos_emb], axis=-1)
+        else:
+            return F.np.concatenate(
+                [sin_emb, cos_emb, F.np.expand_dims(F.np.zeros_like(positions).astype(self._dtype),
+                                                    axis=-1)], axis=-1)
+
+    def __repr__(self):
+        s = '{name}(units={units}, dtype={dtype})'
+        return s.format(name=self.__class__.__name__,
+                        units=self._units,
+                        dtype=self._dtype)
+
+
+@use_np
+class LearnedPositionalEmbedding(HybridBlock):
+    def __init__(self, units, max_length, mode='clip',
+                 dtype='float32', weight_initializer=None):
+        super().__init__()
+        self._units = units
+        self._dtype = dtype
+        self._max_length = max_length
+        self._mode = mode
+
+        self.weight = Parameter('weight', shape=(max_length, units),
+                                init=weight_initializer, dtype=dtype,
+                                allow_deferred_init=True)
+
+    def __repr__(self):
+        s = '{name}(units={units}, max_length={max_length}, mode={mode}, dtype={dtype})'
+        return s.format(name=self.__class__.__name__,
+                        units=self._units,
+                        max_length=self._max_length,
+                        mode=self._mode,
+                        dtype=self._dtype)
+
+    def hybrid_forward(self, F, positions, weight):
+        return F.np.take(weight, positions, axis=0, mode=self._mode)
+
+
+@use_np
+class BucketPositionalEmbedding(HybridBlock):
+    """Divide the positional space into buckets and assign the relative positions within each
+    bucket to the same value. For positions that are out-of-the-boundary, they are treated as
+    falling into one bucket.
+
+    This is used in the T5 paper:
+    "[Arxiv2019] Exploring the limits of transfer learning with a unified text-to-text transformer",
+
+    Here, the first half of the buckets handles the small shifts and the second half
+    of the buckets handles the large shifts (mapping them in logarithmically separated bins).
+    """
+    def __init__(self, units, bidirectional=True, num_buckets=32, max_distance=128,
+                 dtype='float32', embed_initializer=None):
+        super().__init__()
+        self._units = units
+        self._bidirectional = bidirectional
+        self._num_buckets = num_buckets
+        self._max_distance = max_distance
+        self._dtype = dtype
+        self.weight = Parameter('weight', shape=(num_buckets, units),
+                                init=embed_initializer, dtype=dtype,
+                                allow_deferred_init=True)
+
+    def __repr__(self):
+        s = '{name}(units={units}, bidirectional={bidirectional}, num_buckets={num_buckets},' \
+            ' max_distance={max_distance}, dtype={dtype})'
+        return s.format(name=self.__class__.__name__,
+                        units=self._units,
+                        bidirectional=self._bidirectional,
+                        num_buckets=self._num_buckets,
+                        max_distance=self._max_distance,
+                        dtype=self._dtype)
+
+    def hybrid_forward(self, F, relative_positions, weight):
+        buckets = relative_position_bucket(F, relative_positions,
+                                           bidirectional=self._bidirectional,
+                                           num_buckets=self._num_buckets,
+                                           max_distance=self._max_distance)
+        return F.np.take(weight, buckets, axis=0)
+
+
+@use_np
+class PositionwiseFFN(HybridBlock):
+    """The Position-wise FFN layer used in Transformer-like architectures
+
+    If pre_norm is True:
+        norm(data) -> fc1 -> act -> act_dropout -> fc2 -> dropout -> res(+data)
+    Else:
+        data -> fc1 -> act -> act_dropout -> fc2 -> dropout -> norm(res(+data))
+    """
+    def __init__(self,
+                 units: int = 512,
+                 hidden_size: int = 2048,
+                 activation_dropout: float = 0.0,
+                 dropout: float = 0.1,
+                 weight_initializer=None,
+                 bias_initializer='zeros',
+                 activation='relu',
+                 normalization: str = 'layer_norm',
+                 layer_norm_eps: float = 1E-5,
+                 pre_norm: bool = False,
+                 dtype='float32'):
+        """
+
+        Parameters
+        ----------
+        units
+        hidden_size
+        activation_dropout
+        dropout
+        weight_initializer
+        bias_initializer
+        activation
+        normalization
+            layer_norm or no_norm
+        layer_norm_eps
+        pre_norm
+            Pre-layer normalization as proposed in the paper:
+            "[ACL2018] The Best of Both Worlds: Combining Recent Advances in
+             Neural Machine Translation"
+            This will stabilize the training of Transformers.
+            You may also refer to
+            "[Arxiv2020] Understanding the Difficulty of Training Transformers"
+        """
+        super().__init__()
+        self._dtype = dtype
+        self._pre_norm = pre_norm
+        self._kwargs = OrderedDict([
+            ('units', units),
+            ('hidden_size', hidden_size),
+            ('activation_dropout', activation_dropout),
+            ('activation', activation),
+            ('dropout', dropout),
+            ('normalization', normalization),
+            ('layer_norm_eps', layer_norm_eps),
+            ('pre_norm', pre_norm),
+            ('dtype', self._dtype)
+        ])
+        self.dropout_layer = nn.Dropout(dropout)
+        self.activation_dropout_layer = nn.Dropout(activation_dropout)
+        self.ffn_1 = nn.Dense(units=hidden_size,
+                              in_units=units,
+                              flatten=False,
+                              weight_initializer=weight_initializer,
+                              bias_initializer=bias_initializer,
+                              dtype=dtype)
+        self.activation = get_activation(activation)
+        self.ffn_2 = nn.Dense(units=units,
+                              in_units=hidden_size,
+                              flatten=False,
+                              weight_initializer=weight_initializer,
+                              bias_initializer=bias_initializer,
+                              dtype=dtype)
+        # TODO(sxjscience) We may need to set the dtype flag in LayerNorm, need to double check
+        self.layer_norm = get_layer_norm(normalization=normalization,
+                                         in_channels=units,
+                                         epsilon=layer_norm_eps)
+
+    def hybrid_forward(self, F, data):
+        """
+
+        Parameters
+        ----------
+        F
+        data :
+            Shape (B, seq_length, C_in)
+
+        Returns
+        -------
+        out :
+            Shape (B, seq_length, C_out)
+        """
+        if self._pre_norm:
+            data = self.layer_norm(data)
+        out = self.activation(self.ffn_1(data))
+        out = self.activation_dropout_layer(out)
+        out = self.ffn_2(out)
+        out = self.dropout_layer(out)
+        out = out + data
+        if not self._pre_norm:
+            out = self.layer_norm(out)
+        return out
+
+    def __repr__(self):
+        return _gen_repr_with_kwargs(self._kwargs, self.__class__.__name__)
+
+
+@use_np
+class AdaptiveEmbedding(HybridBlock):
+    """Adaptive Embedding.
+
+    It uses larger embedding units for tokens with higher frequencies. This helps reduce the risk
+    of overfitting to rare words.
+
+    Baevski, Alexei, and Michael Auli.
+     "Adaptive input representations for neural language modeling." ICLR 2019.
+
+    From input = (..., ) --> embedding (..., units)
+    """
+    def __init__(self, vocab_size: int,
+                 embed_size: int,
+                 units: int,
+                 cutoffs: Optional[Union[int, List]] = None,
+                 div_val: float = 1.0,
+                 dtype='float32',
+                 scaled=True,
+                 embedding_initializer: InitializerType = None,
+                 weight_initializer: InitializerType = None):
+        """
+
+        Parameters
+        ----------
+        vocab_size
+            The size of the vocabulary
+        embed_size
+            The base size of the embedding vectors. The embedding size of each cluster will be
+            [embed_size / div_val**0, embed_size / div_val**1, embed_size / div_val**2, ...]
+        units
+            The number of units after the mapping
+        cutoffs
+            The cutoffs to slice the vocab to multiple clusters. It should be a sorted list. Each
+            value should be between 1 --> vocab_size - 1.
+        div_val
+            The base denominator for computing the size of the embedding vector in each cluster.
+        dtype
+            The data type of layer
+        scaled
+            Whether to scale the embedding by sqrt(units)
+        embedding_initializer
+            Initializer of the embedding vectors
+        weight_initializer
+            Initializer of projection layers
+        bias_initializer
+            Initializer of the bias
+        """
+        super().__init__()
+        cutoffs = _fmt_and_check_cutoffs(cutoffs, vocab_size)
+        if cutoffs is None:
+            assert div_val == 1.0
+        self._dtype = dtype
+        self._kwargs = OrderedDict([
+            ('cutoffs', cutoffs),
+            ('vocab_size', vocab_size),
+            ('embed_size', embed_size),
+            ('units', units),
+            ('div_val', div_val),
+            ('dtype', dtype),
+            ('scaled', scaled)
+        ])
+        self._vocab_size = vocab_size
+        self._cutoffs = cutoffs
+        self._units = units
+        self._embed_size = embed_size
+        self._div_val = div_val
+        self._scaled = scaled
+        if self._scaled:
+            self._emb_scale = units**0.5
+        if div_val == 1.0:
+            setattr(self, 'embed0_weight',
+                    Parameter('embed0_weight',
+                              shape=(vocab_size, embed_size),
+                              init=embedding_initializer,
+                              allow_deferred_init=True))
+
+            if units != embed_size:
+                setattr(self, 'inter_proj0_weight',
+                        Parameter('inter_proj0_weight',
+                                  shape=(embed_size, units),
+                                  init=weight_initializer,
+                                  allow_deferred_init=True))
+            else:
+                self.proj_layers = None
+        else:
+            self.proj_layers = nn.HybridSequential()
+            for i, (l_idx, r_idx) in enumerate(zip([0] + cutoffs, cutoffs + [vocab_size])):
+                inner_embed_size = int(embed_size / div_val**i)
+                if inner_embed_size == 0:
+                    raise ValueError('div_val = {} is too large for the layer. Currently, the '
+                                     'cutoffs are {} and the embed_size is {}. Using the '
+                                     'div_val = {} will cause some clusters to have '
+                                     'embed_size=0.'.format(div_val, cutoffs, embed_size,
+                                                            div_val))
+                setattr(
+                    self, 'embed{}_weight'.format(i),
+                    Parameter('embed{}_weight'.format(i),
+                              shape=(r_idx - l_idx, inner_embed_size),
+                              init=embedding_initializer,
+                              allow_deferred_init=True))
+                setattr(self, 'inter_proj{}_weight'.format(i),
+                        Parameter('inter_proj{}_weight'.format(i),
+                                  shape=(inner_embed_size, units),
+                                  init=weight_initializer,
+                                  allow_deferred_init=True))
+
+    def hybrid_forward(self, F, inp, **params):  # pylint: disable=arguments-differ
+        """
+
+        Parameters
+        ----------
+        F
+        inp
+            Shape (...,)
+        params
+
+        Returns
+        -------
+        out
+            Shape (..., units)
+        """
+        if self._div_val == 1.0:
+            emb = F.np.take(params['embed0_weight'], inp, axis=0)
+            if self._units != self._embed_size:
+                emb = F.np.dot(emb, params['inter_proj0_weight'])
+        else:
+            emb = None
+            # TODO(?) We can refactor the code using
+            #  F.np._internal.nonzero() + F.npx.index_update
+            for i, (l_idx, r_idx) in enumerate(zip([0] + self._cutoffs,
+                                                   self._cutoffs + [self._vocab_size])):
+                emb_i = F.np.take(params['embed{}_weight'.format(i)],
+                                  inp - l_idx, axis=0,
+                                  mode='clip')
+                emb_i = F.np.dot(emb_i, params['inter_proj{}_weight'.format(i)])
+                if emb is None:
+                    emb = emb_i
+                else:
+                    emb = F.np.where(F.np.expand_dims((inp >= l_idx) * (inp < r_idx), axis=-1),
+                                     emb_i, emb)
+        if self._scaled:
+            emb = emb * self._emb_scale
+        return emb
+
+    def __repr__(self):
+        return _gen_repr_with_kwargs(self._kwargs, self.__class__.__name__)
+
+
+@use_np
+class ProjectedAdaptiveLogSoftmaxWithLoss(HybridBlock):
+    r"""Projected Adaptive LogSoftmax Loss.
+
+    Projected Adaptive LogSoftmax is a practical way to accelerate the computation of log-softmax.
+    We divide the words into multiple clusters based on the cutoffs:
+    For example, if the cutoffs are [c0, c1] and there are N words, we can divide these N words into
+    three clusters:
+
+    Cluster-1: [V_0, V_1, ..., V_{c0}],
+    Cluster-2: [V_{c0 + 1}, V_{c0 + 2}, ... V_{c1}]
+    Cluster-3: [V_{c1 + 1}, V_{c1 + 2}, ... V_{N - 1}]
+
+    Usually, the cutoffs are chosen based on the frequency of the words. The
+    top clusters will contain more common words and the bottom ones contain less frequent
+    words.
+
+    Based on this property, Adaptive Softmax calculate the logits step-by-step.
+    We first calculate the probability for all words in the first cluster +
+    additional probability values for the situations that the word belongs to the other
+    clusters.
+
+    For the example above, we will have two additional virtual words: T2, and T3, meaning that the
+    correct word should be at the 2nd or 3rd cluster
+
+    prob1 = \softmax([V_0, V_1, ..., V_{c0}, T2, T3])
+    prob2 = p(T2) * \softmax([V_{c0 + 1}, V_{c0 + 2}, ... V_{c1}])
+    prob3 = p(T3) * softmax([V_{c1 + 1}, V_{c1 + 2}, ... V_{N - 1}])
+
+
+    Converting to log-probability, we have
+    lprob1 = log-softmax([V_0, V_1, ..., V_{c0}, T2, T3])
+    lprob2 = lprob1[T2] + log-softmax([V_{c0 + 1}, V_{c0 + 2}, ... V_{c1}])
+    lprob3 = lprob2[T3] + log-softmax([V_{c1 + 1}, V_{c1 + 2}, ... V_{N - 1}])
+
+
+    @inproceedings{grave2017efficient,
+      title={Efficient softmax approximation for GPUs},
+      author={Grave, Edouard and Joulin, Armand and Ciss{\'e}, Moustapha and J{\'e}gou, Herv{\'e} and others},
+      booktitle={Proceedings of the 34th International Conference on Machine Learning-Volume 70},
+      pages={1302--1310},
+      year={2017},
+      organization={JMLR. org}
+    }
+    """
+    def __init__(self, vocab_size: int, embed_size: int, in_units: int,
+                 cutoffs: Optional[Union[int, List]] = None,
+                 div_val: float = 1.0,
+                 dtype='float32',
+                 use_bias=True,
+                 weight_initializer: InitializerType = None,
+                 bias_initializer: InitializerType = None):
+        """
+
+        Parameters
+        ----------
+        vocab_size
+            Size of the vocabulary
+        embed_size
+            Base embedding size. The hidden will be first projected to
+            embed_size and then project to vocab_size
+        in_units
+            The number of input units
+        cutoffs
+            The cutoff values
+        div_val
+            The base denominator for computing the size of the embedding vector in each cluster.
+        dtype
+            Data type
+        use_bias
+            Whether to use bias when computing the scores for the tokens
+        weight_initializer
+        bias_initializer
+        """
+        super().__init__()
+        cutoffs = _fmt_and_check_cutoffs(cutoffs, vocab_size)
+        if cutoffs is None:
+            assert div_val == 1.0
+        self._vocab_size = vocab_size
+        self._embed_size = embed_size
+        self._in_units = in_units
+        self._cutoffs = cutoffs
+        self._div_val = div_val
+        if cutoffs is not None:
+            self._num_tail_clusters = len(self._cutoffs)
+        self._dtype = dtype
+        self._kwargs = OrderedDict([
+            ('cutoffs', cutoffs),
+            ('vocab_size', vocab_size),
+            ('embed_size', embed_size),
+            ('in_units', in_units),
+            ('div_val', div_val),
+            ('dtype', dtype),
+            ('use_bias', use_bias)
+        ])
+        if cutoffs is not None:
+            self.tail_cluster_score_proj = nn.Dense(units=self._num_tail_clusters,
+                                                    in_units=embed_size,
+                                                    flatten=False,
+                                                    use_bias=use_bias,
+                                                    weight_initializer=weight_initializer,
+                                                    bias_initializer=bias_initializer)
+        self.inter_proj_l = nn.HybridSequential()
+        self.out_proj_l = nn.HybridSequential()
+        if div_val == 1.0:
+            if in_units != embed_size:
+                self.inter_proj_l.add(nn.Dense(in_units=in_units,
+                                               units=embed_size,
+                                               flatten=False,
+                                               use_bias=False,
+                                               weight_initializer=weight_initializer,
+                                               bias_initializer=bias_initializer))
+            self.out_proj_l.add(nn.Dense(in_units=embed_size,
+                                         units=vocab_size,
+                                         flatten=False,
+                                         use_bias=use_bias,
+                                         weight_initializer=weight_initializer,
+                                         bias_initializer=bias_initializer))
+        else:
+            for i, (l_idx, r_idx) in enumerate(zip([0] + self._cutoffs,
+                                                   self._cutoffs + [vocab_size])):
+                ele_embed_size = int(embed_size / (div_val ** i))
+                self.inter_proj_l.add(nn.Dense(in_units=in_units,
+                                               units=ele_embed_size,
+                                               flatten=False,
+                                               use_bias=False,
+                                               weight_initializer=weight_initializer,
+                                               bias_initializer=bias_initializer))
+                self.out_proj_l.add(nn.Dense(in_units=ele_embed_size,
+                                             units=r_idx - l_idx,
+                                             flatten=False,
+                                             use_bias=use_bias,
+                                             weight_initializer=weight_initializer,
+                                             bias_initializer=bias_initializer))
+
+    def get_logits(self, F, hidden):
+        """Get all the logits.
+
+        Parameters
+        ----------
+        F
+        hidden
+            The hidden representation
+            Shape (..., in_units)
+
+        Returns
+        -------
+        logits
+            Shape (..., |V|)
+
+        """
+        if self._cutoffs is None:
+            if self._in_units != self._embed_size:
+                hidden = self.inter_proj_l[0](hidden)
+            logits = self.out_proj_l[0](hidden)
+            return logits
+        else:
+            all_logits = []
+            if self._div_val == 1.0:
+                if self._in_units == self._embed_size:
+                    all_scores = self.out_proj_l[0](hidden)
+                    tail_cluster_scores = self.tail_cluster_score_proj(hidden)
+                else:
+                    inter_hidden = self.inter_proj_l[0](hidden)
+                    all_scores = self.out_proj_l[0](inter_hidden)
+                    tail_cluster_scores = self.tail_cluster_score_proj(inter_hidden)
+                all_scores_l = F.np.split(all_scores, self._cutoffs, axis=-1)
+                head_scores = all_scores_l[0]
+            else:
+                inter_hidden = self.inter_proj_l[0](hidden)
+                head_scores = self.out_proj_l[0](inter_hidden)
+                tail_cluster_scores = self.tail_cluster_score_proj(inter_hidden)
+            head_tail_cluster_logits = \
+                F.npx.log_softmax(F.np.concatenate([head_scores, tail_cluster_scores],
+                                                   axis=-1), axis=-1)
+            head_logits, tail_cluster_logits = \
+                F.np.split(head_tail_cluster_logits, [self._cutoffs[0]], axis=-1)
+            tail_cluster_logits = F.np.split(tail_cluster_logits, self._num_tail_clusters, axis=-1)
+            all_logits.append(head_logits)
+            for i in range(1, len(self._cutoffs) + 1):
+                if self._div_val == 1.0:
+                    ele_scores = all_scores_l[i]
+                else:
+                    ele_scores = self.out_proj_l[i](self.inter_proj_l[i](hidden))
+                ele_logits = F.npx.log_softmax(ele_scores, axis=-1)
+                ele_logits = tail_cluster_logits[-i] + ele_logits
+                all_logits.append(ele_logits)
+            return F.np.concatenate(all_logits, axis=-1)
+
+    def hybrid_forward(self, F, hidden, target):
+        """
+
+        Parameters
+        ----------
+        F
+        hidden
+            The hidden representation
+            Shape (..., in_units)
+        target
+            The target representation
+            Shape (...,)
+
+        Returns
+        -------
+        sel_logits
+            The log probability that each hidden has when label == target
+        """
+        # TODO(sxjscience) The computation here can be greatly accelerated! Due to the
+        #  missing feature of index_update, we are not able to do this here.
+        logits = self.get_logits(F, hidden)
+        sel_logits = F.npx.pick(logits, target, axis=-1)
+        return sel_logits
+
+    def __repr__(self):
+        return _gen_repr_with_kwargs(self._kwargs, self.__class__.__name__)
diff --git a/src/gluonnlp/loss.py b/src/gluonnlp/loss.py
new file mode 100644
index 0000000000..f592399fad
--- /dev/null
+++ b/src/gluonnlp/loss.py
@@ -0,0 +1,59 @@
+from mxnet.gluon import HybridBlock
+
+
+class LabelSmoothCrossEntropyLoss(HybridBlock):
+    r"""Computes the softmax cross entropy loss with label-smoothing
+
+    .. math::
+
+        \DeclareMathOperator{softmax}{softmax}
+
+        lp = \log \softmax({pred})
+
+        L = - [(1 - \alpha) \sum_{i=1}^N (lp_{i, {label}_i}) + \alpha \frac{1}{N} \sum_{j=1}^N (lp_{i, j})]
+
+    To reduce complexity, we can implement it as
+
+    .. math::
+
+        L = -\sum_i (\frac{N \alpha - 1}{N-1} lp_{i, {label}_i} + \frac{1 - \alpha}{N - 1} \sum_j lp_{i, j})
+
+    Parameters
+    ----------
+    num_labels
+        The number of possible labels. For example, in NLP, it can be the size of the vocabulary.
+    alpha
+        The uncertainty that will be injected to the labels. All the negative labels will be
+        treated with probability equals to \frac{\alpha} / {N}
+    from_logits
+        Whether input is a log probability (usually from log_softmax) instead of unnormalized numbers.
+    """
+    def __init__(self, num_labels: int, alpha: float = 0.1, from_logits: bool = False, **kwargs):
+        super().__init__(**kwargs)
+        self._num_labels = num_labels
+        self._alpha = alpha
+        self._from_logits = from_logits
+
+    def hybrid_forward(self, F, pred, label):
+        """
+
+        Parameters
+        ----------
+        F
+        pred :
+            The predictions of the network. Shape (..., V)
+        label :
+            The labels. Shape (..., )
+
+        Returns
+        -------
+        loss :
+            Shape (..., )
+        """
+        if not self._from_logits:
+            pred = F.npx.log_softmax(pred, axis=-1)
+        log_likelihood = F.npx.pick(pred, label, axis=-1)
+        all_scores = pred.sum(axis=-1)
+        loss = - (1 - self._alpha) * log_likelihood\
+               - self._alpha / float(self._num_labels) * all_scores
+        return loss
diff --git a/src/gluonnlp/loss/__init__.py b/src/gluonnlp/loss/__init__.py
deleted file mode 100644
index c87346c288..0000000000
--- a/src/gluonnlp/loss/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""NLP loss."""
-
-from . import activation_regularizer, loss, label_smoothing
-
-from .activation_regularizer import *
-from .loss import *
-from .label_smoothing import *
-
-__all__ = activation_regularizer.__all__ + loss.__all__ + label_smoothing.__all__
diff --git a/src/gluonnlp/loss/activation_regularizer.py b/src/gluonnlp/loss/activation_regularizer.py
deleted file mode 100644
index 55647b1fd8..0000000000
--- a/src/gluonnlp/loss/activation_regularizer.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=
-"""Language model RNN loss."""
-
-__all__ = ['ActivationRegularizationLoss', 'TemporalActivationRegularizationLoss']
-
-from mxnet.gluon.loss import Loss
-
-
-class ActivationRegularizationLoss(Loss):
-    r"""Computes Activation Regularization Loss. (alias: AR)
-
-    The formulation is as below:
-
-    .. math::
-
-        L = \alpha L_2(h_t)
-
-    where :math:`L_2(\cdot) = {||\cdot||}_2, h_t` is the output of the RNN at timestep t.
-    :math:`\alpha` is scaling coefficient.
-
-    The implementation follows the work::
-
-        @article{merity2017revisiting,
-          title={Revisiting Activation Regularization for Language RNNs},
-          author={Merity, Stephen and McCann, Bryan and Socher, Richard},
-          journal={arXiv preprint arXiv:1708.01009},
-          year={2017}
-        }
-
-    Parameters
-    ----------
-    alpha : float, default 0
-        The scaling coefficient of the regularization.
-    weight : float or None
-        Global scalar weight for loss.
-    batch_axis : int, default 0
-        The axis that represents mini-batch.
-    """
-    def __init__(self, alpha=0, weight=None, batch_axis=None, **kwargs):
-        super(ActivationRegularizationLoss, self).__init__(weight, batch_axis, **kwargs)
-        self._alpha = alpha
-
-    def __repr__(self):
-        s = 'ActivationRegularizationLoss (alpha={alpha})'
-        return s.format(alpha=self._alpha)
-
-    def hybrid_forward(self, F, *states): # pylint: disable=arguments-differ
-        """
-        Parameters
-        ----------
-        states : list
-            the stack outputs from RNN, which consists of output from each time step (TNC).
-
-        Returns
-        --------
-        loss : NDArray
-            loss tensor with shape (batch_size,). Dimensions other than batch_axis are averaged out.
-        """
-        # pylint: disable=unused-argument
-        if self._alpha != 0:
-            if states:
-                means = [self._alpha * state.__pow__(2).mean()
-                         for state in states[-1:]]
-                return F.add_n(*means)
-            else:
-                return F.zeros(1)
-        return F.zeros(1)
-
-
-class TemporalActivationRegularizationLoss(Loss):
-    r"""Computes Temporal Activation Regularization Loss. (alias: TAR)
-
-    The formulation is as below:
-
-    .. math::
-
-        L = \beta L_2(h_t-h_{t+1})
-
-    where :math:`L_2(\cdot) = {||\cdot||}_2, h_t` is the output of the RNN at timestep t,
-    :math:`h_{t+1}` is the output of the RNN at timestep t+1, :math:`\beta` is scaling coefficient.
-
-    The implementation follows the work::
-
-        @article{merity2017revisiting,
-          title={Revisiting Activation Regularization for Language RNNs},
-          author={Merity, Stephen and McCann, Bryan and Socher, Richard},
-          journal={arXiv preprint arXiv:1708.01009},
-          year={2017}
-        }
-
-    Parameters
-    ----------
-    beta : float, default 0
-        The scaling coefficient of the regularization.
-    weight : float or None
-        Global scalar weight for loss.
-    batch_axis : int, default 0
-        The axis that represents mini-batch.
-    """
-
-    def __init__(self, beta=0, weight=None, batch_axis=None, **kwargs):
-        super(TemporalActivationRegularizationLoss, self).__init__(weight, batch_axis, **kwargs)
-        self._beta = beta
-
-    def __repr__(self):
-        s = 'TemporalActivationRegularizationLoss (beta={beta})'
-        return s.format(beta=self._beta)
-
-    def hybrid_forward(self, F, *states): # pylint: disable=arguments-differ
-        """
-        Parameters
-        ----------
-        states : list
-            the stack outputs from RNN, which consists of output from each time step (TNC).
-
-        Returns
-        --------
-        loss : NDArray
-            loss tensor with shape (batch_size,). Dimensions other than batch_axis are averaged out.
-        """
-        # pylint: disable=unused-argument
-        if self._beta != 0:
-            if states:
-                means = [self._beta * (state[1:] - state[:-1]).__pow__(2).mean()
-                         for state in states[-1:]]
-                return F.add_n(*means)
-            else:
-                return F.zeros(1)
-        return F.zeros(1)
diff --git a/src/gluonnlp/loss/label_smoothing.py b/src/gluonnlp/loss/label_smoothing.py
deleted file mode 100644
index 6068fb43a7..0000000000
--- a/src/gluonnlp/loss/label_smoothing.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Label Smoothing"""
-
-__all__ = ['LabelSmoothing']
-
-import mxnet as mx
-from mxnet.gluon import HybridBlock
-
-# pylint: disable=unused-argument
-class _SmoothingWithDim(mx.operator.CustomOp):
-    def __init__(self, epsilon=0.1, axis=-1):
-        super(_SmoothingWithDim, self).__init__(True)
-        self._epsilon = epsilon
-        self._axis = axis
-
-    def forward(self, is_train, req, in_data, out_data, aux):
-        inputs = in_data[0]
-        outputs = ((1 - self._epsilon) * inputs) + (self._epsilon / float(inputs.shape[self._axis]))
-        self.assign(out_data[0], req[0], outputs)
-
-    def backward(self, req, out_grad, in_data, out_data, in_grad, aux):
-        self.assign(in_grad[0], req[0], (1 - self._epsilon) * out_grad[0])
-
-
-@mx.operator.register('_smoothing_with_dim')
-class _SmoothingWithDimProp(mx.operator.CustomOpProp):
-    def __init__(self, epsilon=0.1, axis=-1):
-        super(_SmoothingWithDimProp, self).__init__(True)
-        self._epsilon = float(epsilon)
-        self._axis = int(axis)
-
-    def list_arguments(self):
-        return ['data']
-
-    def list_outputs(self):
-        return ['output']
-
-    def infer_shape(self, in_shape):
-        data_shape = in_shape[0]
-        output_shape = data_shape
-        return (data_shape,), (output_shape,), ()
-
-    def declare_backward_dependency(self, out_grad, in_data, out_data):
-        return out_grad
-
-    def create_operator(self, ctx, in_shapes, in_dtypes):
-        #  create and return the CustomOp class.
-        return _SmoothingWithDim(self._epsilon, self._axis)
-# pylint: enable=unused-argument
-
-
-class LabelSmoothing(HybridBlock):
-    """Applies label smoothing. See https://arxiv.org/abs/1512.00567.
-
-    It changes the construction of the probability to (1 - epsilon) for the true class,
-    epsilon / (num_classes - 1) otherwise.
-
-    Parameters
-    ----------
-    axis : int, default -1
-        The axis to smooth.
-    epsilon : float, default 0.1
-        The epsilon parameter in label smoothing
-    sparse_label : bool, default True
-        Whether input is an integer array instead of one hot array.
-    units : int or None
-        Vocabulary size. If units is not given, it will be inferred from the input.
-    prefix : str
-        Prefix for name of `Block`s
-        (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells.
-        Created if `None`.
-    """
-    def __init__(self, axis=-1, epsilon=0.1, units=None,
-                 sparse_label=True, prefix=None, params=None):
-        super(LabelSmoothing, self).__init__(prefix=prefix, params=params)
-        self._axis = axis
-        self._epsilon = epsilon
-        self._sparse_label = sparse_label
-        self._units = units
-
-    def hybrid_forward(self, F, inputs, units=None): # pylint: disable=arguments-differ
-        """
-
-        Parameters
-        ----------
-        inputs : Symbol or NDArray
-            Shape (batch_size, length) or (batch_size, length, V)
-        units : int or None
-
-        Returns
-        -------
-        smoothed_label : Symbol or NDArray
-            Shape (batch_size, length, V)
-        """
-        if self._sparse_label:
-            assert units is not None or self._units is not None, \
-                'units needs to be given in function call or ' \
-                'instance initialization when sparse_label is False'
-            if units is None:
-                units = self._units
-            inputs = F.one_hot(inputs, depth=units)
-        if units is None and self._units is None:
-            return F.Custom(inputs, epsilon=self._epsilon, axis=self._axis,
-                            op_type='_smoothing_with_dim')
-        else:
-            if units is None:
-                units = self._units
-            return ((1 - self._epsilon) * inputs) + (self._epsilon / units)
diff --git a/src/gluonnlp/loss/loss.py b/src/gluonnlp/loss/loss.py
deleted file mode 100644
index 655501e737..0000000000
--- a/src/gluonnlp/loss/loss.py
+++ /dev/null
@@ -1,98 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Loss functions."""
-
-__all__ = ['MaskedSoftmaxCrossEntropyLoss', 'MaskedSoftmaxCELoss']
-
-import numpy as np
-from mxnet.gluon.loss import SoftmaxCELoss
-
-class MaskedSoftmaxCrossEntropyLoss(SoftmaxCELoss):
-    r"""Wrapper of the SoftmaxCELoss that supports valid_length as the input
-    (alias: MaskedSoftmaxCELoss)
-
-    If `sparse_label` is `True` (default), label should contain integer
-    category indicators:
-
-    .. math::
-
-        \DeclareMathOperator{softmax}{softmax}
-
-        p = \softmax({pred})
-
-        L = -\sum_i \log p_{i,{label}_i}
-
-    `label`'s shape should be `pred`'s shape with the channel dimension removed.
-    i.e. for `pred` with shape (1,2,3) `label`'s shape should
-    be (1,2).
-
-    If `sparse_label` is `False`, `label` should contain probability distribution
-    and `label`'s shape should be the same with `pred`:
-
-    .. math::
-
-        p = \softmax({pred})
-
-        L = -\sum_i \sum_j {label}_j \log p_{ij}
-
-    Parameters
-    ----------
-    sparse_label : bool, default True
-        Whether label is an integer array instead of probability distribution.
-    from_logits : bool, default False
-        Whether input is a log probability (usually from log_softmax) instead
-        of unnormalized numbers.
-    weight : float or None
-        Global scalar weight for loss.
-
-    Inputs:
-        - **pred**: the prediction tensor, shape should be (N, T, C)
-        - **label**: the truth tensor. When `sparse_label` is True, `label`'s
-          shape should be `pred`'s shape with the channel dimension C removed.
-          i.e. for `pred` with shape (1,2,3) `label`'s shape should be (1,2)
-          and values should be integers between 0 and 2.
-          If `sparse_label` is False, `label`'s shape must be the same as `pred`
-          and values should be floats in the range `[0, 1]`.
-        - **valid_length**: valid length of each sequence, of shape (batch_size, )
-          predictions elements longer than their valid_length are masked out
-
-    Outputs:
-        - **loss**: loss tensor with shape (batch_size,). Dimensions other than
-          batch_axis are averaged out.
-    """
-    def __init__(self, sparse_label=True, from_logits=False, weight=None,
-                 **kwargs):
-        # The current technique only works with NTC data
-        axis = -1
-        batch_axis = 0
-        super(MaskedSoftmaxCrossEntropyLoss, self).__init__(axis, sparse_label, from_logits,
-                                                            weight, batch_axis, **kwargs)
-
-    def hybrid_forward(self, F, pred, label, valid_length): # pylint: disable=arguments-differ
-        if self._sparse_label:
-            sample_weight = F.cast(F.expand_dims(F.ones_like(label), axis=-1), dtype=np.float32)
-        else:
-            sample_weight = F.ones_like(label)
-        sample_weight = F.SequenceMask(sample_weight,
-                                       sequence_length=valid_length,
-                                       use_sequence_length=True,
-                                       axis=1)
-        return super(MaskedSoftmaxCrossEntropyLoss, self).hybrid_forward(
-            F, pred, label, sample_weight)
-
-MaskedSoftmaxCELoss = MaskedSoftmaxCrossEntropyLoss
diff --git a/src/gluonnlp/lr_scheduler.py b/src/gluonnlp/lr_scheduler.py
new file mode 100644
index 0000000000..c2c5c490a0
--- /dev/null
+++ b/src/gluonnlp/lr_scheduler.py
@@ -0,0 +1,35 @@
+import math
+from mxnet import lr_scheduler
+
+
+class InverseSquareRootScheduler(lr_scheduler.LRScheduler):
+    """ Reduce the learning rate according to a polynomial of given power.
+
+    During warmup
+        Increase the learning rate linearly from warmup_init_lr to base_lr,
+    After warmup
+        Decay the learning rate with
+            lr = base_lr * sqrt(warmup_steps) / sqrt(num_update)
+
+    Parameters
+    ----------
+        warmup_steps
+            maximum number of updates before the decay reaches final learning rate.
+        base_lr
+            The final learning rate in the warm-up stage. The learning rate starts to decay after
+            the lr reaches warmup_end_lr
+        warmup_init_lr
+            The initial learning rate of the scheduler. The warm up starts at this point.
+    """
+
+    def __init__(self, warmup_steps: int, base_lr: float = 1E-3, warmup_init_lr: float = 0.0):
+        super().__init__(
+            base_lr, warmup_steps, warmup_init_lr, 'linear')
+        self.base_lr = base_lr
+        self.warmup_steps = warmup_steps
+
+    def __call__(self, num_update):
+        if num_update < self.warmup_steps:
+            return self.get_warmup_lr(num_update)
+        else:
+            return self.base_lr * math.sqrt(self.warmup_steps) / math.sqrt(num_update)
diff --git a/src/gluonnlp/metric/__init__.py b/src/gluonnlp/metric/__init__.py
deleted file mode 100644
index d26d4e0e06..0000000000
--- a/src/gluonnlp/metric/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""NLP Metrics."""
-
-from . import masked_accuracy
-from . import length_normalized_loss
-
-from .masked_accuracy import *
-from .length_normalized_loss import *
-
-__all__ = masked_accuracy.__all__ + length_normalized_loss.__all__
diff --git a/src/gluonnlp/metric/length_normalized_loss.py b/src/gluonnlp/metric/length_normalized_loss.py
deleted file mode 100644
index 378b22bb60..0000000000
--- a/src/gluonnlp/metric/length_normalized_loss.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Length Normalized Loss """
-
-from mxnet import ndarray
-from mxnet.metric import EvalMetric
-
-__all__ = ['LengthNormalizedLoss']
-
-class LengthNormalizedLoss(EvalMetric):
-    """Compute length normalized loss metrics
-
-    Parameters
-    ----------
-    axis : int, default=1
-        The axis that represents classes
-    name : str
-        Name of this metric instance for display.
-    output_names : list of str, or None
-        Name of predictions that should be used when updating with update_dict.
-        By default include all predictions.
-    label_names : list of str, or None
-        Name of labels that should be used when updating with update_dict.
-        By default include all labels.
-    """
-    def __init__(self, axis=0, name='length-normalized-loss',
-                 output_names=None, label_names=None):
-        super(LengthNormalizedLoss, self).__init__(
-            name, axis=axis,
-            output_names=output_names, label_names=label_names,
-            has_global_stats=True)
-
-    def update(self, labels, preds):
-        """Update the length normalized metrics with target label and loss
-
-        Update the sum_metrics and sum_insts of metrics with provided arguments.
-
-        Parameters:
-        ----------
-        labels: list or tuple
-            It contains two elements. The first element is the target sentence and
-            the second element is the valid length of target sentence
-        preds: list or ndarray.ndarray.NDArray
-            a list of ndarray.ndarray.NDArray or scalar or a single
-            ndarray.ndarray.NDArray. It is usually the loss predicted by the model
-        """
-        typecheck = not isinstance(labels, list) and not isinstance(labels, tuple)
-        if typecheck or len(labels) != 2:
-            raise ValueError('labels must be a list. Its first element should be'
-                             ' target sequence and the second element should be'
-                             'the valid length of sequence.')
-
-        _, seq_valid_length = labels
-
-        if not isinstance(seq_valid_length, list):
-            seq_valid_length = [seq_valid_length]
-
-        if not isinstance(preds, list):
-            preds = [preds]
-
-        for length in seq_valid_length:
-            if isinstance(length, ndarray.ndarray.NDArray):
-                total_length = ndarray.sum(length).asscalar()
-            else:
-                total_length = length
-            self.num_inst += total_length
-            self.global_num_inst += total_length
-
-        for pred in preds:
-            if isinstance(pred, ndarray.ndarray.NDArray):
-                loss = ndarray.sum(pred).asscalar()
-            else:
-                loss = pred
-            self.sum_metric += loss
-            self.global_sum_metric += loss
diff --git a/src/gluonnlp/metric/masked_accuracy.py b/src/gluonnlp/metric/masked_accuracy.py
deleted file mode 100644
index a6ef3a5a69..0000000000
--- a/src/gluonnlp/metric/masked_accuracy.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Masked accuracy metric."""
-
-from mxnet import ndarray
-from mxnet.metric import check_label_shapes
-from mxnet.metric import EvalMetric
-
-__all__ = ['MaskedAccuracy']
-
-class MaskedAccuracy(EvalMetric):
-    """Computes accuracy classification score.
-
-    The accuracy score is defined as
-
-    .. math::
-        \\text{accuracy}(y, \\hat{y}, mask) = \\frac{1}{m} \\sum_{i=0}^{n-1}
-        \\text{mask_i}(\\hat{y_i} == y_i)
-
-        \\text{m} = \\sum_{i=0}^{n-1} mask_i
-
-    Parameters
-    ----------
-    axis : int, default=1
-        The axis that represents classes
-    name : str
-        Name of this metric instance for display.
-    output_names : list of str, or None
-        Name of predictions that should be used when updating with update_dict.
-        By default include all predictions.
-    label_names : list of str, or None
-        Name of labels that should be used when updating with update_dict.
-        By default include all labels.
-
-    Examples
-    --------
-    >>> predicts = [mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])]
-    >>> labels   = [mx.nd.array([0, 1, 1])]
-    >>> masks    = [mx.nd.array([1, 1, 0])]
-    >>> acc = MaskedAccuracy()
-    >>> acc.update(preds=predicts, labels=labels, masks=masks)
-    >>> acc.get()
-    ('masked-accuracy', 0.5)
-    >>> acc2 = MaskedAccuracy()
-    >>> acc2.update(preds=predicts, labels=labels)
-    >>> acc2.get()
-    ('masked-accuracy', 0.6666666666666666)
-    """
-    def __init__(self, axis=1, name='masked-accuracy',
-                 output_names=None, label_names=None):
-        super(MaskedAccuracy, self).__init__(
-            name, axis=axis,
-            output_names=output_names, label_names=label_names,
-            has_global_stats=True)
-        self.axis = axis
-
-    def update(self, labels, preds, masks=None):
-        # pylint: disable=arguments-differ
-        """Updates the internal evaluation result.
-
-        Parameters
-        ----------
-        labels : list of `NDArray`
-            The labels of the data with class indices as values, one per sample.
-        preds : list of `NDArray`
-            Prediction values for samples. Each prediction value can either be the class index,
-            or a vector of likelihoods for all classes.
-        masks : list of `NDArray` or None, optional
-            Masks for samples, with the same shape as `labels`. value of its element must
-            be either 1 or 0. If None, all samples are considered valid.
-        """
-        labels, preds = check_label_shapes(labels, preds, True)
-        masks = [None] * len(labels) if masks is None else masks
-
-        for label, pred_label, mask in zip(labels, preds, masks):
-            if pred_label.shape != label.shape:
-                # TODO(haibin) topk does not support fp16. Issue tracked at:
-                # https://github.com/apache/incubator-mxnet/issues/14125
-                # topk is used because argmax is slow:
-                # https://github.com/apache/incubator-mxnet/issues/11061
-                pred_label = ndarray.topk(pred_label.astype('float32', copy=False),
-                                          k=1, ret_typ='indices', axis=self.axis)
-
-            # flatten before checking shapes to avoid shape miss match
-            pred_label = pred_label.astype('int32', copy=False).reshape((-1,))
-            label = label.astype('int32', copy=False).reshape((-1,))
-            check_label_shapes(label, pred_label)
-
-            if mask is not None:
-                mask = mask.astype('int32', copy=False).reshape((-1,))
-                check_label_shapes(label, mask)
-                num_correct = ((pred_label == label) * mask).sum().asscalar()
-                num_inst = mask.sum().asscalar()
-            else:
-                num_correct = (pred_label == label).sum().asscalar()
-                num_inst = len(label)
-            self.sum_metric += num_correct
-            self.global_sum_metric += num_correct
-            self.num_inst += num_inst
-            self.global_num_inst += num_inst
diff --git a/src/gluonnlp/model/__init__.py b/src/gluonnlp/model/__init__.py
deleted file mode 100644
index 00ce707f4d..0000000000
--- a/src/gluonnlp/model/__init__.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import, arguments-differ
-r"""Module for pre-defined NLP models.
-
-This module contains definitions for the following model architectures:
--  `AWD`_
-
-You can construct a model with random weights by calling its constructor. Because NLP models
-are tied to vocabularies, you can either specify a dataset name to load and use the vocabulary
-of that dataset:
-
-.. code-block:: python
-
-    import gluonnlp as nlp
-    awd, vocab = nlp.model.awd_lstm_lm_1150(dataset_name='wikitext-2')
-
-or directly specify a vocabulary object:
-
-.. code-block:: python
-
-    awd, vocab = nlp.model.awd_lstm_lm_1150(None, vocab=custom_vocab)
-
-We provide pre-trained models for all the listed models.
-These models can constructed by passing ``pretrained=True``:
-
-.. code-block:: python
-
-    awd, vocab = nlp.model.awd_lstm_lm_1150(dataset_name='wikitext-2'
-                                            pretrained=True)
-
-.. _AWD: https://arxiv.org/abs/1404.5997
-
-
--  `ELMo`_
-
-You can construct a predefined ELMo model structure:
-
-.. code-block:: python
-
-    import gluonnlp as nlp
-    elmo = nlp.model.elmo_2x1024_128_2048cnn_1xhighway(dataset_name='gbw')
-
-You can also get a ELMo model with pretrained parameters:
-
-.. code-block:: python
-
-    import gluonnlp as nlp
-    elmo = nlp.model.elmo_2x1024_128_2048cnn_1xhighway(dataset_name='gbw', pretrained=True)
-
-.. _ELMo: https://arxiv.org/pdf/1802.05365.pdf
-"""
-import os
-
-from . import (attention_cell, bert, bilm_encoder, block,
-               convolutional_encoder, elmo, highway, language_model,
-               lstmpcellwithclip, parameter, sampled_block,
-               seq2seq_encoder_decoder, sequence_sampler, train, transformer,
-               utils, info)
-from .attention_cell import *
-from .bert import *
-from .bilm_encoder import BiLMEncoder
-from .block import *
-from .convolutional_encoder import *
-from .elmo import *
-from .highway import *
-from .language_model import *
-from .lstmpcellwithclip import LSTMPCellWithClip
-from .parameter import *
-from .sampled_block import *
-from .seq2seq_encoder_decoder import *
-from .sequence_sampler import *
-from .transformer import *
-from .translation import *
-from .utils import *
-from .info import *
-from ..base import get_home_dir
-
-__all__ = (language_model.__all__ + sequence_sampler.__all__ + attention_cell.__all__ +
-           utils.__all__ + parameter.__all__ + block.__all__ + highway.__all__ +
-           convolutional_encoder.__all__ + sampled_block.__all__ + bilm_encoder.__all__ +
-           lstmpcellwithclip.__all__ + elmo.__all__ + seq2seq_encoder_decoder.__all__ +
-           transformer.__all__ + bert.__all__ + info.__all__ + ['train', 'get_model'])
-
-
-def get_model(name, **kwargs):
-    """Returns a pre-defined model by name.
-
-    Parameters
-    ----------
-    name : str
-        Name of the model.
-    dataset_name : str or None, default None
-        The dataset name on which the pre-trained model is trained.
-        For language model, options are 'wikitext-2'.
-        For ELMo, Options are 'gbw' and '5bw'.
-        'gbw' represents 1 Billion Word Language Model Benchmark
-        http://www.statmt.org/lm-benchmark/;
-        '5bw' represents a dataset of 5.5B tokens consisting of
-        Wikipedia (1.9B) and all of the monolingual news crawl data from WMT 2008-2012 (3.6B).
-        If specified, then the returned vocabulary is extracted from
-        the training set of the dataset.
-        If None, then vocab is required, for specifying embedding weight size, and is directly
-        returned.
-    vocab : gluonnlp.Vocab or None, default None
-        Vocabulary object to be used with the language model.
-        Required when dataset_name is not specified.
-        None Vocabulary object is required with the ELMo model.
-    pretrained : bool, default False
-        Whether to load the pre-trained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pre-trained weights.
-    root : str, default '$MXNET_HOME/models' with MXNET_HOME defaults to '~/.mxnet'
-        Location for keeping the model parameters.
-
-    Returns
-    -------
-    gluon.Block, gluonnlp.Vocab, (optional) gluonnlp.Vocab
-    """
-    models = {'standard_lstm_lm_200' : standard_lstm_lm_200,
-              'standard_lstm_lm_650' : standard_lstm_lm_650,
-              'standard_lstm_lm_1500': standard_lstm_lm_1500,
-              'awd_lstm_lm_1150': awd_lstm_lm_1150,
-              'awd_lstm_lm_600': awd_lstm_lm_600,
-              'big_rnn_lm_2048_512': big_rnn_lm_2048_512,
-              'elmo_2x1024_128_2048cnn_1xhighway': elmo_2x1024_128_2048cnn_1xhighway,
-              'elmo_2x2048_256_2048cnn_1xhighway': elmo_2x2048_256_2048cnn_1xhighway,
-              'elmo_2x4096_512_2048cnn_2xhighway': elmo_2x4096_512_2048cnn_2xhighway,
-              'transformer_en_de_512': transformer_en_de_512,
-              'bert_12_768_12'       : bert_12_768_12,
-              'bert_24_1024_16'      : bert_24_1024_16,
-              'distilbert_6_768_12'  : distilbert_6_768_12,
-              'roberta_12_768_12'    : roberta_12_768_12,
-              'roberta_24_1024_16'   : roberta_24_1024_16,
-              'ernie_12_768_12'      : ernie_12_768_12}
-    name = name.lower()
-    if name not in models:
-        raise ValueError(
-            'Model %s is not supported. Available options are\n\t%s'%(
-                name, '\n\t'.join(sorted(models.keys()))))
-    return models[name](**kwargs)
diff --git a/src/gluonnlp/model/attention_cell.py b/src/gluonnlp/model/attention_cell.py
deleted file mode 100644
index e2083f4081..0000000000
--- a/src/gluonnlp/model/attention_cell.py
+++ /dev/null
@@ -1,549 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Attention cells."""
-
-__all__ = ['AttentionCell', 'MultiHeadAttentionCell', 'MLPAttentionCell', 'DotProductAttentionCell']
-
-import math
-import numpy as np
-import mxnet as mx
-from mxnet.gluon.block import HybridBlock
-from mxnet.gluon import nn
-from mxnet.contrib.amp import amp
-from .block import L2Normalization
-
-
-def _apply_mask(F, att_score, mask, dtype):
-    """Fill in the masked scores with a very small value
-
-    Parameters
-    ----------
-    F : symbol or ndarray
-    att_score : Symbol or NDArray
-        Shape (batch_size, query_length, memory_length)
-    mask : Symbol or NDArray or None
-        Shape (batch_size, query_length, memory_length)
-    Returns
-    -------
-    att_score : Symbol or NDArray
-        Shape (batch_size, query_length, memory_length)
-    """
-    # Fill in the masked scores with a very small value
-    neg = -1e18
-    if np.dtype(dtype) == np.float16:
-        neg = -1e4
-    else:
-        # if AMP (automatic mixed precision) is enabled, -1e18 will cause NaN.
-        if amp._amp_initialized:
-            neg = -1e4
-    att_score = F.where(mask, att_score, neg * F.ones_like(att_score))
-    return att_score
-
-
-# TODO(sxjscience) Add mask flag to softmax operator. Think about how to accelerate the kernel
-def _masked_softmax(F, att_score, mask, dtype):
-    """Ignore the masked elements when calculating the softmax
-
-    Parameters
-    ----------
-    F : symbol or ndarray
-    att_score : Symborl or NDArray
-        Shape (batch_size, query_length, memory_length)
-    mask : Symbol or NDArray or None
-        Shape (batch_size, query_length, memory_length)
-    Returns
-    -------
-    att_weights : Symbol or NDArray
-        Shape (batch_size, query_length, memory_length)
-    """
-    if mask is not None:
-        # Fill in the masked scores with a very small value
-        att_score = _apply_mask(F, att_score, mask, dtype)
-        att_weights = F.softmax(att_score, axis=-1) * mask
-    else:
-        att_weights = F.softmax(att_score, axis=-1)
-    return att_weights
-
-
-# TODO(sxjscience) In the future, we should support setting mask/att_weights as sparse tensors
-class AttentionCell(HybridBlock):
-    """Abstract class for attention cells. Extend the class
-     to implement your own attention method.
-     One typical usage is to define your own `_compute_weight()` function to calculate the weights::
-
-        cell = AttentionCell()
-        out = cell(query, key, value, mask)
-
-    """
-    def __init__(self, prefix=None, params=None):
-        self._dtype = np.float32
-        super(AttentionCell, self).__init__(prefix=prefix, params=params)
-
-    def cast(self, dtype):
-        self._dtype = dtype
-        super(AttentionCell, self).cast(dtype)
-
-    def _compute_weight(self, F, query, key, mask=None):
-        """Compute attention weights based on the query and the keys
-
-        Parameters
-        ----------
-        F : symbol or ndarray
-        query : Symbol or NDArray
-            The query vectors. Shape (batch_size, query_length, query_dim)
-        key : Symbol or NDArray
-            Key of the memory. Shape (batch_size, memory_length, key_dim)
-        mask : Symbol or NDArray or None
-            Mask the memory slots. Shape (batch_size, query_length, memory_length)
-            Only contains 0 or 1 where 0 means that the memory slot will not be used.
-            If set to None. No mask will be used.
-
-        Returns
-        -------
-        att_weights : Symbol or NDArray
-            For single-head attention, Shape (batch_size, query_length, memory_length)
-            For multi-head attention, Shape (batch_size, num_heads, query_length, memory_length)
-        """
-        raise NotImplementedError
-
-    def _read_by_weight(self, F, att_weights, value):
-        """Read from the value matrix given the attention weights.
-
-        Parameters
-        ----------
-        F : symbol or ndarray
-        att_weights : Symbol or NDArray
-            Attention weights.
-            For single-head attention,
-                Shape (batch_size, query_length, memory_length).
-            For multi-head attention,
-                Shape (batch_size, num_heads, query_length, memory_length).
-        value : Symbol or NDArray
-            Value of the memory. Shape (batch_size, memory_length, total_value_dim)
-
-        Returns
-        -------
-        context_vec: Symbol or NDArray
-            Shape (batch_size, query_length, context_vec_dim)
-        """
-        output = F.batch_dot(att_weights, value)
-        return output
-
-    def __call__(self, query, key, value=None, mask=None):  # pylint: disable=arguments-differ
-        """Compute the attention.
-
-        Parameters
-        ----------
-        query : Symbol or NDArray
-            Query vector. Shape (batch_size, query_length, query_dim)
-        key : Symbol or NDArray
-            Key of the memory. Shape (batch_size, memory_length, key_dim)
-        value : Symbol or NDArray or None, default None
-            Value of the memory. If set to None, the value will be set as the key.
-            Shape (batch_size, memory_length, value_dim)
-        mask : Symbol or NDArray or None, default None
-            Mask of the memory slots. Shape (batch_size, query_length, memory_length)
-            Only contains 0 or 1 where 0 means that the memory slot will not be used.
-            If set to None. No mask will be used.
-
-        Returns
-        -------
-        context_vec : Symbol or NDArray
-            Shape (batch_size, query_length, context_vec_dim)
-        att_weights : Symbol or NDArray
-            Attention weights. Shape (batch_size, query_length, memory_length)
-        """
-        return super(AttentionCell, self).__call__(query, key, value, mask)
-
-    def hybrid_forward(self, F, query, key, value=None, mask=None):  # pylint: disable=arguments-differ
-        if value is None:
-            value = key
-        att_weights = self._compute_weight(F, query, key, mask)
-        context_vec = self._read_by_weight(F, att_weights, value)
-        return context_vec, att_weights
-
-
-class MultiHeadAttentionCell(AttentionCell):
-    r"""Multi-head Attention Cell.
-
-    In the MultiHeadAttentionCell, the input query/key/value will be linearly projected
-    for `num_heads` times with different projection matrices. Each projected key, value, query
-    will be used to calculate the attention weights and values. The output of each head will be
-    concatenated to form the final output.
-
-    The idea is first proposed in "[Arxiv2014] Neural Turing Machines" and
-    is later adopted in "[NIPS2017] Attention is All You Need" to solve the
-    Neural Machine Translation problem.
-
-    Parameters
-    ----------
-    base_cell : AttentionCell
-    query_units : int
-        Total number of projected units for query. Must be divided exactly by num_heads.
-    key_units : int
-        Total number of projected units for key. Must be divided exactly by num_heads.
-    value_units : int
-        Total number of projected units for value. Must be divided exactly by num_heads.
-    num_heads : int
-        Number of parallel attention heads
-    use_bias : bool, default True
-        Whether to use bias when projecting the query/key/values
-    weight_initializer : str or `Initializer` or None, default None
-        Initializer of the weights.
-    bias_initializer : str or `Initializer`, default 'zeros'
-        Initializer of the bias.
-    prefix : str or None, default None
-        See document of `Block`.
-    params : str or None, default None
-        See document of `Block`.
-    """
-    def __init__(self, base_cell, query_units, key_units, value_units, num_heads, use_bias=True,
-                 weight_initializer=None, bias_initializer='zeros', prefix=None, params=None):
-        super(MultiHeadAttentionCell, self).__init__(prefix=prefix, params=params)
-        self._base_cell = base_cell
-        self._num_heads = num_heads
-        self._use_bias = use_bias
-        units = [('query', query_units), ('key', key_units), ('value', value_units)]
-        for name, unit in units:
-            if unit % self._num_heads != 0:
-                raise ValueError(
-                    'In MultiHeadAttetion, the {name}_units should be divided exactly'
-                    ' by the number of heads. Received {name}_units={unit}, num_heads={n}'.format(
-                        name=name, unit=unit, n=num_heads))
-            setattr(self, '_{}_units'.format(name), unit)
-            with self.name_scope():
-                setattr(
-                    self, 'proj_{}'.format(name),
-                    nn.Dense(units=unit, use_bias=self._use_bias, flatten=False,
-                             weight_initializer=weight_initializer,
-                             bias_initializer=bias_initializer, prefix='{}_'.format(name)))
-
-    def __call__(self, query, key, value=None, mask=None):
-        """Compute the attention.
-
-        Parameters
-        ----------
-        query : Symbol or NDArray
-            Query vector. Shape (batch_size, query_length, query_dim)
-        key : Symbol or NDArray
-            Key of the memory. Shape (batch_size, memory_length, key_dim)
-        value : Symbol or NDArray or None, default None
-            Value of the memory. If set to None, the value will be set as the key.
-            Shape (batch_size, memory_length, value_dim)
-        mask : Symbol or NDArray or None, default None
-            Mask of the memory slots. Shape (batch_size, query_length, memory_length)
-            Only contains 0 or 1 where 0 means that the memory slot will not be used.
-            If set to None. No mask will be used.
-
-        Returns
-        -------
-        context_vec : Symbol or NDArray
-            Shape (batch_size, query_length, context_vec_dim)
-        att_weights : Symbol or NDArray
-            Attention weights of multiple heads.
-            Shape (batch_size, num_heads, query_length, memory_length)
-        """
-        return super(MultiHeadAttentionCell, self).__call__(query, key, value, mask)
-
-    def _project(self, F, name, x):
-        # Shape (batch_size, query_length, query_units)
-        x = getattr(self, 'proj_{}'.format(name))(x)
-        # Shape (batch_size * num_heads, query_length, ele_units)
-        x = F.transpose(x.reshape(shape=(0, 0, self._num_heads, -1)),
-                        axes=(0, 2, 1, 3))\
-             .reshape(shape=(-1, 0, 0), reverse=True)
-        return x
-
-    def _compute_weight(self, F, query, key, mask=None):
-        query = self._project(F, 'query', query)
-        key = self._project(F, 'key', key)
-        if mask is not None:
-            mask = F.broadcast_axis(F.expand_dims(mask, axis=1),
-                                    axis=1, size=self._num_heads)\
-                    .reshape(shape=(-1, 0, 0), reverse=True)
-        att_weights = self._base_cell._compute_weight(F, query, key, mask)
-        return att_weights.reshape(shape=(-1, self._num_heads, 0, 0), reverse=True)
-
-    def _read_by_weight(self, F, att_weights, value):
-        att_weights = att_weights.reshape(shape=(-1, 0, 0), reverse=True)
-        value = self._project(F, 'value', value)
-        context_vec = self._base_cell._read_by_weight(F, att_weights, value)
-        context_vec = F.transpose(context_vec.reshape(shape=(-1, self._num_heads, 0, 0),
-                                                      reverse=True),
-                                  axes=(0, 2, 1, 3)).reshape(shape=(0, 0, -1))
-        return context_vec
-
-
-class MLPAttentionCell(AttentionCell):
-    r"""Concat the query and the key and use a single-hidden-layer MLP to get the attention score.
-    We provide two mode, the standard mode and the normalized mode.
-
-    In the standard mode::
-
-        score = v tanh(W [h_q, h_k] + b)
-
-    In the normalized mode (Same as TensorFlow)::
-
-        score = g v / ||v||_2 tanh(W [h_q, h_k] + b)
-
-    This type of attention is first proposed in
-
-    .. Bahdanau et al., Neural Machine Translation by Jointly Learning to Align and Translate.
-       ICLR 2015
-
-    Parameters
-    ----------
-    units : int
-    act : Activation, default nn.Activation('tanh')
-    normalized : bool, default False
-        Whether to normalize the weight that maps the embedded
-        hidden states to the final score. This strategy can be interpreted as a type of
-        "[NIPS2016] Weight Normalization".
-    dropout : float, default 0.0
-        Attention dropout.
-    weight_initializer : str or `Initializer` or None, default None
-        Initializer of the weights.
-    bias_initializer : str or `Initializer`, default 'zeros'
-        Initializer of the bias.
-    prefix : str or None, default None
-        See document of `Block`.
-    params : ParameterDict or None, default None
-        See document of `Block`.
-    """
-
-    def __init__(self, units, act=nn.Activation('tanh'), normalized=False, dropout=0.0,
-                 weight_initializer=None, bias_initializer='zeros', prefix=None, params=None):
-        # Define a temporary class to implement the normalized version
-        # TODO(sxjscience) Find a better solution
-        class _NormalizedScoreProj(HybridBlock):
-            def __init__(self, in_units, weight_initializer=None, prefix=None, params=None):
-                super(_NormalizedScoreProj, self).__init__(prefix=prefix, params=params)
-                self.g = self.params.get('g', shape=(1,),
-                                         init=mx.init.Constant(1.0 / math.sqrt(in_units)),
-                                         allow_deferred_init=True)
-                self.v = self.params.get('v', shape=(1, in_units),
-                                         init=weight_initializer,
-                                         allow_deferred_init=True)
-
-            def hybrid_forward(self, F, x, g, v):  # pylint: disable=arguments-differ
-                v = F.broadcast_div(v, F.sqrt(F.dot(v, v, transpose_b=True)))
-                weight = F.broadcast_mul(g, v)
-                out = F.FullyConnected(x, weight, None, no_bias=True, num_hidden=1,
-                                       flatten=False, name='fwd')
-                return out
-
-        super(MLPAttentionCell, self).__init__(prefix=prefix, params=params)
-        self._units = units
-        self._act = act
-        self._normalized = normalized
-        self._dropout = dropout
-        with self.name_scope():
-            self._dropout_layer = nn.Dropout(dropout)
-            self._query_mid_layer = nn.Dense(units=self._units, flatten=False, use_bias=True,
-                                             weight_initializer=weight_initializer,
-                                             bias_initializer=bias_initializer,
-                                             prefix='query_')
-            self._key_mid_layer = nn.Dense(units=self._units, flatten=False, use_bias=False,
-                                           weight_initializer=weight_initializer,
-                                           prefix='key_')
-            if self._normalized:
-                self._attention_score = \
-                    _NormalizedScoreProj(in_units=units,
-                                         weight_initializer=weight_initializer,
-                                         prefix='score_')
-            else:
-                self._attention_score = nn.Dense(units=1, in_units=self._units,
-                                                 flatten=False, use_bias=False,
-                                                 weight_initializer=weight_initializer,
-                                                 prefix='score_')
-
-    def _compute_score(self, F, query, key, mask=None):
-        mapped_query = self._query_mid_layer(query)
-        mapped_key = self._key_mid_layer(key)
-        mid_feat = F.broadcast_add(F.expand_dims(mapped_query, axis=2),
-                                   F.expand_dims(mapped_key, axis=1))
-        mid_feat = self._act(mid_feat)
-        att_score = self._attention_score(mid_feat).reshape(shape=(0, 0, 0))
-        if mask is not None:
-            att_score = _apply_mask(F, att_score, mask, self._dtype)
-        return att_score
-
-    def _compute_weight(self, F, query, key, mask=None):
-        att_score = self._compute_score(F, query, key, mask)
-        att_weights = F.softmax(att_score, axis=-1)
-        if mask is not None:
-            att_weights = att_weights * mask
-        att_weights = self._dropout_layer(att_weights)
-        return att_weights
-
-
-class DotProductAttentionCell(AttentionCell):
-    r"""Dot product attention between the query and the key.
-
-    Depending on parameters, defined as::
-
-        units is None:
-            score = <h_q, h_k>
-        units is not None and luong_style is False:
-            score = <W_q h_q, W_k h_k>
-        units is not None and luong_style is True:
-            score = <W h_q, h_k>
-
-    Parameters
-    ----------
-    units: int or None, default None
-        Project the query and key to vectors with `units` dimension
-        before applying the attention. If set to None,
-        the query vector and the key vector are directly used to compute the attention and
-        should have the same dimension::
-
-            If the units is None,
-                score = <h_q, h_k>
-            Else if the units is not None and luong_style is False:
-                score = <W_q h_q, W_k h_k>
-            Else if the units is not None and luong_style is True:
-                score = <W h_q, h_k>
-
-    luong_style: bool, default False
-        If turned on, the score will be::
-
-            score = <W h_q, h_k>
-
-        `units` must be the same as the dimension of the key vector
-    scaled: bool, default True
-        Whether to divide the attention weights by the sqrt of the query dimension.
-        This is first proposed in "[NIPS2017] Attention is all you need."::
-
-            score = <h_q, h_k> / sqrt(dim_q)
-
-    normalized: bool, default False
-        If turned on, the cosine distance is used, i.e::
-
-            score = <h_q / ||h_q||, h_k / ||h_k||>
-
-    use_bias : bool, default True
-        Whether to use bias in the projection layers.
-    dropout : float, default 0.0
-        Attention dropout
-    weight_initializer : str or `Initializer` or None, default None
-        Initializer of the weights
-    bias_initializer : str or `Initializer`, default 'zeros'
-        Initializer of the bias
-    prefix : str or None, default None
-        See document of `Block`.
-    params : str or None, default None
-        See document of `Block`.
-    """
-    def __init__(self, units=None, luong_style=False, scaled=True, normalized=False, use_bias=True,
-                 dropout=0.0, weight_initializer=None, bias_initializer='zeros',
-                 prefix=None, params=None):
-        super(DotProductAttentionCell, self).__init__(prefix=prefix, params=params)
-        self._units = units
-        self._scaled = scaled
-        self._normalized = normalized
-        self._use_bias = use_bias
-        self._luong_style = luong_style
-        self._dropout = dropout
-        if self._luong_style:
-            assert units is not None, 'Luong style attention is not available without explicitly ' \
-                                      'setting the units'
-        with self.name_scope():
-            self._dropout_layer = nn.Dropout(dropout)
-        if units is not None:
-            with self.name_scope():
-                self._proj_query = nn.Dense(units=self._units, use_bias=self._use_bias,
-                                            flatten=False, weight_initializer=weight_initializer,
-                                            bias_initializer=bias_initializer, prefix='query_')
-                if not self._luong_style:
-                    self._proj_key = nn.Dense(units=self._units, use_bias=self._use_bias,
-                                              flatten=False, weight_initializer=weight_initializer,
-                                              bias_initializer=bias_initializer, prefix='key_')
-        if self._normalized:
-            with self.name_scope():
-                self._l2_norm = L2Normalization(axis=-1)
-
-    def _compute_score(self, F, query, key, mask=None):
-        if self._units is not None:
-            query = self._proj_query(query)
-            if not self._luong_style:
-                key = self._proj_key(key)
-            elif F == mx.nd:
-                assert query.shape[-1] == key.shape[-1], 'Luong style attention requires key to ' \
-                                                         'have the same dim as the projected ' \
-                                                         'query. Received key {}, query {}.'.format(
-                                                             key.shape, query.shape)
-        if self._normalized:
-            query = self._l2_norm(query)
-            key = self._l2_norm(key)
-        if self._scaled:
-            query = F.contrib.div_sqrt_dim(query)
-
-        att_score = F.batch_dot(query, key, transpose_b=True)
-        if mask is not None:
-            att_score = _apply_mask(F, att_score, mask, self._dtype)
-        return att_score
-
-    def _compute_weight(self, F, query, key, mask=None):
-        att_score = self._compute_score(F, query, key, mask)
-        att_weights = F.softmax(att_score, axis=-1)
-        if mask is not None:
-            att_weights = att_weights * mask
-        att_weights = self._dropout_layer(att_weights)
-        return att_weights
-
-def _get_attention_cell(attention_cell, units=None,
-                        scaled=True, num_heads=None,
-                        use_bias=False, dropout=0.0):
-    """
-
-    Parameters
-    ----------
-    attention_cell : AttentionCell or str
-    units : int or None
-
-    Returns
-    -------
-    attention_cell : AttentionCell
-    """
-    if isinstance(attention_cell, str):
-        if attention_cell == 'scaled_luong':
-            return DotProductAttentionCell(units=units, scaled=True, normalized=False,
-                                           use_bias=use_bias, dropout=dropout, luong_style=True)
-        elif attention_cell == 'scaled_dot':
-            return DotProductAttentionCell(units=units, scaled=True, normalized=False,
-                                           use_bias=use_bias, dropout=dropout, luong_style=False)
-        elif attention_cell == 'dot':
-            return DotProductAttentionCell(units=units, scaled=False, normalized=False,
-                                           use_bias=use_bias, dropout=dropout, luong_style=False)
-        elif attention_cell == 'cosine':
-            return DotProductAttentionCell(units=units, scaled=False, use_bias=use_bias,
-                                           dropout=dropout, normalized=True)
-        elif attention_cell == 'mlp':
-            return MLPAttentionCell(units=units, normalized=False)
-        elif attention_cell == 'normed_mlp':
-            return MLPAttentionCell(units=units, normalized=True)
-        elif attention_cell == 'multi_head':
-            base_cell = DotProductAttentionCell(scaled=scaled, dropout=dropout)
-            return MultiHeadAttentionCell(base_cell=base_cell, query_units=units, use_bias=use_bias,
-                                          key_units=units, value_units=units, num_heads=num_heads)
-        else:
-            raise NotImplementedError
-    else:
-        assert isinstance(attention_cell, AttentionCell),\
-            'attention_cell must be either string or AttentionCell. Received attention_cell={}'\
-                .format(attention_cell)
-        return attention_cell
diff --git a/src/gluonnlp/model/bert.py b/src/gluonnlp/model/bert.py
deleted file mode 100644
index 022d1103b2..0000000000
--- a/src/gluonnlp/model/bert.py
+++ /dev/null
@@ -1,1647 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""BERT models."""
-# pylint: disable=too-many-lines
-
-__all__ = ['BERTModel', 'RoBERTaModel', 'BERTEncoder', 'BERTClassifier',
-           'RoBERTaClassifier', 'bert_12_768_12', 'bert_24_1024_16',
-           'ernie_12_768_12', 'roberta_12_768_12', 'roberta_24_1024_16',
-           'DistilBERTModel', 'distilbert_6_768_12']
-
-import os
-
-import mxnet as mx
-from mxnet.gluon import HybridBlock, nn
-from mxnet.gluon.model_zoo import model_store
-
-from ..base import get_home_dir
-from .block import GELU
-from .seq2seq_encoder_decoder import Seq2SeqEncoder
-from .transformer import PositionwiseFFN
-from .utils import _load_pretrained_params, _load_vocab
-
-###############################################################################
-#                              COMPONENTS                                     #
-###############################################################################
-
-class DotProductSelfAttentionCell(HybridBlock):
-    r"""Multi-head Dot Product Self Attention Cell.
-
-    In the DotProductSelfAttentionCell, the input query/key/value will be linearly projected
-    for `num_heads` times with different projection matrices. Each projected key, value, query
-    will be used to calculate the attention weights and values. The output of each head will be
-    concatenated to form the final output.
-
-    This is a more efficient implementation of MultiHeadAttentionCell with
-    DotProductAttentionCell as the base_cell:
-
-    score = <W_q h_q, W_k h_k> / sqrt(dim_q)
-
-    Parameters
-    ----------
-    units : int
-        Total number of projected units for query. Must be divided exactly by num_heads.
-    num_heads : int
-        Number of parallel attention heads
-    use_bias : bool, default True
-        Whether to use bias when projecting the query/key/values
-    weight_initializer : str or `Initializer` or None, default None
-        Initializer of the weights.
-    bias_initializer : str or `Initializer`, default 'zeros'
-        Initializer of the bias.
-    prefix : str or None, default None
-        See document of `Block`.
-    params : str or None, default None
-        See document of `Block`.
-
-    Inputs:
-      - **qkv** : Symbol or NDArray
-        Query / Key / Value vector. Shape (query_length, batch_size, C_in)
-      - **valid_len** : Symbol or NDArray or None, default None
-        Valid length of the query/key/value slots. Shape (batch_size, query_length)
-
-    Outputs:
-      - **context_vec** : Symbol or NDArray
-        Shape (query_length, batch_size, context_vec_dim)
-      - **att_weights** : Symbol or NDArray
-        Attention weights of multiple heads.
-        Shape (batch_size, num_heads, query_length, memory_length)
-    """
-    def __init__(self, units, num_heads, dropout=0.0, use_bias=True,
-                 weight_initializer=None, bias_initializer='zeros',
-                 prefix=None, params=None):
-        super().__init__(prefix=prefix, params=params)
-        self._num_heads = num_heads
-        self._use_bias = use_bias
-        self._dropout = dropout
-        self.units = units
-        with self.name_scope():
-            if self._use_bias:
-                self.query_bias = self.params.get('query_bias', shape=(self.units,),
-                                                  init=bias_initializer)
-                self.key_bias = self.params.get('key_bias', shape=(self.units,),
-                                                init=bias_initializer)
-                self.value_bias = self.params.get('value_bias', shape=(self.units,),
-                                                  init=bias_initializer)
-            weight_shape = (self.units, self.units)
-            self.query_weight = self.params.get('query_weight', shape=weight_shape,
-                                                init=weight_initializer,
-                                                allow_deferred_init=True)
-            self.key_weight = self.params.get('key_weight', shape=weight_shape,
-                                              init=weight_initializer,
-                                              allow_deferred_init=True)
-            self.value_weight = self.params.get('value_weight', shape=weight_shape,
-                                                init=weight_initializer,
-                                                allow_deferred_init=True)
-            self.dropout_layer = nn.Dropout(self._dropout)
-
-    def _collect_params_with_prefix(self, prefix=''):
-        # the registered parameter names in v0.8 are the following:
-        # prefix_proj_query.weight, prefix_proj_query.bias
-        # prefix_proj_value.weight, prefix_proj_value.bias
-        # prefix_proj_key.weight, prefix_proj_key.bias
-        # this is a temporary fix to keep backward compatibility, due to an issue in MXNet:
-        # https://github.com/apache/incubator-mxnet/issues/17220
-        if prefix:
-            prefix += '.'
-        ret = {prefix + 'proj_' + k.replace('_', '.') : v for k, v in self._reg_params.items()}
-        for name, child in self._children.items():
-            ret.update(child._collect_params_with_prefix(prefix + name))
-        return ret
-
-    # pylint: disable=arguments-differ
-    def hybrid_forward(self, F, qkv, valid_len, query_bias, key_bias, value_bias,
-                       query_weight, key_weight, value_weight):
-        # interleaved_matmul_selfatt ops assume the projection is done with interleaving
-        # weights for query/key/value. The concatenated weight should have shape
-        # (num_heads, C_out/num_heads * 3, C_in).
-        query_weight = query_weight.reshape(shape=(self._num_heads, -1, 0), reverse=True)
-        key_weight = key_weight.reshape(shape=(self._num_heads, -1, 0), reverse=True)
-        value_weight = value_weight.reshape(shape=(self._num_heads, -1, 0), reverse=True)
-        in_weight = F.concat(query_weight, key_weight, value_weight, dim=-2)
-        in_weight = in_weight.reshape(shape=(-1, 0), reverse=True)
-        in_bias = F.concat(query_bias, key_bias, value_bias, dim=0)
-
-        # qkv_proj shape = (seq_length, batch_size, num_heads * head_dim * 3)
-        qkv_proj = F.FullyConnected(data=qkv, weight=in_weight, bias=in_bias,
-                                    num_hidden=self.units*3, no_bias=False, flatten=False)
-        att_score = F.contrib.interleaved_matmul_selfatt_qk(qkv_proj, heads=self._num_heads)
-        if valid_len is not None:
-            valid_len = F.broadcast_axis(F.expand_dims(valid_len, axis=1),
-                                         axis=1, size=self._num_heads)
-            valid_len = valid_len.reshape(shape=(-1, 0), reverse=True)
-            att_weights = F.softmax(att_score, length=valid_len, use_length=True, axis=-1)
-        else:
-            att_weights = F.softmax(att_score, axis=-1)
-        # att_weights shape = (batch_size, seq_length, seq_length)
-        att_weights = self.dropout_layer(att_weights)
-        context_vec = F.contrib.interleaved_matmul_selfatt_valatt(qkv_proj, att_weights,
-                                                                  heads=self._num_heads)
-        att_weights = att_weights.reshape(shape=(-1, self._num_heads, 0, 0), reverse=True)
-        return context_vec, att_weights
-
-
-class BERTEncoderCell(HybridBlock):
-    """Structure of the BERT Encoder Cell.
-
-    Parameters
-    ----------
-    units : int
-        Number of units for the output
-    hidden_size : int
-        number of units in the hidden layer of position-wise feed-forward networks
-    num_heads : int
-        Number of heads in multi-head attention
-    dropout : float
-    output_attention: bool
-        Whether to output the attention weights
-    attention_use_bias : float, default True
-        Whether to use bias term in the attention cell
-    weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default None
-        Prefix for name of `Block`s. (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells. Created if `None`.
-    activation : str, default 'gelu'
-        Activation methods in PositionwiseFFN
-    layer_norm_eps : float, default 1e-5
-        Epsilon for layer_norm
-
-    Inputs:
-        - **inputs** : input sequence. Shape (length, batch_size, C_in)
-        - **valid_length** : valid length of inputs for attention. Shape (batch_size, length)
-
-    Outputs:
-        - **outputs**: output tensor of the transformer encoder cell.
-            Shape (length, batch_size, C_out)
-        - **additional_outputs**: the additional output of all the BERT encoder cell.
-    """
-    def __init__(self, units=128, hidden_size=512, num_heads=4,
-                 dropout=0.0, output_attention=False,
-                 attention_use_bias=True,
-                 weight_initializer=None, bias_initializer='zeros',
-                 prefix=None, params=None, activation='gelu',
-                 layer_norm_eps=1e-5):
-        super().__init__(prefix=prefix, params=params)
-        self._dropout = dropout
-        self._output_attention = output_attention
-        with self.name_scope():
-            if dropout:
-                self.dropout_layer = nn.Dropout(rate=dropout)
-            self.attention_cell = DotProductSelfAttentionCell(units, num_heads,
-                                                              use_bias=attention_use_bias,
-                                                              dropout=dropout)
-            self.proj = nn.Dense(units=units, flatten=False, use_bias=True,
-                                 weight_initializer=weight_initializer,
-                                 bias_initializer=bias_initializer, prefix='proj_')
-            self.ffn = PositionwiseFFN(units=units, hidden_size=hidden_size, dropout=dropout,
-                                       weight_initializer=weight_initializer,
-                                       bias_initializer=bias_initializer, activation=activation,
-                                       layer_norm_eps=layer_norm_eps)
-            self.layer_norm = nn.LayerNorm(in_channels=units, epsilon=layer_norm_eps)
-
-
-    def hybrid_forward(self, F, inputs, valid_len=None):  # pylint: disable=arguments-differ
-        """Transformer Encoder Attention Cell.
-
-        Parameters
-        ----------
-        inputs : Symbol or NDArray
-            Input sequence. Shape (length, batch_size, C_in)
-        valid_len : Symbol or NDArray or None
-            Valid length for inputs. Shape (batch_size, length)
-
-        Returns
-        -------
-        encoder_cell_outputs: list
-            Outputs of the encoder cell. Contains:
-
-            - outputs of the transformer encoder cell. Shape (length, batch_size, C_out)
-            - additional_outputs of all the transformer encoder cell
-        """
-        outputs, attention_weights = self.attention_cell(inputs, valid_len)
-        outputs = self.proj(outputs)
-        if self._dropout:
-            outputs = self.dropout_layer(outputs)
-        # use residual
-        outputs = outputs + inputs
-        outputs = self.layer_norm(outputs)
-        outputs = self.ffn(outputs)
-        additional_outputs = []
-        if self._output_attention:
-            additional_outputs.append(attention_weights)
-        return outputs, additional_outputs
-
-class BERTEncoder(HybridBlock, Seq2SeqEncoder):
-    """Structure of the BERT Encoder.
-
-    Different from the original encoder for transformer, `BERTEncoder` uses
-    learnable positional embedding, a 'gelu' activation functions and a
-    separate epsilon value for LayerNorm.
-
-    Parameters
-    ----------
-    num_layers : int
-        Number of attention layers.
-    units : int
-        Number of units for the output.
-    hidden_size : int
-        number of units in the hidden layer of position-wise feed-forward networks
-    max_length : int
-        Maximum length of the input sequence
-    num_heads : int
-        Number of heads in multi-head attention
-    dropout : float
-        Dropout probability of the attention probabilities and embedding.
-    output_attention: bool, default False
-        Whether to output the attention weights
-    output_all_encodings: bool, default False
-        Whether to output encodings of all encoder cells
-    weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default None.
-        Prefix for name of `Block`s. (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells. Created if `None`.
-    activation : str, default 'gelu'
-        Activation methods in PositionwiseFFN
-    layer_norm_eps : float, default 1e-12
-        Epsilon for layer_norm
-
-    Inputs:
-        - **inputs** : input sequence of shape (length, batch_size, C_in)
-        - **states** : list of tensors for initial states and valid length for self attention.
-        - **valid_length** : valid lengths of each sequence. Usually used when part of sequence
-            has been padded. Shape is (batch_size, )
-
-    Outputs:
-        - **outputs** : the output of the encoder. Shape is (length, batch_size, C_out)
-        - **additional_outputs** : list of tensors.
-            Either be an empty list or contains the attention weights in this step.
-            The attention weights will have shape (batch_size, num_heads, length, mem_length)
-
-    """
-
-    def __init__(self, *, num_layers=2, units=512, hidden_size=2048,
-                 max_length=50, num_heads=4, dropout=0.0,
-                 output_attention=False, output_all_encodings=False, weight_initializer=None,
-                 bias_initializer='zeros', prefix=None, params=None, activation='gelu',
-                 layer_norm_eps=1e-12):
-        super().__init__(prefix=prefix, params=params)
-        assert units % num_heads == 0,\
-            'In BERTEncoder, The units should be divided exactly ' \
-            'by the number of heads. Received units={}, num_heads={}' \
-            .format(units, num_heads)
-        self._max_length = max_length
-        self._units = units
-        self._output_attention = output_attention
-        self._output_all_encodings = output_all_encodings
-        self._dropout = dropout
-
-        with self.name_scope():
-            if dropout:
-                self.dropout_layer = nn.Dropout(rate=dropout)
-            self.layer_norm = nn.LayerNorm(in_channels=units, epsilon=1e-12)
-            self.position_weight = self.params.get('position_weight', shape=(max_length, units),
-                                                   init=weight_initializer)
-            self.transformer_cells = nn.HybridSequential()
-            for i in range(num_layers):
-                cell = BERTEncoderCell(
-                    units=units, hidden_size=hidden_size, num_heads=num_heads,
-                    weight_initializer=weight_initializer,
-                    bias_initializer=bias_initializer, dropout=dropout,
-                    output_attention=output_attention, prefix='transformer%d_' % i,
-                    activation=activation, layer_norm_eps=layer_norm_eps)
-                self.transformer_cells.add(cell)
-
-    def __call__(self, inputs, states=None, valid_length=None):  # pylint: disable=arguments-differ
-        """Encode the inputs given the states and valid sequence length.
-
-        Parameters
-        ----------
-        inputs : NDArray or Symbol
-            Input sequence. Shape (batch_size, length, C_in)
-        states : list of NDArrays or Symbols
-            Initial states. The list of initial states and valid length for self attention
-        valid_length : NDArray or Symbol
-            Valid lengths of each sequence. This is usually used when part of sequence has
-            been padded. Shape (batch_size,)
-
-        Returns
-        -------
-        encoder_outputs: list
-            Outputs of the encoder. Contains:
-
-            - outputs of the transformer encoder. Shape (batch_size, length, C_out)
-            - additional_outputs of all the transformer encoder
-        """
-        return super().__call__(inputs, states, valid_length)
-
-    def hybrid_forward(self, F, inputs, states=None, valid_length=None, position_weight=None):
-        # pylint: disable=arguments-differ
-        """Encode the inputs given the states and valid sequence length.
-
-        Parameters
-        ----------
-        inputs : NDArray or Symbol
-            Input sequence. Shape (length, batch_size, C_in)
-        states : list of NDArrays or Symbols
-            Initial states. The list of initial states and valid length for self attention
-        valid_length : NDArray or Symbol
-            Valid lengths of each sequence. This is usually used when part of sequence has
-            been padded. Shape (batch_size,)
-
-        Returns
-        -------
-        outputs : NDArray or Symbol, or List[NDArray] or List[Symbol]
-            If output_all_encodings flag is False, then the output of the last encoder.
-            If output_all_encodings flag is True, then the list of all outputs of all encoders.
-            In both cases, shape of the tensor(s) is/are (length, batch_size, C_out)
-        additional_outputs : list
-            Either be an empty list or contains the attention weights in this step.
-            The attention weights will have shape (batch_size, length) or
-            (batch_size, num_heads, length, length)
-
-        """
-        # axis 0 is for length
-        steps = F.contrib.arange_like(inputs, axis=0)
-        if valid_length is not None:
-            zeros = F.zeros_like(steps)
-            # valid_length for attention, shape = (batch_size, seq_length)
-            attn_valid_len = F.broadcast_add(F.reshape(valid_length, shape=(-1, 1)),
-                                             F.reshape(zeros, shape=(1, -1)))
-            attn_valid_len = F.cast(attn_valid_len, dtype='int32')
-            if states is None:
-                states = [attn_valid_len]
-            else:
-                states.append(attn_valid_len)
-        else:
-            attn_valid_len = None
-
-        if states is None:
-            states = [steps]
-        else:
-            states.append(steps)
-
-        # positional encoding
-        positional_embed = F.Embedding(steps, position_weight, self._max_length, self._units)
-        inputs = F.broadcast_add(inputs, F.expand_dims(positional_embed, axis=1))
-
-        if self._dropout:
-            inputs = self.dropout_layer(inputs)
-        inputs = self.layer_norm(inputs)
-        outputs = inputs
-
-        all_encodings_outputs = []
-        additional_outputs = []
-        for cell in self.transformer_cells:
-            outputs, attention_weights = cell(inputs, attn_valid_len)
-            inputs = outputs
-            if self._output_all_encodings:
-                if valid_length is not None:
-                    outputs = F.SequenceMask(outputs, sequence_length=valid_length,
-                                             use_sequence_length=True, axis=0)
-                all_encodings_outputs.append(outputs)
-
-            if self._output_attention:
-                additional_outputs.append(attention_weights)
-
-        if valid_length is not None and not self._output_all_encodings:
-            # if self._output_all_encodings, SequenceMask is already applied above
-            outputs = F.SequenceMask(outputs, sequence_length=valid_length,
-                                     use_sequence_length=True, axis=0)
-
-        if self._output_all_encodings:
-            return all_encodings_outputs, additional_outputs
-        return outputs, additional_outputs
-
-
-###############################################################################
-#                                FULL MODEL                                   #
-###############################################################################
-
-
-class BERTModel(HybridBlock):
-    """Generic Model for BERT (Bidirectional Encoder Representations from Transformers).
-
-    Parameters
-    ----------
-    encoder : BERTEncoder
-        Bidirectional encoder that encodes the input sentence.
-    vocab_size : int or None, default None
-        The size of the vocabulary.
-    token_type_vocab_size : int or None, default None
-        The vocabulary size of token types (number of segments).
-    units : int or None, default None
-        Number of units for the final pooler layer.
-    embed_size : int or None, default None
-        Size of the embedding vectors. It is used to generate the word and token type
-        embeddings if word_embed and token_type_embed are None.
-    embed_initializer : Initializer, default None
-        Initializer of the embedding weights. It is used to generate the source and target
-        embeddings if word_embed and token_type_embed are None.
-    word_embed : Block or None, default None
-        The word embedding. If set to None, word_embed will be constructed using embed_size.
-    token_type_embed : Block or None, default None
-        The token type embedding (segment embedding). If set to None and the token_type_embed will
-        be constructed using embed_size.
-    use_pooler : bool, default True
-        Whether to include the pooler which converts the encoded sequence tensor of shape
-        (batch_size, seq_length, units) to a tensor of shape (batch_size, units)
-        for segment level classification task.
-    use_decoder : bool, default True
-        Whether to include the decoder for masked language model prediction.
-    use_classifier : bool, default True
-        Whether to include the classifier for next sentence classification.
-    use_token_type_embed : bool, default True
-        Whether to include token type embedding (segment embedding).
-    prefix : str or None
-        See document of `mx.gluon.Block`.
-    params : ParameterDict or None
-        See document of `mx.gluon.Block`.
-
-    Inputs:
-        - **inputs**: input sequence tensor, shape (batch_size, seq_length)
-        - **token_types**: optional input token type tensor, shape (batch_size, seq_length).
-            If the inputs contain two sequences, then the token type of the first
-            sequence differs from that of the second one.
-        - **valid_length**: optional tensor of input sequence valid lengths, shape (batch_size,)
-        - **masked_positions**: optional tensor of position of tokens for masked LM decoding,
-            shape (batch_size, num_masked_positions).
-
-    Outputs:
-        - **sequence_outputs**: Encoded sequence, which can be either a tensor of the last
-            layer of the Encoder, or a list of all sequence encodings of all layers.
-            In both cases shape of the tensor(s) is/are (batch_size, seq_length, units).
-        - **attention_outputs**: output list of all intermediate encodings per layer
-            Returned only if BERTEncoder.output_attention is True.
-            List of num_layers length of tensors of shape
-            (batch_size, num_attention_heads, seq_length, seq_length)
-        - **pooled_output**: output tensor of pooled representation of the first tokens.
-            Returned only if use_pooler is True. Shape (batch_size, units)
-        - **next_sentence_classifier_output**: output tensor of next sentence classification.
-            Returned only if use_classifier is True. Shape (batch_size, 2)
-        - **masked_lm_outputs**: output tensor of sequence decoding for masked language model
-            prediction. Returned only if use_decoder True.
-            Shape (batch_size, num_masked_positions, vocab_size)
-    """
-
-    def __init__(self, encoder, vocab_size=None, token_type_vocab_size=None, units=None,
-                 embed_size=None, embed_initializer=None,
-                 word_embed=None, token_type_embed=None, use_pooler=True, use_decoder=True,
-                 use_classifier=True, use_token_type_embed=True, prefix=None, params=None):
-        super().__init__(prefix=prefix, params=params)
-        self._use_decoder = use_decoder
-        self._use_classifier = use_classifier
-        self._use_pooler = use_pooler
-        self._use_token_type_embed = use_token_type_embed
-        self._units = units
-        self.encoder = encoder
-        # Construct word embedding
-        self.word_embed = self._get_embed(word_embed, vocab_size, embed_size,
-                                          embed_initializer, 'word_embed_')
-        # Construct token type embedding
-        if use_token_type_embed:
-            self.token_type_embed = self._get_embed(token_type_embed, token_type_vocab_size,
-                                                    embed_size, embed_initializer,
-                                                    'token_type_embed_')
-        if self._use_pooler:
-            # Construct pooler
-            self.pooler = self._get_pooler(units, 'pooler_')
-            if self._use_classifier:
-                # Construct classifier for next sentence predicition
-                self.classifier = self._get_classifier('cls_')
-        else:
-            assert not use_classifier, 'Cannot use classifier if use_pooler is False'
-        if self._use_decoder:
-            # Construct decoder for masked language model
-            self.decoder = self._get_decoder(units, vocab_size, self.word_embed[0], 'decoder_')
-
-    def _get_classifier(self, prefix):
-        """ Construct a decoder for the next sentence prediction task """
-        with self.name_scope():
-            classifier = nn.Dense(2, prefix=prefix)
-        return classifier
-
-    def _get_decoder(self, units, vocab_size, embed, prefix):
-        """ Construct a decoder for the masked language model task """
-        with self.name_scope():
-            decoder = nn.HybridSequential(prefix=prefix)
-            decoder.add(nn.Dense(units, flatten=False))
-            decoder.add(GELU())
-            decoder.add(nn.LayerNorm(in_channels=units, epsilon=1e-12))
-            decoder.add(nn.Dense(vocab_size, flatten=False, params=embed.collect_params()))
-        assert decoder[3].weight == list(embed.collect_params().values())[0], \
-            'The weights of word embedding are not tied with those of decoder'
-        return decoder
-
-    def _get_embed(self, embed, vocab_size, embed_size, initializer, prefix):
-        """ Construct an embedding block. """
-        if embed is None:
-            assert embed_size is not None, '"embed_size" cannot be None if "word_embed" or ' \
-                                           'token_type_embed is not given.'
-            with self.name_scope():
-                embed = nn.HybridSequential(prefix=prefix)
-                with embed.name_scope():
-                    embed.add(nn.Embedding(input_dim=vocab_size, output_dim=embed_size,
-                                           weight_initializer=initializer))
-        assert isinstance(embed, HybridBlock)
-        return embed
-
-    def _get_pooler(self, units, prefix):
-        """ Construct pooler.
-
-        The pooler slices and projects the hidden output of first token
-        in the sequence for segment level classification.
-
-        """
-        with self.name_scope():
-            pooler = nn.Dense(units=units, flatten=False, activation='tanh',
-                              prefix=prefix)
-        return pooler
-
-    def __call__(self, inputs, token_types, valid_length=None, masked_positions=None):
-        # pylint: disable=dangerous-default-value, arguments-differ
-        """Generate the representation given the inputs.
-
-        This is used in training or fine-tuning a BERT model.
-        """
-        return super().__call__(inputs, token_types, valid_length, masked_positions)
-
-    def hybrid_forward(self, F, inputs, token_types, valid_length=None, masked_positions=None):
-        # pylint: disable=arguments-differ
-        """Generate the representation given the inputs.
-
-        This is used in training or fine-tuning a BERT model.
-        """
-        outputs = []
-        seq_out, attention_out = self._encode_sequence(inputs, token_types, valid_length)
-        outputs.append(seq_out)
-
-        if self.encoder._output_all_encodings:
-            assert isinstance(seq_out, list)
-            output = seq_out[-1]
-        else:
-            output = seq_out
-
-        if attention_out:
-            outputs.append(attention_out)
-
-        if self._use_pooler:
-            pooled_out = self._apply_pooling(output)
-            outputs.append(pooled_out)
-            if self._use_classifier:
-                next_sentence_classifier_out = self.classifier(pooled_out)
-                outputs.append(next_sentence_classifier_out)
-        if self._use_decoder:
-            assert masked_positions is not None, \
-                'masked_positions tensor is required for decoding masked language model'
-            decoder_out = self._decode(F, output, masked_positions)
-            outputs.append(decoder_out)
-        return tuple(outputs) if len(outputs) > 1 else outputs[0]
-
-    def _encode_sequence(self, inputs, token_types, valid_length=None):
-        """Generate the representation given the input sequences.
-
-        This is used for pre-training or fine-tuning a BERT model.
-        """
-        # embedding
-        embedding = self.word_embed(inputs)
-        if self._use_token_type_embed:
-            type_embedding = self.token_type_embed(token_types)
-            embedding = embedding + type_embedding
-        # (batch, seq_len, C) -> (seq_len, batch, C)
-        embedding = embedding.transpose((1, 0, 2))
-        # encoding
-        outputs, additional_outputs = self.encoder(embedding, valid_length=valid_length)
-        # (seq_len, batch, C) -> (batch, seq_len, C)
-        if isinstance(outputs, (list, tuple)):
-            outputs = [o.transpose((1, 0, 2)) for o in outputs]
-        else:
-            outputs = outputs.transpose((1, 0, 2))
-        return outputs, additional_outputs
-
-    def _apply_pooling(self, sequence):
-        """Generate the representation given the inputs.
-
-        This is used for pre-training or fine-tuning a BERT model.
-        """
-        outputs = sequence.slice(begin=(0, 0, 0), end=(None, 1, None))
-        outputs = outputs.reshape(shape=(-1, self._units))
-        return self.pooler(outputs)
-
-    def _decode(self, F, sequence, masked_positions):
-        """Generate unnormalized prediction for the masked language model task.
-
-        This is only used for pre-training the BERT model.
-
-        Inputs:
-            - **sequence**: input tensor of sequence encodings.
-              Shape (batch_size, seq_length, units).
-            - **masked_positions**: input tensor of position of tokens for masked LM decoding.
-              Shape (batch_size, num_masked_positions). For each sample in the batch, the values
-              in this tensor must not be out of bound considering the length of the sequence.
-
-        Outputs:
-            - **masked_lm_outputs**: output tensor of token predictions for target masked_positions.
-                Shape (batch_size, num_masked_positions, vocab_size).
-        """
-        masked_positions = masked_positions.astype('int32')
-        mask_shape = masked_positions.shape_array()
-        num_masked_positions = mask_shape.slice(begin=(1,), end=(2,)).astype('int32')
-        idx_arange = F.contrib.arange_like(masked_positions.reshape((-1, )), axis=0)
-        batch_idx = F.broadcast_div(idx_arange, num_masked_positions)
-        # batch_idx_1d =        [0,0,0,1,1,1,2,2,2...]
-        # masked_positions_1d = [1,2,4,0,3,4,2,3,5...]
-        batch_idx_1d = batch_idx.reshape((1, -1))
-        masked_positions_1d = masked_positions.reshape((1, -1))
-        position_idx = F.concat(batch_idx_1d, masked_positions_1d, dim=0)
-        encoded = F.gather_nd(sequence, position_idx)
-        encoded = encoded.reshape_like(masked_positions, lhs_begin=-2, lhs_end=-1, rhs_begin=0)
-        decoded = self.decoder(encoded)
-        return decoded
-
-
-class RoBERTaModel(BERTModel):
-    """Generic Model for BERT (Bidirectional Encoder Representations from Transformers).
-
-    Parameters
-    ----------
-    encoder : BERTEncoder
-        Bidirectional encoder that encodes the input sentence.
-    vocab_size : int or None, default None
-        The size of the vocabulary.
-    units : int or None, default None
-        Number of units for the final pooler layer.
-    embed_size : int or None, default None
-        Size of the embedding vectors. It is used to generate the word and token type
-        embeddings if word_embed and token_type_embed are None.
-    embed_initializer : Initializer, default None
-        Initializer of the embedding weights. It is used to generate the source and target
-        embeddings if word_embed and token_type_embed are None.
-    word_embed : Block or None, default None
-        The word embedding. If set to None, word_embed will be constructed using embed_size.
-    use_decoder : bool, default True
-        Whether to include the decoder for masked language model prediction.
-    prefix : str or None
-        See document of `mx.gluon.Block`.
-    params : ParameterDict or None
-        See document of `mx.gluon.Block`.
-
-    Inputs:
-        - **inputs**: input sequence tensor, shape (batch_size, seq_length)
-        - **valid_length**: optional tensor of input sequence valid lengths, shape (batch_size,)
-        - **masked_positions**: optional tensor of position of tokens for masked LM decoding,
-            shape (batch_size, num_masked_positions).
-
-    Outputs:
-        - **sequence_outputs**: Encoded sequence, which can be either a tensor of the last
-            layer of the Encoder, or a list of all sequence encodings of all layers.
-            In both cases shape of the tensor(s) is/are (batch_size, seq_length, units).
-        - **attention_outputs**: output list of all intermediate encodings per layer
-            Returned only if BERTEncoder.output_attention is True.
-            List of num_layers length of tensors of shape
-            (num_masks, num_attention_heads, seq_length, seq_length)
-        - **masked_lm_outputs**: output tensor of sequence decoding for masked language model
-            prediction. Returned only if use_decoder True.
-            Shape (batch_size, num_masked_positions, vocab_size)
-    """
-
-    def __init__(self, encoder, vocab_size=None, units=None,
-                 embed_size=None, embed_initializer=None,
-                 word_embed=None, use_decoder=True,
-                 prefix=None, params=None):
-        super(RoBERTaModel, self).__init__(encoder, vocab_size=vocab_size,
-                                           token_type_vocab_size=None, units=units,
-                                           embed_size=embed_size,
-                                           embed_initializer=embed_initializer,
-                                           word_embed=word_embed, token_type_embed=None,
-                                           use_pooler=False, use_decoder=use_decoder,
-                                           use_classifier=False, use_token_type_embed=False,
-                                           prefix=prefix, params=params)
-
-    def __call__(self, inputs, valid_length=None, masked_positions=None):
-        # pylint: disable=dangerous-default-value
-        """Generate the representation given the inputs.
-
-        This is used in training or fine-tuning a BERT model.
-        """
-        return super(RoBERTaModel, self).__call__(inputs, [], valid_length=valid_length,
-                                                  masked_positions=masked_positions)
-
-class DistilBERTModel(BERTModel):
-    """DistilBERT Model.
-
-    Parameters
-    ----------
-    encoder : BERTEncoder
-        Bidirectional encoder that encodes the input sentence.
-    vocab_size : int or None, default None
-        The size of the vocabulary.
-    units : int or None, default None
-        Number of units for the final pooler layer.
-    embed_size : int or None, default None
-        Size of the embedding vectors. It is used to generate the word and token type
-        embeddings if word_embed and token_type_embed are None.
-    embed_initializer : Initializer, default None
-        Initializer of the embedding weights. It is used to generate the source and target
-        embeddings if word_embed and token_type_embed are None.
-    word_embed : Block or None, default None
-        The word embedding. If set to None, word_embed will be constructed using embed_size.
-    prefix : str or None
-        See document of `mx.gluon.Block`.
-    params : ParameterDict or None
-        See document of `mx.gluon.Block`.
-
-    Inputs:
-        - **inputs**: input sequence tensor, shape (batch_size, seq_length)
-        - **valid_length**: optional tensor of input sequence valid lengths, shape (batch_size,)
-
-    Outputs:
-        - **sequence_outputs**: Encoded sequence, which can be either a tensor of the last
-            layer of the Encoder, or a list of all sequence encodings of all layers.
-            In both cases shape of the tensor(s) is/are (batch_size, seq_length, units).
-        - **attention_outputs**: output list of all intermediate encodings per layer
-            Returned only if BERTEncoder.output_attention is True.
-            List of num_layers length of tensors of shape
-            (num_masks, num_attention_heads, seq_length, seq_length)
-    """
-
-    def __init__(self, encoder, vocab_size=None, units=None,
-                 embed_size=None, embed_initializer=None,
-                 word_embed=None, prefix=None, params=None):
-        super(DistilBERTModel, self).__init__(encoder, vocab_size=vocab_size,
-                                              token_type_vocab_size=None, units=units,
-                                              embed_size=embed_size,
-                                              embed_initializer=embed_initializer,
-                                              word_embed=word_embed, token_type_embed=None,
-                                              use_pooler=False, use_decoder=False,
-                                              use_classifier=False, use_token_type_embed=False,
-                                              prefix=prefix, params=params)
-
-    def __call__(self, inputs, valid_length=None):
-        # pylint: disable=dangerous-default-value, signature-differs
-        """Generate the representation given the inputs.
-
-        This is used in fine-tuning a DistilBERT model.
-        """
-        return super(DistilBERTModel, self).__call__(inputs, [], valid_length=valid_length)
-
-class BERTClassifier(HybridBlock):
-    """Model for sentence (pair) classification task with BERT.
-
-    The model feeds token ids and token type ids into BERT to get the
-    pooled BERT sequence representation, then apply a Dense layer for
-    classification.
-
-    Parameters
-    ----------
-    bert: BERTModel
-        Bidirectional encoder with transformer.
-    num_classes : int, default is 2
-        The number of target classes.
-    dropout : float or None, default 0.0.
-        Dropout probability for the bert output.
-    prefix : str or None
-        See document of `mx.gluon.Block`.
-    params : ParameterDict or None
-        See document of `mx.gluon.Block`.
-    """
-
-    def __init__(self, bert, num_classes=2, dropout=0.0,
-                 prefix=None, params=None):
-        super(BERTClassifier, self).__init__(prefix=prefix, params=params)
-        self.bert = bert
-        with self.name_scope():
-            self.classifier = nn.HybridSequential(prefix=prefix)
-            if dropout:
-                self.classifier.add(nn.Dropout(rate=dropout))
-            self.classifier.add(nn.Dense(units=num_classes))
-
-    def __call__(self, inputs, token_types, valid_length=None):
-        # pylint: disable=dangerous-default-value, arguments-differ
-        """Generate the unnormalized score for the given the input sequences.
-
-        Parameters
-        ----------
-        inputs : NDArray or Symbol, shape (batch_size, seq_length)
-            Input words for the sequences.
-        token_types : NDArray or Symbol, shape (batch_size, seq_length)
-            Token types for the sequences, used to indicate whether the word belongs to the
-            first sentence or the second one.
-        valid_length : NDArray or Symbol, or None, shape (batch_size)
-            Valid length of the sequence. This is used to mask the padded tokens.
-
-        Returns
-        -------
-        outputs : NDArray or Symbol
-            Shape (batch_size, num_classes)
-        """
-        return super(BERTClassifier, self).__call__(inputs, token_types, valid_length)
-
-    def hybrid_forward(self, F, inputs, token_types, valid_length=None):
-        # pylint: disable=arguments-differ
-        """Generate the unnormalized score for the given the input sequences.
-
-        Parameters
-        ----------
-        inputs : NDArray or Symbol, shape (batch_size, seq_length)
-            Input words for the sequences.
-        token_types : NDArray or Symbol, shape (batch_size, seq_length)
-            Token types for the sequences, used to indicate whether the word belongs to the
-            first sentence or the second one.
-        valid_length : NDArray or None, shape (batch_size)
-            Valid length of the sequence. This is used to mask the padded tokens.
-
-        Returns
-        -------
-        outputs : NDArray
-            Shape (batch_size, num_classes)
-        """
-        _, pooler_out = self.bert(inputs, token_types, valid_length)
-        return self.classifier(pooler_out)
-
-
-class RoBERTaClassifier(HybridBlock):
-    """Model for sentence (pair) classification task with BERT.
-
-    The model feeds token ids and token type ids into BERT to get the
-    pooled BERT sequence representation, then apply a Dense layer for
-    classification.
-
-    Parameters
-    ----------
-    bert: RoBERTaModel
-        The RoBERTa model.
-    num_classes : int, default is 2
-        The number of target classes.
-    dropout : float or None, default 0.0.
-        Dropout probability for the RoBERTa output.
-    prefix : str or None
-        See document of `mx.gluon.Block`.
-    params : ParameterDict or None
-        See document of `mx.gluon.Block`.
-
-    Inputs:
-        - **inputs**: input sequence tensor, shape (batch_size, seq_length)
-        - **valid_length**: optional tensor of input sequence valid lengths.
-            Shape (batch_size, num_classes).
-
-    Outputs:
-        - **output**: Regression output, shape (batch_size, num_classes)
-    """
-
-    def __init__(self, roberta, num_classes=2, dropout=0.0,
-                 prefix=None, params=None):
-        super(RoBERTaClassifier, self).__init__(prefix=prefix, params=params)
-        self.roberta = roberta
-        self._units = roberta._units
-        with self.name_scope():
-            self.classifier = nn.HybridSequential(prefix=prefix)
-            if dropout:
-                self.classifier.add(nn.Dropout(rate=dropout))
-            self.classifier.add(nn.Dense(units=self._units, activation='tanh'))
-            if dropout:
-                self.classifier.add(nn.Dropout(rate=dropout))
-            self.classifier.add(nn.Dense(units=num_classes))
-
-    def __call__(self, inputs, valid_length=None):
-        # pylint: disable=dangerous-default-value, arguments-differ
-        """Generate the unnormalized score for the given the input sequences.
-
-        Parameters
-        ----------
-        inputs : NDArray or Symbol, shape (batch_size, seq_length)
-            Input words for the sequences.
-        valid_length : NDArray or Symbol, or None, shape (batch_size)
-            Valid length of the sequence. This is used to mask the padded tokens.
-
-        Returns
-        -------
-        outputs : NDArray or Symbol
-            Shape (batch_size, num_classes)
-        """
-        return super(RoBERTaClassifier, self).__call__(inputs, valid_length)
-
-    def hybrid_forward(self, F, inputs, valid_length=None):
-        # pylint: disable=arguments-differ
-        """Generate the unnormalized score for the given the input sequences.
-
-        Parameters
-        ----------
-        inputs : NDArray or Symbol, shape (batch_size, seq_length)
-            Input words for the sequences.
-        valid_length : NDArray or Symbol, or None, shape (batch_size)
-            Valid length of the sequence. This is used to mask the padded tokens.
-
-        Returns
-        -------
-        outputs : NDArray or Symbol
-            Shape (batch_size, num_classes)
-        """
-        seq_out = self.roberta(inputs, valid_length)
-        assert not isinstance(seq_out, (tuple, list)), 'Expected one output from RoBERTaModel'
-        outputs = seq_out.slice(begin=(0, 0, 0), end=(None, 1, None))
-        outputs = outputs.reshape(shape=(-1, self._units))
-        return self.classifier(outputs)
-
-###############################################################################
-#                               GET MODEL                                     #
-###############################################################################
-
-
-model_store._model_sha1.update(
-    {name: checksum for checksum, name in [
-        ('5656dac6965b5054147b0375337d5a6a7a2ff832', 'bert_12_768_12_book_corpus_wiki_en_cased'),
-        ('75cc780f085e8007b3bf6769c6348bb1ff9a3074', 'bert_12_768_12_book_corpus_wiki_en_uncased'),
-        ('e0864cc40b3d00fcfb1a878a728650d9148c9a1d',
-         'distilbert_6_768_12_distilbert_book_corpus_wiki_en_uncased'),
-        ('a56e24015a777329c795eed4ed21c698af03c9ff',
-         'bert_12_768_12_openwebtext_book_corpus_wiki_en_uncased'),
-        ('5cf21fcddb5ae1a4c21c61201643460c9d65d3b0',
-         'roberta_12_768_12_openwebtext_ccnews_stories_books_cased'),
-        ('d1b7163e9628e2fd51c9a9f3a0dc519d4fc24add',
-         'roberta_24_1024_16_openwebtext_ccnews_stories_books_cased'),
-        ('237f39851b24f0b56d70aa20efd50095e3926e26', 'bert_12_768_12_wiki_multilingual_uncased'),
-        ('b0f57a207f85a7d361bb79de80756a8c9a4276f7', 'bert_12_768_12_wiki_multilingual_cased'),
-        ('885ebb9adc249a170c5576e90e88cfd1bbd98da6', 'bert_12_768_12_wiki_cn_cased'),
-        ('4e685a966f8bf07d533bd6b0e06c04136f23f620', 'bert_24_1024_16_book_corpus_wiki_en_cased'),
-        ('24551e1446180e045019a87fc4ffbf714d99c0b5', 'bert_24_1024_16_book_corpus_wiki_en_uncased'),
-        ('6c82d963fc8fa79c35dd6cb3e1725d1e5b6aa7d7', 'bert_12_768_12_scibert_scivocab_uncased'),
-        ('adf9c81e72ac286a37b9002da8df9e50a753d98b', 'bert_12_768_12_scibert_scivocab_cased'),
-        ('75acea8e8386890120533d6c0032b0b3fcb2d536', 'bert_12_768_12_scibert_basevocab_uncased'),
-        ('8e86e5de55d6dae99123312cd8cdd8183a75e057', 'bert_12_768_12_scibert_basevocab_cased'),
-        ('a07780385add682f609772e81ec64aca77c9fb05', 'bert_12_768_12_biobert_v1.0_pmc_cased'),
-        ('280ad1cc487db90489f86189e045e915b35e7489', 'bert_12_768_12_biobert_v1.0_pubmed_cased'),
-        ('8a8c75441f028a6b928b11466f3d30f4360dfff5',
-         'bert_12_768_12_biobert_v1.0_pubmed_pmc_cased'),
-        ('55f15c5d23829f6ee87622b68711b15fef50e55b', 'bert_12_768_12_biobert_v1.1_pubmed_cased'),
-        ('60281c98ba3572dfdaac75131fa96e2136d70d5c', 'bert_12_768_12_clinicalbert_uncased'),
-        ('f869f3f89e4237a769f1b7edcbdfe8298b480052', 'ernie_12_768_12_baidu_ernie_uncased'),
-        ('ccf0593e03b91b73be90c191d885446df935eb64', 'bert_12_768_12_kobert_news_wiki_ko_cased')
-    ]})
-
-roberta_12_768_12_hparams = {
-    'num_layers': 12,
-    'units': 768,
-    'hidden_size': 3072,
-    'max_length': 512,
-    'num_heads': 12,
-    'dropout': 0.1,
-    'embed_size': 768,
-    'word_embed': None,
-    'layer_norm_eps': 1e-5
-}
-
-roberta_24_1024_16_hparams = {
-    'num_layers': 24,
-    'units': 1024,
-    'hidden_size': 4096,
-    'max_length': 512,
-    'num_heads': 16,
-    'dropout': 0.1,
-    'embed_size': 1024,
-    'word_embed': None,
-    'layer_norm_eps': 1e-5
-}
-
-distilbert_6_768_12_hparams = {
-    'attention_cell': 'multi_head',
-    'num_layers': 6,
-    'units': 768,
-    'hidden_size': 3072,
-    'max_length': 512,
-    'num_heads': 12,
-    'scaled': True,
-    'dropout': 0.1,
-    'use_residual': True,
-    'embed_size': 768,
-    'word_embed': None,
-}
-
-bert_12_768_12_hparams = {
-    'num_layers': 12,
-    'units': 768,
-    'hidden_size': 3072,
-    'max_length': 512,
-    'num_heads': 12,
-    'dropout': 0.1,
-    'embed_size': 768,
-    'token_type_vocab_size': 2,
-    'word_embed': None,
-}
-
-bert_24_1024_16_hparams = {
-    'num_layers': 24,
-    'units': 1024,
-    'hidden_size': 4096,
-    'max_length': 512,
-    'num_heads': 16,
-    'dropout': 0.1,
-    'embed_size': 1024,
-    'token_type_vocab_size': 2,
-    'word_embed': None,
-}
-
-ernie_12_768_12_hparams = {
-    'num_layers': 12,
-    'units': 768,
-    'hidden_size': 3072,
-    'max_length': 513,
-    'num_heads': 12,
-    'dropout': 0.1,
-    'embed_size': 768,
-    'token_type_vocab_size': 2,
-    'word_embed': None,
-    'activation': 'relu',
-    'layer_norm_eps': 1e-5
-}
-
-bert_hparams = {
-    'distilbert_6_768_12': distilbert_6_768_12_hparams,
-    'bert_12_768_12': bert_12_768_12_hparams,
-    'bert_24_1024_16': bert_24_1024_16_hparams,
-    'roberta_12_768_12': roberta_12_768_12_hparams,
-    'roberta_24_1024_16': roberta_24_1024_16_hparams,
-    'ernie_12_768_12': ernie_12_768_12_hparams
-}
-
-
-def bert_12_768_12(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
-                   root=os.path.join(get_home_dir(), 'models'), use_pooler=True, use_decoder=True,
-                   use_classifier=True, pretrained_allow_missing=False,
-                   hparam_allow_override=False, **kwargs):
-    """Generic BERT BASE model.
-
-    The number of layers (L) is 12, number of units (H) is 768, and the
-    number of self-attention heads (A) is 12.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        If not None, the dataset name is used to load a vocabulary for the
-        dataset. If the `pretrained` argument is set to True, the dataset name
-        is further used to select the pretrained parameters to load.
-        The supported datasets are 'book_corpus_wiki_en_cased',
-        'book_corpus_wiki_en_uncased', 'wiki_cn_cased',
-        'openwebtext_book_corpus_wiki_en_uncased',
-        'wiki_multilingual_uncased', 'wiki_multilingual_cased',
-        'scibert_scivocab_uncased', 'scibert_scivocab_cased',
-        'scibert_basevocab_uncased', 'scibert_basevocab_cased',
-        'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
-        'biobert_v1.1_pubmed',
-        'clinicalbert',
-        'kobert_news_wiki_ko_cased'
-    vocab : gluonnlp.vocab.BERTVocab or None, default None
-        Vocabulary for the dataset. Must be provided if dataset_name is not
-        specified. Ignored if dataset_name is specified.
-    pretrained : bool, default True
-        Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-    use_pooler : bool, default True
-        Whether to include the pooler which converts the encoded sequence tensor of shape
-        (batch_size, seq_length, units) to a tensor of shape (batch_size, units)
-        for for segment level classification task.
-    use_decoder : bool, default True
-        Whether to include the decoder for masked language model prediction.
-        Note that
-        'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
-        'biobert_v1.1_pubmed',
-        'clinicalbert'
-        do not include these parameters.
-    use_classifier : bool, default True
-        Whether to include the classifier for next sentence classification.
-        Note that
-        'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
-        'biobert_v1.1_pubmed'
-        do not include these parameters.
-    pretrained_allow_missing : bool, default False
-        Whether to ignore if any parameters for the BERTModel are missing in
-        the pretrained weights for model.
-        Some BERTModels for example do not provide decoder or classifier
-        weights. In that case it is still possible to construct a BERTModel
-        with use_decoder=True and/or use_classifier=True, but the respective
-        parameters will be missing from the pretrained file.
-        If pretrained_allow_missing=True, this will be ignored and the
-        parameters will be left uninitialized. Otherwise AssertionError is
-        raised.
-    hparam_allow_override : bool, default False
-        If set to True, pre-defined hyper-parameters of the model
-        (e.g. the number of layers, hidden units) can be overriden.
-
-    The pretrained parameters for dataset_name
-    'openwebtext_book_corpus_wiki_en_uncased' were obtained by running the
-    GluonNLP BERT pre-training script on OpenWebText.
-
-    The pretrained parameters for dataset_name 'scibert_scivocab_uncased',
-    'scibert_scivocab_cased', 'scibert_basevocab_uncased',
-    'scibert_basevocab_cased' were obtained by converting the parameters
-    published by "Beltagy, I., Cohan, A., & Lo, K. (2019). Scibert: Pretrained
-    contextualized embeddings for scientific text. arXiv preprint
-    arXiv:1903.10676."
-
-    The pretrained parameters for dataset_name 'biobert_v1.0_pmc',
-    'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc', 'biobert_v1.1_pubmed'
-    were obtained by converting the parameters published by "Lee, J., Yoon, W.,
-    Kim, S., Kim, D., Kim, S., So, C. H., & Kang, J. (2019). Biobert:
-    pre-trained biomedical language representation model for biomedical text
-    mining. arXiv preprint arXiv:1901.08746."
-
-    The pretrained parameters for dataset_name 'clinicalbert' were obtained by
-    converting the parameters published by "Huang, K., Altosaar, J., &
-    Ranganath, R. (2019). ClinicalBERT: Modeling Clinical Notes and Predicting
-    Hospital Readmission. arXiv preprint arXiv:1904.05342."
-
-
-    Returns
-    -------
-    BERTModel, gluonnlp.vocab.BERTVocab
-    """
-    return get_bert_model(model_name='bert_12_768_12', vocab=vocab, dataset_name=dataset_name,
-                          pretrained=pretrained, ctx=ctx, use_pooler=use_pooler,
-                          use_decoder=use_decoder, use_classifier=use_classifier, root=root,
-                          pretrained_allow_missing=pretrained_allow_missing,
-                          hparam_allow_override=hparam_allow_override, **kwargs)
-
-
-def bert_24_1024_16(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(), use_pooler=True,
-                    use_decoder=True, use_classifier=True,
-                    root=os.path.join(get_home_dir(), 'models'),
-                    pretrained_allow_missing=False,
-                    hparam_allow_override=False, **kwargs):
-    """Generic BERT LARGE model.
-
-    The number of layers (L) is 24, number of units (H) is 1024, and the
-    number of self-attention heads (A) is 16.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        If not None, the dataset name is used to load a vocabulary for the
-        dataset. If the `pretrained` argument is set to True, the dataset name
-        is further used to select the pretrained parameters to load.
-        Options include 'book_corpus_wiki_en_uncased' and 'book_corpus_wiki_en_cased'.
-    vocab : gluonnlp.vocab.BERTVocab or None, default None
-        Vocabulary for the dataset. Must be provided if dataset_name is not
-        specified. Ignored if dataset_name is specified.
-    pretrained : bool, default True
-        Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-    use_pooler : bool, default True
-        Whether to include the pooler which converts the encoded sequence tensor of shape
-        (batch_size, seq_length, units) to a tensor of shape (batch_size, units)
-        for for segment level classification task.
-    use_decoder : bool, default True
-        Whether to include the decoder for masked language model prediction.
-    use_classifier : bool, default True
-        Whether to include the classifier for next sentence classification.
-    pretrained_allow_missing : bool, default False
-        Whether to ignore if any parameters for the BERTModel are missing in
-        the pretrained weights for model.
-        Some BERTModels for example do not provide decoder or classifier
-        weights. In that case it is still possible to construct a BERTModel
-        with use_decoder=True and/or use_classifier=True, but the respective
-        parameters will be missing from the pretrained file.
-        If pretrained_allow_missing=True, this will be ignored and the
-        parameters will be left uninitialized. Otherwise AssertionError is
-        raised.
-    hparam_allow_override : bool, default False
-        If set to True, pre-defined hyper-parameters of the model
-        (e.g. the number of layers, hidden units) can be overriden.
-
-    Returns
-    -------
-    BERTModel, gluonnlp.vocab.BERTVocab
-    """
-    return get_bert_model(model_name='bert_24_1024_16', vocab=vocab, dataset_name=dataset_name,
-                          pretrained=pretrained, ctx=ctx, use_pooler=use_pooler,
-                          use_decoder=use_decoder, use_classifier=use_classifier, root=root,
-                          pretrained_allow_missing=pretrained_allow_missing,
-                          hparam_allow_override=hparam_allow_override, **kwargs)
-
-def distilbert_6_768_12(dataset_name='distil_book_corpus_wiki_en_uncased', vocab=None,
-                        pretrained=True, ctx=mx.cpu(),
-                        output_attention=False,
-                        output_all_encodings=False,
-                        root=os.path.join(get_home_dir(), 'models'),
-                        **kwargs):
-    """DistilBERT model: https://arxiv.org/abs/1910.01108
-
-    The number of layers (L) is 6, number of units (H) is 768, and the
-    number of self-attention heads (A) is 12.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        If not None, the dataset name is used to load a vocabulary for the
-        dataset. If the `pretrained` argument is set to True, the dataset name
-        is further used to select the pretrained parameters to load.
-        Options include 'book_corpus_wiki_en_uncased' and 'book_corpus_wiki_en_cased'.
-    vocab : gluonnlp.vocab.BERTVocab or None, default None
-        Vocabulary for the dataset. Must be provided if dataset_name is not
-        specified. Ignored if dataset_name is specified.
-    pretrained : bool, default True
-        Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Returns
-    -------
-    DistilBERTModel, gluonnlp.vocab.Vocab
-    """
-    model_name = 'distilbert_6_768_12'
-    predefined_args = bert_hparams[model_name]
-    mutable_args = ['use_residual', 'dropout', 'word_embed']
-    mutable_args = frozenset(mutable_args)
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-        'Cannot override predefined model settings.'
-    predefined_args.update(kwargs)
-    # encoder
-    encoder = BERTEncoder(num_layers=predefined_args['num_layers'],
-                          units=predefined_args['units'],
-                          hidden_size=predefined_args['hidden_size'],
-                          max_length=predefined_args['max_length'],
-                          num_heads=predefined_args['num_heads'],
-                          dropout=predefined_args['dropout'],
-                          output_attention=output_attention,
-                          output_all_encodings=output_all_encodings,
-                          activation=predefined_args.get('activation', 'gelu'),
-                          layer_norm_eps=predefined_args.get('layer_norm_eps', 1e-5))
-
-    from ..vocab import Vocab  # pylint: disable=import-outside-toplevel
-    bert_vocab = _load_vocab(dataset_name, vocab, root, cls=Vocab)
-    # DistilBERT
-    net = DistilBERTModel(encoder, len(bert_vocab),
-                          units=predefined_args['units'],
-                          embed_size=predefined_args['embed_size'],
-                          word_embed=predefined_args['word_embed'])
-    if pretrained:
-        _load_pretrained_params(net, model_name, dataset_name, root, ctx,
-                                allow_missing=False)
-    return net, bert_vocab
-
-def roberta_12_768_12(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
-                      use_decoder=True,
-                      root=os.path.join(get_home_dir(), 'models'),
-                      hparam_allow_override=False, **kwargs):
-    """Generic RoBERTa BASE model.
-
-    The number of layers (L) is 12, number of units (H) is 768, and the
-    number of self-attention heads (A) is 12.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        If not None, the dataset name is used to load a vocabulary for the
-        dataset. If the `pretrained` argument is set to True, the dataset name
-        is further used to select the pretrained parameters to load.
-        Options include 'book_corpus_wiki_en_uncased' and 'book_corpus_wiki_en_cased'.
-    vocab : gluonnlp.vocab.Vocab or None, default None
-        Vocabulary for the dataset. Must be provided if dataset_name is not
-        specified. Ignored if dataset_name is specified.
-    pretrained : bool, default True
-        Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-    use_decoder : bool, default True
-        Whether to include the decoder for masked language model prediction.
-    hparam_allow_override : bool, default False
-        If set to True, pre-defined hyper-parameters of the model
-        (e.g. the number of layers, hidden units) can be overriden.
-
-    Returns
-    -------
-    RoBERTaModel, gluonnlp.vocab.Vocab
-    """
-    return get_roberta_model(model_name='roberta_12_768_12', vocab=vocab, dataset_name=dataset_name,
-                             pretrained=pretrained, ctx=ctx,
-                             use_decoder=use_decoder, root=root,
-                             hparam_allow_override=hparam_allow_override, **kwargs)
-
-
-def roberta_24_1024_16(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
-                       use_decoder=True,
-                       root=os.path.join(get_home_dir(), 'models'),
-                       hparam_allow_override=False, **kwargs):
-    """Generic RoBERTa LARGE model.
-
-    The number of layers (L) is 24, number of units (H) is 1024, and the
-    number of self-attention heads (A) is 16.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        If not None, the dataset name is used to load a vocabulary for the
-        dataset. If the `pretrained` argument is set to True, the dataset name
-        is further used to select the pretrained parameters to load.
-        Options include 'book_corpus_wiki_en_uncased' and 'book_corpus_wiki_en_cased'.
-    vocab : gluonnlp.vocab.Vocab or None, default None
-        Vocabulary for the dataset. Must be provided if dataset_name is not
-        specified. Ignored if dataset_name is specified.
-    pretrained : bool, default True
-        Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-    use_decoder : bool, default True
-        Whether to include the decoder for masked language model prediction.
-    hparam_allow_override : bool, default False
-        If set to True, pre-defined hyper-parameters of the model
-        (e.g. the number of layers, hidden units) can be overriden.
-
-    Returns
-    -------
-    RoBERTaModel, gluonnlp.vocab.Vocab
-    """
-    return get_roberta_model(model_name='roberta_24_1024_16', vocab=vocab,
-                             dataset_name=dataset_name, pretrained=pretrained, ctx=ctx,
-                             use_decoder=use_decoder, root=root,
-                             hparam_allow_override=hparam_allow_override, **kwargs)
-
-
-def ernie_12_768_12(dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
-                    root=os.path.join(get_home_dir(), 'models'), use_pooler=True, use_decoder=True,
-                    use_classifier=True, hparam_allow_override=False, **kwargs):
-    """Baidu ERNIE model.
-
-    Reference:
-    https://arxiv.org/pdf/1904.09223.pdf
-
-    The number of layers (L) is 12, number of units (H) is 768, and the
-    number of self-attention heads (A) is 12.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        If not None, the dataset name is used to load a vocabulary for the
-        dataset. If the `pretrained` argument is set to True, the dataset name
-        is further used to select the pretrained parameters to load.
-        The supported datasets are 'baidu_ernie'
-    vocab : gluonnlp.vocab.BERTVocab or None, default None
-        Vocabulary for the dataset. Must be provided if dataset_name is not
-        specified. Ignored if dataset_name is specified.
-    pretrained : bool, default True
-        Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-    use_pooler : bool, default True
-        Whether to include the pooler which converts the encoded sequence tensor of shape
-        (batch_size, seq_length, units) to a tensor of shape (batch_size, units)
-        for for segment level classification task.
-    use_decoder : bool, default True
-        Whether to include the decoder for masked language model prediction.
-    use_classifier : bool, default True
-        Whether to include the classifier for next sentence classification.
-    hparam_allow_override : bool, default False
-        If set to True, pre-defined hyper-parameters of the model
-        (e.g. the number of layers, hidden units) can be overriden.
-
-    Returns
-    -------
-    (BERTModel, gluonnlp.vocab.BERTVocab)
-    """
-    return get_bert_model(model_name='ernie_12_768_12', vocab=vocab, dataset_name=dataset_name,
-                          pretrained=pretrained, ctx=ctx, use_pooler=use_pooler,
-                          use_decoder=use_decoder, use_classifier=use_classifier, root=root,
-                          pretrained_allow_missing=False,
-                          hparam_allow_override=hparam_allow_override, **kwargs)
-
-
-def get_roberta_model(model_name=None, dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
-                      use_decoder=True, output_attention=False, output_all_encodings=False,
-                      root=os.path.join(get_home_dir(), 'models'), ignore_extra=False,
-                      hparam_allow_override=False, **kwargs):
-    """Any RoBERTa pretrained model.
-
-    Parameters
-    ----------
-    model_name : str or None, default None
-        Options include 'bert_24_1024_16' and 'bert_12_768_12'.
-    dataset_name : str or None, default None
-        If not None, the dataset name is used to load a vocabulary for the
-        dataset. If the `pretrained` argument is set to True, the dataset name
-        is further used to select the pretrained parameters to load.
-        The supported datasets for model_name of either roberta_24_1024_16 and
-        roberta_12_768_12 include 'openwebtext_ccnews_stories_books'.
-    vocab : gluonnlp.vocab.Vocab or None, default None
-        Vocabulary for the dataset. Must be provided if dataset_name is not
-        specified. Ignored if dataset_name is specified.
-    pretrained : bool, default True
-        Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-    use_decoder : bool, default True
-        Whether to include the decoder for masked language model prediction.
-        Note that
-        'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
-        'biobert_v1.1_pubmed',
-        'clinicalbert'
-        do not include these parameters.
-    output_attention : bool, default False
-        Whether to include attention weights of each encoding cell to the output.
-    output_all_encodings : bool, default False
-        Whether to output encodings of all encoder cells.
-    ignore_extra : bool, default False
-        Whether to silently ignore parameters from the file that are not
-        present in this Block.
-    hparam_allow_override : bool, default False
-        If set to True, pre-defined hyper-parameters of the model
-        (e.g. the number of layers, hidden units) can be overriden.
-
-    Returns
-    -------
-    RoBERTaModel, gluonnlp.vocab.Vocab
-    """
-    predefined_args = bert_hparams[model_name].copy()
-    if not hparam_allow_override:
-        mutable_args = ['use_residual', 'dropout', 'word_embed']
-        mutable_args = frozenset(mutable_args)
-        assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-            'Cannot override predefined model settings.'
-    predefined_args.update(kwargs)
-
-    # encoder
-    encoder = BERTEncoder(num_layers=predefined_args['num_layers'],
-                          units=predefined_args['units'],
-                          hidden_size=predefined_args['hidden_size'],
-                          max_length=predefined_args['max_length'],
-                          num_heads=predefined_args['num_heads'],
-                          dropout=predefined_args['dropout'],
-                          output_attention=output_attention,
-                          output_all_encodings=output_all_encodings,
-                          activation=predefined_args.get('activation', 'gelu'),
-                          layer_norm_eps=predefined_args.get('layer_norm_eps', 1e-5))
-
-    from ..vocab import Vocab  # pylint: disable=import-outside-toplevel
-    bert_vocab = _load_vocab(dataset_name, vocab, root, cls=Vocab)
-    # BERT
-    net = RoBERTaModel(encoder, len(bert_vocab),
-                       units=predefined_args['units'],
-                       embed_size=predefined_args['embed_size'],
-                       word_embed=predefined_args['word_embed'],
-                       use_decoder=use_decoder)
-    if pretrained:
-        ignore_extra = ignore_extra or not use_decoder
-        _load_pretrained_params(net, model_name, dataset_name, root, ctx, ignore_extra=ignore_extra,
-                                allow_missing=False)
-    return net, bert_vocab
-
-
-def get_bert_model(model_name=None, dataset_name=None, vocab=None, pretrained=True, ctx=mx.cpu(),
-                   use_pooler=True, use_decoder=True, use_classifier=True, output_attention=False,
-                   output_all_encodings=False, use_token_type_embed=True,
-                   root=os.path.join(get_home_dir(), 'models'),
-                   pretrained_allow_missing=False, ignore_extra=False,
-                   hparam_allow_override=False, **kwargs):
-    """Any BERT pretrained model.
-
-    Parameters
-    ----------
-    model_name : str or None, default None
-        Options include 'bert_24_1024_16' and 'bert_12_768_12'.
-    dataset_name : str or None, default None
-        If not None, the dataset name is used to load a vocabulary for the
-        dataset. If the `pretrained` argument is set to True, the dataset name
-        is further used to select the pretrained parameters to load.
-        The supported datasets for model_name of either bert_24_1024_16 and
-        bert_12_768_12 are 'book_corpus_wiki_en_cased',
-        'book_corpus_wiki_en_uncased'.
-        For model_name bert_12_768_12 'wiki_cn_cased',
-        'wiki_multilingual_uncased', 'wiki_multilingual_cased',
-        'scibert_scivocab_uncased', 'scibert_scivocab_cased',
-        'scibert_basevocab_uncased','scibert_basevocab_cased',
-        'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
-        'biobert_v1.1_pubmed',
-        'clinicalbert',
-        'kobert_news_wiki_ko_cased'
-        are additionally supported.
-    vocab : gluonnlp.vocab.BERTVocab or None, default None
-        Vocabulary for the dataset. Must be provided if dataset_name is not
-        specified. Ignored if dataset_name is specified.
-    pretrained : bool, default True
-        Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-    use_pooler : bool, default True
-        Whether to include the pooler which converts the encoded sequence tensor of shape
-        (batch_size, seq_length, units) to a tensor of shape (batch_size, units)
-        for for segment level classification task.
-    use_decoder : bool, default True
-        Whether to include the decoder for masked language model prediction.
-        Note that
-        'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
-        'biobert_v1.1_pubmed',
-        'clinicalbert'
-        do not include these parameters.
-    use_classifier : bool, default True
-        Whether to include the classifier for next sentence classification.
-        Note that
-        'biobert_v1.0_pmc', 'biobert_v1.0_pubmed', 'biobert_v1.0_pubmed_pmc',
-        'biobert_v1.1_pubmed'
-        do not include these parameters.
-    output_attention : bool, default False
-        Whether to include attention weights of each encoding cell to the output.
-    output_all_encodings : bool, default False
-        Whether to output encodings of all encoder cells.
-    pretrained_allow_missing : bool, default False
-        Whether to ignore if any parameters for the BERTModel are missing in
-        the pretrained weights for model.
-        Some BERTModels for example do not provide decoder or classifier
-        weights. In that case it is still possible to construct a BERTModel
-        with use_decoder=True and/or use_classifier=True, but the respective
-        parameters will be missing from the pretrained file.
-        If pretrained_allow_missing=True, this will be ignored and the
-        parameters will be left uninitialized. Otherwise AssertionError is
-        raised.
-    ignore_extra : bool, default False
-        Whether to silently ignore parameters from the file that are not
-        present in this Block.
-    hparam_allow_override : bool, default False
-        If set to True, pre-defined hyper-parameters of the model
-        (e.g. the number of layers, hidden units) can be overriden.
-
-    Returns
-    -------
-    (BERTModel, gluonnlp.vocab.BERTVocab)
-    """
-    predefined_args = bert_hparams[model_name].copy()
-    if not hparam_allow_override:
-        mutable_args = ['use_residual', 'dropout', 'word_embed']
-        mutable_args = frozenset(mutable_args)
-        assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-            'Cannot override predefined model settings.'
-    predefined_args.update(kwargs)
-    # encoder
-    encoder = BERTEncoder(num_layers=predefined_args['num_layers'],
-                          units=predefined_args['units'],
-                          hidden_size=predefined_args['hidden_size'],
-                          max_length=predefined_args['max_length'],
-                          num_heads=predefined_args['num_heads'],
-                          dropout=predefined_args['dropout'],
-                          output_attention=output_attention,
-                          output_all_encodings=output_all_encodings,
-                          activation=predefined_args.get('activation', 'gelu'),
-                          layer_norm_eps=predefined_args.get('layer_norm_eps', 1e-12))
-
-    from ..vocab import BERTVocab  # pylint: disable=import-outside-toplevel
-    bert_vocab = _load_vocab(dataset_name, vocab, root, cls=BERTVocab)
-    # BERT
-    net = BERTModel(encoder, len(bert_vocab),
-                    token_type_vocab_size=predefined_args['token_type_vocab_size'],
-                    units=predefined_args['units'],
-                    embed_size=predefined_args['embed_size'],
-                    word_embed=predefined_args['word_embed'],
-                    use_pooler=use_pooler, use_decoder=use_decoder,
-                    use_classifier=use_classifier,
-                    use_token_type_embed=use_token_type_embed)
-    if pretrained:
-        ignore_extra = ignore_extra or not (use_pooler and use_decoder and use_classifier)
-        _load_pretrained_params(net, model_name, dataset_name, root, ctx, ignore_extra=ignore_extra,
-                                allow_missing=pretrained_allow_missing)
-    return net, bert_vocab
diff --git a/src/gluonnlp/model/bilm_encoder.py b/src/gluonnlp/model/bilm_encoder.py
deleted file mode 100644
index 34fdc00726..0000000000
--- a/src/gluonnlp/model/bilm_encoder.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Bidirectional LM encoder."""
-__all__ = ['BiLMEncoder']
-
-from mxnet import gluon
-from mxnet.gluon import rnn
-from .utils import _get_rnn_cell
-
-
-class BiLMEncoder(gluon.HybridBlock):
-    r"""Bidirectional LM encoder.
-
-    We implement the encoder of the biLM proposed in the following work::
-
-        @inproceedings{Peters:2018,
-        author={Peters, Matthew E. and  Neumann, Mark and Iyyer, Mohit and Gardner, Matt and Clark,
-        Christopher and Lee, Kenton and Zettlemoyer, Luke},
-        title={Deep contextualized word representations},
-        booktitle={Proc. of NAACL},
-        year={2018}
-        }
-
-    Parameters
-    ----------
-    mode : str
-        The type of RNN cell to use. Options are 'lstmpc', 'rnn_tanh', 'rnn_relu', 'lstm', 'gru'.
-    num_layers : int
-        The number of RNN cells in the encoder.
-    input_size : int
-        The initial input size of in the RNN cell.
-    hidden_size : int
-        The hidden size of the RNN cell.
-    dropout : float
-        The dropout rate to use for encoder output.
-    skip_connection : bool
-        Whether to add skip connections (add RNN cell input to output)
-    proj_size : int
-        The projection size of each LSTMPCellWithClip cell
-    cell_clip : float
-        Clip cell state between [-cellclip, cell_clip] in LSTMPCellWithClip cell
-    proj_clip : float
-        Clip projection between [-projclip, projclip] in LSTMPCellWithClip cell
-    """
-
-    def __init__(self, mode, num_layers, input_size, hidden_size, dropout=0.0,
-                 skip_connection=True, proj_size=None, cell_clip=None, proj_clip=None, **kwargs):
-        super(BiLMEncoder, self).__init__(**kwargs)
-
-        self._mode = mode
-        self._num_layers = num_layers
-        self._input_size = input_size
-        self._hidden_size = hidden_size
-        self._dropout = dropout
-        self._skip_connection = skip_connection
-        self._proj_size = proj_size
-        self._cell_clip = cell_clip
-        self._proj_clip = proj_clip
-
-        with self.name_scope():
-            lstm_input_size = self._input_size
-            self.forward_layers = rnn.HybridSequentialRNNCell()
-            with self.forward_layers.name_scope():
-                for layer_index in range(self._num_layers):
-                    forward_layer = _get_rnn_cell(mode=self._mode,
-                                                  num_layers=1,
-                                                  input_size=lstm_input_size,
-                                                  hidden_size=self._hidden_size,
-                                                  dropout=0
-                                                  if layer_index == num_layers - 1
-                                                  else self._dropout,
-                                                  weight_dropout=0,
-                                                  var_drop_in=0,
-                                                  var_drop_state=0,
-                                                  var_drop_out=0,
-                                                  skip_connection=False
-                                                  if layer_index == 0
-                                                  else self._skip_connection,
-                                                  proj_size=self._proj_size,
-                                                  cell_clip=self._cell_clip,
-                                                  proj_clip=self._proj_clip)
-
-                    self.forward_layers.add(forward_layer)
-                    lstm_input_size = self._hidden_size \
-                        if self._proj_size is None else self._proj_size
-
-            lstm_input_size = self._input_size
-            self.backward_layers = rnn.HybridSequentialRNNCell()
-            with self.backward_layers.name_scope():
-                for layer_index in range(self._num_layers):
-                    backward_layer = _get_rnn_cell(mode=self._mode,
-                                                   num_layers=1,
-                                                   input_size=lstm_input_size,
-                                                   hidden_size=self._hidden_size,
-                                                   dropout=0
-                                                   if layer_index == num_layers - 1
-                                                   else self._dropout,
-                                                   weight_dropout=0,
-                                                   var_drop_in=0,
-                                                   var_drop_state=0,
-                                                   var_drop_out=0,
-                                                   skip_connection=False
-                                                   if layer_index == 0
-                                                   else self._skip_connection,
-                                                   proj_size=self._proj_size,
-                                                   cell_clip=self._cell_clip,
-                                                   proj_clip=self._proj_clip)
-                    self.backward_layers.add(backward_layer)
-                    lstm_input_size = self._hidden_size \
-                        if self._proj_size is None else self._proj_size
-
-    def begin_state(self, func, **kwargs):
-        return [self.forward_layers[0][0].begin_state(func=func, **kwargs)
-                for _ in range(self._num_layers)], \
-               [self.backward_layers[0][0].begin_state(func=func, **kwargs)
-                for _ in range(self._num_layers)]
-
-    def hybrid_forward(self, F, inputs, states=None, mask=None):
-        # pylint: disable=arguments-differ
-        # pylint: disable=unused-argument
-        """Defines the forward computation for cache cell. Arguments can be either
-        :py:class:`NDArray` or :py:class:`Symbol`.
-
-        Parameters
-        ----------
-        inputs : NDArray
-            The input data layout='TNC'.
-        states : Tuple[List[List[NDArray]]]
-            The states. including:
-            states[0] indicates the states used in forward layer,
-            Each layer has a list of two initial tensors with
-            shape (batch_size, proj_size) and (batch_size, hidden_size).
-            states[1] indicates the states used in backward layer,
-            Each layer has a list of two initial tensors with
-            shape (batch_size, proj_size) and (batch_size, hidden_size).
-
-        Returns
-        --------
-        out: NDArray
-            The output data with shape (num_layers, seq_len, batch_size, 2*input_size).
-        [states_forward, states_backward] : List
-            Including:
-            states_forward: The out states from forward layer,
-            which has the same structure with *states[0]*.
-            states_backward: The out states from backward layer,
-            which has the same structure with *states[1]*.
-        """
-        states_forward, states_backward = states
-        if mask is not None:
-            sequence_length = mask.sum(axis=1)
-
-        outputs_forward = []
-        outputs_backward = []
-
-        for layer_index in range(self._num_layers):
-            if layer_index == 0:
-                layer_inputs = inputs
-            else:
-                layer_inputs = outputs_forward[layer_index - 1]
-            output, states_forward[layer_index] = F.contrib.foreach(
-                self.forward_layers[layer_index],
-                layer_inputs,
-                states_forward[layer_index])
-            outputs_forward.append(output)
-
-            if layer_index == 0:
-                layer_inputs = inputs
-            else:
-                layer_inputs = outputs_backward[layer_index - 1]
-
-            if mask is not None:
-                layer_inputs = F.SequenceReverse(layer_inputs,
-                                                 sequence_length=sequence_length,
-                                                 use_sequence_length=True, axis=0)
-            else:
-                layer_inputs = F.SequenceReverse(layer_inputs, axis=0)
-            output, states_backward[layer_index] = F.contrib.foreach(
-                self.backward_layers[layer_index],
-                layer_inputs,
-                states_backward[layer_index])
-            if mask is not None:
-                backward_out = F.SequenceReverse(output,
-                                                 sequence_length=sequence_length,
-                                                 use_sequence_length=True, axis=0)
-            else:
-                backward_out = F.SequenceReverse(output, axis=0)
-            outputs_backward.append(backward_out)
-        out = F.concat(*[F.stack(*outputs_forward, axis=0),
-                         F.stack(*outputs_backward, axis=0)], dim=-1)
-
-        return out, [states_forward, states_backward]
diff --git a/src/gluonnlp/model/block.py b/src/gluonnlp/model/block.py
deleted file mode 100644
index 5226a1bc54..0000000000
--- a/src/gluonnlp/model/block.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Building blocks and utility for models."""
-__all__ = ['RNNCellLayer', 'L2Normalization', 'GELU']
-
-import math
-from mxnet import ndarray
-from mxnet.gluon import Block, HybridBlock
-
-
-class RNNCellLayer(Block):
-    """A block that takes an rnn cell and makes it act like rnn layer.
-
-    Parameters
-    ----------
-    rnn_cell : Cell
-        The cell to wrap into a layer-like block.
-    layout : str, default 'TNC'
-        The output layout of the layer.
-    """
-    def __init__(self, rnn_cell, layout='TNC', **kwargs):
-        super(RNNCellLayer, self).__init__(**kwargs)
-        self.cell = rnn_cell
-        assert layout in ('TNC', 'NTC'), \
-            'Invalid layout %s; must be one of ["TNC" or "NTC"]'%layout
-        self._layout = layout
-        self._axis = layout.find('T')
-        self._batch_axis = layout.find('N')
-
-    def forward(self, inputs, states=None): # pylint: disable=arguments-differ
-        """Defines the forward computation. Arguments can be either
-        :py:class:`NDArray` or :py:class:`Symbol`."""
-        batch_size = inputs.shape[self._batch_axis]
-        skip_states = states is None
-        if skip_states:
-            states = self.cell.begin_state(batch_size, ctx=inputs.context)
-        if isinstance(states, ndarray.NDArray):
-            states = [states]
-        for state, info in zip(states, self.cell.state_info(batch_size)):
-            if state.shape != info['shape']:
-                raise ValueError(
-                    'Invalid recurrent state shape. Expecting %s, got %s.'%(
-                        str(info['shape']), str(state.shape)))
-        states = sum(zip(*((j for j in i) for i in states)), ())
-        outputs, states = self.cell.unroll(
-            inputs.shape[self._axis], inputs, states,
-            layout=self._layout, merge_outputs=True)
-
-        if skip_states:
-            return outputs
-        return outputs, states
-
-
-class L2Normalization(HybridBlock):
-    """Normalize the input array by dividing the L2 norm along the given axis.
-
-    ..code
-
-        out = data / (sqrt(sum(data**2, axis)) + eps)
-
-    Parameters
-    ----------
-    axis : int, default -1
-        The axis to compute the norm value.
-    eps : float, default 1E-6
-        The epsilon value to avoid dividing zero
-    """
-    def __init__(self, axis=-1, eps=1E-6, **kwargs):
-        super(L2Normalization, self).__init__(**kwargs)
-        self._axis = axis
-        self._eps = eps
-
-    def hybrid_forward(self, F, x):  # pylint: disable=arguments-differ
-        ret = F.broadcast_div(x, F.norm(x, axis=self._axis, keepdims=True) + self._eps)
-        return ret
-
-
-class GELU(HybridBlock):
-    """Gaussian Error Linear Unit.
-
-    This is a smoother version of the RELU.
-    https://arxiv.org/abs/1606.08415
-
-    Parameters
-    ----------
-    approximate : bool, default False
-        If True, use tanh approximation to calculate gelu. If False, use erf.
-
-    """
-
-    def __init__(self, approximate=False, prefix=None, params=None):
-        super().__init__(prefix=prefix, params=params)
-        self._approximate = approximate
-
-    def hybrid_forward(self, F, x):  # pylint: disable=arguments-differ
-        """
-
-        Parameters
-        ----------
-        Inputs:
-            - **data**: input tensor with arbitrary shape.
-        Outputs:
-            - **out**: output tensor with the same shape as `data`.
-        """
-        if not self._approximate:
-            return F.LeakyReLU(x, act_type='gelu')
-        else:
-            return 0.5 * x * (1 + F.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * (x ** 3))))
-
-    def __repr__(self):
-        s = '{name}()'
-        return s.format(name=self.__class__.__name__)
diff --git a/src/gluonnlp/model/convolutional_encoder.py b/src/gluonnlp/model/convolutional_encoder.py
deleted file mode 100644
index 2fbaa0f5f9..0000000000
--- a/src/gluonnlp/model/convolutional_encoder.py
+++ /dev/null
@@ -1,162 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Convolutional encoder."""
-
-
-__all__ = ['ConvolutionalEncoder']
-
-from mxnet import gluon
-from mxnet.gluon import nn
-from ..initializer import HighwayBias
-
-from .highway import Highway
-
-
-class ConvolutionalEncoder(gluon.HybridBlock):
-    r"""Convolutional encoder.
-
-    We implement the convolutional encoder proposed in the following work::
-
-        @inproceedings{kim2016character,
-         title={Character-Aware Neural Language Models.},
-         author={Kim, Yoon and Jernite, Yacine and Sontag, David and Rush, Alexander M},
-         booktitle={AAAI},
-         pages={2741--2749},
-         year={2016}
-        }
-
-    Parameters
-    ----------
-    embed_size : int, default 15
-        The input dimension to the encoder.
-        We set the default according to the original work's experiments
-        on PTB dataset with Char-small model setting.
-    num_filters: Tuple[int], default (25, 50, 75, 100, 125, 150)
-        The output dimension for each convolutional layer according to the filter sizes,
-        which are the number of the filters learned by the layers.
-        We set the default according to the original work's experiments
-        on PTB dataset with Char-small model setting.
-    ngram_filter_sizes: Tuple[int], default (1, 2, 3, 4, 5, 6)
-        The size of each convolutional layer,
-        and len(ngram_filter_sizes) equals to the number of convolutional layers.
-        We set the default according to the original work's experiments
-        on PTB dataset with Char-small model setting.
-    conv_layer_activation: str, default 'tanh'
-        Activation function to be used after convolutional layer.
-        We set the default according to the original work's experiments
-        on PTB dataset with Char-small model setting.
-    num_highway: int, default '1'
-        The number of layers of the Highway layer.
-        We set the default according to the original work's experiments
-        on PTB dataset with Char-small model setting.
-    highway_layer_activation: str, default 'relu'
-        Activation function to be used after highway layer.
-        If you don't specify anything, no activation is applied
-        (ie. "linear" activation: `a(x) = x`).
-        We set the default according to the original work's experiments
-        on PTB dataset with Char-small model setting.
-    highway_bias : HighwayBias,
-        default HighwayBias(nonlinear_transform_bias=0.0, transform_gate_bias=-2.0)
-        The biases applied to the highway layer.
-        We set the default according to the above original work.
-    output_size: int, default None
-        The output dimension after conducting the convolutions and max pooling,
-        and applying highways, as well as linear projection.
-
-    """
-    def __init__(self,
-                 embed_size=15,
-                 num_filters=(25, 50, 75, 100, 125, 150),
-                 ngram_filter_sizes=(1, 2, 3, 4, 5, 6),
-                 conv_layer_activation='tanh',
-                 num_highway=1,
-                 highway_layer_activation='relu',
-                 highway_bias=HighwayBias(nonlinear_transform_bias=0.0, transform_gate_bias=-2.0),
-                 output_size=None,
-                 **kwargs):
-        super(ConvolutionalEncoder, self).__init__(**kwargs)
-
-        self._embed_size = embed_size
-        self._num_filters = num_filters
-        self._ngram_filter_sizes = ngram_filter_sizes
-        self._num_highway = num_highway
-        self._output_size = output_size
-
-        with self.name_scope():
-            self._convs = gluon.contrib.nn.HybridConcurrent()
-            maxpool_output_size = 0
-            with self._convs.name_scope():
-                for num_filter, ngram_size in zip(self._num_filters, self._ngram_filter_sizes):
-                    seq = nn.HybridSequential()
-                    seq.add(nn.Conv1D(in_channels=self._embed_size,
-                                      channels=num_filter,
-                                      kernel_size=ngram_size,
-                                      use_bias=True))
-                    seq.add(gluon.nn.HybridLambda(lambda F, x: F.max(x, axis=2)))
-                    if conv_layer_activation is not None:
-                        seq.add(nn.Activation(conv_layer_activation))
-                    self._convs.add(seq)
-                    maxpool_output_size += num_filter
-
-            if self._num_highway:
-                self._highways = Highway(maxpool_output_size,
-                                         self._num_highway,
-                                         activation=highway_layer_activation,
-                                         highway_bias=highway_bias)
-            else:
-                self._highways = None
-            if self._output_size:
-                self._projection = nn.Dense(in_units=maxpool_output_size,
-                                            units=self._output_size,
-                                            use_bias=True)
-            else:
-                self._projection = None
-                self._output_size = maxpool_output_size
-
-    def hybrid_forward(self, F, inputs, mask=None): # pylint: disable=arguments-differ
-        r"""
-        Forward computation for char_encoder
-
-        Parameters
-        ----------
-        inputs: NDArray
-            The input tensor is of shape `(seq_len, batch_size, embedding_size)` TNC.
-        mask: NDArray
-            The mask applied to the input of shape `(seq_len, batch_size)`, the mask will
-            be broadcasted along the embedding dimension.
-
-        Returns
-        ----------
-        output: NDArray
-            The output of the encoder with shape `(batch_size, output_size)`
-
-        """
-        if mask is not None:
-            inputs = F.broadcast_mul(inputs, mask.expand_dims(-1))
-
-        inputs = F.transpose(inputs, axes=(1, 2, 0))
-
-        output = self._convs(inputs)
-
-        if self._highways:
-            output = self._highways(output)
-
-        if self._projection:
-            output = self._projection(output)
-
-        return output
diff --git a/src/gluonnlp/model/elmo.py b/src/gluonnlp/model/elmo.py
deleted file mode 100644
index 467efd1bd3..0000000000
--- a/src/gluonnlp/model/elmo.py
+++ /dev/null
@@ -1,435 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""ELMo."""
-__all__ = ['ELMoBiLM', 'ELMoCharacterEncoder',
-           'elmo_2x1024_128_2048cnn_1xhighway', 'elmo_2x2048_256_2048cnn_1xhighway',
-           'elmo_2x4096_512_2048cnn_2xhighway']
-
-import os
-import mxnet as mx
-from mxnet import gluon
-from mxnet.gluon.model_zoo import model_store
-from mxnet.gluon.model_zoo.model_store import get_model_file
-
-from .convolutional_encoder import ConvolutionalEncoder
-from .bilm_encoder import BiLMEncoder
-from ..initializer.initializer import HighwayBias
-from ..vocab.elmo import ELMoCharVocab
-from ..base import get_home_dir
-
-
-class ELMoCharacterEncoder(gluon.HybridBlock):
-    r"""ELMo character encoder
-
-    Compute context-free character-based token representation with character-level convolution.
-
-    This encoder has input character ids of shape
-    (batch_size, sequence_length, max_character_per_word)
-    and returns (batch_size, sequence_length, embedding_size).
-
-    Parameters
-    ----------
-    output_size : int
-        The output dimension after conducting the convolutions and max pooling,
-        and applying highways, as well as linear projection.
-    filters : list of tuple
-        List of tuples representing the settings for convolution layers.
-        Each element is (ngram_filter_size, num_filters).
-    char_embed_size : int
-        The input dimension to the encoder.
-    num_highway : int
-        The number of layers of the Highway layer.
-    conv_layer_activation : str
-        Activation function to be used after convolutional layer.
-    max_chars_per_token : int
-        The maximum number of characters of a token.
-    char_vocab_size : int
-        Size of character-level vocabulary.
-    """
-    def __init__(self,
-                 output_size,
-                 filters,
-                 char_embed_size,
-                 num_highway,
-                 conv_layer_activation,
-                 max_chars_per_token,
-                 char_vocab_size,
-                 **kwargs):
-        super(ELMoCharacterEncoder, self).__init__(**kwargs)
-
-        self._output_size = output_size
-        self._char_embed_size = char_embed_size
-        self._filters = filters
-        ngram_filter_sizes = []
-        num_filters = []
-        for width, num in filters:
-            ngram_filter_sizes.append(width)
-            num_filters.append(num)
-        self._num_highway = num_highway
-        self._conv_layer_activation = conv_layer_activation
-        self._max_chars_per_token = max_chars_per_token
-
-        with self.name_scope():
-            self._char_embedding = gluon.nn.Embedding(char_vocab_size,
-                                                      self._char_embed_size)
-            self._convolutions = ConvolutionalEncoder(embed_size=self._char_embed_size,
-                                                      num_filters=tuple(num_filters),
-                                                      ngram_filter_sizes=tuple(ngram_filter_sizes),
-                                                      conv_layer_activation=conv_layer_activation,
-                                                      num_highway=self._num_highway,
-                                                      highway_bias=HighwayBias(
-                                                          nonlinear_transform_bias=0.0,
-                                                          transform_gate_bias=1.0),
-                                                      output_size=self._output_size)
-
-
-    def hybrid_forward(self, F, inputs):
-        # pylint: disable=arguments-differ
-        """
-        Compute context insensitive token embeddings for ELMo representations.
-
-        Parameters
-        ----------
-        inputs : NDArray
-            Shape (batch_size, sequence_length, max_character_per_token)
-            of character ids representing the current batch.
-
-        Returns
-        -------
-        token_embedding : NDArray
-            Shape (batch_size, sequence_length, embedding_size) with context
-            insensitive token representations.
-        """
-        # the character id embedding
-        # (batch_size * sequence_length, max_chars_per_token, embed_dim)
-        character_embedding = self._char_embedding(inputs.reshape((-1, self._max_chars_per_token)))
-
-        character_embedding = F.transpose(character_embedding, axes=(1, 0, 2))
-        token_embedding = self._convolutions(character_embedding)
-
-        out_shape_ref = inputs.slice_axis(axis=-1, begin=0, end=1)
-        out_shape_ref = out_shape_ref.broadcast_axes(axis=(2,),
-                                                     size=(self._output_size))
-
-        return token_embedding.reshape_like(out_shape_ref)
-
-
-class ELMoBiLM(gluon.HybridBlock):
-    r"""ELMo Bidirectional language model
-
-    Run a pre-trained bidirectional language model, outputting the weighted
-    ELMo representation.
-
-    We implement the ELMo Bidirectional language model (BiLm) proposed in the following work::
-
-        @inproceedings{Peters:2018,
-        author={Peters, Matthew E. and  Neumann, Mark and Iyyer, Mohit and Gardner,
-        Matt and Clark, Christopher and Lee, Kenton and Zettlemoyer, Luke},
-        title={Deep contextualized word representations},
-        booktitle={Proc. of NAACL},
-        year={2018}
-        }
-
-    Parameters
-    ----------
-    rnn_type : str
-        The type of RNN cell to use.
-        The option for pre-trained models is 'lstmpc'.
-    output_size : int
-        The output dimension after conducting the convolutions and max pooling,
-        and applying highways, as well as linear projection.
-    filters : list of tuple
-        List of tuples representing the settings for convolution layers.
-        Each element is (ngram_filter_size, num_filters).
-    char_embed_size : int
-        The input dimension to the encoder.
-    char_vocab_size : int
-        Size of character-level vocabulary.
-    num_highway : int
-        The number of layers of the Highway layer.
-    conv_layer_activation : str
-        Activation function to be used after convolutional layer.
-    max_chars_per_token : int
-        The maximum number of characters of a token.
-    input_size : int
-        The initial input size of in the RNN cell.
-    hidden_size : int
-        The hidden size of the RNN cell.
-    proj_size : int
-        The projection size of each LSTMPCellWithClip cell
-    num_layers : int
-        The number of RNN cells.
-    cell_clip : float
-        Clip cell state between [-cellclip, cell_clip] in LSTMPCellWithClip cell
-    proj_clip : float
-        Clip projection between [-projclip, projclip] in LSTMPCellWithClip cell
-    skip_connection : bool
-        Whether to add skip connections (add RNN cell input to output)
-    """
-    def __init__(self,
-                 rnn_type,
-                 output_size,
-                 filters,
-                 char_embed_size,
-                 char_vocab_size,
-                 num_highway,
-                 conv_layer_activation,
-                 max_chars_per_token,
-                 input_size,
-                 hidden_size,
-                 proj_size,
-                 num_layers,
-                 cell_clip,
-                 proj_clip,
-                 skip_connection=True,
-                 **kwargs):
-        super(ELMoBiLM, self).__init__(**kwargs)
-
-        self._rnn_type = rnn_type
-        self._output_size = output_size
-        self._filters = filters
-        self._char_embed_size = char_embed_size
-        self._char_vocab_size = char_vocab_size
-        self._num_highway = num_highway
-        self._conv_layer_activation = conv_layer_activation
-        self._max_chars_per_token = max_chars_per_token
-        self._input_size = input_size
-        self._hidden_size = hidden_size
-        self._proj_size = proj_size
-        self._num_layers = num_layers
-        self._cell_clip = cell_clip
-        self._proj_clip = proj_clip
-        self._skip_connection = skip_connection
-
-        if not self._skip_connection:
-            raise NotImplementedError
-
-        with self.name_scope():
-            self._elmo_char_encoder = ELMoCharacterEncoder(self._output_size,
-                                                           self._filters,
-                                                           self._char_embed_size,
-                                                           self._num_highway,
-                                                           self._conv_layer_activation,
-                                                           self._max_chars_per_token,
-                                                           self._char_vocab_size)
-            self._elmo_lstm = BiLMEncoder(mode=self._rnn_type,
-                                          input_size=self._input_size,
-                                          hidden_size=self._hidden_size,
-                                          proj_size=self._proj_size,
-                                          num_layers=self._num_layers,
-                                          cell_clip=self._cell_clip,
-                                          proj_clip=self._proj_clip)
-
-    def begin_state(self, func, **kwargs):
-        return self._elmo_lstm.begin_state(func, **kwargs)
-
-    def hybrid_forward(self, F, inputs, states=None, mask=None):
-        # pylint: disable=arguments-differ
-        """
-        Parameters
-        ----------
-        inputs : NDArray
-            Shape (batch_size, sequence_length, max_character_per_token)
-            of character ids representing the current batch.
-        states : (list of list of NDArray, list of list of NDArray)
-            The states. First tuple element is the forward layer states, while the second is
-            the states from backward layer. Each is a list of states for each layer.
-            The state of each layer has a list of two initial tensors with
-            shape (batch_size, proj_size) and (batch_size, hidden_size).
-        mask :  NDArray
-            Shape (batch_size, sequence_length) with sequence mask.
-
-        Returns
-        -------
-        output : list of NDArray
-            A list of activations at each layer of the network, each of shape
-            (batch_size, sequence_length, embedding_size)
-        states : (list of list of NDArray, list of list of NDArray)
-            The states. First tuple element is the forward layer states, while the second is
-            the states from backward layer. Each is a list of states for each layer.
-            The state of each layer has a list of two initial tensors with
-            shape (batch_size, proj_size) and (batch_size, hidden_size).
-        """
-
-        type_representation = self._elmo_char_encoder(inputs)
-        type_representation = type_representation.transpose(axes=(1, 0, 2))
-        lstm_outputs, states = self._elmo_lstm(type_representation, states, mask)
-        lstm_outputs = lstm_outputs.transpose(axes=(0, 2, 1, 3))
-        type_representation = type_representation.transpose(axes=(1, 0, 2))
-
-        # Prepare the output. The first layer is duplicated.
-        output = F.concat(*[type_representation, type_representation], dim=-1)
-        if mask is not None:
-            output = output * mask.expand_dims(axis=-1)
-        output = [output]
-        output.extend([layer_activations.squeeze(axis=0) for layer_activations
-                       in F.split(lstm_outputs, self._num_layers, axis=0)])
-        return output, states
-
-
-model_store._model_sha1.update(
-    {name: checksum for checksum, name in [
-        ('8c9257d9153436e9eb692f9ec48d8ee07e2120f8', 'elmo_2x1024_128_2048cnn_1xhighway_gbw'),
-        ('85eab56a3c90c6866dd8d13b50449934be58a2e6', 'elmo_2x2048_256_2048cnn_1xhighway_gbw'),
-        ('79af623840c13b10cb891d20c207afc483ab27b9', 'elmo_2x4096_512_2048cnn_2xhighway_5bw'),
-        ('5608a09f33c52e5ab3f043b1793481ab448a0347', 'elmo_2x4096_512_2048cnn_2xhighway_gbw')
-    ]})
-
-
-
-def _get_elmo_model(model_cls, model_name, dataset_name, pretrained, ctx, root, **kwargs):
-    vocab = ELMoCharVocab()
-    if 'char_vocab_size' not in kwargs:
-        kwargs['char_vocab_size'] = len(vocab)
-    net = model_cls(**kwargs)
-    if pretrained:
-        model_file = get_model_file('_'.join([model_name, dataset_name]), root=root)
-        net.load_parameters(model_file, ctx=ctx)
-    return net, vocab
-
-
-def elmo_2x1024_128_2048cnn_1xhighway(dataset_name=None, pretrained=False, ctx=mx.cpu(),
-                                      root=os.path.join(get_home_dir(), 'models'), **kwargs):
-    r"""ELMo 2-layer BiLSTM with 1024 hidden units, 128 projection size, 1 highway layer.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        The dataset name on which the pre-trained model is trained.
-        Options are 'gbw'.
-    pretrained : bool, default False
-        Whether to load the pre-trained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pre-trained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Returns
-    -------
-    gluon.Block
-    """
-
-    predefined_args = {'rnn_type': 'lstmpc',
-                       'output_size': 128,
-                       'filters': [[1, 32], [2, 32], [3, 64], [4, 128],
-                                   [5, 256], [6, 512], [7, 1024]],
-                       'char_embed_size': 16,
-                       'num_highway': 1,
-                       'conv_layer_activation': 'relu',
-                       'max_chars_per_token': 50,
-                       'input_size': 128,
-                       'hidden_size': 1024,
-                       'proj_size': 128,
-                       'num_layers': 2,
-                       'cell_clip': 3,
-                       'proj_clip': 3,
-                       'skip_connection': True}
-    assert all((k not in kwargs) for k in predefined_args), \
-           'Cannot override predefined model settings.'
-    predefined_args.update(kwargs)
-    return _get_elmo_model(ELMoBiLM, 'elmo_2x1024_128_2048cnn_1xhighway', dataset_name, pretrained,
-                           ctx, root, **predefined_args)
-
-
-def elmo_2x2048_256_2048cnn_1xhighway(dataset_name=None, pretrained=False, ctx=mx.cpu(),
-                                      root=os.path.join(get_home_dir(), 'models'), **kwargs):
-    r"""ELMo 2-layer BiLSTM with 2048 hidden units, 256 projection size, 1 highway layer.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        The dataset name on which the pre-trained model is trained.
-        Options are 'gbw'.
-    pretrained : bool, default False
-        Whether to load the pre-trained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pre-trained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Returns
-    -------
-    gluon.Block
-    """
-
-    predefined_args = {'rnn_type': 'lstmpc',
-                       'output_size': 256,
-                       'filters': [[1, 32], [2, 32], [3, 64], [4, 128],
-                                   [5, 256], [6, 512], [7, 1024]],
-                       'char_embed_size': 16,
-                       'num_highway': 1,
-                       'conv_layer_activation': 'relu',
-                       'max_chars_per_token': 50,
-                       'input_size': 256,
-                       'hidden_size': 2048,
-                       'proj_size': 256,
-                       'num_layers': 2,
-                       'cell_clip': 3,
-                       'proj_clip': 3,
-                       'skip_connection': True}
-    assert all((k not in kwargs) for k in predefined_args), \
-           'Cannot override predefined model settings.'
-    predefined_args.update(kwargs)
-    return _get_elmo_model(ELMoBiLM, 'elmo_2x2048_256_2048cnn_1xhighway', dataset_name, pretrained,
-                           ctx, root, **predefined_args)
-
-
-def elmo_2x4096_512_2048cnn_2xhighway(dataset_name=None, pretrained=False, ctx=mx.cpu(),
-                                      root=os.path.join(get_home_dir(), 'models'), **kwargs):
-    r"""ELMo 2-layer BiLSTM with 4096 hidden units, 512 projection size, 2 highway layer.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        The dataset name on which the pre-trained model is trained.
-        Options are 'gbw' and '5bw'.
-    pretrained : bool, default False
-        Whether to load the pre-trained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pre-trained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Returns
-    -------
-    gluon.Block
-    """
-
-    predefined_args = {'rnn_type': 'lstmpc',
-                       'output_size': 512,
-                       'filters': [[1, 32], [2, 32], [3, 64], [4, 128],
-                                   [5, 256], [6, 512], [7, 1024]],
-                       'char_embed_size': 16,
-                       'num_highway': 2,
-                       'conv_layer_activation': 'relu',
-                       'max_chars_per_token': 50,
-                       'input_size': 512,
-                       'hidden_size': 4096,
-                       'proj_size': 512,
-                       'num_layers': 2,
-                       'cell_clip': 3,
-                       'proj_clip': 3,
-                       'skip_connection': True}
-    assert all((k not in kwargs) for k in predefined_args), \
-           'Cannot override predefined model settings.'
-    predefined_args.update(kwargs)
-    return _get_elmo_model(ELMoBiLM, 'elmo_2x4096_512_2048cnn_2xhighway', dataset_name, pretrained,
-                           ctx, root, **predefined_args)
diff --git a/src/gluonnlp/model/highway.py b/src/gluonnlp/model/highway.py
deleted file mode 100644
index 30aa8ae448..0000000000
--- a/src/gluonnlp/model/highway.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Highway layer."""
-
-
-__all__ = ['Highway']
-
-from mxnet import gluon
-from mxnet.gluon import nn
-from ..initializer import HighwayBias
-
-
-class Highway(gluon.HybridBlock):
-    r"""Highway network.
-
-    We implemented the highway network proposed in the following work::
-
-        @article{srivastava2015highway,
-          title={Highway networks},
-          author={Srivastava, Rupesh Kumar and Greff, Klaus and Schmidhuber, J{\"u}rgen},
-          journal={arXiv preprint arXiv:1505.00387},
-          year={2015}
-        }
-
-    The full version of the work::
-
-        @inproceedings{srivastava2015training,
-         title={Training very deep networks},
-         author={Srivastava, Rupesh K and Greff, Klaus and Schmidhuber, J{\"u}rgen},
-         booktitle={Advances in neural information processing systems},
-         pages={2377--2385},
-         year={2015}
-        }
-
-    A Highway layer is defined as below:
-
-    .. math::
-        y = (1 - t) * x + t * f(A(x))
-
-    which is a gated combination of a linear transform and a non-linear transform of its input,
-    where :math:`x` is the input tensor, :math:`A` is a linear transformer,
-    :math:`f` is an element-wise non-linear transformer,
-    and :math:`t` is an element-wise transform gate, and :math:`1-t` refers to carry gate.
-
-    Parameters
-    ----------
-    input_size : int
-        The dimension of the input tensor.  We assume the input has shape ``(batch_size,
-        input_size)``.
-    num_layers : int
-        The number of highway layers to apply to the input.
-    activation : str, default 'relu'
-        The non-linear activation function to use.
-        If you don't specify anything, no activation is applied
-        (ie. "linear" activation: `a(x) = x`).
-    highway_bias : HighwayBias,
-        default HighwayBias(nonlinear_transform_bias=0.0, transform_gate_bias=-2.0)
-        The biases applied to the highway layer.
-        We set the default according to the above original work.
-    """
-
-    def __init__(self,
-                 input_size,
-                 num_layers,
-                 activation='relu',
-                 highway_bias=HighwayBias(nonlinear_transform_bias=0.0, transform_gate_bias=-2.0),
-                 **kwargs):
-        super(Highway, self).__init__(**kwargs)
-        self._input_size = input_size
-        self._num_layers = num_layers
-
-        with self.name_scope():
-            self.hnet = nn.HybridSequential()
-            with self.hnet.name_scope():
-                for _ in range(self._num_layers):
-                    self.hnet.add(nn.Dense(units=self._input_size * 2,
-                                           in_units=self._input_size,
-                                           bias_initializer=highway_bias,
-                                           use_bias=True,
-                                           flatten=False))
-            self._activation = nn.Activation(activation)
-
-    def hybrid_forward(self, F, inputs, **kwargs): # pylint: disable=arguments-differ
-        r"""
-        Forward computation for highway layer
-
-        Parameters
-        ----------
-        inputs: NDArray
-            The input tensor is of shape `(..., input_size)`.
-
-        Returns
-        ----------
-        outputs: NDArray
-            The output tensor is of the same shape with input tensor `(..., input_size)`.
-        """
-        current_input = inputs
-        for layer in self.hnet:
-            projected_input = layer(current_input)
-            linear_transform = current_input
-            nonlinear_transform, transform_gate = projected_input.split(num_outputs=2, axis=-1)
-            nonlinear_transform = self._activation(nonlinear_transform)
-            transform_gate = transform_gate.sigmoid()
-            current_input = (1 - transform_gate) * linear_transform + \
-                            transform_gate * nonlinear_transform
-        return current_input
diff --git a/src/gluonnlp/model/info.py b/src/gluonnlp/model/info.py
deleted file mode 100644
index 3f964d4c38..0000000000
--- a/src/gluonnlp/model/info.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""API to get list of pretrained models"""
-__all__ = ['list_models']
-
-from . import (bert, bilm_encoder, elmo, language_model,
-               transformer)
-
-
-def list_models():
-    """Returns the list of pretrained models
-    """
-    models = (bert.__all__ + bilm_encoder.__all__ + elmo.__all__ +
-              language_model.__all__ + transformer.__all__)
-
-    return models
diff --git a/src/gluonnlp/model/language_model.py b/src/gluonnlp/model/language_model.py
deleted file mode 100644
index 29f5eedb56..0000000000
--- a/src/gluonnlp/model/language_model.py
+++ /dev/null
@@ -1,558 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Language models."""
-__all__ = ['AWDRNN', 'StandardRNN', 'BigRNN', 'awd_lstm_lm_1150', 'awd_lstm_lm_600',
-           'standard_lstm_lm_200', 'standard_lstm_lm_650', 'standard_lstm_lm_1500',
-           'big_rnn_lm_2048_512']
-
-import os
-
-from mxnet.gluon import Block, nn, rnn, contrib
-from mxnet import nd, cpu, autograd, sym
-from mxnet.gluon.model_zoo import model_store
-
-from . import train
-from .utils import _load_vocab, _load_pretrained_params
-from ..base import get_home_dir
-
-
-class AWDRNN(train.AWDRNN):
-    """AWD language model by salesforce.
-
-    Reference: https://github.com/salesforce/awd-lstm-lm
-
-    License: BSD 3-Clause
-
-    Parameters
-    ----------
-    mode : str
-        The type of RNN to use. Options are 'lstm', 'gru', 'rnn_tanh', 'rnn_relu'.
-    vocab_size : int
-        Size of the input vocabulary.
-    embed_size : int
-        Dimension of embedding vectors.
-    hidden_size : int
-        Number of hidden units for RNN.
-    num_layers : int
-        Number of RNN layers.
-    tie_weights : bool, default False
-        Whether to tie the weight matrices of output dense layer and input embedding layer.
-    dropout : float
-        Dropout rate to use for encoder output.
-    weight_drop : float
-        Dropout rate to use on encoder h2h weights.
-    drop_h : float
-        Dropout rate to on the output of intermediate layers of encoder.
-    drop_i : float
-        Dropout rate to on the output of embedding.
-    drop_e : float
-        Dropout rate to use on the embedding layer.
-    """
-    def __init__(self, mode, vocab_size, embed_size, hidden_size, num_layers,
-                 tie_weights, dropout, weight_drop, drop_h,
-                 drop_i, drop_e, **kwargs):
-        super(AWDRNN, self).__init__(mode, vocab_size, embed_size, hidden_size, num_layers,
-                                     tie_weights, dropout, weight_drop,
-                                     drop_h, drop_i, drop_e, **kwargs)
-
-    def hybrid_forward(self, F, inputs, begin_state=None):
-        # pylint: disable=arguments-differ
-        """Implement forward computation.
-
-        Parameters
-        -----------
-        inputs : NDArray
-            input tensor with shape `(sequence_length, batch_size)`
-            when `layout` is "TNC".
-        begin_state : list
-            initial recurrent state tensor with length equals to num_layers.
-            the initial state with shape `(1, batch_size, num_hidden)`
-
-        Returns
-        --------
-        out: NDArray
-            output tensor with shape `(sequence_length, batch_size, input_size)`
-            when `layout` is "TNC".
-        out_states: list
-            output recurrent state tensor with length equals to num_layers.
-            the state with shape `(1, batch_size, num_hidden)`
-        """
-        encoded = self.embedding(inputs)
-        if not begin_state:
-            if F == nd:
-                begin_state = self.begin_state(batch_size=inputs.shape[1])
-            else:
-                begin_state = self.begin_state(batch_size=0, func=sym.zeros)
-        out_states = []
-        for i, (e, s) in enumerate(zip(self.encoder, begin_state)):
-            encoded, state = e(encoded, s)
-            out_states.append(state)
-            if self._drop_h and i != len(self.encoder)-1:
-                encoded = F.Dropout(encoded, p=self._drop_h, axes=(0,))
-        if self._dropout:
-            encoded = F.Dropout(encoded, p=self._dropout, axes=(0,))
-        with autograd.predict_mode():
-            out = self.decoder(encoded)
-        return out, out_states
-
-class StandardRNN(train.StandardRNN):
-    """Standard RNN language model.
-
-    Parameters
-    ----------
-    mode : str
-        The type of RNN to use. Options are 'lstm', 'gru', 'rnn_tanh', 'rnn_relu'.
-    vocab_size : int
-        Size of the input vocabulary.
-    embed_size : int
-        Dimension of embedding vectors.
-    hidden_size : int
-        Number of hidden units for RNN.
-    num_layers : int
-        Number of RNN layers.
-    dropout : float
-        Dropout rate to use for encoder output.
-    tie_weights : bool, default False
-        Whether to tie the weight matrices of output dense layer and input embedding layer.
-    """
-    def __init__(self, mode, vocab_size, embed_size, hidden_size,
-                 num_layers, dropout, tie_weights, **kwargs):
-        if tie_weights:
-            assert embed_size == hidden_size, 'Embedding dimension must be equal to ' \
-                                              'hidden dimension in order to tie weights. ' \
-                                              'Got: emb: {}, hid: {}.'.format(embed_size,
-                                                                              hidden_size)
-        super(StandardRNN, self).__init__(mode, vocab_size, embed_size, hidden_size,
-                                          num_layers, dropout, tie_weights, **kwargs)
-
-    def hybrid_forward(self, F, inputs, begin_state=None): # pylint: disable=arguments-differ
-        """Defines the forward computation. Arguments can be either
-        :py:class:`NDArray` or :py:class:`Symbol`.
-
-        Parameters
-        -----------
-        inputs : NDArray
-            input tensor with shape `(sequence_length, batch_size)`
-              when `layout` is "TNC".
-        begin_state : list
-            initial recurrent state tensor with length equals to num_layers-1.
-            the initial state with shape `(num_layers, batch_size, num_hidden)`
-
-        Returns
-        --------
-        out: NDArray
-            output tensor with shape `(sequence_length, batch_size, input_size)`
-              when `layout` is "TNC".
-        out_states: list
-            output recurrent state tensor with length equals to num_layers-1.
-            the state with shape `(num_layers, batch_size, num_hidden)`
-        """
-        encoded = self.embedding(inputs)
-        if not begin_state:
-            if F == nd:
-                begin_state = self.begin_state(batch_size=inputs.shape[1])
-            else:
-                begin_state = self.begin_state(batch_size=0, func=sym.zeros)
-        encoded, state = self.encoder(encoded, begin_state)
-        if self._dropout:
-            encoded = nd.Dropout(encoded, p=self._dropout, axes=(0,))
-        out = self.decoder(encoded)
-        return out, state
-
-
-def _get_rnn_model(model_cls, model_name, dataset_name, vocab, pretrained, ctx, root, **kwargs):
-    vocab = _load_vocab(dataset_name, vocab, root)
-    kwargs['vocab_size'] = len(vocab)
-    net = model_cls(**kwargs)
-    if pretrained:
-        _load_pretrained_params(net, model_name, dataset_name, root, ctx)
-    return net, vocab
-
-
-def awd_lstm_lm_1150(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(),
-                     root=os.path.join(get_home_dir(), 'models'), **kwargs):
-    r"""3-layer LSTM language model with weight-drop, variational dropout, and tied weights.
-
-    Embedding size is 400, and hidden layer size is 1150.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        The dataset name on which the pre-trained model is trained.
-        Options are 'wikitext-2'. If specified, then the returned vocabulary is extracted from
-        the training set of the dataset.
-        If None, then vocab is required, for specifying embedding weight size, and is directly
-        returned.
-        The pre-trained model achieves 73.32/69.74 ppl on Val and Test of wikitext-2 respectively.
-    vocab : gluonnlp.Vocab or None, default None
-        Vocab object to be used with the language model.
-        Required when dataset_name is not specified.
-    pretrained : bool, default False
-        Whether to load the pre-trained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pre-trained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Returns
-    -------
-    gluon.Block, gluonnlp.Vocab
-    """
-    predefined_args = {'embed_size': 400,
-                       'hidden_size': 1150,
-                       'mode': 'lstm',
-                       'num_layers': 3,
-                       'tie_weights': True,
-                       'dropout': 0.4,
-                       'weight_drop': 0.5,
-                       'drop_h': 0.2,
-                       'drop_i': 0.65,
-                       'drop_e': 0.1}
-    mutable_args = frozenset(['dropout', 'weight_drop', 'drop_h', 'drop_i', 'drop_e'])
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-           'Cannot override predefined model settings.'
-    predefined_args.update(kwargs)
-    return _get_rnn_model(AWDRNN, 'awd_lstm_lm_1150', dataset_name, vocab, pretrained,
-                          ctx, root, **predefined_args)
-
-
-def awd_lstm_lm_600(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(),
-                    root=os.path.join(get_home_dir(), 'models'), **kwargs):
-    r"""3-layer LSTM language model with weight-drop, variational dropout, and tied weights.
-
-    Embedding size is 200, and hidden layer size is 600.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        The dataset name on which the pre-trained model is trained.
-        Options are 'wikitext-2'. If specified, then the returned vocabulary is extracted from
-        the training set of the dataset.
-        If None, then vocab is required, for specifying embedding weight size, and is directly
-        returned.
-        The pre-trained model achieves 84.61/80.96 ppl on Val and Test of wikitext-2 respectively.
-    vocab : gluonnlp.Vocab or None, default None
-        Vocab object to be used with the language model.
-        Required when dataset_name is not specified.
-    pretrained : bool, default False
-        Whether to load the pre-trained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pre-trained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Returns
-    -------
-    gluon.Block, gluonnlp.Vocab
-    """
-    predefined_args = {'embed_size': 200,
-                       'hidden_size': 600,
-                       'mode': 'lstm',
-                       'num_layers': 3,
-                       'tie_weights': True,
-                       'dropout': 0.2,
-                       'weight_drop': 0.2,
-                       'drop_h': 0.1,
-                       'drop_i': 0.3,
-                       'drop_e': 0.05}
-    mutable_args = frozenset(['dropout', 'weight_drop', 'drop_h', 'drop_i', 'drop_e'])
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-           'Cannot override predefined model settings.'
-    predefined_args.update(kwargs)
-    return _get_rnn_model(AWDRNN, 'awd_lstm_lm_600', dataset_name, vocab, pretrained,
-                          ctx, root, **predefined_args)
-
-def standard_lstm_lm_200(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(),
-                         root=os.path.join(get_home_dir(), 'models'), **kwargs):
-    r"""Standard 2-layer LSTM language model with tied embedding and output weights.
-
-    Both embedding and hidden dimensions are 200.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        The dataset name on which the pre-trained model is trained.
-        Options are 'wikitext-2'. If specified, then the returned vocabulary is extracted from
-        the training set of the dataset.
-        If None, then vocab is required, for specifying embedding weight size, and is directly
-        returned.
-        The pre-trained model achieves 108.25/102.26 ppl on Val and Test of wikitext-2 respectively.
-    vocab : gluonnlp.Vocab or None, default None
-        Vocabulary object to be used with the language model.
-        Required when dataset_name is not specified.
-    pretrained : bool, default False
-        Whether to load the pre-trained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pre-trained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Returns
-    -------
-    gluon.Block, gluonnlp.Vocab
-    """
-    predefined_args = {'embed_size': 200,
-                       'hidden_size': 200,
-                       'mode': 'lstm',
-                       'num_layers': 2,
-                       'tie_weights': True,
-                       'dropout': 0.2}
-    mutable_args = ['dropout']
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-           'Cannot override predefined model settings.'
-    predefined_args.update(kwargs)
-    return _get_rnn_model(StandardRNN, 'standard_lstm_lm_200', dataset_name, vocab, pretrained,
-                          ctx, root, **predefined_args)
-
-
-def standard_lstm_lm_650(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(),
-                         root=os.path.join(get_home_dir(), 'models'), **kwargs):
-    r"""Standard 2-layer LSTM language model with tied embedding and output weights.
-
-    Both embedding and hidden dimensions are 650.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        The dataset name on which the pre-trained model is trained.
-        Options are 'wikitext-2'. If specified, then the returned vocabulary is extracted from
-        the training set of the dataset.
-        If None, then vocab is required, for specifying embedding weight size, and is directly
-        returned.
-        The pre-trained model achieves 98.96/93.90 ppl on Val and Test of wikitext-2 respectively.
-    vocab : gluonnlp.Vocab or None, default None
-        Vocabulary object to be used with the language model.
-        Required when dataset_name is not specified.
-    pretrained : bool, default False
-        Whether to load the pre-trained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pre-trained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Returns
-    -------
-    gluon.Block, gluonnlp.Vocab
-    """
-    predefined_args = {'embed_size': 650,
-                       'hidden_size': 650,
-                       'mode': 'lstm',
-                       'num_layers': 2,
-                       'tie_weights': True,
-                       'dropout': 0.5}
-    mutable_args = ['dropout']
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-           'Cannot override predefined model settings.'
-    predefined_args.update(kwargs)
-    return _get_rnn_model(StandardRNN, 'standard_lstm_lm_650', dataset_name, vocab, pretrained,
-                          ctx, root, **predefined_args)
-
-
-def standard_lstm_lm_1500(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(),
-                          root=os.path.join(get_home_dir(), 'models'), **kwargs):
-    r"""Standard 2-layer LSTM language model with tied embedding and output weights.
-
-    Both embedding and hidden dimensions are 1500.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        The dataset name on which the pre-trained model is trained.
-        Options are 'wikitext-2'. If specified, then the returned vocabulary is extracted from
-        the training set of the dataset.
-        If None, then vocab is required, for specifying embedding weight size, and is directly
-        returned.
-        The pre-trained model achieves 98.29/92.83 ppl on Val and Test of wikitext-2 respectively.
-    vocab : gluonnlp.Vocab or None, default None
-        Vocabulary object to be used with the language model.
-        Required when dataset_name is not specified.
-    pretrained : bool, default False
-        Whether to load the pre-trained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pre-trained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Returns
-    -------
-    gluon.Block, gluonnlp.Vocab
-    """
-    predefined_args = {'embed_size': 1500,
-                       'hidden_size': 1500,
-                       'mode': 'lstm',
-                       'num_layers': 2,
-                       'tie_weights': True,
-                       'dropout': 0.65}
-    mutable_args = ['dropout']
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-           'Cannot override predefined model settings.'
-    predefined_args.update(kwargs)
-    return _get_rnn_model(StandardRNN, 'standard_lstm_lm_1500',
-                          dataset_name, vocab, pretrained, ctx, root, **predefined_args)
-
-model_store._model_sha1.update(
-    {name: checksum for checksum, name in [
-        ('a416351377d837ef12d17aae27739393f59f0b82', 'standard_lstm_lm_1500_wikitext-2'),
-        ('631f39040cd65b49f5c8828a0aba65606d73a9cb', 'standard_lstm_lm_650_wikitext-2'),
-        ('b233c700e80fb0846c17fe14846cb7e08db3fd51', 'standard_lstm_lm_200_wikitext-2'),
-        ('f9562ed05d9bcc7e1f5b7f3c81a1988019878038', 'awd_lstm_lm_1150_wikitext-2'),
-        ('e952becc7580a0b5a6030aab09d0644e9a13ce18', 'awd_lstm_lm_600_wikitext-2'),
-        ('6bb3e991eb4439fabfe26c129da2fe15a324e918', 'big_rnn_lm_2048_512_gbw')
-    ]})
-
-class BigRNN(Block):
-    """Big language model with LSTMP for inference.
-
-    Parameters
-    ----------
-    vocab_size : int
-        Size of the input vocabulary.
-    embed_size : int
-        Dimension of embedding vectors.
-    hidden_size : int
-        Number of hidden units for LSTMP.
-    num_layers : int
-        Number of LSTMP layers.
-    projection_size : int
-        Number of projection units for LSTMP.
-    embed_dropout : float
-        Dropout rate to use for embedding output.
-    encode_dropout : float
-        Dropout rate to use for encoder output.
-
-    """
-    def __init__(self, vocab_size, embed_size, hidden_size, num_layers,
-                 projection_size, embed_dropout=0.0, encode_dropout=0.0, **kwargs):
-        super(BigRNN, self).__init__(**kwargs)
-        self._embed_size = embed_size
-        self._hidden_size = hidden_size
-        self._projection_size = projection_size
-        self._num_layers = num_layers
-        self._embed_dropout = embed_dropout
-        self._encode_dropout = encode_dropout
-        self._vocab_size = vocab_size
-
-        with self.name_scope():
-            self.embedding = self._get_embedding()
-            self.encoder = self._get_encoder()
-            self.decoder = self._get_decoder()
-
-    def _get_embedding(self):
-        prefix = 'embedding0_'
-        embedding = nn.HybridSequential(prefix=prefix)
-        with embedding.name_scope():
-            embedding.add(nn.Embedding(self._vocab_size, self._embed_size, prefix=prefix))
-            if self._embed_dropout:
-                embedding.add(nn.Dropout(self._embed_dropout))
-        return embedding
-
-    def _get_encoder(self):
-        block = rnn.HybridSequentialRNNCell()
-        with block.name_scope():
-            for _ in range(self._num_layers):
-                block.add(contrib.rnn.LSTMPCell(self._hidden_size, self._projection_size))
-                if self._encode_dropout:
-                    block.add(rnn.DropoutCell(self._encode_dropout))
-        return block
-
-    def _get_decoder(self):
-        output = nn.Dense(self._vocab_size, prefix='decoder0_')
-        return output
-
-    def begin_state(self, **kwargs):
-        return self.encoder.begin_state(**kwargs)
-
-    def forward(self, inputs, begin_state): # pylint: disable=arguments-differ
-        """Implement forward computation.
-
-        Parameters
-        -----------
-        inputs : NDArray
-            input tensor with shape `(sequence_length, batch_size)`
-            when `layout` is "TNC".
-        begin_state : list
-            initial recurrent state tensor with length equals to num_layers*2.
-            For each layer the two initial states have shape `(batch_size, num_hidden)`
-            and `(batch_size, num_projection)`
-
-        Returns
-        --------
-        out : NDArray
-            output tensor with shape `(sequence_length, batch_size, vocab_size)`
-              when `layout` is "TNC".
-        out_states : list
-            output recurrent state tensor with length equals to num_layers*2.
-            For each layer the two initial states have shape `(batch_size, num_hidden)`
-            and `(batch_size, num_projection)`
-        """
-        encoded = self.embedding(inputs)
-        length = inputs.shape[0]
-        batch_size = inputs.shape[1]
-        encoded, state = self.encoder.unroll(length, encoded, begin_state,
-                                             layout='TNC', merge_outputs=True)
-        encoded = encoded.reshape((-1, self._projection_size))
-        out = self.decoder(encoded)
-        out = out.reshape((length, batch_size, -1))
-        return out, state
-
-def big_rnn_lm_2048_512(dataset_name=None, vocab=None, pretrained=False, ctx=cpu(),
-                        root=os.path.join(get_home_dir(), 'models'), **kwargs):
-    r"""Big 1-layer LSTMP language model.
-
-    Both embedding and projection size are 512. Hidden size is 2048.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-        The dataset name on which the pre-trained model is trained.
-        Options are 'gbw'. If specified, then the returned vocabulary is extracted from
-        the training set of the dataset.
-        If None, then vocab is required, for specifying embedding weight size, and is directly
-        returned.
-        The pre-trained model achieves 44.05 ppl on Test of GBW dataset.
-    vocab : gluonnlp.Vocab or None, default None
-        Vocabulary object to be used with the language model.
-        Required when dataset_name is not specified.
-    pretrained : bool, default False
-        Whether to load the pre-trained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pre-trained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Returns
-    -------
-    gluon.Block, gluonnlp.Vocab
-    """
-    predefined_args = {'embed_size': 512,
-                       'hidden_size': 2048,
-                       'projection_size': 512,
-                       'num_layers': 1,
-                       'embed_dropout': 0.1,
-                       'encode_dropout': 0.1}
-    mutable_args = ['embed_dropout', 'encode_dropout']
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-           'Cannot override predefined model settings.'
-    predefined_args.update(kwargs)
-    return _get_rnn_model(BigRNN, 'big_rnn_lm_2048_512', dataset_name, vocab, pretrained,
-                          ctx, root, **predefined_args)
diff --git a/src/gluonnlp/model/lstmpcellwithclip.py b/src/gluonnlp/model/lstmpcellwithclip.py
deleted file mode 100644
index 097c2bc4e8..0000000000
--- a/src/gluonnlp/model/lstmpcellwithclip.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""LSTM projection cell with cell clip and projection clip."""
-__all__ = ['LSTMPCellWithClip']
-
-from mxnet.gluon.contrib.rnn import LSTMPCell
-
-
-class LSTMPCellWithClip(LSTMPCell):
-    r"""Long-Short Term Memory Projected (LSTMP) network cell with cell clip and projection clip.
-    Each call computes the following function:
-
-    .. math::
-
-        \DeclareMathOperator{\sigmoid}{sigmoid}
-        \begin{array}{ll}
-        i_t = \sigmoid(W_{ii} x_t + b_{ii} + W_{ri} r_{(t-1)} + b_{ri}) \\
-        f_t = \sigmoid(W_{if} x_t + b_{if} + W_{rf} r_{(t-1)} + b_{rf}) \\
-        g_t = \tanh(W_{ig} x_t + b_{ig} + W_{rc} r_{(t-1)} + b_{rg}) \\
-        o_t = \sigmoid(W_{io} x_t + b_{io} + W_{ro} r_{(t-1)} + b_{ro}) \\
-        c_t = c_{\text{clip}}(f_t * c_{(t-1)} + i_t * g_t) \\
-        h_t = o_t * \tanh(c_t) \\
-        r_t = p_{\text{clip}}(W_{hr} h_t)
-        \end{array}
-
-    where :math:`c_{\text{clip}}` is the cell clip applied on the next cell;
-    :math:`r_t` is the projected recurrent activation at time `t`,
-    :math:`p_{\text{clip}}` means apply projection clip on he projected output.
-    math:`h_t` is the hidden state at time `t`, :math:`c_t` is the
-    cell state at time `t`, :math:`x_t` is the input at time `t`, and :math:`i_t`,
-    :math:`f_t`, :math:`g_t`, :math:`o_t` are the input, forget, cell, and
-    out gates, respectively.
-
-    Parameters
-    ----------
-    hidden_size : int
-        Number of units in cell state symbol.
-    projection_size : int
-        Number of units in output symbol.
-    i2h_weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    h2h_weight_initializer : str or Initializer
-        Initializer for the recurrent weights matrix, used for the linear
-        transformation of the hidden state.
-    h2r_weight_initializer : str or Initializer
-        Initializer for the projection weights matrix, used for the linear
-        transformation of the recurrent state.
-    i2h_bias_initializer : str or Initializer, default 'lstmbias'
-        Initializer for the bias vector. By default, bias for the forget
-        gate is initialized to 1 while all other biases are initialized
-        to zero.
-    h2h_bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str
-        Prefix for name of `Block`s
-        (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells.
-        Created if `None`.
-    cell_clip : float
-        Clip cell state between `[-cell_clip, cell_clip]` in LSTMPCellWithClip cell
-    projection_clip : float
-        Clip projection between `[-projection_clip, projection_clip]` in LSTMPCellWithClip cell
-    """
-    def __init__(self, hidden_size, projection_size,
-                 i2h_weight_initializer=None, h2h_weight_initializer=None,
-                 h2r_weight_initializer=None,
-                 i2h_bias_initializer='zeros', h2h_bias_initializer='zeros',
-                 input_size=0, cell_clip=None, projection_clip=None, prefix=None, params=None):
-        super(LSTMPCellWithClip, self).__init__(hidden_size,
-                                                projection_size,
-                                                i2h_weight_initializer,
-                                                h2h_weight_initializer,
-                                                h2r_weight_initializer,
-                                                i2h_bias_initializer,
-                                                h2h_bias_initializer,
-                                                input_size,
-                                                prefix=prefix,
-                                                params=params)
-
-        self._cell_clip = cell_clip
-        self._projection_clip = projection_clip
-
-    # pylint: disable= arguments-differ
-    def hybrid_forward(self, F, inputs, states, i2h_weight,
-                       h2h_weight, h2r_weight, i2h_bias, h2h_bias):
-        r"""Hybrid forward computation for Long-Short Term Memory Projected network cell
-        with cell clip and projection clip.
-
-        Parameters
-        ----------
-        inputs : input tensor with shape `(batch_size, input_size)`.
-        states : a list of two initial recurrent state tensors, with shape
-            `(batch_size, projection_size)` and `(batch_size, hidden_size)` respectively.
-
-        Returns
-        --------
-        out : output tensor with shape `(batch_size, num_hidden)`.
-        next_states : a list of two output recurrent state tensors. Each has
-            the same shape as `states`.
-        """
-        prefix = 't%d_'%self._counter
-        i2h = F.FullyConnected(data=inputs, weight=i2h_weight, bias=i2h_bias,
-                               num_hidden=self._hidden_size*4, name=prefix+'i2h')
-        h2h = F.FullyConnected(data=states[0], weight=h2h_weight, bias=h2h_bias,
-                               num_hidden=self._hidden_size*4, name=prefix+'h2h')
-        gates = i2h + h2h
-        slice_gates = F.SliceChannel(gates, num_outputs=4, name=prefix+'slice')
-        in_gate = F.Activation(slice_gates[0], act_type='sigmoid', name=prefix+'i')
-        forget_gate = F.Activation(slice_gates[1], act_type='sigmoid', name=prefix+'f')
-        in_transform = F.Activation(slice_gates[2], act_type='tanh', name=prefix+'c')
-        out_gate = F.Activation(slice_gates[3], act_type='sigmoid', name=prefix+'o')
-        next_c = F._internal._plus(forget_gate * states[1], in_gate * in_transform,
-                                   name=prefix+'state')
-        if self._cell_clip is not None:
-            next_c = next_c.clip(-self._cell_clip, self._cell_clip)
-        hidden = F._internal._mul(out_gate, F.Activation(next_c, act_type='tanh'),
-                                  name=prefix+'hidden')
-        next_r = F.FullyConnected(data=hidden, num_hidden=self._projection_size,
-                                  weight=h2r_weight, no_bias=True, name=prefix+'out')
-        if self._projection_clip is not None:
-            next_r = next_r.clip(-self._projection_clip, self._projection_clip)
-
-        return next_r, [next_r, next_c]
diff --git a/src/gluonnlp/model/parameter.py b/src/gluonnlp/model/parameter.py
deleted file mode 100644
index abaf25d4de..0000000000
--- a/src/gluonnlp/model/parameter.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Building blocks and utility for models."""
-__all__ = ['WeightDropParameter']
-
-from mxnet import nd, gluon
-
-
-class WeightDropParameter(gluon.Parameter):
-    """A Container holding parameters (weights) of Blocks and performs dropout.
-
-    Parameters
-    ----------
-    parameter : Parameter
-        The parameter which drops out.
-    rate : float, default 0.0
-        Fraction of the input units to drop. Must be a number between 0 and 1.
-        Dropout is not applied if dropout_rate is 0.
-    mode : str, default 'training'
-        Whether to only turn on dropout during training or to also turn on for inference.
-        Options are 'training' and 'always'.
-    axes : tuple of int, default ()
-        Axes on which dropout mask is shared.
-    """
-    def __init__(self, parameter, rate=0.0, mode='training', axes=()):
-        p = parameter
-        self._deferred_init = p._deferred_init
-        super(WeightDropParameter, self).__init__(
-            name=p.name, grad_req=p.grad_req, shape=p._shape, dtype=p.dtype,
-            lr_mult=p.lr_mult, wd_mult=p.wd_mult, init=p.init,
-            allow_deferred_init=p._allow_deferred_init,
-            differentiable=p._differentiable)
-        self._rate = rate
-        self._mode = mode
-        self._axes = axes
-        self._var = p._var
-        self._data = p._data
-        self._grad = p._grad
-        self._ctx_list = p._ctx_list
-        self._ctx_map = p._ctx_map
-        self._trainer = p._trainer
-
-    def data(self, ctx=None):
-        """Returns a copy of this parameter on one context. Must have been
-        initialized on this context before.
-
-        Parameters
-        ----------
-        ctx : Context
-            Desired context.
-        Returns
-        -------
-        NDArray on ctx
-        """
-        d = self._check_and_get(self._data, ctx)
-        if self._rate:
-            d = nd.Dropout(d, self._rate, self._mode, self._axes)
-        return d
-
-    def __repr__(self):
-        s = 'WeightDropParameter {name} (shape={shape}, dtype={dtype}, rate={rate}, mode={mode})'
-        return s.format(name=self.name, shape=self.shape, dtype=self.dtype,
-                        rate=self._rate, mode=self._mode)
diff --git a/src/gluonnlp/model/sampled_block.py b/src/gluonnlp/model/sampled_block.py
deleted file mode 100644
index b84b0d9cd9..0000000000
--- a/src/gluonnlp/model/sampled_block.py
+++ /dev/null
@@ -1,689 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Blocks for sampled losses."""
-__all__ = ['ISDense', 'NCEDense', 'SparseISDense', 'SparseNCEDense']
-
-from mxnet import nd
-from mxnet.gluon import Block, HybridBlock
-
-class _SampledDenseHelper(HybridBlock):
-    """A helper Block for calculating sampled pred.
-
-    Parameters
-    ----------
-    num_classes: int
-        Number of possible classes.
-    num_sampled: int
-        Number of classes randomly sampled for each batch.
-    in_unit: int
-        Dimensionality of the input space.
-    remove_accidental_hits: bool
-        Whether to remove "accidental hits" when a sampled candidate is equal to
-        one of the true classes.
-    sparse_label: bool
-        Whether to output label as an integer array instead of probability distribution.
-    """
-    def __init__(self, num_classes, num_sampled, in_unit, remove_accidental_hits,
-                 sparse_label, prefix=None, params=None):
-        super(_SampledDenseHelper, self).__init__(prefix=prefix, params=params)
-        self._num_classes = num_classes
-        self._num_sampled = num_sampled
-        self._in_unit = in_unit
-        self._remove_accidental_hits = remove_accidental_hits
-        self._sparse_label = sparse_label
-
-    # pylint: disable=arguments-differ
-    def hybrid_forward(self, F, x, sampled_values, label, w_all, b_all):
-        """Forward computation."""
-        sampled_candidates, expected_count_sampled, expected_count_true = sampled_values
-        # (num_sampled, in_unit)
-        w_sampled = w_all.slice(begin=(0, 0), end=(self._num_sampled, None))
-        w_true = w_all.slice(begin=(self._num_sampled, 0), end=(None, None))
-        b_sampled = b_all.slice(begin=(0,), end=(self._num_sampled,))
-        b_true = b_all.slice(begin=(self._num_sampled,), end=(None,))
-        # true pred
-        # (batch_size, 1)
-        x = x.reshape((-1, self._in_unit))
-        pred_true = (w_true * x).sum(axis=1) + b_true
-        # samples pred
-        # (batch_size, num_sampled)
-        b_sampled = F.reshape(b_sampled, (-1,))
-        pred_sampled = F.FullyConnected(x, weight=w_sampled, bias=b_sampled,
-                                        num_hidden=self._num_sampled)
-
-        # remove accidental hits
-        if self._remove_accidental_hits:
-            label_vec = F.reshape(label, (-1, 1)).astype('int32')
-            sample_vec = F.reshape(sampled_candidates, (1, -1)).astype('int32')
-            mask = F.broadcast_equal(label_vec, sample_vec).astype('float32') * -1e37
-            pred_sampled = pred_sampled + mask
-
-        # subtract log(q)
-        expected_count_sampled = expected_count_sampled.astype('float32')
-        expected_count_sampled = expected_count_sampled.reshape(shape=(1, self._num_sampled))
-        expected_count_true = expected_count_true.astype('float32').reshape((-1,))
-        pred_true = pred_true - F.log(expected_count_true)
-        pred_true = pred_true.reshape((-1, 1))
-        pred_sampled = F.broadcast_sub(pred_sampled, F.log(expected_count_sampled))
-
-        # pred and new_labels
-        # (batch_size, 1+num_sampled)
-        pred = F.concat(pred_true, pred_sampled, dim=1)
-        if self._sparse_label:
-            new_label = F.zeros_like(label)
-        else:
-            label_vec = F.reshape(label, (-1, 1))
-            new_label_true = F.ones_like(label_vec)
-            new_label_sampled = F.zeros_like(pred_sampled)
-            new_label = F.Concat(new_label_true, new_label_sampled, dim=1)
-        return pred, new_label
-
-    def __repr__(self):
-        s = '{name}({mapping})'
-        mapping = '{0} -> {1}, with {2} samples'.format(self._in_unit, self._num_classes,
-                                                        self._num_sampled)
-        return s.format(name=self.__class__.__name__,
-                        mapping=mapping,
-                        **self.__dict__)
-
-class _SampledDense(HybridBlock):
-    """Block that computes sampled output training pred and labels suitable for
-    sampled softmax loss or noise contrastive estimation loss.
-
-    Please use `loss.SoftmaxCrossEntropyLoss` for sampled softmax loss, and
-    `loss.SigmoidBinaryCrossEntropyLoss` for nce loss.
-
-    Parameters
-    ----------
-    num_classes: int
-        Number of possible classes.
-    num_sampled: int
-        Number of classes randomly sampled for each batch.
-    in_unit: int
-        Dimensionality of the input space.
-    remove_accidental_hits: bool
-        Whether to remove "accidental hits" when a sampled candidate is equal to
-        one of the true classes.
-    dtype : str or np.dtype, default 'float32'
-        Data type of output embeddings.
-    weight_initializer : str or `Initializer`, optional
-        Initializer for the `kernel` weights matrix.
-    bias_initializer: str or `Initializer`, optional
-        Initializer for the bias vector.
-    sparse_grad: bool, default True.
-        Whether to use sparse gradient.
-
-    Inputs:
-        - **x**: A tensor of shape `(batch_size, in_unit)`. The forward activation of
-          the input network.
-        - **sampled_values** : A list of three tensors for
-          `sampled_classes` with shape `(num_samples,)`,
-          `expected_count_sampled` with shape `(num_samples,)`, and
-          `expected_count_true` with shape `(sequence_length, batch_size)`.
-        - **label**: A tensor of shape `(batch_size,1)`.
-          The target classes.
-
-    Outputs:
-        - **out**: A tensor of shape `(batch_size, 1+num_sampled)`.
-          The output probability for the true class and sampled classes
-        - **new_targets**: A tensor.
-          The new target classes. The shape is `(batch_size, 1)` if `sparse_label` is `True`,
-          `(batch_size, 1+num_sampled)` otherwise.
-
-    """
-    def __init__(self, num_classes, num_sampled, in_unit, remove_accidental_hits,
-                 sparse_label, dtype='float32', weight_initializer=None,
-                 bias_initializer='zeros', sparse_grad=True, prefix=None, params=None):
-        super(_SampledDense, self).__init__(prefix=prefix, params=params)
-        with self.name_scope():
-            grad_stype = 'row_sparse' if sparse_grad else 'default'
-            self.weight = self.params.get('weight', shape=(num_classes, in_unit),
-                                          init=weight_initializer,
-                                          dtype=dtype, grad_stype=grad_stype)
-            self.bias = self.params.get('bias', shape=(num_classes,), init=bias_initializer,
-                                        dtype=dtype)
-        self._dense = _SampledDenseHelper(num_classes, num_sampled, in_unit,
-                                          remove_accidental_hits, sparse_label)
-        self._num_classes = num_classes
-        self._num_sampled = num_sampled
-        self._in_unit = in_unit
-        self._remove_accidental_hits = remove_accidental_hits
-        self._sparse_grad = sparse_grad
-
-    # pylint: disable=arguments-differ
-    def hybrid_forward(self, F, x, sampled_values, label, weight, bias):
-        """Forward computation."""
-        sampled_candidates, _, _ = sampled_values
-        # (batch_size,)
-        label = F.reshape(label, shape=(-1,))
-        # (num_sampled+batch_size,)
-        ids = F.concat(sampled_candidates.astype('int32'), label.astype('int32'), dim=0)
-        # lookup weights and biases
-        # (num_sampled+batch_size, dim)
-        w_all = F.Embedding(data=ids, weight=weight,
-                            input_dim=self._num_classes, output_dim=self._in_unit,
-                            sparse_grad=self._sparse_grad)
-        # (num_sampled+batch_size, 1)
-        b_all = F.take(bias, indices=ids)
-        return self._dense(x, sampled_values, label, w_all, b_all)
-
-    def __repr__(self):
-        s = '{name}({mapping})'
-        mapping = '{0} -> {1}, with {2} samples'.format(self._in_unit, self._num_classes,
-                                                        self._num_sampled)
-        return s.format(name=self.__class__.__name__,
-                        mapping=mapping,
-                        **self.__dict__)
-
-class NCEDense(_SampledDense):
-    """Noise contrastive estimated Dense block, which computes sampled pred
-    output and labels for noise contrastive estimation loss during training.
-
-    Reference:
-
-    Exploring the Limits of Language Modeling
-    Jozefowicz, Rafal and Vinyals, Oriol and Schuster, Mike and Shazeer, Noam and Wu, Yonghui
-    https://arxiv.org/pdf/1602.02410
-
-    Please use `loss.SigmoidBinaryCrossEntropyLoss` for noise contrastive estimation loss
-    during training.
-
-    .. note::
-
-        If `sparse_grad` is set to True, the gradient w.r.t input and output
-        embeddings will be sparse. Only a subset of optimizers support
-        sparse gradients, including SGD, AdaGrad and Adam.
-        By default `lazy_update` is turned on for these optimizers,
-        which may perform differently from standard updates.
-        For more details, please check the Optimization API at:
-        https://mxnet.incubator.apache.org/api/python/optimization/optimization.html
-
-    Example::
-
-        # network with sampling for training
-        encoder = Encoder(..)
-        decoder = NCEDense(..)
-        train_net.add(encoder)
-        train_net.add(decoder)
-        loss_train = SigmoidBinaryCrossEntropyLoss()
-
-        # training
-        for x, y, sampled_values in train_batches:
-            pred, new_targets = train_net(x, sampled_values, y)
-            l = loss_train(pred, new_targets)
-
-        # network for testing
-        test_net.add(encoder)
-        test_net.add(Dense(..., params=decoder.params))
-        loss_test = SoftmaxCrossEntropyLoss()
-
-        # testing
-        for x, y in test_batches:
-            pred = test_net(x)
-            l = loss_test(pred, y)
-
-    Parameters
-    ----------
-    num_classes: int
-        Number of possible classes.
-    num_sampled: int
-        Number of classes randomly sampled for each batch.
-    in_unit: int
-        Dimensionality of the input space.
-    remove_accidental_hits: bool, default False
-        Whether to remove "accidental hits" when a sampled candidate is equal to
-        one of the true classes.
-    dtype : str or np.dtype, default 'float32'
-        Data type of output embeddings.
-    weight_initializer : str or `Initializer`, optional
-        Initializer for the `kernel` weights matrix.
-    bias_initializer: str or `Initializer`, optional
-        Initializer for the bias vector.
-    sparse_grad: bool, default True.
-        Whether to use sparse gradient.
-
-    Inputs:
-        - **x**: A tensor of shape `(batch_size, in_unit)`. The forward activation of
-          the input network.
-        - **sampled_values** : A list of three tensors for
-          `sampled_classes` with shape `(num_samples,)`,
-          `expected_count_sampled` with shape `(num_samples,)`, and
-          `expected_count_true` with shape `(sequence_length, batch_size)`.
-        - **label**: A tensor of shape `(batch_size,1)`.
-          The target classes.
-
-    Outputs:
-        - **out**: A tensor of shape `(batch_size, 1+num_sampled)`.
-          The output probability for the true class and sampled classes
-        - **new_targets**: A tensor of shape `(batch_size, 1+num_sampled)`.
-          The new target classes.
-    """
-    def __init__(self, num_classes, num_sampled, in_unit, remove_accidental_hits=False,
-                 dtype='float32', weight_initializer=None, bias_initializer='zeros',
-                 sparse_grad=True, prefix=None, params=None):
-        super(NCEDense, self).__init__(num_classes, num_sampled, in_unit, remove_accidental_hits,
-                                       False, dtype=dtype, weight_initializer=weight_initializer,
-                                       bias_initializer=bias_initializer, sparse_grad=sparse_grad,
-                                       prefix=prefix, params=params)
-
-class ISDense(_SampledDense):
-    """Importance sampled Dense block, which computes sampled pred output and labels
-    for importance sampled softmax loss during training.
-
-    Reference:
-
-    Exploring the Limits of Language Modeling
-    Jozefowicz, Rafal and Vinyals, Oriol and Schuster, Mike and Shazeer, Noam and Wu, Yonghui
-    https://arxiv.org/pdf/1602.02410
-
-    Please use `loss.SoftmaxCrossEntropyLoss` for sampled softmax loss.
-
-    .. note::
-
-        If `sparse_grad` is set to True, the gradient w.r.t input and output
-        embeddings will be sparse. Only a subset of optimizers support
-        sparse gradients, including SGD, AdaGrad and Adam.
-        By default `lazy_update` is turned on for these optimizers,
-        which may perform differently from standard updates.
-        For more details, please check the Optimization API at
-        https://mxnet.incubator.apache.org/api/python/optimization/optimization.html
-
-    Example::
-
-        # network with importance sampling for training
-        encoder = Encoder(..)
-        decoder = ISDense(..)
-        train_net.add(encoder)
-        train_net.add(decoder)
-        loss = SoftmaxCrossEntropyLoss()
-
-        # training
-        for x, y, sampled_values in train_batches:
-            pred, new_targets = train_net(x, sampled_values, y)
-            l = loss(pred, new_targets)
-
-        # network for testing
-        test_net.add(encoder)
-        test_net.add(Dense(..., params=decoder.params))
-
-        # testing
-        for x, y in test_batches:
-            pred = test_net(x)
-            l = loss(pred, y)
-
-    Parameters
-    ----------
-    num_classes: int
-        Number of possible classes.
-    num_sampled: int
-        Number of classes randomly sampled for each batch.
-    in_unit: int
-        Dimensionality of the input space.
-    remove_accidental_hits: bool, default True
-        Whether to remove "accidental hits" when a sampled candidate is equal to
-        one of the true classes.
-    dtype : str or np.dtype, default 'float32'
-        Data type of output embeddings.
-    weight_initializer : str or `Initializer`, optional
-        Initializer for the `kernel` weights matrix.
-    bias_initializer: str or `Initializer`, optional
-        Initializer for the bias vector.
-    sparse_grad: bool, default True.
-        Whether to use sparse gradient.
-
-    Inputs:
-        - **x**: A tensor of shape `(batch_size, in_unit)`. The forward activation of
-          the input network.
-        - **sampled_values** : A list of three tensors for
-          `sampled_classes` with shape `(num_samples,)`,
-          `expected_count_sampled` with shape `(num_samples,)`, and
-          `expected_count_true` with shape `(sequence_length, batch_size)`.
-        - **label**: A tensor of shape `(batch_size,1)`.
-          The target classes.
-
-    Outputs:
-        - **out**: A tensor of shape `(batch_size, 1+num_sampled)`.
-          The output probability for the true class and sampled classes
-        - **new_targets**: A tensor of shape `(batch_size,)`.
-          The new target classes.
-    """
-    def __init__(self, num_classes, num_sampled, in_unit, remove_accidental_hits=True,
-                 dtype='float32', weight_initializer=None, bias_initializer='zeros',
-                 sparse_grad=True, prefix=None, params=None):
-        super(ISDense, self).__init__(num_classes, num_sampled, in_unit, remove_accidental_hits,
-                                      True, dtype=dtype, weight_initializer=weight_initializer,
-                                      bias_initializer=bias_initializer, sparse_grad=sparse_grad,
-                                      prefix=prefix, params=params)
-
-class _SparseSampledDense(Block):
-    """Block that computes sampled output training pred and labels suitable for
-    sampled softmax loss or noise contrastive estimation loss.
-
-    Please use `loss.SoftmaxCrossEntropyLoss` for sampled softmax loss, and
-    `loss.SigmoidBinaryCrossEntropyLoss` for nce loss.
-
-    The block is designed for distributed training with extremely large
-    number of classes to reduce communication overhead and memory consumption.
-    Both weight and gradient w.r.t. weight are `RowSparseNDArray`.
-
-    Different from SampledDense block, the parameters have to be saved before they
-    are used for testing.
-
-    Example::
-
-        # network with sampled_softmax_loss for training
-        encoder = Encoder(..)
-        train_net.add(encoder)
-        train_net.add(SampledDense(.., prefix='decoder')))
-        loss = SoftmaxCrossEntropyLoss()
-
-        # training
-        for x, y, sampled_values in train_batches:
-            pred, new_targets = train_net(x, sampled_values, y)
-            l = loss(pred, new_targets)
-
-        # save params
-        train_net.save_parameters('net.params')
-
-        # network for testing
-        test_net.add(encoder)
-        test_net.add(Dense(..., prefix='decoder'))
-
-        # load params
-        test_net.load_parameters('net.params')
-
-        # testing
-        for x, y in test_batches:
-            pred = test_net(x)
-            l = loss(pred, y)
-
-    Parameters
-    ----------
-    num_classes: int
-        Number of possible classes.
-    num_sampled: int
-        Number of classes randomly sampled for each batch.
-    in_unit: int
-        Dimensionality of the input space.
-    remove_accidental_hits: bool
-        Whether to remove "accidental hits" when a sampled candidate is equal to
-        one of the true classes.
-    sparse_label: bool
-        Whether to output label as an integer array instead of probability distribution.
-    dtype : str or np.dtype, default 'float32'
-        Data type of output embeddings.
-    weight_initializer : str or `Initializer`, optional
-        Initializer for the `kernel` weights matrix.
-    bias_initializer: str or `Initializer`, optional
-        Initializer for the bias vector.
-
-    Inputs:
-        - **x**: A tensor of shape `(batch_size, in_unit)`. The forward activation of
-          the input network.
-        - **sampled_values** : A list of three tensors for
-          `sampled_classes` with shape `(num_samples,)`,
-          `expected_count_sampled` with shape `(num_samples,)`, and
-          `expected_count_true` with shape `(sequence_length, batch_size)`.
-        - **label**: A tensor of shape `(batch_size,1)`.
-          The target classes.
-
-    Outputs:
-        - **out**: A tensor of shape `(batch_size, 1+num_sampled)`.
-          The output probability for the true class and sampled classes
-        - **new_targets**: A tensor.
-          The new target classes. The shape is `(batch_size, 1)` if `sparse_label` is `True`,
-          `(batch_size, 1+num_sampled)` otherwise.
-
-    """
-    def __init__(self, num_classes, num_sampled, in_unit, remove_accidental_hits,
-                 sparse_label, dtype='float32', weight_initializer=None,
-                 bias_initializer='zeros', prefix=None, params=None):
-        super(_SparseSampledDense, self).__init__(prefix=prefix, params=params)
-        with self.name_scope():
-            self.weight = self.params.get('weight', shape=(num_classes, in_unit),
-                                          init=weight_initializer, dtype=dtype,
-                                          grad_stype='row_sparse', stype='row_sparse')
-            self.bias = self.params.get('bias', shape=(num_classes,), init=bias_initializer,
-                                        dtype=dtype)
-            self._dense = _SampledDenseHelper(num_classes, num_sampled, in_unit,
-                                              remove_accidental_hits, sparse_label)
-        self._num_classes = num_classes
-        self._num_sampled = num_sampled
-        self._in_unit = in_unit
-        self._remove_accidental_hits = remove_accidental_hits
-        self._kwargs = {'input_dim': self._num_classes, 'output_dim': self._in_unit,
-                        'sparse_grad': True}
-
-    def forward(self, x, sampled_values, label): # pylint: disable=arguments-differ
-        """Forward computation."""
-        sampled_candidates, _, _ = sampled_values
-        # (batch_size,)
-        label = label.reshape(shape=(-1,))
-        # (num_sampled+batch_size,)
-        ids = nd.concat(sampled_candidates.astype('int32'), label.astype('int32'), dim=0)
-        # lookup weights and biases
-        weight = self.weight.row_sparse_data(ids)
-        bias = self.bias.data(ids.context)
-        # (num_sampled+batch_size, dim)
-        w_all = nd.Embedding(data=ids, weight=weight, **self._kwargs)
-        # (num_sampled+batch_size,)
-        b_all = nd.take(bias, indices=ids)
-        out, new_targets = self._dense(x, sampled_values, label, w_all, b_all)
-        return out, new_targets
-
-    def __repr__(self):
-        s = '{name}({mapping})'
-        mapping = '{0} -> {1}, num_sampled = {2}, remove_accidental_hits = {3}'
-        mapping = mapping.format(self._in_unit, self._num_classes, self._num_sampled,
-                                 str(self._remove_accidental_hits))
-        return s.format(name=self.__class__.__name__,
-                        mapping=mapping, **self.__dict__)
-
-class SparseISDense(_SparseSampledDense):
-    """Importance sampled Dense block with sparse weights, which computes sampled pred output
-    and labels for importance sampled softmax loss during training.
-
-    Reference:
-
-    Exploring the Limits of Language Modeling
-    Jozefowicz, Rafal and Vinyals, Oriol and Schuster, Mike and Shazeer, Noam and Wu, Yonghui
-    https://arxiv.org/pdf/1602.02410
-
-    Please use `loss.SoftmaxCrossEntropyLoss` for sampled softmax loss.
-
-    The block is designed for distributed training with extremely large
-    number of classes to reduce communication overhead and memory consumption.
-    Both weight and gradient w.r.t. weight are `RowSparseNDArray`.
-
-    .. note::
-
-        Different from `ISDense` block, the weight parameter is stored in
-        row_sparse format, which helps reduce memory consumption and
-        communication overhead during multi-GPU training. However,
-        sparse parameters cannot be shared with other blocks, nor could we hybridize
-        a block containing sparse parameters. Therefore, the parameters have
-        to be saved before they are used for testing.
-
-    Example::
-
-        # network with importance sampled softmax for training
-        encoder = Encoder(..)
-        train_net.add(encoder)
-        train_net.add(SparseISDense(.., prefix='decoder')))
-        loss = SoftmaxCrossEntropyLoss()
-
-        # training
-        for x, y, sampled_values in train_batches:
-            pred, new_targets = train_net(x, sampled_values, y)
-            l = loss(pred, new_targets)
-
-        # save params
-        train_net.save_parameters('net.params')
-
-        # network for testing
-        test_net.add(encoder)
-        test_net.add(Dense(..., prefix='decoder'))
-
-        # load params
-        test_net.load_parameters('net.params')
-
-        # testing
-        for x, y in test_batches:
-            pred = test_net(x)
-            l = loss(pred, y)
-
-    Parameters
-    ----------
-    num_classes: int
-        Number of possible classes.
-    num_sampled: int
-        Number of classes randomly sampled for each batch.
-    in_unit: int
-        Dimensionality of the input space.
-    remove_accidental_hits: bool, default True
-        Whether to remove "accidental hits" when a sampled candidate is equal to
-        one of the true classes.
-    dtype : str or np.dtype, default 'float32'
-        Data type of output embeddings.
-    weight_initializer : str or `Initializer`, optional
-        Initializer for the `kernel` weights matrix.
-    bias_initializer: str or `Initializer`, optional
-        Initializer for the bias vector.
-
-    Inputs:
-        - **x**: A tensor of shape `(batch_size, in_unit)`. The forward activation of
-          the input network.
-        - **sampled_values** : A list of three tensors for
-          `sampled_classes` with shape `(num_samples,)`,
-          `expected_count_sampled` with shape `(num_samples,)`, and
-          `expected_count_true` with shape `(sequence_length, batch_size)`.
-        - **label**: A tensor of shape `(batch_size,1)`.
-          The target classes.
-
-    Outputs:
-        - **out**: A tensor of shape `(batch_size, 1+num_sampled)`.
-          The output probability for the true class and sampled classes
-        - **new_targets**: A tensor of shape `(batch_size,)`.
-          The new target classes.
-
-    """
-    def __init__(self, num_classes, num_sampled, in_unit, remove_accidental_hits=True,
-                 dtype='float32', weight_initializer=None, bias_initializer='zeros',
-                 prefix=None, params=None):
-        super(SparseISDense, self).__init__(num_classes, num_sampled, in_unit,
-                                            remove_accidental_hits, True, dtype,
-                                            weight_initializer, bias_initializer,
-                                            prefix=prefix, params=params)
-
-class SparseNCEDense(_SparseSampledDense):
-    """Noise contrastive estimated Dense block with sparse weights, which computes sampled
-    pred output and labels for noise contrastive estimation loss during training.
-
-    Reference:
-
-    Exploring the Limits of Language Modeling
-    Jozefowicz, Rafal and Vinyals, Oriol and Schuster, Mike and Shazeer, Noam and Wu, Yonghui
-    https://arxiv.org/pdf/1602.02410
-
-    Please use `loss.SigmoidBinaryCrossEntropyLoss` for noise contrastive estimation loss
-    during training.
-
-    The block is designed for distributed training with extremely large
-    number of classes to reduce communication overhead and memory consumption.
-    Both weight and gradient w.r.t. weight are `RowSparseNDArray`.
-
-    .. note::
-
-        Different from `NCEDense` block, the weight parameter is stored
-        in row_sparse format, which helps reduce memory consumption and
-        communication overhead during multi-GPU training. However,
-        sparse parameters cannot be shared with other blocks, nor could we
-        hybridize a block containing sparse parameters. Therefore, the
-        parameters have to be saved before they are used for testing.
-
-    Example::
-
-        # network with importance sampled softmax for training
-        encoder = Encoder(..)
-        train_net.add(encoder)
-        train_net.add(SparseNCEDense(.., prefix='decoder')))
-        train_loss = SigmoidBinaryCrossEntropyLoss()
-
-        # training
-        for x, y, sampled_values in train_batches:
-            pred, new_targets = train_net(x, sampled_values, y)
-            l = train_loss(pred, new_targets)
-
-        # save params
-        train_net.save_parameters('net.params')
-
-        # network for testing
-        test_net.add(encoder)
-        test_net.add(Dense(..., prefix='decoder'))
-
-        # load params
-        test_net.load_parameters('net.params')
-        test_loss = SoftmaxCrossEntropyLoss()
-
-        # testing
-        for x, y in test_batches:
-            pred = test_net(x)
-            l = test_loss(pred, y)
-
-    Parameters
-    ----------
-    num_classes: int
-        Number of possible classes.
-    num_sampled: int
-        Number of classes randomly sampled for each batch.
-    in_unit: int
-        Dimensionality of the input space.
-    remove_accidental_hits: bool, default True
-        Whether to remove "accidental hits" when a sampled candidate is equal to
-        one of the true classes.
-    dtype : str or np.dtype, default 'float32'
-        Data type of output embeddings.
-    weight_initializer : str or `Initializer`, optional
-        Initializer for the `kernel` weights matrix.
-    bias_initializer: str or `Initializer`, optional
-        Initializer for the bias vector.
-
-    Inputs:
-        - **x**: A tensor of shape `(batch_size, in_unit)`. The forward activation of
-          the input network.
-        - **sampled_values** : A list of three tensors for
-          `sampled_classes` with shape `(num_samples,)`,
-          `expected_count_sampled` with shape `(num_samples,)`, and
-          `expected_count_true` with shape `(sequence_length, batch_size)`.
-        - **label**: A tensor of shape `(batch_size, 1+num_samples)`.
-          The target classes.
-
-    Outputs:
-        - **out**: A tensor of shape `(batch_size, 1+num_sampled)`.
-          The output probability for the true class and sampled classes
-        - **new_targets**: A tensor of shape `(batch_size, 1+num_sampled)`.
-          The new target classes.
-
-    """
-    def __init__(self, num_classes, num_sampled, in_unit, remove_accidental_hits=True,
-                 dtype='float32', weight_initializer=None, bias_initializer='zeros',
-                 prefix=None, params=None):
-        super(SparseNCEDense, self).__init__(num_classes, num_sampled, in_unit,
-                                             remove_accidental_hits, False,
-                                             dtype, weight_initializer, bias_initializer,
-                                             prefix=prefix, params=params)
diff --git a/src/gluonnlp/model/seq2seq_encoder_decoder.py b/src/gluonnlp/model/seq2seq_encoder_decoder.py
deleted file mode 100644
index 42e9f9f1ba..0000000000
--- a/src/gluonnlp/model/seq2seq_encoder_decoder.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Encoder and decoder used in sequence-to-sequence learning."""
-__all__ = ['Seq2SeqEncoder']
-
-import mxnet as mx
-from mxnet.gluon.block import Block
-
-def _nested_sequence_last(data, valid_length):
-    """
-
-    Parameters
-    ----------
-    data : nested container of NDArrays/Symbols
-        The input data. Each element will have shape (batch_size, ...)
-    valid_length : NDArray or Symbol
-        Valid length of the sequences. Shape (batch_size,)
-    Returns
-    -------
-    data_last: nested container of NDArrays/Symbols
-        The last valid element in the sequence.
-    """
-    assert isinstance(data, list)
-    if isinstance(data[0], (mx.sym.Symbol, mx.nd.NDArray)):
-        F = mx.sym if isinstance(data[0], mx.sym.Symbol) else mx.ndarray
-        return F.SequenceLast(F.stack(*data, axis=0),
-                              sequence_length=valid_length,
-                              use_sequence_length=True)
-    elif isinstance(data[0], list):
-        ret = []
-        for i in range(len(data[0])):
-            ret.append(_nested_sequence_last([ele[i] for ele in data], valid_length))
-        return ret
-    else:
-        raise NotImplementedError
-
-
-class Seq2SeqEncoder(Block):
-    r"""Base class of the encoders in sequence to sequence learning models.
-    """
-
-    def __call__(self, inputs, valid_length=None, states=None):  #pylint: disable=arguments-differ
-        """Encode the input sequence.
-
-        Parameters
-        ----------
-        inputs : NDArray
-            The input sequence, Shape (batch_size, length, C_in).
-        valid_length : NDArray or None, default None
-            The valid length of the input sequence, Shape (batch_size,). This is used when the
-            input sequences are padded. If set to None, all elements in the sequence are used.
-        states : list of NDArrays or None, default None
-            List that contains the initial states of the encoder.
-
-        Returns
-        -------
-        outputs : list
-            Outputs of the encoder.
-        """
-        return super(Seq2SeqEncoder, self).__call__(inputs, valid_length, states)
-
-    def forward(self, inputs, valid_length=None, states=None):  #pylint: disable=arguments-differ
-        raise NotImplementedError
-
-
-class Seq2SeqDecoder(Block):
-    """Base class of the decoders for sequence to sequence learning models.
-
-    Given the inputs and the context computed by the encoder, generate the new
-    states. Used in the training phase where we set the inputs to be the target
-    sequence.
-
-    """
-
-    def init_state_from_encoder(self, encoder_outputs, encoder_valid_length=None):
-        r"""Generates the initial decoder states based on the encoder outputs.
-
-        Parameters
-        ----------
-        encoder_outputs : list of NDArrays
-        encoder_valid_length : NDArray or None
-
-        Returns
-        -------
-        decoder_states : list
-        """
-        raise NotImplementedError
-
-    def forward(self, step_input, states, valid_length=None):  #pylint: disable=arguments-differ
-        """Given the inputs and the context computed by the encoder, generate the new states.
-
-        Used in the training phase where we set the inputs to be the target
-        sequence.
-
-        Parameters
-        ----------
-        inputs : NDArray
-            The input embeddings. Shape (batch_size, length, C_in)
-        states : list
-            The initial states of the decoder.
-        valid_length : NDArray or None
-            valid length of the inputs. Shape (batch_size,)
-        Returns
-        -------
-        output : NDArray
-            The output of the decoder. Shape is (batch_size, length, C_out)
-        states: list
-            The new states of the decoder
-        additional_outputs : list
-            Additional outputs of the decoder, e.g, the attention weights
-
-        """
-        raise NotImplementedError
-
-
-class Seq2SeqOneStepDecoder(Block):
-    r"""Base class of the decoders in sequence to sequence learning models.
-
-    In the forward function, it generates the one-step-ahead decoding output.
-
-    """
-
-    def init_state_from_encoder(self, encoder_outputs, encoder_valid_length=None):
-        r"""Generates the initial decoder states based on the encoder outputs.
-
-        Parameters
-        ----------
-        encoder_outputs : list of NDArrays
-        encoder_valid_length : NDArray or None
-
-        Returns
-        -------
-        decoder_states : list
-        """
-        raise NotImplementedError
-
-    def forward(self, step_input, states):  #pylint: disable=arguments-differ
-        """One-step decoding of the input
-
-        Parameters
-        ----------
-        step_input : NDArray
-            Shape (batch_size, C_in)
-        states : list
-            The previous states of the decoder
-        Returns
-        -------
-        step_output : NDArray
-            Shape (batch_size, C_out)
-        states : list
-        step_additional_outputs : list
-            Additional outputs of the step, e.g, the attention weights
-        """
-        raise NotImplementedError
diff --git a/src/gluonnlp/model/sequence_sampler.py b/src/gluonnlp/model/sequence_sampler.py
deleted file mode 100644
index 60c034f83b..0000000000
--- a/src/gluonnlp/model/sequence_sampler.py
+++ /dev/null
@@ -1,812 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Implements the beam search sampler."""
-
-__all__ = ['BeamSearchScorer', 'BeamSearchSampler', 'HybridBeamSearchSampler', 'SequenceSampler']
-
-from typing import TypeVar
-
-import numpy as np
-
-import mxnet as mx
-from mxnet.gluon import HybridBlock
-
-from .._constants import LARGE_NEGATIVE_FLOAT
-
-
-__T = TypeVar('__T')
-
-class BeamSearchScorer(HybridBlock):
-    r"""Score function used in beam search.
-
-    Implements the length-penalized score function used in the GNMT paper::
-
-        scores = (log_probs + scores) / length_penalty
-        length_penalty = (K + length)^\alpha / (K + 1)^\alpha
-
-
-    Parameters
-    ----------
-    alpha : float, default 1.0
-    K : float, default 5.0
-    from_logits : bool, default True
-        Whether input is a log probability (usually from log_softmax) instead
-        of unnormalized numbers.
-    """
-    def __init__(self, alpha=1.0, K=5.0, from_logits=True, **kwargs):
-        super(BeamSearchScorer, self).__init__(**kwargs)
-        self._alpha = alpha
-        self._K = K
-        self._from_logits = from_logits
-
-    def __call__(self, outputs, scores, step): # pylint: disable=arguments-differ
-        """Compute new scores of each candidate
-
-        Parameters
-        ----------
-        outputs : NDArray or Symbol
-            If from_logits is True, outputs is the log probabilities of the candidates.
-            Shape (d1, d2, ..., dn, V).
-            Otherwise, outputs is the unnormalized outputs from predictor of the same shape,
-            before softmax/log_softmax.
-        scores : NDArray or Symbol
-            The original scores of the beams. Shape (d1, d2, ..., dn)
-        step : NDArray or Symbol
-            Step to calculate the score function. It starts from 1. Shape (1,)
-
-        Returns
-        -------
-        candidate_scores : NDArray or Symbol
-            The scores of all the candidates. Shape (d1, d2, ..., dn, V), where V is the size
-            of the vocabulary.
-        """
-        return super(BeamSearchScorer, self).__call__(outputs, scores, step)
-
-    def hybrid_forward(self, F, outputs, scores, step): # pylint: disable=arguments-differ
-        if not self._from_logits:
-            outputs = outputs.log_softmax()
-        prev_lp = (self._K + step - 1) ** self._alpha / (self._K + 1) ** self._alpha
-        prev_lp = prev_lp * (step != 1) + (step == 1)
-        scores = F.broadcast_mul(scores, prev_lp)
-        lp = (self._K + step) ** self._alpha / (self._K + 1) ** self._alpha
-        candidate_scores = F.broadcast_add(outputs, F.expand_dims(scores, axis=-1))
-        candidate_scores = F.broadcast_div(candidate_scores, lp)
-        return candidate_scores
-
-
-def _extract_and_flatten_nested_structure(data, flattened=None):
-    """Flatten the structure of a nested container to a list.
-
-    Parameters
-    ----------
-    data : A single NDArray/Symbol or nested container with NDArrays/Symbol.
-        The nested container to be flattened.
-    flattened : list or None
-        The container thats holds flattened result.
-    Returns
-    -------
-    structure : An integer or a nested container with integers.
-        The extracted structure of the container of `data`.
-    flattened : (optional) list
-        The container thats holds flattened result.
-        It is returned only when the input argument `flattened` is not given.
-    """
-    if flattened is None:
-        flattened = []
-        structure = _extract_and_flatten_nested_structure(data, flattened)
-        return structure, flattened
-    if isinstance(data, list):
-        return list(_extract_and_flatten_nested_structure(x, flattened) for x in data)
-    elif isinstance(data, tuple):
-        return tuple(_extract_and_flatten_nested_structure(x, flattened) for x in data)
-    elif isinstance(data, dict):
-        return {k: _extract_and_flatten_nested_structure(v) for k, v in data.items()}
-    elif isinstance(data, (mx.sym.Symbol, mx.nd.NDArray)):
-        flattened.append(data)
-        return len(flattened) - 1
-    else:
-        raise NotImplementedError
-
-
-def _reconstruct_flattened_structure(structure, flattened):
-    """Reconstruct the flattened list back to (possibly) nested structure.
-
-    Parameters
-    ----------
-    structure : An integer or a nested container with integers.
-        The extracted structure of the container of `data`.
-    flattened : list or None
-        The container thats holds flattened result.
-    Returns
-    -------
-    data : A single NDArray/Symbol or nested container with NDArrays/Symbol.
-        The nested container that was flattened.
-    """
-    if isinstance(structure, list):
-        return list(_reconstruct_flattened_structure(x, flattened) for x in structure)
-    elif isinstance(structure, tuple):
-        return tuple(_reconstruct_flattened_structure(x, flattened) for x in structure)
-    elif isinstance(structure, dict):
-        return {k: _reconstruct_flattened_structure(v, flattened) for k, v in structure.items()}
-    elif isinstance(structure, int):
-        return flattened[structure]
-    else:
-        raise NotImplementedError
-
-
-def _expand_to_beam_size(data: __T, beam_size, batch_size, state_info=None) -> __T:
-    """Tile all the states to have batch_size * beam_size on the batch axis.
-
-    Parameters
-    ----------
-    data : A single NDArray/Symbol or nested container with NDArrays/Symbol
-        Each NDArray/Symbol should have shape (N, ...) when state_info is None,
-        or same as the layout in state_info when it's not None.
-    beam_size : int
-        Beam size
-    batch_size : int
-        Batch size
-    state_info : Nested structure of dictionary, default None.
-        Descriptors for states, usually from decoder's ``state_info()``.
-        When None, this method assumes that the batch axis is the first dimension.
-    Returns
-    -------
-    new_states : Object that contains NDArrays/Symbols
-        Each NDArray/Symbol should have shape batch_size * beam_size on the batch axis.
-    """
-    assert not state_info or isinstance(state_info, (type(data), dict)), \
-            'data and state_info doesn\'t match, ' \
-            'got: {} vs {}.'.format(type(state_info), type(data))
-    if isinstance(data, list):
-        if not state_info:
-            state_info = [None] * len(data)
-        return [_expand_to_beam_size(d, beam_size, batch_size, s)
-                for d, s in zip(data, state_info)]
-    elif isinstance(data, tuple):
-        if not state_info:
-            state_info = [None] * len(data)
-            state_info = tuple(state_info)
-        return tuple(_expand_to_beam_size(d, beam_size, batch_size, s)
-                     for d, s in zip(data, state_info))
-    elif isinstance(data, dict):
-        if not state_info:
-            state_info = {k: None for k in data.keys()}
-        return {k: _expand_to_beam_size(v, beam_size, batch_size, state_info[k])
-                for k, v in data.items()}
-    elif isinstance(data, mx.nd.NDArray):
-        if not state_info:
-            batch_axis = 0
-        else:
-            batch_axis = state_info['__layout__'].find('N')
-        if data.shape[batch_axis] != batch_size:
-            raise ValueError('The batch dimension of all the inner elements in states must be '
-                             '{}, Found shape={}'.format(batch_size, data.shape))
-        new_shape = list(data.shape)
-        new_shape[batch_axis] = batch_size * beam_size
-        new_shape = tuple(new_shape)
-        return data.expand_dims(batch_axis+1)\
-                   .broadcast_axes(axis=batch_axis+1, size=beam_size)\
-                   .reshape(new_shape)
-    elif isinstance(data, mx.sym.Symbol):
-        if not state_info:
-            batch_axis = 0
-        else:
-            batch_axis = state_info['__layout__'].find('N')
-        new_shape = (0, ) * batch_axis + (-3, -2)
-        return data.expand_dims(batch_axis+1)\
-                   .broadcast_axes(axis=batch_axis+1, size=beam_size)\
-                   .reshape(new_shape)
-    elif data is None:
-        return None
-    else:
-        raise NotImplementedError
-
-
-def _choose_states(F, states, state_info, indices):
-    """
-
-    Parameters
-    ----------
-    F : ndarray or symbol
-    states : Object contains NDArrays/Symbols
-        Each NDArray/Symbol should have shape (N, ...) when state_info is None,
-        or same as the layout in state_info when it's not None.
-    state_info : Nested structure of dictionary, default None.
-        Descriptors for states, usually from decoder's ``state_info()``.
-        When None, this method assumes that the batch axis is the first dimension.
-    indices : NDArray or Symbol
-        Indices of the states to take. Shape (N,).
-    Returns
-    -------
-    new_states : Object contains NDArrays/Symbols
-        Each NDArray/Symbol should have shape (N, ...).
-    """
-    assert not state_info or isinstance(state_info, (type(states), dict)), \
-            'states and state_info don\'t match'
-    if isinstance(states, list):
-        if not state_info:
-            state_info = [None] * len(states)
-        return [_choose_states(F, d, s, indices) for d, s in zip(states, state_info)]
-    elif isinstance(states, tuple):
-        if not state_info:
-            state_info = [None] * len(states)
-            state_info = tuple(state_info)
-        return tuple(_choose_states(F, d, s, indices) for d, s in zip(states, state_info))
-    elif isinstance(states, dict):
-        if not state_info:
-            state_info = {k: None for k in states.keys()}
-        return {k: _choose_states(F, v, state_info[k], indices)
-                for k, v in states.items()}
-    elif isinstance(states, (mx.nd.NDArray, mx.sym.Symbol)):
-        if not state_info:
-            batch_axis = 0
-        else:
-            batch_axis = state_info['__layout__'].find('N')
-        states = F.take(states, indices, axis=batch_axis)
-        return states
-    else:
-        raise NotImplementedError
-
-
-class _BeamSearchStepUpdate(HybridBlock):
-    def __init__(self, beam_size, eos_id, scorer, state_info, single_step=False, \
-        prefix=None, params=None):
-        super(_BeamSearchStepUpdate, self).__init__(prefix, params)
-        self._beam_size = beam_size
-        self._eos_id = eos_id
-        self._scorer = scorer
-        self._state_info = state_info
-        self._single_step = single_step
-        assert eos_id >= 0, 'eos_id cannot be negative! Received eos_id={}'.format(eos_id)
-
-    def hybrid_forward(self, F, samples, valid_length, outputs, scores, step, beam_alive_mask,   # pylint: disable=arguments-differ
-                       states, vocab_size, batch_shift):
-        """
-
-        Parameters
-        ----------
-        F
-        samples : NDArray or Symbol
-            The current samples generated by beam search.
-            When single_step is True, (batch_size, beam_size, max_length).
-            When single_step is False, (batch_size, beam_size, L).
-        valid_length : NDArray or Symbol
-            The current valid lengths of the samples
-        outputs : NDArray or Symbol
-            Outputs from predictor. If from_logits was set to True in scorer, then it's the
-            log probability of the current step. Else, it's the unnormalized outputs before
-            softmax or log_softmax. Shape (batch_size * beam_size, V).
-        scores : NDArray or Symbol
-            The previous scores. Shape (batch_size, beam_size)
-        step : NDArray or Symbol
-            The current step for doing beam search. Begins from 1. Shape (1,)
-        beam_alive_mask : NDArray or Symbol
-            Shape (batch_size, beam_size)
-        states : nested structure of NDArrays/Symbols
-            Each NDArray/Symbol should have shape (N, ...) when state_info is None,
-            or same as the layout in state_info when it's not None.
-        vocab_size : NDArray or Symbol
-            Shape (1,)
-        batch_shift : NDArray or Symbol
-            Contains [0, beam_size, 2 * beam_size, ..., (batch_size - 1) * beam_size].
-            Shape (batch_size,)
-
-        Returns
-        -------
-        new_samples : NDArray or Symbol or an empty list
-            The updated samples.
-            When single_step is True, it is an empty list.
-            When single_step is False, shape (batch_size, beam_size, L + 1)
-        new_valid_length : NDArray or Symbol
-            Valid lengths of the samples. Shape (batch_size, beam_size)
-        new_scores : NDArray or Symbol
-            Shape (batch_size, beam_size)
-        chosen_word_ids : NDArray or Symbol
-            The chosen word ids of the step. Shape (batch_size, beam_size). If it's negative,
-            no word will be appended to the beam.
-        beam_alive_mask : NDArray or Symbol
-            Shape (batch_size, beam_size)
-        new_states : nested structure of NDArrays/Symbols
-            Inner NDArrays have shape (batch_size * beam_size, ...)
-        """
-        beam_size = self._beam_size
-        beam_alive_mask_bcast = F.expand_dims(beam_alive_mask, axis=2).astype(np.float32)
-        candidate_scores = self._scorer(outputs.reshape(shape=(-4, -1, beam_size, 0)),
-                                        scores, step)
-        # Concat the candidate scores and the scores of the finished beams
-        # The resulting candidate score will have shape (batch_size, beam_size * |V| + beam_size)
-        candidate_scores = F.broadcast_mul(beam_alive_mask_bcast, candidate_scores) + \
-                           F.broadcast_mul(1 - beam_alive_mask_bcast,
-                                           F.ones_like(candidate_scores) * LARGE_NEGATIVE_FLOAT)
-        finished_scores = F.where(beam_alive_mask,
-                                  F.ones_like(scores) * LARGE_NEGATIVE_FLOAT, scores)
-        candidate_scores = F.concat(candidate_scores.reshape(shape=(0, -1)),
-                                    finished_scores, dim=1)
-        # Get the top K scores
-        new_scores, indices = F.topk(candidate_scores, axis=1, k=beam_size, ret_typ='both')
-        indices = indices.astype(np.int32)
-        use_prev = F.broadcast_greater_equal(indices, beam_size * vocab_size)
-        chosen_word_ids = F.broadcast_mod(indices, vocab_size)
-        beam_ids = F.where(use_prev,
-                           F.broadcast_minus(indices, beam_size * vocab_size),
-                           F.floor(F.broadcast_div(indices, vocab_size)))
-        batch_beam_indices = F.broadcast_add(beam_ids, F.expand_dims(batch_shift, axis=1))
-        chosen_word_ids = F.where(use_prev,
-                                  -F.ones_like(indices),
-                                  chosen_word_ids)
-        # Update the samples and vaild_length
-        selected_samples = F.take(samples.reshape(shape=(-3, 0)),
-                                  batch_beam_indices.reshape(shape=(-1,)))
-        new_samples = F.concat(selected_samples,
-                               chosen_word_ids.reshape(shape=(-1, 1)), dim=1)\
-                       .reshape(shape=(-4, -1, beam_size, 0))
-        if self._single_step:
-            new_samples = new_samples.slice_axis(axis=2, begin=1, end=None)
-        new_valid_length = F.take(valid_length.reshape(shape=(-1,)),
-                                  batch_beam_indices.reshape(shape=(-1,))).reshape((-1, beam_size))\
-                           + 1 - use_prev
-        # Update the states
-        new_states = _choose_states(F, states, self._state_info, batch_beam_indices.reshape((-1,)))
-        # Update the alive mask.
-        beam_alive_mask = F.take(beam_alive_mask.reshape(shape=(-1,)),
-                                 batch_beam_indices.reshape(shape=(-1,)))\
-                              .reshape(shape=(-1, beam_size)) * (chosen_word_ids != self._eos_id)
-
-        return new_samples, new_valid_length, new_scores,\
-               chosen_word_ids, beam_alive_mask, new_states
-
-
-class _SamplingStepUpdate(HybridBlock):
-    def __init__(self, beam_size, eos_id, temperature=1.0, top_k=None, prefix=None, params=None):
-        super(_SamplingStepUpdate, self).__init__(prefix, params)
-        self._beam_size = beam_size
-        self._eos_id = eos_id
-        self._temperature = temperature
-        self._top_k = top_k
-        assert eos_id >= 0, 'eos_id cannot be negative! Received eos_id={}'.format(eos_id)
-
-    # pylint: disable=arguments-differ
-    def hybrid_forward(self, F, samples, valid_length, outputs, scores, beam_alive_mask, states):
-        """
-        Parameters
-        ----------
-        F
-        samples : NDArray or Symbol
-            The current samples generated by beam search. Shape (batch_size, beam_size, L)
-        valid_length : NDArray or Symbol
-            The current valid lengths of the samples
-        outputs: NDArray or Symbol
-            Decoder output (unnormalized) scores of the current step.
-            Shape (batch_size * beam_size, V)
-        scores : NDArray or Symbol
-            The previous scores. Shape (batch_size, beam_size)
-        beam_alive_mask : NDArray or Symbol
-            Shape (batch_size, beam_size)
-        states : nested structure of NDArrays/Symbols
-            Inner NDArrays have shape (batch_size * beam_size, ...)
-
-        Returns
-        -------
-        new_samples : NDArray or Symbol
-            The updated samples. Shape (batch_size, beam_size, L + 1)
-        new_valid_length : NDArray or Symbol
-            Valid lengths of the samples. Shape (batch_size, beam_size)
-        new_scores : NDArray or Symbol
-            Shape (batch_size, beam_size)
-        chosen_word_ids : NDArray or Symbol
-            The chosen word ids of the step. Shape (batch_size, beam_size). If it's negative,
-            no word will be appended to the beam.
-        beam_alive_mask : NDArray or Symbol
-            Shape (batch_size, beam_size)
-        new_states : nested structure of NDArrays/Symbols
-            Inner NDArrays have shape (batch_size * beam_size, ...)
-        """
-        beam_size = self._beam_size
-        # outputs: (batch_size, beam_size, vocab_size)
-        outputs = outputs.reshape(shape=(-4, -1, beam_size, 0))
-        if self._top_k:
-            ranks = outputs.argsort(is_ascend=False, dtype='int32')
-            outputs = F.where(ranks < self._top_k, outputs, F.ones_like(outputs)*-99999)
-        smoothed_probs = (outputs / self._temperature).softmax(axis=2)
-        log_probs = F.log_softmax(outputs, axis=2).reshape(-3, -1)
-
-        # (batch_size, beam_size)
-        chosen_word_ids = F.sample_multinomial(smoothed_probs, dtype=np.int32)
-        chosen_word_ids = F.where(beam_alive_mask,
-                                  chosen_word_ids,
-                                  -1*F.ones_like(beam_alive_mask))
-        chosen_word_log_probs = log_probs[mx.nd.arange(log_probs.shape[0]),
-                                          chosen_word_ids.reshape(-1)].reshape(-4, -1, beam_size)
-
-        # Don't update for finished beams
-        new_scores = scores + F.where(beam_alive_mask,
-                                      chosen_word_log_probs,
-                                      F.zeros_like(chosen_word_log_probs))
-        new_valid_length = valid_length + beam_alive_mask
-
-        # Update the samples and vaild_length
-        new_samples = F.concat(samples, chosen_word_ids.expand_dims(2), dim=2)
-
-        # Update the states
-        new_states = states
-
-        # Update the alive mask.
-        beam_alive_mask = beam_alive_mask * (chosen_word_ids != self._eos_id)
-
-        return new_samples, new_valid_length, new_scores,\
-               chosen_word_ids, beam_alive_mask, new_states
-
-
-class BeamSearchSampler:
-    r"""Draw samples from the decoder by beam search.
-
-    Parameters
-    ----------
-    beam_size : int
-        The beam size.
-    decoder : callable
-        Function of the one-step-ahead decoder, should have the form::
-
-            outputs, new_states = decoder(step_input, states)
-
-        The outputs, input should follow these rules:
-
-        - step_input has shape (batch_size,),
-        - outputs has shape (batch_size, V),
-        - states and new_states have the same structure and the leading
-          dimension of the inner NDArrays is the batch dimension.
-    eos_id : int
-        Id of the EOS token. No other elements will be appended to the sample if it reaches eos_id.
-    scorer : BeamSearchScorer, default BeamSearchScorer(alpha=1.0, K=5)
-        The score function used in beam search.
-    max_length : int, default 100
-        The maximum search length.
-    """
-    def __init__(self, beam_size, decoder, eos_id, scorer=BeamSearchScorer(alpha=1.0, K=5),
-                 max_length=100):
-        self._beam_size = beam_size
-        assert beam_size > 0,\
-            'beam_size must be larger than 0. Received beam_size={}'.format(beam_size)
-        self._decoder = decoder
-        self._eos_id = eos_id
-        assert eos_id >= 0, 'eos_id cannot be negative! Received eos_id={}'.format(eos_id)
-        self._max_length = max_length
-        self._scorer = scorer
-        if hasattr(decoder, 'state_info'):
-            state_info = decoder.state_info()
-        else:
-            state_info = None
-        self._updater = _BeamSearchStepUpdate(beam_size=beam_size, eos_id=eos_id, scorer=scorer,
-                                              state_info=state_info)
-        self._updater.hybridize()
-
-    def __call__(self, inputs, states):
-        """Sample by beam search.
-
-        Parameters
-        ----------
-        inputs : NDArray
-            The initial input of the decoder. Shape is (batch_size,).
-        states : Object that contains NDArrays
-            The initial states of the decoder.
-        Returns
-        -------
-        samples : NDArray
-            Samples draw by beam search. Shape (batch_size, beam_size, length). dtype is int32.
-        scores : NDArray
-            Scores of the samples. Shape (batch_size, beam_size). We make sure that scores[i, :] are
-            in descending order.
-        valid_length : NDArray
-            The valid length of the samples. Shape (batch_size, beam_size). dtype will be int32.
-        """
-        batch_size = inputs.shape[0]
-        beam_size = self._beam_size
-        ctx = inputs.context
-        # Tile the states and inputs to have shape (batch_size * beam_size, ...)
-        if hasattr(self._decoder, 'state_info'):
-            state_info = self._decoder.state_info(batch_size)
-        else:
-            state_info = None
-        states = _expand_to_beam_size(states, beam_size=beam_size, batch_size=batch_size,
-                                      state_info=state_info)
-        step_input = _expand_to_beam_size(inputs, beam_size=beam_size,
-                                          batch_size=batch_size).astype(np.int32)
-        # All beams are initialized to alive
-        # Generated samples are initialized to be the inputs
-        # Except the first beam where the scores are set to be zero, all beams have -inf scores.
-        # Valid length is initialized to be 1
-        beam_alive_mask = mx.nd.ones(shape=(batch_size, beam_size), ctx=ctx, dtype=np.int32)
-        valid_length = mx.nd.ones(shape=(batch_size, beam_size), ctx=ctx, dtype=np.int32)
-        scores = mx.nd.zeros(shape=(batch_size, beam_size), ctx=ctx)
-        if beam_size > 1:
-            scores[:, 1:beam_size] = LARGE_NEGATIVE_FLOAT
-        samples = step_input.reshape((batch_size, beam_size, 1))
-        for i in range(self._max_length):
-            log_probs, new_states = self._decoder(step_input, states)
-            vocab_size_nd = mx.nd.array([log_probs.shape[1]], ctx=ctx, dtype=np.int32)
-            batch_shift_nd = mx.nd.arange(0, batch_size * beam_size, beam_size, ctx=ctx,
-                                          dtype=np.int32)
-            step_nd = mx.nd.array([i + 1], ctx=ctx)
-            samples, valid_length, scores, chosen_word_ids, beam_alive_mask, states = \
-                self._updater(samples, valid_length, log_probs, scores, step_nd, beam_alive_mask,
-                              new_states, vocab_size_nd, batch_shift_nd)
-            step_input = mx.nd.relu(chosen_word_ids).reshape((-1,))
-            if mx.nd.sum(beam_alive_mask).asscalar() == 0:
-                return samples, scores, valid_length
-        final_word = mx.nd.where(beam_alive_mask,
-                                 mx.nd.full(shape=(batch_size, beam_size),
-                                            val=self._eos_id, ctx=ctx, dtype=np.int32),
-                                 mx.nd.full(shape=(batch_size, beam_size),
-                                            val=-1, ctx=ctx, dtype=np.int32))
-        samples = mx.nd.concat(samples, final_word.reshape((0, 0, 1)), dim=2)
-        valid_length += beam_alive_mask
-        return samples, scores, valid_length
-
-
-class HybridBeamSearchSampler(HybridBlock):
-    r"""Draw samples from the decoder by beam search.
-
-    Parameters
-    ----------
-    batch_size : int
-        The batch size.
-    beam_size : int
-        The beam size.
-    decoder : callable, must be hybridizable
-        Function of the one-step-ahead decoder, should have the form::
-
-            outputs, new_states = decoder(step_input, states)
-
-        The outputs, input should follow these rules:
-
-        - step_input has shape (batch_size,),
-        - outputs has shape (batch_size, V),
-        - states and new_states have the same structure and the leading
-          dimension of the inner NDArrays is the batch dimension.
-    eos_id : int
-        Id of the EOS token. No other elements will be appended to the sample if it reaches eos_id.
-    scorer : BeamSearchScorer, default BeamSearchScorer(alpha=1.0, K=5), must be hybridizable
-        The score function used in beam search.
-    max_length : int, default 100
-        The maximum search length.
-    vocab_size : int, default None, meaning `decoder._vocab_size`
-        The vocabulary size
-    """
-    def __init__(self, batch_size, beam_size, decoder, eos_id,
-                 scorer=BeamSearchScorer(alpha=1.0, K=5),
-                 max_length=100, vocab_size=None,
-                 prefix=None, params=None):
-        super(HybridBeamSearchSampler, self).__init__(prefix, params)
-        self._batch_size = batch_size
-        self._beam_size = beam_size
-        assert beam_size > 0,\
-            'beam_size must be larger than 0. Received beam_size={}'.format(beam_size)
-        self._decoder = decoder
-        self._eos_id = eos_id
-        assert eos_id >= 0, 'eos_id cannot be negative! Received eos_id={}'.format(eos_id)
-        self._max_length = max_length
-        self._scorer = scorer
-        self._state_info_func = getattr(decoder, 'state_info', lambda _=None: None)
-        self._updater = _BeamSearchStepUpdate(beam_size=beam_size, eos_id=eos_id, scorer=scorer,
-                                              single_step=True, state_info=self._state_info_func())
-        self._updater.hybridize()
-        self._vocab_size = vocab_size or getattr(decoder, '_vocab_size', None)
-        assert self._vocab_size is not None,\
-            'Please provide vocab_size or define decoder._vocab_size'
-        assert not hasattr(decoder, '_vocab_size') or decoder._vocab_size == self._vocab_size, \
-            'Provided vocab_size={} is not equal to decoder._vocab_size={}'\
-            .format(self._vocab_size, decoder._vocab_size)
-
-    def hybrid_forward(self, F, inputs, states):   # pylint: disable=arguments-differ
-        """Sample by beam search.
-
-        Parameters
-        ----------
-        F
-        inputs : NDArray or Symbol
-            The initial input of the decoder. Shape is (batch_size,).
-        states : Object that contains NDArrays or Symbols
-            The initial states of the decoder.
-        Returns
-        -------
-        samples : NDArray or Symbol
-            Samples draw by beam search. Shape (batch_size, beam_size, length). dtype is int32.
-        scores : NDArray or Symbol
-            Scores of the samples. Shape (batch_size, beam_size). We make sure that scores[i, :] are
-            in descending order.
-        valid_length : NDArray or Symbol
-            The valid length of the samples. Shape (batch_size, beam_size). dtype will be int32.
-        """
-        batch_size = self._batch_size
-        beam_size = self._beam_size
-        vocab_size = self._vocab_size
-        # Tile the states and inputs to have shape (batch_size * beam_size, ...)
-        state_info = self._state_info_func(batch_size)
-        step_input = _expand_to_beam_size(inputs, beam_size=beam_size,
-                                          batch_size=batch_size).astype(np.int32)
-        states = _expand_to_beam_size(states, beam_size=beam_size, batch_size=batch_size,
-                                      state_info=state_info)
-        state_structure, states = _extract_and_flatten_nested_structure(states)
-        if beam_size == 1:
-            init_scores = F.zeros(shape=(batch_size, 1))
-        else:
-            init_scores = F.concat(
-                F.zeros(shape=(batch_size, 1)),
-                F.full(shape=(batch_size, beam_size - 1), val=LARGE_NEGATIVE_FLOAT),
-                dim=1)
-        vocab_size = F.full(shape=(1,), val=vocab_size, dtype=np.int32)
-        batch_shift = F.arange(0, batch_size * beam_size, beam_size, dtype=np.int32)
-
-        def _loop_cond(_i, _samples, _indices, _step_input, _valid_length, _scores, \
-            beam_alive_mask, *_states):
-            return F.sum(beam_alive_mask) > 0
-
-        def _loop_func(i, samples, indices, step_input, valid_length, scores, \
-            beam_alive_mask, *states):
-            outputs, new_states = self._decoder(
-                step_input, _reconstruct_flattened_structure(state_structure, states))
-            step = i + 1
-            new_samples, new_valid_length, new_scores, \
-                chosen_word_ids, new_beam_alive_mask, new_new_states = \
-                self._updater(samples, valid_length, outputs, scores, step.astype(np.float32),
-                              beam_alive_mask,
-                              _extract_and_flatten_nested_structure(new_states)[-1],
-                              vocab_size, batch_shift)
-            new_step_input = F.relu(chosen_word_ids).reshape((-1,))
-            # We are doing `new_indices = indices[1 : ] + indices[ : 1]`
-            new_indices = F.concat(
-                indices.slice_axis(axis=0, begin=1, end=None),
-                indices.slice_axis(axis=0, begin=0, end=1),
-                dim=0)
-            return [], (step, new_samples, new_indices, new_step_input, new_valid_length, \
-                   new_scores, new_beam_alive_mask) + tuple(new_new_states)
-
-        _, pad_samples, indices, _, new_valid_length, new_scores, new_beam_alive_mask = \
-            F.contrib.while_loop(
-                cond=_loop_cond, func=_loop_func, max_iterations=self._max_length,
-                loop_vars=(
-                    F.zeros(shape=(1,), dtype=np.int32),                        # i
-                    F.zeros(shape=(batch_size, beam_size, self._max_length),
-                            dtype=np.int32),                                    # samples
-                    F.arange(start=0, stop=self._max_length, dtype=np.int32),   # indices
-                    step_input,                                                 # step_input
-                    F.ones(shape=(batch_size, beam_size), dtype=np.int32),      # valid_length
-                    init_scores,                                                # scores
-                    F.ones(shape=(batch_size, beam_size), dtype=np.int32),      # beam_alive_mask
-                ) + tuple(states)
-            )[1][:7]                                                            # I hate Python 2
-        samples = pad_samples.take(indices, axis=2)
-
-        def _then_func():
-            new_samples = F.concat(
-                step_input.reshape((batch_size, beam_size, 1)),
-                samples,
-                F.full(shape=(batch_size, beam_size, 1), val=-1, dtype=np.int32),
-                dim=2,
-                name='concat3')
-            new_new_valid_length = new_valid_length
-            return new_samples, new_new_valid_length
-
-        def _else_func():
-            final_word = F.where(new_beam_alive_mask,
-                                 F.full(shape=(batch_size, beam_size), val=self._eos_id,
-                                        dtype=np.int32),
-                                 F.full(shape=(batch_size, beam_size), val=-1, dtype=np.int32))
-            new_samples = F.concat(
-                step_input.reshape((batch_size, beam_size, 1)),
-                samples,
-                final_word.reshape((0, 0, 1)),
-                dim=2)
-            new_new_valid_length = new_valid_length + new_beam_alive_mask
-            return new_samples, new_new_valid_length
-
-        new_samples, new_new_valid_length = \
-            F.contrib.cond(F.sum(new_beam_alive_mask) == 0, _then_func, _else_func)
-        return new_samples, new_scores, new_new_valid_length
-
-class SequenceSampler:
-    r"""Draw samples from the decoder according to the step-wise distribution.
-
-    Parameters
-    ----------
-    beam_size : int
-        The beam size.
-    decoder : callable
-        Function of the one-step-ahead decoder, should have the form::
-
-            outputs, new_states = decoder(step_input, states)
-
-        The outputs, input should follow these rules:
-
-        - step_input has shape (batch_size,)
-        - outputs is the unnormalized prediction before softmax with shape (batch_size, V)
-        - states and new_states have the same structure and the leading
-          dimension of the inner NDArrays is the batch dimension.
-    eos_id : int
-        Id of the EOS token. No other elements will be appended to the sample if it reaches eos_id.
-    max_length : int, default 100
-        The maximum search length.
-    temperature : float, default 1.0
-        Softmax temperature.
-    top_k : int or None, default None
-        Sample only from the top-k candidates. If None, all candidates are considered.
-    """
-    def __init__(self, beam_size, decoder, eos_id, max_length=100, temperature=1.0, top_k=None):
-        self._beam_size = beam_size
-        self._decoder = decoder
-        self._eos_id = eos_id
-        assert eos_id >= 0, 'eos_id cannot be negative! Received eos_id={}'.format(eos_id)
-        self._max_length = max_length
-        self._top_k = top_k
-        self._updater = _SamplingStepUpdate(beam_size=beam_size,
-                                            eos_id=eos_id,
-                                            temperature=temperature,
-                                            top_k=top_k)
-
-    def __call__(self, inputs, states):
-        """Sample by beam search.
-
-        Parameters
-        ----------
-        inputs : NDArray
-            The initial input of the decoder. Shape is (batch_size,).
-        states : Object that contains NDArrays
-            The initial states of the decoder.
-        Returns
-        -------
-        samples : NDArray
-            Samples draw by beam search. Shape (batch_size, beam_size, length). dtype is int32.
-        scores : NDArray
-            Scores of the samples. Shape (batch_size, beam_size). We make sure that scores[i, :] are
-            in descending order.
-        valid_length : NDArray
-            The valid length of the samples. Shape (batch_size, beam_size). dtype will be int32.
-        """
-        batch_size = inputs.shape[0]
-        beam_size = self._beam_size
-        ctx = inputs.context
-        # Tile the states and inputs to have shape (batch_size * beam_size, ...)
-        if hasattr(self._decoder, 'state_info'):
-            state_info = self._decoder.state_info(batch_size)
-        else:
-            state_info = None
-        states = _expand_to_beam_size(states, beam_size=beam_size, batch_size=batch_size,
-                                      state_info=state_info)
-        step_input = _expand_to_beam_size(inputs, beam_size=beam_size, batch_size=batch_size)
-        # All beams are initialized to alive
-        # Generated samples are initialized to be the inputs
-        # Except the first beam where the scores are set to be zero, all beams have -inf scores.
-        # Valid length is initialized to be 1
-        beam_alive_mask = mx.nd.ones(shape=(batch_size, beam_size), ctx=ctx, dtype=np.int32)
-        valid_length = mx.nd.ones(shape=(batch_size, beam_size), ctx=ctx, dtype=np.int32)
-        scores = mx.nd.zeros(shape=(batch_size, beam_size), ctx=ctx)
-        samples = step_input.reshape((batch_size, beam_size, 1)).astype(np.int32)
-        for _ in range(self._max_length):
-            outputs, new_states = self._decoder(step_input, states)
-            samples, valid_length, scores, chosen_word_ids, beam_alive_mask, states = \
-                self._updater(samples, valid_length, outputs, scores, beam_alive_mask, new_states)
-            step_input = mx.nd.relu(chosen_word_ids).reshape((-1,))
-            if mx.nd.sum(beam_alive_mask).asscalar() == 0:
-                return samples, scores, valid_length
-        final_word = mx.nd.where(beam_alive_mask,
-                                 mx.nd.full(shape=(batch_size, beam_size),
-                                            val=self._eos_id, ctx=ctx, dtype=np.int32),
-                                 mx.nd.full(shape=(batch_size, beam_size),
-                                            val=-1, ctx=ctx, dtype=np.int32))
-        samples = mx.nd.concat(samples, final_word.reshape((0, 0, 1)), dim=2)
-        valid_length += beam_alive_mask
-        return samples, scores, valid_length
diff --git a/src/gluonnlp/model/train/__init__.py b/src/gluonnlp/model/train/__init__.py
deleted file mode 100644
index 8b66c7e208..0000000000
--- a/src/gluonnlp/model/train/__init__.py
+++ /dev/null
@@ -1,94 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""NLP training model."""
-
-import mxnet as mx
-import gluonnlp as nlp
-
-from . import cache, embedding, language_model
-from .cache import *
-from .embedding import *
-from .language_model import *
-
-__all__ = language_model.__all__ + cache.__all__ + embedding.__all__ + ['get_cache_model']
-
-
-def get_cache_model(name, dataset_name='wikitext-2', window=2000,
-                    theta=0.6, lambdas=0.2, ctx=mx.cpu(), **kwargs):
-    r"""Returns a cache model using a pre-trained language model.
-
-    We implement the neural cache language model proposed in the following work::
-
-        @article{grave2016improving,
-        title={Improving neural language models with a continuous cache},
-        author={Grave, Edouard and Joulin, Armand and Usunier, Nicolas},
-        journal={ICLR},
-        year={2017}
-        }
-
-    Parameters
-    ----------
-    name : str
-        Name of the cache language model.
-    dataset_name : str or None, default 'wikitext-2'.
-        The dataset name on which the pre-trained model is trained.
-        Options are 'wikitext-2'. If specified, then the returned vocabulary is extracted from
-        the training set of the dataset.
-        If None, then vocab is required, for specifying embedding weight size, and is directly
-        returned.
-    window : int
-        Size of cache window
-    theta : float
-        The scala controls the flatness of the cache distribution
-        that predict the next word as shown below:
-
-        .. math::
-
-            p_{cache} \propto \sum_{i=1}^{t-1} \mathbb{1}_{w=x_{i+1}} exp(\theta {h_t}^T h_i)
-
-        where :math:`p_{cache}` is the cache distribution, :math:`\mathbb{1}` is
-        the identity function, and :math:`h_i` is the output of timestep i.
-    lambdas : float
-        Linear scalar between only cache and vocab distribution, the formulation is as below:
-
-        .. math::
-
-            p = (1 - \lambda) p_{vocab} + \lambda p_{cache}
-
-        where :math:`p_{vocab}` is the vocabulary distribution and :math:`p_{cache}`
-        is the cache distribution.
-    vocab : gluonnlp.Vocab or None, default None
-        Vocabulary object to be used with the language model.
-        Required when dataset_name is not specified.
-    pretrained : bool, default False
-        Whether to load the pre-trained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pre-trained weights.
-    root : str, default '~/.mxnet/models'
-        Location for keeping the pre-trained model parameters.
-
-    Returns
-    -------
-    Block
-        The model.
-    """
-    lm_model, vocab = nlp.model.get_model(name, dataset_name=dataset_name,
-                                          pretrained=True, ctx=ctx, **kwargs)
-    cache_cell = CacheCell(lm_model, len(vocab), window, theta, lambdas)
-    return cache_cell
diff --git a/src/gluonnlp/model/train/cache.py b/src/gluonnlp/model/train/cache.py
deleted file mode 100644
index f1fea9a9c1..0000000000
--- a/src/gluonnlp/model/train/cache.py
+++ /dev/null
@@ -1,195 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Cache model."""
-__all__ = ['CacheCell']
-
-import mxnet as mx
-from mxnet.gluon import HybridBlock
-
-class CacheCell(HybridBlock):
-    r"""Cache language model.
-
-    We implement the neural cache language model proposed in the following work::
-
-        @article{grave2016improving,
-        title={Improving neural language models with a continuous cache},
-        author={Grave, Edouard and Joulin, Armand and Usunier, Nicolas},
-        journal={ICLR},
-        year={2017}
-        }
-
-    Parameters
-    ----------
-    lm_model : gluonnlp.model.StandardRNN or gluonnlp.model.AWDRNN
-        The type of RNN to use. Options are 'gluonnlp.model.StandardRNN', 'gluonnlp.model.AWDRNN'.
-    vocab_size : int
-        Size of the input vocabulary.
-    window : int
-        Size of cache window
-    theta : float
-        The scala controls the flatness of the cache distribution
-        that predict the next word as shown below:
-
-        .. math::
-
-            p_{cache} \propto \sum_{i=1}^{t-1} \mathbb{1}_{w=x_{i+1}} exp(\theta {h_t}^T h_i)
-
-        where :math:`p_{cache}` is the cache distribution, :math:`\mathbb{1}` is
-        the identity function, and :math:`h_i` is the output of timestep i.
-    lambdas : float
-        Linear scalar between only cache and vocab distribution, the formulation is as below:
-
-        .. math::
-
-            p = (1 - \lambda) p_{vocab} + \lambda p_{cache}
-
-        where :math:`p_{vocab}` is the vocabulary distribution and :math:`p_{cache}`
-        is the cache distribution.
-    """
-    def __init__(self, lm_model, vocab_size, window, theta, lambdas, **kwargs):
-        super(CacheCell, self).__init__(**kwargs)
-        self._vocab_size = vocab_size
-        self._window = window
-        self._theta = theta
-        self._lambdas = lambdas
-        with self.name_scope():
-            self.lm_model = lm_model
-
-    def save_parameters(self, filename, deduplicate=False):
-        """Save parameters to file.
-
-        filename : str
-            Path to file.
-        deduplicate : bool, default False
-            If True, save shared parameters only once. Otherwise, if a Block
-            contains multiple sub-blocks that share parameters, each of the
-            shared parameters will be separately saved for every sub-block.
-        """
-        self.lm_model.save_parameters(filename, deduplicate=deduplicate)
-
-    def load_parameters(self, filename, ctx=mx.cpu()):  # pylint: disable=arguments-differ
-        """Load parameters from file.
-
-        filename : str
-            Path to parameter file.
-        ctx : Context or list of Context, default cpu()
-            Context(s) initialize loaded parameters on.
-        """
-        self.lm_model.load_parameters(filename, ctx=ctx)
-
-    def begin_state(self, *args, **kwargs):
-        """Initialize the hidden states.
-        """
-        return self.lm_model.begin_state(*args, **kwargs)
-
-    def __call__(self, inputs, target, next_word_history, cache_history, begin_state=None):
-        # pylint: disable=arguments-differ
-        """Defines the forward computation for cache cell. Arguments can be either
-        :py:class:`NDArray` or :py:class:`Symbol`.
-
-        Parameters
-        ----------
-        inputs: NDArray or Symbol
-            The input data
-        target: NDArray or Symbol
-            The label
-        next_word_history: NDArray or Symbol
-            The next word in memory
-        cache_history: NDArray or Symbol
-            The hidden state in cache history
-        begin_state: list of NDArray or Symbol, optional
-            The begin states.
-
-        Returns
-        --------
-        out: NDArray or Symbol
-            The linear interpolation of the cache language model
-            with the regular word-level language model
-        next_word_history: NDArray or Symbol
-            The next words to be kept in the memory for look up
-            (size is equal to the window size)
-        cache_history: NDArray or Symbol
-            The hidden states to be kept in the memory for look up
-            (size is equal to the window size)
-        """
-        return super(CacheCell, self).__call__(inputs, target, next_word_history,
-                                               cache_history, begin_state)
-
-
-    def hybrid_forward(self, F, inputs, target, next_word_history, cache_history, begin_state=None):
-        # pylint: disable=arguments-differ
-        """Defines the forward computation for cache cell. Arguments can be either
-        :py:class:`NDArray` or :py:class:`Symbol`.
-
-        Parameters
-        ----------
-        inputs: NDArray or Symbol
-            The input data
-        target: NDArray or Symbol
-            The label
-        next_word_history: NDArray or Symbol
-            The next word in memory
-        cache_history: NDArray or Symbol
-            The hidden state in cache history
-        begin_state: list of NDArray or Symbol, optional
-            The begin states.
-
-        Returns
-        --------
-        out: NDArray or Symbol
-            The linear interpolation of the cache language model
-            with the regular word-level language model
-        next_word_history: NDArray or Symbol
-            The next words to be kept in the memory for look up
-            (size is equal to the window size)
-        cache_history: NDArray or Symbol
-            The hidden states to be kept in the memory for look up
-            (size is equal to the window size)
-        """
-        output, hidden, encoder_hs, _ = super(self.lm_model.__class__, self.lm_model).\
-                                        hybrid_forward(F, inputs, begin_state)
-        encoder_h = encoder_hs[-1].reshape(-3, -2)
-        output = output.reshape(-1, self._vocab_size)
-
-        start_idx = len(next_word_history) \
-            if next_word_history is not None else 0
-        next_word_history = F.concat(*[F.one_hot(t[0], self._vocab_size, on_value=1, off_value=0)
-                                       for t in target], dim=0) if next_word_history is None \
-            else F.concat(next_word_history,
-                          F.concat(*[F.one_hot(t[0], self._vocab_size, on_value=1, off_value=0)
-                                     for t in target], dim=0), dim=0)
-        cache_history = encoder_h if cache_history is None \
-            else F.concat(cache_history, encoder_h, dim=0)
-
-        out = None
-        softmax_output = F.softmax(output)
-        for idx, vocab_L in enumerate(softmax_output):
-            joint_p = vocab_L
-            if start_idx + idx > self._window:
-                valid_next_word = next_word_history[start_idx + idx - self._window:start_idx + idx]
-                valid_cache_history = cache_history[start_idx + idx - self._window:start_idx + idx]
-                logits = F.dot(valid_cache_history, encoder_h[idx])
-                cache_attn = F.softmax(self._theta * logits).reshape(-1, 1)
-                cache_dist = (cache_attn.broadcast_to(valid_next_word.shape)
-                              * valid_next_word).sum(axis=0)
-                joint_p = self._lambdas * cache_dist + (1 - self._lambdas) * vocab_L
-
-            out = joint_p[target[idx]] if out is None \
-                else F.concat(out, joint_p[target[idx]], dim=0)
-        next_word_history = next_word_history[-self._window:]
-        cache_history = cache_history[-self._window:]
-        return out, next_word_history, cache_history, hidden
diff --git a/src/gluonnlp/model/train/embedding.py b/src/gluonnlp/model/train/embedding.py
deleted file mode 100644
index cd7afeeb24..0000000000
--- a/src/gluonnlp/model/train/embedding.py
+++ /dev/null
@@ -1,432 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=abstract-method
-"""Trainable embedding models."""
-
-__all__ = ['EmbeddingModel', 'CSREmbeddingModel', 'FasttextEmbeddingModel']
-
-import logging
-import struct
-import warnings
-
-import numpy as np
-from mxnet import cpu, nd
-from mxnet.gluon import Block, HybridBlock
-
-from ...vocab.subwords import create_subword_function
-
-
-class EmbeddingModel(Block):
-    """Abstract base class for embedding models for training.
-
-    An embedding model is a Gluon block with additional __contains__ and
-    __getitem__ support for computing embeddings given a string or list of
-    strings. See the documentation of __contains__ and __getitem__ for details.
-
-    """
-
-    def __contains__(self, token):
-        """Checks if a vector for token could be computed.
-
-        Parameters
-        ----------
-        token : str
-            A token.
-
-        Returns
-        -------
-        bool:
-            True if a vector for token can be computed.
-        """
-        raise NotImplementedError
-
-    def __getitem__(self, tokens):
-        """Looks up embedding vectors of text tokens.
-
-        Parameters
-        ----------
-        tokens : str or list of strs
-            A token or a list of tokens.
-
-        Returns
-        -------
-        mxnet.ndarray.NDArray:
-            The embedding vector(s) of the token(s). According to numpy
-            conventions, if `tokens` is a string, returns a 1-D NDArray
-            (vector); if `tokens` is a list of strings, returns a 2-D NDArray
-            (matrix) of shape=(len(tokens), vec_len).
-        """
-        raise NotImplementedError
-
-
-class CSREmbeddingModel(EmbeddingModel, HybridBlock):
-    """A trainable embedding model.
-
-    This class is a simple wrapper around the mxnet.gluon.nn.Embedding. It
-    trains independent embedding vectors for every token. It implements the
-    `gluonnlp.model.train.EmbeddingModel` interface which provides convenient
-    helper methods.
-
-    Parameters
-    ----------
-    token_to_idx : dict
-        token_to_idx mapping of the vocabulary that this model is to be trained
-        with. token_to_idx is used for __getitem__ and __contains__. For
-        initialization len(token_to_idx) is used to specify the size of the
-        subword embedding matrix.
-    output_dim : int
-        Dimension of the dense embedding.
-    weight_initializer : mxnet.initializer.Initializer, optional
-        Initializer for the embeddings matrix.
-    sparse_grad : bool, default True
-        Specifies mxnet.gluon.nn.Embedding sparse_grad argument.
-    dtype : str, default 'float32'
-        dtype argument passed to gluon.nn.Embedding
-
-    """
-
-    def __init__(self, token_to_idx, output_dim, weight_initializer=None,
-                 sparse_grad=True, dtype='float32', **kwargs):
-        super(CSREmbeddingModel, self).__init__(**kwargs)
-        assert isinstance(token_to_idx, dict)
-        self._token_to_idx = token_to_idx
-        self._kwargs = {
-            'input_dim': len(token_to_idx), 'output_dim': output_dim,
-            'dtype': dtype, 'sparse_grad': sparse_grad}
-        grad_stype = 'row_sparse' if sparse_grad else 'default'
-        self.weight = self.params.get(
-            'weight', shape=(len(token_to_idx), output_dim),
-            init=weight_initializer, dtype=dtype,
-            allow_deferred_init=True, grad_stype=grad_stype)  # yapf: disable
-
-    def hybrid_forward(self, F, words, weight):
-        """Compute embedding of words in batch.
-
-        Parameters
-        ----------
-        words : mx.nd.NDArray
-            Array of token indices.
-
-        """
-        #pylint: disable=arguments-differ
-        embeddings = F.sparse.dot(words, weight)
-        return embeddings
-
-    def __repr__(self):
-        s = '{block_name}({input_dim} -> {output_dim}, {dtype})'
-        return s.format(block_name=self.__class__.__name__, **self._kwargs)
-
-    def __contains__(self, token):
-        return token in self.idx_to_token
-
-    def __getitem__(self, tokens):
-        """Looks up embedding vectors of text tokens.
-
-        Parameters
-        ----------
-        tokens : str or list of strs
-            A token or a list of tokens.
-
-        Returns
-        -------
-        mxnet.ndarray.NDArray:
-            The embedding vector(s) of the token(s). According to numpy
-            conventions, if `tokens` is a string, returns a 1-D NDArray
-            (vector); if `tokens` is a list of strings, returns a 2-D NDArray
-            (matrix) of shape=(len(tokens), vec_len).
-        """
-        squeeze = False
-        if isinstance(tokens, str):
-            tokens = [tokens]
-            squeeze = True
-
-        row = np.arange(len(tokens))
-        col = np.array([self._token_to_idx[t] for t in tokens])
-        x = nd.sparse.csr_matrix(
-            (np.ones(len(row)), (row, col)),
-            dtype=self._kwargs['dtype'],
-            ctx=self.weight.list_ctx()[0],
-            shape=(len(tokens), self.weight.shape[0]),
-        )
-        vecs = self(x)
-
-        if squeeze:
-            assert len(vecs) == 1
-            return vecs[0].squeeze()
-        else:
-            return vecs
-
-
-class FasttextEmbeddingModel(EmbeddingModel, HybridBlock):
-    """FastText embedding model.
-
-    The FasttextEmbeddingModel combines a word level embedding matrix and a
-    subword level embedding matrix. It implements the
-    `gluonnlp.model.train.EmbeddingModel` interface which provides convenient
-    functions.
-
-    Parameters
-    ----------
-    token_to_idx : dict
-        token_to_idx mapping of the vocabulary that this model is to be trained
-        with. token_to_idx is used for __getitem__ and __contains__. For
-        initialization len(token_to_idx) is used to specify the size of the
-        subword embedding matrix..
-    subword_function : gluonnlp.vocab.SubwordFunction
-        The subword function used to obtain the subword indices during training
-        this model. The subword_function is used for __getitem__ and
-        __contains__. For initialization len(subword_function) is used to
-        specify the size of the subword embedding matrix..
-    output_dim : int
-        Dimension of embeddings.
-    weight_initializer : mxnet.initializer.Initializer, optional
-        Initializer for the embeddings and subword embeddings matrix.
-    sparse_grad : bool, default True
-        Specifies mxnet.gluon.nn.Embedding sparse_grad argument.
-    dtype : str, default 'float32'
-        dtype argument passed to gluon.nn.Embedding
-
-    """
-    FASTTEXT_FILEFORMAT_MAGIC = 793712314
-
-    def __init__(self, token_to_idx, subword_function, output_dim,
-                 weight_initializer=None, sparse_grad=True, dtype='float32',
-                 **kwargs):
-        super(FasttextEmbeddingModel, self).__init__(**kwargs)
-        self._token_to_idx = token_to_idx
-        self._subword_function = subword_function
-
-        self._kwargs = {
-            'num_words': len(token_to_idx),
-            'num_subwords': len(subword_function), 'output_dim': output_dim,
-            'dtype': dtype, 'sparse_grad': sparse_grad}
-        self.weight_initializer = weight_initializer
-        self.sparse_grad = sparse_grad
-        self.dtype = dtype
-
-        grad_stype = 'row_sparse' if sparse_grad else 'default'
-        self.weight = self.params.get(
-            'weight', shape=(len(token_to_idx) + len(subword_function), output_dim),
-            init=weight_initializer, dtype=dtype,
-            allow_deferred_init=True, grad_stype=grad_stype)  # yapf: disable
-
-    @classmethod
-    def load_fasttext_format(cls, path, ctx=cpu(), **kwargs):
-        """Create an instance of the class and load weights.
-
-        Load the weights from the fastText binary format created by
-        https://github.com/facebookresearch/fastText
-
-        Parameters
-        ----------
-        path : str
-            Path to the .bin model file.
-        ctx : mx.Context, default mx.cpu()
-            Context to initialize the weights on.
-        kwargs : dict
-            Keyword arguments are passed to the class initializer.
-
-        """
-        with open(path, 'rb') as f:
-            new_format, dim, bucket, minn, maxn, = cls._read_model_params(f)
-            idx_to_token = cls._read_vocab(f, new_format)
-            dim, matrix = cls._read_vectors(f, new_format, bucket,
-                                            len(idx_to_token))
-
-        token_to_idx = {token: idx for idx, token in enumerate(idx_to_token)}
-        if len(token_to_idx) != len(idx_to_token):
-            # If multiple tokens with invalid encoding were collapsed in a
-            # single token due to replacement of invalid bytes with Unicode
-            # replacement character
-            warnings.warn(
-                'There are duplicate tokens in the embedding file. '
-                'This is likely due to decoding errors for some tokens, '
-                'where invalid bytes were replaced by '
-                'the Unicode replacement character. '
-                'This affects {} tokens.'.format(
-                    len(idx_to_token) - len(token_to_idx)))
-            for _ in range(len(token_to_idx), len(idx_to_token)):
-                # Add pseudo tokens to make sure length is the same
-                token_to_idx[object()] = -1
-        assert len(token_to_idx) == len(idx_to_token)
-
-        subword_function = create_subword_function(
-            'NGramHashes', num_subwords=matrix.shape[0] - len(idx_to_token),
-            ngrams=list(range(minn, maxn + 1)), special_tokens={'</s>'})
-
-        self = cls(token_to_idx, subword_function, output_dim=dim, **kwargs)
-
-        self.initialize(ctx=ctx)
-        self.weight.set_data(nd.array(matrix))
-
-        return self
-
-    @classmethod
-    def _read_model_params(cls, file_handle):
-        magic, _ = cls._struct_unpack(file_handle, '@2i')
-        if magic == cls.FASTTEXT_FILEFORMAT_MAGIC:  # newer format
-            new_format = True
-            dim, _, _, _, _, _, _, _, bucket, minn, maxn, _, _ = \
-                cls._struct_unpack(file_handle, '@12i1d')
-        else:  # older format
-            new_format = False
-            dim = magic
-            _, _, _, _, _, _, bucket, minn, maxn, _, _ = \
-                cls._struct_unpack(file_handle, '@10i1d')
-
-        return new_format, dim, bucket, minn, maxn
-
-    @classmethod
-    def _read_vocab(cls, file_handle, new_format, encoding='utf8'):
-        vocab_size, nwords, nlabels = cls._struct_unpack(file_handle, '@3i')
-        if nlabels > 0:
-            warnings.warn((
-                'Provided model contains labels (nlabels={})'
-                'This indicates you are either not using a word embedding model '
-                'or that the model was created with a buggy version of fasttext. '
-                'Ignoring all labels.').format(nlabels))
-        logging.info('Loading %s words from fastText model.', vocab_size)
-
-        cls._struct_unpack(file_handle, '@1q')  # number of tokens
-        if new_format:
-            pruneidx_size, = cls._struct_unpack(file_handle, '@q')
-
-        idx_to_token = []
-        for _ in range(vocab_size):
-            word_bytes = b''
-            char_byte = file_handle.read(1)
-            # Read vocab word
-            while char_byte != b'\x00':
-                word_bytes += char_byte
-                char_byte = file_handle.read(1)
-            # 'surrogateescape' would be better but only available in Py3
-            word = word_bytes.decode(encoding, errors='replace')
-            _, entry_type = cls._struct_unpack(file_handle, '@qb')
-            if entry_type:
-                # Skip incorrectly included labels (affects wiki.fr)
-                assert nlabels > 0
-                continue
-            idx_to_token.append(word)
-
-        assert len(idx_to_token) == nwords, \
-            'Mismatch between words in pre-trained model file ({} words), ' \
-            'and expected number of words ({} words)'.format(len(idx_to_token), nwords)
-
-        if new_format:
-            for _ in range(pruneidx_size):
-                cls._struct_unpack(file_handle, '@2i')
-
-        return idx_to_token
-
-    @classmethod
-    def _read_vectors(cls, file_handle, new_format, bucket, vocab_len):
-        if new_format:
-            # bool quant_input in fasttext.cc
-            cls._struct_unpack(file_handle, '@?')
-        num_vectors, dim = cls._struct_unpack(file_handle, '@2q')
-        assert num_vectors == bucket + vocab_len
-
-        # Vectors stored by Matrix::save
-        float_size = struct.calcsize('@f')
-        if float_size == 4:
-            dtype = np.dtype(np.float32)
-        elif float_size == 8:
-            dtype = np.dtype(np.float64)
-
-        vectors_ngrams = np.fromfile(file_handle, dtype=dtype,
-                                     count=num_vectors * dim) \
-                           .reshape((num_vectors, dim))
-
-        return dim, vectors_ngrams
-
-    @classmethod
-    def _struct_unpack(cls, file_handle, fmt):
-        num_bytes = struct.calcsize(fmt)
-        return struct.unpack(fmt, file_handle.read(num_bytes))
-
-    def __repr__(self):
-        s = '{block_name}({num_words} + {num_subwords} -> {output_dim}, {dtype})'
-        return s.format(block_name=self.__class__.__name__, **self._kwargs)
-
-    def __contains__(self, token):
-        # supports computing vector for any str that is at least either in the
-        # word level vocabulary or contains subwords
-        return (token in self._token_to_idx
-                or self._subword_function([token])[0])
-
-    def __getitem__(self, tokens):
-        """Looks up embedding vectors of text tokens.
-
-        Parameters
-        ----------
-        tokens : str or list of strs
-            A token or a list of tokens.
-
-        Returns
-        -------
-        mxnet.ndarray.NDArray:
-            The embedding vector(s) of the token(s). According to numpy
-            conventions, if `tokens` is a string, returns a 1-D NDArray
-            (vector); if `tokens` is a list of strings, returns a 2-D NDArray
-            (matrix) of shape=(len(tokens), vec_len).
-        """
-        squeeze = False
-        if isinstance(tokens, str):
-            tokens = [tokens]
-            squeeze = True
-
-        data = []
-        row = []
-        col = []
-        subwords = self._subword_function(tokens)
-        offset = len(self._token_to_idx)
-        for i, (token, token_subwords) in enumerate(zip(tokens, subwords)):
-            if token not in self:
-                raise KeyError
-
-            if token in self._token_to_idx:
-                col.append(self._token_to_idx[token])
-                num = 1 + len(token_subwords)
-            else:
-                num = len(token_subwords)
-            data += [1.0 / num] * num
-            row += [i] * num
-            col += [s + offset for s in token_subwords]
-
-        x = nd.sparse.csr_matrix(
-            (data, (row, col)), shape=(len(tokens), self.weight.shape[0]),
-            dtype=self.dtype, ctx=self.weight.list_ctx()[0])
-        emb = self(x)
-
-        if squeeze:
-            return emb.squeeze()
-        else:
-            return emb
-
-    def hybrid_forward(self, F, words, weight):
-        """Compute embedding of words in batch.
-
-        Parameters
-        ----------
-        words : mxnet.ndarray.sparse.CSRNDArray
-            Sparse array containing weights for every word and subword index.
-            Output is the weighted sum of word and subword embeddings.
-        """
-        #pylint: disable=arguments-differ
-        embeddings = F.sparse.dot(words, weight)
-        return embeddings
diff --git a/src/gluonnlp/model/train/language_model.py b/src/gluonnlp/model/train/language_model.py
deleted file mode 100644
index 2bbf1066f0..0000000000
--- a/src/gluonnlp/model/train/language_model.py
+++ /dev/null
@@ -1,566 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Language models for training."""
-__all__ = ['AWDRNN', 'StandardRNN', 'BigRNN']
-
-from mxnet import init, nd, autograd
-from mxnet.gluon import nn, Block, HybridBlock, contrib, rnn, ParameterDict
-from mxnet import sym
-
-from ..utils import _get_rnn_layer, apply_weight_drop
-from ..sampled_block import ISDense, SparseISDense
-from ...utils import Parallelizable
-
-class AWDRNN(HybridBlock):
-    """AWD language model by salesforce.
-
-    Reference: https://github.com/salesforce/awd-lstm-lm
-
-    License: BSD 3-Clause
-
-    Parameters
-    ----------
-    mode : str
-        The type of RNN to use. Options are 'lstm', 'gru', 'rnn_tanh', 'rnn_relu'.
-    vocab_size : int
-        Size of the input vocabulary.
-    embed_size : int
-        Dimension of embedding vectors.
-    hidden_size : int
-        Number of hidden units for RNN.
-    num_layers : int
-        Number of RNN layers.
-    tie_weights : bool, default False
-        Whether to tie the weight matrices of output dense layer and input embedding layer.
-    dropout : float
-        Dropout rate to use for encoder output.
-    weight_drop : float
-        Dropout rate to use on encoder h2h weights.
-    drop_h : float
-        Dropout rate to on the output of intermediate layers of encoder.
-    drop_i : float
-        Dropout rate to on the output of embedding.
-    drop_e : float
-        Dropout rate to use on the embedding layer.
-    """
-    def __init__(self, mode, vocab_size, embed_size=400, hidden_size=1150, num_layers=3,
-                 tie_weights=True, dropout=0.4, weight_drop=0.5, drop_h=0.2,
-                 drop_i=0.65, drop_e=0.1, **kwargs):
-        super(AWDRNN, self).__init__(**kwargs)
-        self._mode = mode
-        self._vocab_size = vocab_size
-        self._embed_size = embed_size
-        self._hidden_size = hidden_size
-        self._num_layers = num_layers
-        self._dropout = dropout
-        self._drop_h = drop_h
-        self._drop_i = drop_i
-        self._drop_e = drop_e
-        self._weight_drop = weight_drop
-        self._tie_weights = tie_weights
-        self._shared_params = None
-        if 'params' in kwargs:
-            self._shared_params = kwargs['params']
-
-        with self.name_scope():
-            self.embedding = self._get_embedding()
-            self.encoder = self._get_encoder()
-            self.decoder = self._get_decoder()
-
-    def _get_embedding(self):
-        embedding = nn.HybridSequential()
-        with embedding.name_scope():
-            embedding_block = nn.Embedding(self._vocab_size, self._embed_size,
-                                           weight_initializer=init.Uniform(0.1))
-            if self._drop_e:
-                apply_weight_drop(embedding_block, 'weight', self._drop_e, axes=(1,))
-            embedding.add(embedding_block)
-            if self._drop_i:
-                embedding.add(nn.Dropout(self._drop_i, axes=(0,)))
-        return embedding
-
-    def _get_encoder(self):
-        encoder = nn.HybridSequential()
-        with encoder.name_scope():
-            for l in range(self._num_layers):
-                encoder.add(_get_rnn_layer(self._mode, 1, self._embed_size if l == 0 else
-                                           self._hidden_size, self._hidden_size if
-                                           l != self._num_layers - 1 or not self._tie_weights
-                                           else self._embed_size, 0, self._weight_drop))
-        return encoder
-
-    def _get_decoder(self):
-        output = nn.HybridSequential()
-        with output.name_scope():
-            if self._tie_weights:
-                if self._shared_params is not None:
-                    # self.embedding[0].params do not contain the bias, it
-                    # may leave the decoder bias uninitialized. We resolve this
-                    # issue by creating a new ParameterDict and stuffing
-                    # every shared params into the ParameterDict.
-                    shared_params = self.embedding[0].params
-                    shared_params = ParameterDict(shared_params.prefix)
-                    shared_params.update(self._shared_params)
-                    output.add(nn.Dense(self._vocab_size, flatten=False,
-                                        params=shared_params))
-                else:
-                    output.add(nn.Dense(self._vocab_size, flatten=False,
-                                        params=self.embedding[0].params))
-            else:
-                output.add(nn.Dense(self._vocab_size, flatten=False))
-        return output
-
-    def begin_state(self, *args, **kwargs):
-        return [c.begin_state(*args, **kwargs) for c in self.encoder]
-
-    def state_info(self, *args, **kwargs):
-        return [c.state_info(*args, **kwargs) for c in self.encoder]
-
-    def __call__(self, inputs, begin_state=None):
-        #pylint: disable=arguments-differ, dangerous-default-value
-        """Encode the inputs given the states and valid sequence length.
-
-        Parameters
-        -----------
-        inputs : NDArray or Symbol
-            input tensor with shape `(sequence_length, batch_size)`
-            when `layout` is "TNC".
-        begin_state : list
-            initial recurrent state tensor with length equals to num_layers.
-            the initial state with shape `(1, batch_size, num_hidden)`
-
-        Returns
-        --------
-        out: NDArray
-            output tensor with shape `(sequence_length, batch_size, input_size)`
-            when `layout` is "TNC".
-        out_states: list
-            output recurrent state tensor with length equals to num_layers.
-            the state with shape `(1, batch_size, num_hidden)`
-        encoded_raw: list
-            The list of outputs of the model's encoder with length equals to num_layers.
-            the shape of every encoder's output `(sequence_length, batch_size, num_hidden)`
-        encoded_dropped: list
-            The list of outputs with dropout of the model's encoder with length equals
-            to num_layers. The shape of every encoder's dropped output
-            `(sequence_length, batch_size, num_hidden)`
-        """
-        return super(AWDRNN, self).__call__(inputs, begin_state)
-
-    def hybrid_forward(self, F, inputs, begin_state=None): # pylint: disable=arguments-differ
-        """Implement the forward computation that the awd language model and cache model use.
-
-        Parameters
-        -----------
-        inputs : NDArray or Symbol
-            input tensor with shape `(sequence_length, batch_size)`
-            when `layout` is "TNC".
-        begin_state : list
-            initial recurrent state tensor with length equals to num_layers.
-            the initial state with shape `(1, batch_size, num_hidden)`
-
-        Returns
-        --------
-        out: NDArray or Symbol
-            output tensor with shape `(sequence_length, batch_size, input_size)`
-            when `layout` is "TNC".
-        out_states: list
-            output recurrent state tensor with length equals to num_layers.
-            the state with shape `(1, batch_size, num_hidden)`
-        encoded_raw: list
-            The list of outputs of the model's encoder with length equals to num_layers.
-            the shape of every encoder's output `(sequence_length, batch_size, num_hidden)`
-        encoded_dropped: list
-            The list of outputs with dropout of the model's encoder with length equals
-            to num_layers. The shape of every encoder's dropped output
-            `(sequence_length, batch_size, num_hidden)`
-        """
-        encoded = self.embedding(inputs)
-        if not begin_state:
-            if F == nd:
-                begin_state = self.begin_state(batch_size=inputs.shape[1])
-            else:
-                begin_state = self.begin_state(batch_size=0, func=sym.zeros)
-        out_states = []
-        encoded_raw = []
-        encoded_dropped = []
-        for i, (e, s) in enumerate(zip(self.encoder, begin_state)):
-            encoded, state = e(encoded, s)
-            encoded_raw.append(encoded)
-            out_states.append(state)
-            if self._drop_h and i != len(self.encoder) - 1:
-                encoded = F.Dropout(encoded, p=self._drop_h, axes=(0,))
-                encoded_dropped.append(encoded)
-        if self._dropout:
-            encoded = F.Dropout(encoded, p=self._dropout, axes=(0,))
-        encoded_dropped.append(encoded)
-        with autograd.predict_mode():
-            out = self.decoder(encoded)
-        return out, out_states, encoded_raw, encoded_dropped
-
-
-class StandardRNN(HybridBlock):
-    """Standard RNN language model.
-
-    Parameters
-    ----------
-    mode : str
-        The type of RNN to use. Options are 'lstm', 'gru', 'rnn_tanh', 'rnn_relu'.
-    vocab_size : int
-        Size of the input vocabulary.
-    embed_size : int
-        Dimension of embedding vectors.
-    hidden_size : int
-        Number of hidden units for RNN.
-    num_layers : int
-        Number of RNN layers.
-    dropout : float
-        Dropout rate to use for encoder output.
-    tie_weights : bool, default False
-        Whether to tie the weight matrices of output dense layer and input embedding layer.
-    """
-    def __init__(self, mode, vocab_size, embed_size, hidden_size,
-                 num_layers, dropout=0.5, tie_weights=False, **kwargs):
-        if tie_weights:
-            assert embed_size == hidden_size, 'Embedding dimension must be equal to ' \
-                                              'hidden dimension in order to tie weights. ' \
-                                              'Got: emb: {}, hid: {}.'.format(embed_size,
-                                                                              hidden_size)
-        super(StandardRNN, self).__init__(**kwargs)
-        self._mode = mode
-        self._embed_size = embed_size
-        self._hidden_size = hidden_size
-        self._num_layers = num_layers
-        self._dropout = dropout
-        self._tie_weights = tie_weights
-        self._vocab_size = vocab_size
-        self._shared_params = None
-        if 'params' in kwargs:
-            self._shared_params = kwargs['params']
-
-        with self.name_scope():
-            self.embedding = self._get_embedding()
-            self.encoder = self._get_encoder()
-            self.decoder = self._get_decoder()
-
-    def _get_embedding(self):
-        embedding = nn.HybridSequential()
-        with embedding.name_scope():
-            embedding.add(nn.Embedding(self._vocab_size, self._embed_size,
-                                       weight_initializer=init.Uniform(0.1)))
-            if self._dropout:
-                embedding.add(nn.Dropout(self._dropout))
-        return embedding
-
-    def _get_encoder(self):
-        return _get_rnn_layer(self._mode, self._num_layers, self._embed_size,
-                              self._hidden_size, self._dropout, 0)
-
-    def _get_decoder(self):
-        output = nn.HybridSequential()
-        with output.name_scope():
-            if self._tie_weights:
-                if self._shared_params is not None:
-                    # self.embedding[0].params do not contain the bias, it
-                    # may leave the decoder bias uninitialized. We resolve this
-                    # issue by creating a new ParameterDict and stuffing
-                    # every shared params into the ParameterDict.
-                    shared_params = self.embedding[0].params
-                    shared_params = ParameterDict(shared_params.prefix)
-                    shared_params.update(self._shared_params)
-                    output.add(nn.Dense(self._vocab_size, flatten=False,
-                                        params=shared_params))
-                else:
-                    output.add(nn.Dense(self._vocab_size, flatten=False,
-                                        params=self.embedding[0].params))
-            else:
-                output.add(nn.Dense(self._vocab_size, flatten=False))
-        return output
-
-    def begin_state(self, *args, **kwargs):
-        return self.encoder.begin_state(*args, **kwargs)
-
-    def state_info(self, *args, **kwargs):
-        return self.encoder.state_info(*args, **kwargs)
-
-    def __call__(self, inputs, begin_state=None): # pylint: disable=arguments-differ
-        """Defines the forward computation. Arguments can be either
-        :py:class:`NDArray` or :py:class:`Symbol`.
-
-        Parameters
-        -----------
-        inputs : NDArray or Symbol
-            input tensor with shape `(sequence_length, batch_size)`
-            when `layout` is "TNC".
-        begin_state : list
-            initial recurrent state tensor with length equals to num_layers-1.
-            the initial state with shape `(num_layers, batch_size, num_hidden)`
-
-        Returns
-        --------
-        out: NDArray or Symbol
-            output tensor with shape `(sequence_length, batch_size, input_size)`
-            when `layout` is "TNC".
-        out_states: list
-            output recurrent state tensor with length equals to num_layers-1.
-            the state with shape `(num_layers, batch_size, num_hidden)`
-        encoded_raw: list
-            The list of last output of the model's encoder.
-            the shape of last encoder's output `(sequence_length, batch_size, num_hidden)`
-        encoded_dropped: list
-            The list of last output with dropout of the model's encoder.
-            the shape of last encoder's dropped output `(sequence_length, batch_size, num_hidden)`
-        """
-        return super(StandardRNN, self).__call__(inputs, begin_state)
-
-    def hybrid_forward(self, F, inputs, begin_state=None): # pylint: disable=arguments-differ
-        """Defines the forward computation. Arguments can be either
-        :py:class:`NDArray` or :py:class:`Symbol`.
-
-        Parameters
-        -----------
-        inputs : NDArray or Symbol
-            input tensor with shape `(sequence_length, batch_size)`
-            when `layout` is "TNC".
-        begin_state : list
-            initial recurrent state tensor with length equals to num_layers-1.
-            the initial state with shape `(num_layers, batch_size, num_hidden)`
-
-        Returns
-        --------
-        out: NDArray or Symbol
-            output tensor with shape `(sequence_length, batch_size, input_size)`
-            when `layout` is "TNC".
-        out_states: list
-            output recurrent state tensor with length equals to num_layers-1.
-            the state with shape `(num_layers, batch_size, num_hidden)`
-        encoded_raw: list
-            The list of last output of the model's encoder.
-            the shape of last encoder's output `(sequence_length, batch_size, num_hidden)`
-        encoded_dropped: list
-            The list of last output with dropout of the model's encoder.
-            the shape of last encoder's dropped output `(sequence_length, batch_size, num_hidden)`
-        """
-        encoded = self.embedding(inputs)
-        if not begin_state:
-            if F == nd:
-                begin_state = self.begin_state(batch_size=inputs.shape[1])
-            else:
-                begin_state = self.begin_state(batch_size=0, func=sym.zeros)
-
-        encoded_raw = []
-        encoded_dropped = []
-        encoded, state = self.encoder(encoded, begin_state)
-        encoded_raw.append(encoded)
-        if self._dropout:
-            encoded = F.Dropout(encoded, p=self._dropout, axes=(0,))
-        out = self.decoder(encoded)
-        return out, state, encoded_raw, encoded_dropped
-
-class BigRNN(Block):
-    """Big language model with LSTMP and importance sampling.
-
-    Reference: https://github.com/rafaljozefowicz/lm
-
-    License: MIT
-
-    Parameters
-    ----------
-    vocab_size : int
-        Size of the input vocabulary.
-    embed_size : int
-        Dimension of embedding vectors.
-    hidden_size : int
-        Number of hidden units for LSTMP.
-    num_layers : int
-        Number of LSTMP layers.
-    projection_size : int
-        Number of projection units for LSTMP.
-    num_sampled : int
-        Number of sampled classes for the decoder.
-    embed_dropout : float
-        Dropout rate to use for embedding output.
-    encoder_dropout : float
-        Dropout rate to use for encoder output.
-    sparse_weight : bool
-        Whether to use RewSparseNDArray for weights of input and output embeddings.
-    sparse_grad : bool
-        Whether to use RowSparseNDArray for the gradients w.r.t.
-        weights of input and output embeddings.
-
-    .. note: If `sparse_grad` is set to True, the gradient w.r.t input and output
-             embeddings will be sparse. Only a subset of optimizers support
-             sparse gradients, including SGD, AdaGrad and Adam.
-             By default `lazy_update` is turned on for these optimizers,
-             which may perform differently from standard updates.
-             For more details, please check the Optimization API at:
-             https://mxnet.incubator.apache.org/api/python/optimization/optimization.html
-
-    .. note: If `sparse_weight` is set to True, the parameters in the embedding block and
-             decoder block will be stored in row_sparse format, which helps reduce memory
-             consumption and communication overhead during multi-GPU training. However,
-             sparse parameters cannot be shared with other blocks, nor could we hybridize
-             a block containing sparse parameters.
-    """
-    def __init__(self, vocab_size, embed_size, hidden_size, num_layers,
-                 projection_size, num_sampled, embed_dropout=0.0, encode_dropout=0.0,
-                 sparse_weight=True, sparse_grad=True, **kwargs):
-        super(BigRNN, self).__init__(**kwargs)
-        self._embed_size = embed_size
-        self._hidden_size = hidden_size
-        self._projection_size = projection_size
-        self._num_layers = num_layers
-        self._embed_dropout = embed_dropout
-        self._encode_dropout = encode_dropout
-        self._vocab_size = vocab_size
-        self._num_sampled = num_sampled
-        self._sparse_weight = sparse_weight
-        self._sparse_grad = sparse_grad
-        if self._sparse_weight:
-            assert self._sparse_grad, 'Dense grad with sparse weight is not supported.'
-
-        with self.name_scope():
-            self.embedding = self._get_embedding()
-            self.encoder = self._get_encoder()
-            self.decoder = self._get_decoder()
-
-    def _get_embedding(self):
-        prefix = 'embedding0_'
-        if self._sparse_weight:
-            embedding = nn.Sequential(prefix=prefix)
-        else:
-            embedding = nn.HybridSequential(prefix=prefix)
-        with embedding.name_scope():
-            if self._sparse_weight:
-                # sparse embedding has both sparse weight and sparse grad
-                embed = contrib.nn.SparseEmbedding(self._vocab_size, self._embed_size,
-                                                   prefix=prefix)
-            else:
-                embed = nn.Embedding(self._vocab_size, self._embed_size, prefix=prefix,
-                                     sparse_grad=self._sparse_grad)
-            embedding.add(embed)
-            if self._embed_dropout:
-                embedding.add(nn.Dropout(self._embed_dropout))
-        return embedding
-
-    def _get_encoder(self):
-        block = rnn.HybridSequentialRNNCell()
-        with block.name_scope():
-            for _ in range(self._num_layers):
-                block.add(contrib.rnn.LSTMPCell(self._hidden_size, self._projection_size))
-                if self._encode_dropout:
-                    block.add(rnn.DropoutCell(self._encode_dropout))
-        return block
-
-    def _get_decoder(self):
-        prefix = 'decoder0_'
-        if self._sparse_weight:
-            # sparse IS Dense has both sparse weight and sparse grad
-            block = SparseISDense(self._vocab_size, self._num_sampled,
-                                  self._projection_size, remove_accidental_hits=True,
-                                  prefix=prefix)
-        else:
-            block = ISDense(self._vocab_size, self._num_sampled,
-                            self._projection_size, remove_accidental_hits=True,
-                            prefix=prefix, sparse_grad=self._sparse_grad)
-        return block
-
-    def begin_state(self, **kwargs):
-        return self.encoder.begin_state(**kwargs)
-
-    def forward(self, inputs, label, begin_state, sampled_values): # pylint: disable=arguments-differ
-        """Defines the forward computation.
-
-        Parameters
-        -----------
-        inputs : NDArray
-            input tensor with shape `(sequence_length, batch_size)`
-            when `layout` is "TNC".
-        begin_state : list
-            initial recurrent state tensor with length equals to num_layers*2.
-            For each layer the two initial states have shape `(batch_size, num_hidden)`
-            and `(batch_size, num_projection)`
-        sampled_values : list
-            a list of three tensors for `sampled_classes` with shape `(num_samples,)`,
-            `expected_count_sampled` with shape `(num_samples,)`, and
-            `expected_count_true` with shape `(sequence_length, batch_size)`.
-
-        Returns
-        --------
-        out : NDArray
-            output tensor with shape `(sequence_length, batch_size, 1+num_samples)`
-            when `layout` is "TNC".
-        out_states : list
-            output recurrent state tensor with length equals to num_layers*2.
-            For each layer the two initial states have shape `(batch_size, num_hidden)`
-            and `(batch_size, num_projection)`
-        new_target : NDArray
-            output tensor with shape `(sequence_length, batch_size)`
-            when `layout` is "TNC".
-        """
-        encoded = self.embedding(inputs)
-        length = inputs.shape[0]
-        batch_size = inputs.shape[1]
-        encoded, out_states = self.encoder.unroll(length, encoded, begin_state,
-                                                  layout='TNC', merge_outputs=True)
-        out, new_target = self.decoder(encoded, sampled_values, label)
-        out = out.reshape((length, batch_size, -1))
-        new_target = new_target.reshape((length, batch_size))
-        return out, out_states, new_target
-
-class ParallelBigRNN(Parallelizable):
-    """Data parallel BigRNN model for training.
-
-    Parameters
-    ----------
-    model : HybridBlock
-        The RNN model to be parallelized
-    loss_fn : function
-        A function computes the loss of given predictions.
-    batch_size : int
-        Defines the batch size at each iteration
-    ----------
-    """
-    def __init__(self, model, loss_fn, batch_size):
-        self._model = model
-        self._loss = loss_fn
-        self._batch_size = batch_size
-
-    def forward_backward(self, x):
-        """Defines the forward computation.
-
-        Parameters
-        ----------
-        x : tuple
-        It contains the input, target, masked, sampled and hidden states
-
-        Returns
-        ----------
-        hidden : NDArray
-        Next hidden states computed by the parallel model
-        ls : NDArray
-        Loss computed with provided loss function
-        """
-        X, y, m, s, h = x
-        with autograd.record():
-            output, hidden, new_target = self._model(X, y, h, s)
-            output = output.reshape((-3, -1))
-            new_target = new_target.reshape((-1,))
-            ls = self._loss(output, new_target) * m.reshape((-1,))
-            ls = ls / self._batch_size
-            ls.backward()
-        return hidden, ls
diff --git a/src/gluonnlp/model/transformer.py b/src/gluonnlp/model/transformer.py
deleted file mode 100644
index 08e58e10f5..0000000000
--- a/src/gluonnlp/model/transformer.py
+++ /dev/null
@@ -1,1032 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=too-many-lines
-"""Encoder and decoder usded in sequence-to-sequence learning."""
-
-__all__ = ['TransformerEncoder', 'PositionwiseFFN', 'TransformerEncoderCell',
-           'transformer_en_de_512']
-
-import math
-import os
-
-import numpy as np
-import mxnet as mx
-from mxnet import cpu, gluon
-from mxnet.gluon import nn
-from mxnet.gluon.block import HybridBlock
-from mxnet.gluon.model_zoo import model_store
-
-from ..base import get_home_dir
-from ..utils.parallel import Parallelizable
-from .block import GELU
-from .seq2seq_encoder_decoder import (Seq2SeqDecoder, Seq2SeqEncoder,
-                                      Seq2SeqOneStepDecoder)
-from .translation import NMTModel
-from .utils import _load_pretrained_params, _load_vocab
-from .attention_cell import _get_attention_cell
-
-def _position_encoding_init(max_length, dim):
-    """Init the sinusoid position encoding table """
-    position_enc = np.arange(max_length).reshape((-1, 1)) \
-                   / (np.power(10000, (2. / dim) * np.arange(dim).reshape((1, -1))))
-    # Apply the cosine to even columns and sin to odds.
-    position_enc[:, 0::2] = np.sin(position_enc[:, 0::2])  # dim 2i
-    position_enc[:, 1::2] = np.cos(position_enc[:, 1::2])  # dim 2i+1
-    return position_enc
-
-
-###############################################################################
-#                                ENCODER                                      #
-###############################################################################
-
-class PositionwiseFFN(HybridBlock):
-    """Positionwise Feed-Forward Neural Network.
-
-    Parameters
-    ----------
-    units : int
-        Number of units for the output
-    hidden_size : int
-        Number of units in the hidden layer of position-wise feed-forward networks
-    dropout : float
-        Dropout probability for the output
-    use_residual : bool
-        Add residual connection between the input and the output
-    ffn1_dropout : bool, default False
-        If True, apply dropout both after the first and second Positionwise
-        Feed-Forward Neural Network layers. If False, only apply dropout after
-        the second.
-    activation : str, default 'relu'
-        Activation function
-    layer_norm_eps : float, default 1e-5
-        Epsilon parameter passed to for mxnet.gluon.nn.LayerNorm
-    weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default None
-        Prefix for name of `Block`s
-        (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells.
-        Created if `None`.
-    """
-
-    def __init__(self, *, units=512, hidden_size=2048, dropout=0.0, use_residual=True,
-                 ffn1_dropout=False, activation='relu', layer_norm_eps=1e-5,
-                 weight_initializer=None, bias_initializer='zeros', prefix=None, params=None):
-        super().__init__(prefix=prefix, params=params)
-        self._use_residual = use_residual
-        self._dropout = dropout
-        self._ffn1_dropout = ffn1_dropout
-        with self.name_scope():
-            self.ffn_1 = nn.Dense(units=hidden_size, flatten=False,
-                                  weight_initializer=weight_initializer,
-                                  bias_initializer=bias_initializer,
-                                  prefix='ffn_1_')
-            self.activation = self._get_activation(activation) if activation else None
-            self.ffn_2 = nn.Dense(units=units, flatten=False,
-                                  weight_initializer=weight_initializer,
-                                  bias_initializer=bias_initializer,
-                                  prefix='ffn_2_')
-            if dropout:
-                self.dropout_layer = nn.Dropout(rate=dropout)
-            self.layer_norm = nn.LayerNorm(in_channels=units, epsilon=layer_norm_eps)
-
-    def _get_activation(self, act):
-        """Get activation block based on the name. """
-        if isinstance(act, str):
-            if act.lower() == 'gelu':
-                return GELU()
-            elif act.lower() == 'approx_gelu':
-                return GELU(approximate=True)
-            else:
-                return gluon.nn.Activation(act)
-        assert isinstance(act, gluon.Block)
-        return act
-
-    def hybrid_forward(self, F, inputs):  # pylint: disable=arguments-differ
-        """Position-wise encoding of the inputs.
-
-        Parameters
-        ----------
-        inputs : Symbol or NDArray
-            Input sequence. Shape (batch_size, length, C_in)
-
-        Returns
-        -------
-        outputs : Symbol or NDArray
-            Shape (batch_size, length, C_out)
-        """
-        outputs = self.ffn_1(inputs)
-        if self.activation:
-            outputs = self.activation(outputs)
-        if self._dropout and self._ffn1_dropout:
-            outputs = self.dropout_layer(outputs)
-        outputs = self.ffn_2(outputs)
-        if self._dropout:
-            outputs = self.dropout_layer(outputs)
-        if self._use_residual:
-            outputs = outputs + inputs
-        outputs = self.layer_norm(outputs)
-        return outputs
-
-
-class TransformerEncoderCell(HybridBlock):
-    """Structure of the Transformer Encoder Cell.
-
-    Parameters
-    ----------
-    attention_cell : AttentionCell or str, default 'multi_head'
-        Arguments of the attention cell.
-        Can be 'multi_head', 'scaled_luong', 'scaled_dot', 'dot', 'cosine', 'normed_mlp', 'mlp'
-    units : int
-        Number of units for the output
-    hidden_size : int
-        number of units in the hidden layer of position-wise feed-forward networks
-    num_heads : int
-        Number of heads in multi-head attention
-    scaled : bool
-        Whether to scale the softmax input by the sqrt of the input dimension
-        in multi-head attention
-    dropout : float
-    use_residual : bool
-    output_attention: bool
-        Whether to output the attention weights
-    attention_use_bias : bool, default False
-        Whether to use bias when projecting the query/key/values in the attention cell.
-    attention_proj_use_bias : bool, default False
-        Whether to use bias when projecting the output of the attention cell.
-    weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default None
-        Prefix for name of `Block`s. (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells. Created if `None`.
-    activation : str, default None
-        Activation methods in PositionwiseFFN
-    layer_norm_eps : float, default 1e-5
-        Epsilon for layer_norm
-
-    Inputs:
-        - **inputs** : input sequence. Shape (batch_size, length, C_in)
-        - **mask** : mask for inputs. Shape (batch_size, length, length)
-
-    Outputs:
-        - **outputs**: output tensor of the transformer encoder cell.
-            Shape (batch_size, length, C_out)
-        - **additional_outputs**: the additional output of all the transformer encoder cell.
-    """
-
-    def __init__(self, *, attention_cell='multi_head', units=128, hidden_size=512, num_heads=4,
-                 scaled=True, dropout=0.0, use_residual=True, output_attention=False,
-                 attention_proj_use_bias=False, attention_use_bias=False, weight_initializer=None,
-                 bias_initializer='zeros', prefix=None, params=None, activation='relu',
-                 layer_norm_eps=1e-5):
-        super().__init__(prefix=prefix, params=params)
-        self._dropout = dropout
-        self._use_residual = use_residual
-        self._output_attention = output_attention
-        with self.name_scope():
-            if dropout:
-                self.dropout_layer = nn.Dropout(rate=dropout)
-            self.attention_cell = _get_attention_cell(attention_cell, units=units,
-                                                      num_heads=num_heads, scaled=scaled,
-                                                      dropout=dropout, use_bias=attention_use_bias)
-            self.proj = nn.Dense(units=units, flatten=False, use_bias=attention_proj_use_bias,
-                                 weight_initializer=weight_initializer,
-                                 bias_initializer=bias_initializer, prefix='proj_')
-            self.ffn = PositionwiseFFN(units=units, hidden_size=hidden_size, dropout=dropout,
-                                       use_residual=use_residual,
-                                       weight_initializer=weight_initializer,
-                                       bias_initializer=bias_initializer, activation=activation,
-                                       layer_norm_eps=layer_norm_eps)
-            self.layer_norm = nn.LayerNorm(in_channels=units, epsilon=layer_norm_eps)
-
-
-    def hybrid_forward(self, F, inputs, mask=None):  # pylint: disable=arguments-differ
-        """Transformer Encoder Attention Cell.
-
-        Parameters
-        ----------
-        inputs : Symbol or NDArray
-            Input sequence. Shape (batch_size, length, C_in)
-        mask : Symbol or NDArray or None
-            Mask for inputs. Shape (batch_size, length, length)
-
-        Returns
-        -------
-        encoder_cell_outputs: list
-            Outputs of the encoder cell. Contains:
-
-            - outputs of the transformer encoder cell. Shape (batch_size, length, C_out)
-            - additional_outputs of all the transformer encoder cell
-        """
-        outputs, attention_weights = self.attention_cell(inputs, inputs, inputs, mask)
-        outputs = self.proj(outputs)
-        if self._dropout:
-            outputs = self.dropout_layer(outputs)
-        if self._use_residual:
-            outputs = outputs + inputs
-        outputs = self.layer_norm(outputs)
-        outputs = self.ffn(outputs)
-        additional_outputs = []
-        if self._output_attention:
-            additional_outputs.append(attention_weights)
-        return outputs, additional_outputs
-
-class TransformerEncoder(HybridBlock, Seq2SeqEncoder):
-    """Structure of the Transformer Encoder.
-
-    Parameters
-    ----------
-    attention_cell : AttentionCell or str, default 'multi_head'
-        Arguments of the attention cell.
-        Can be 'multi_head', 'scaled_luong', 'scaled_dot', 'dot', 'cosine', 'normed_mlp', 'mlp'
-    num_layers : int
-        Number of attention layers.
-    units : int
-        Number of units for the output.
-    hidden_size : int
-        number of units in the hidden layer of position-wise feed-forward networks
-    max_length : int
-        Maximum length of the input sequence
-    num_heads : int
-        Number of heads in multi-head attention
-    scaled : bool
-        Whether to scale the softmax input by the sqrt of the input dimension
-        in multi-head attention
-    scale_embed : bool, default True
-        Whether to scale the input embeddings by the sqrt of the `units`.
-    norm_inputs : bool, default True
-        Whether to normalize the input embeddings with LayerNorm. If dropout is
-        enabled, normalization happens after dropout is applied to inputs.
-    dropout : float
-        Dropout probability of the attention probabilities.
-    use_residual : bool
-        Whether to use residual connection.
-    output_attention: bool, default False
-        Whether to output the attention weights
-    output_all_encodings: bool, default False
-        Whether to output encodings of all encoder's cells, or only the last one
-    weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default None.
-        Prefix for name of `Block`s. (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells. Created if `None`.
-
-    Inputs:
-        - **inputs** : input sequence of shape (batch_size, length, C_in)
-        - **states** : list of tensors for initial states and masks.
-        - **valid_length** : valid lengths of each sequence. Usually used when part of sequence
-            has been padded. Shape is (batch_size, )
-
-    Outputs:
-        - **outputs** : the output of the encoder. Shape is (batch_size, length, C_out)
-        - **additional_outputs** : list of tensors.
-            Either be an empty list or contains the attention weights in this step.
-            The attention weights will have shape (batch_size, length, mem_length) or
-            (batch_size, num_heads, length, mem_length)
-    """
-
-    def __init__(self, *, attention_cell='multi_head', num_layers=2, units=512, hidden_size=2048,
-                 max_length=50, num_heads=4, scaled=True, scale_embed=True, norm_inputs=True,
-                 dropout=0.0, use_residual=True, output_attention=False, output_all_encodings=False,
-                 weight_initializer=None, bias_initializer='zeros', prefix=None, params=None):
-        super().__init__(prefix=prefix, params=params)
-        assert units % num_heads == 0,\
-            'In TransformerEncoder, The units should be divided exactly ' \
-            'by the number of heads. Received units={}, num_heads={}' \
-            .format(units, num_heads)
-        self._max_length = max_length
-        self._units = units
-        self._output_attention = output_attention
-        self._output_all_encodings = output_all_encodings
-        self._dropout = dropout
-        self._scale_embed = scale_embed
-        self._norm_inputs = norm_inputs
-
-        with self.name_scope():
-            if dropout:
-                self.dropout_layer = nn.Dropout(rate=dropout)
-            if self._norm_inputs:
-                self.layer_norm = nn.LayerNorm(in_channels=units, epsilon=1e-5)
-            self.position_weight = self.params.get_constant(
-                'const', _position_encoding_init(max_length, units))
-            self.transformer_cells = nn.HybridSequential()
-            for i in range(num_layers):
-                cell = TransformerEncoderCell(
-                    units=units, hidden_size=hidden_size, num_heads=num_heads,
-                    attention_cell=attention_cell, weight_initializer=weight_initializer,
-                    bias_initializer=bias_initializer, dropout=dropout, use_residual=use_residual,
-                    scaled=scaled, output_attention=output_attention, prefix='transformer%d_' % i)
-                self.transformer_cells.add(cell)
-
-    def __call__(self, inputs, states=None, valid_length=None): #pylint: disable=arguments-differ
-        """Encode the inputs given the states and valid sequence length.
-
-        Parameters
-        ----------
-        inputs : NDArray or Symbol
-            Input sequence. Shape (batch_size, length, C_in)
-        states : list of NDArrays or Symbols
-            Initial states. The list of initial states and masks
-        valid_length : NDArray or Symbol
-            Valid lengths of each sequence. This is usually used when part of sequence has
-            been padded. Shape (batch_size,)
-        Returns
-        -------
-        encoder_outputs: list
-            Outputs of the encoder. Contains:
-
-            - outputs of the transformer encoder. Shape (batch_size, length, C_out)
-            - additional_outputs of all the transformer encoder
-        """
-        return super().__call__(inputs, states, valid_length)
-
-    def hybrid_forward(self, F, inputs, states=None, valid_length=None, position_weight=None):
-        # pylint: disable=arguments-differ
-        """Encode the inputs given the states and valid sequence length.
-
-        Parameters
-        ----------
-        inputs : NDArray or Symbol
-            Input sequence. Shape (batch_size, length, C_in)
-        states : list of NDArrays or Symbols
-            Initial states. The list of initial states and masks
-        valid_length : NDArray or Symbol
-            Valid lengths of each sequence. This is usually used when part of sequence has
-            been padded. Shape (batch_size,)
-        position_weight : NDArray or Symbol
-            The weight of positional encoding. Shape (max_len, C_in).
-
-        Returns
-        -------
-        outputs : NDArray or Symbol, or List[NDArray] or List[Symbol]
-            If output_all_encodings flag is False, then the output of the last encoder.
-            If output_all_encodings flag is True, then the list of all outputs of all encoders.
-            In both cases, shape of the tensor(s) is/are (batch_size, length, C_out)
-        additional_outputs : list
-            Either be an empty list or contains the attention weights in this step.
-            The attention weights will have shape (batch_size, length, length) or
-            (batch_size, num_heads, length, length)
-
-        """
-        steps = F.contrib.arange_like(inputs, axis=1)
-        if valid_length is not None:
-            ones = F.ones_like(steps)
-            mask = F.broadcast_lesser(F.reshape(steps, shape=(1, -1)),
-                                      F.reshape(valid_length, shape=(-1, 1)))
-            mask = F.broadcast_mul(F.expand_dims(mask, axis=1),
-                                   F.broadcast_mul(ones, F.reshape(ones, shape=(-1, 1))))
-            if states is None:
-                states = [mask]
-            else:
-                states.append(mask)
-        else:
-            mask = None
-
-        if states is None:
-            states = [steps]
-        else:
-            states.append(steps)
-
-        if self._scale_embed:
-            inputs = inputs * math.sqrt(self._units)
-        # Positional encoding
-        positional_embed = F.Embedding(steps, position_weight, self._max_length, self._units)
-        inputs = F.broadcast_add(inputs, F.expand_dims(positional_embed, axis=0))
-
-        if self._dropout:
-            inputs = self.dropout_layer(inputs)
-
-        if self._norm_inputs:
-            inputs = self.layer_norm(inputs)
-
-        all_encodings_outputs = []
-        additional_outputs = []
-        for cell in self.transformer_cells:
-            outputs, attention_weights = cell(inputs, mask)
-            inputs = outputs
-            if self._output_all_encodings:
-                if valid_length is not None:
-                    outputs = F.SequenceMask(outputs, sequence_length=valid_length,
-                                             use_sequence_length=True, axis=1)
-                all_encodings_outputs.append(outputs)
-
-            if self._output_attention:
-                additional_outputs.append(attention_weights)
-
-        if valid_length is not None and not self._output_all_encodings:
-            # if self._output_all_encodings, SequenceMask is already applied above
-            outputs = F.SequenceMask(outputs, sequence_length=valid_length,
-                                     use_sequence_length=True, axis=1)
-
-        if self._output_all_encodings:
-            return all_encodings_outputs, additional_outputs
-        return outputs, additional_outputs
-
-###############################################################################
-#                                DECODER                                      #
-###############################################################################
-
-class TransformerDecoderCell(HybridBlock):
-    """Structure of the Transformer Decoder Cell.
-
-    Parameters
-    ----------
-    attention_cell : AttentionCell or str, default 'multi_head'
-        Arguments of the attention cell.
-        Can be 'multi_head', 'scaled_luong', 'scaled_dot', 'dot', 'cosine', 'normed_mlp', 'mlp'
-    units : int
-        Number of units for the output
-    hidden_size : int
-        number of units in the hidden layer of position-wise feed-forward networks
-    num_heads : int
-        Number of heads in multi-head attention
-    scaled : bool
-        Whether to scale the softmax input by the sqrt of the input dimension
-        in multi-head attention
-    dropout : float
-        Dropout probability.
-    use_residual : bool
-        Whether to use residual connection.
-    output_attention: bool
-        Whether to output the attention weights
-    weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default None
-        Prefix for name of `Block`s
-        (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells.
-        Created if `None`.
-    """
-    def __init__(self, attention_cell='multi_head', units=128,
-                 hidden_size=512, num_heads=4, scaled=True,
-                 dropout=0.0, use_residual=True, output_attention=False,
-                 weight_initializer=None, bias_initializer='zeros',
-                 prefix=None, params=None):
-        super(TransformerDecoderCell, self).__init__(prefix=prefix, params=params)
-        self._units = units
-        self._num_heads = num_heads
-        self._dropout = dropout
-        self._use_residual = use_residual
-        self._output_attention = output_attention
-        self._scaled = scaled
-        with self.name_scope():
-            if dropout:
-                self.dropout_layer = nn.Dropout(rate=dropout)
-            self.attention_cell_in = _get_attention_cell(attention_cell,
-                                                         units=units,
-                                                         num_heads=num_heads,
-                                                         scaled=scaled,
-                                                         dropout=dropout)
-            self.attention_cell_inter = _get_attention_cell(attention_cell,
-                                                            units=units,
-                                                            num_heads=num_heads,
-                                                            scaled=scaled,
-                                                            dropout=dropout)
-            self.proj_in = nn.Dense(units=units, flatten=False,
-                                    use_bias=False,
-                                    weight_initializer=weight_initializer,
-                                    bias_initializer=bias_initializer,
-                                    prefix='proj_in_')
-            self.proj_inter = nn.Dense(units=units, flatten=False,
-                                       use_bias=False,
-                                       weight_initializer=weight_initializer,
-                                       bias_initializer=bias_initializer,
-                                       prefix='proj_inter_')
-            self.ffn = PositionwiseFFN(hidden_size=hidden_size,
-                                       units=units,
-                                       use_residual=use_residual,
-                                       dropout=dropout,
-                                       weight_initializer=weight_initializer,
-                                       bias_initializer=bias_initializer)
-
-            self.layer_norm_in = nn.LayerNorm()
-            self.layer_norm_inter = nn.LayerNorm()
-
-    def hybrid_forward(self, F, inputs, mem_value, mask=None, mem_mask=None):  #pylint: disable=unused-argument
-        #  pylint: disable=arguments-differ
-        """Transformer Decoder Attention Cell.
-
-        Parameters
-        ----------
-        inputs : Symbol or NDArray
-            Input sequence. Shape (batch_size, length, C_in)
-        mem_value : Symbol or NDArrays
-            Memory value, i.e. output of the encoder. Shape (batch_size, mem_length, C_in)
-        mask : Symbol or NDArray or None
-            Mask for inputs. Shape (batch_size, length, length)
-        mem_mask : Symbol or NDArray or None
-            Mask for mem_value. Shape (batch_size, length, mem_length)
-
-        Returns
-        -------
-        decoder_cell_outputs: list
-            Outputs of the decoder cell. Contains:
-
-            - outputs of the transformer decoder cell. Shape (batch_size, length, C_out)
-            - additional_outputs of all the transformer decoder cell
-        """
-        outputs, attention_in_outputs =\
-            self.attention_cell_in(inputs, inputs, inputs, mask)
-        outputs = self.proj_in(outputs)
-        if self._dropout:
-            outputs = self.dropout_layer(outputs)
-        if self._use_residual:
-            outputs = outputs + inputs
-        outputs = self.layer_norm_in(outputs)
-        inputs = outputs
-        outputs, attention_inter_outputs = \
-            self.attention_cell_inter(inputs, mem_value, mem_value, mem_mask)
-        outputs = self.proj_inter(outputs)
-        if self._dropout:
-            outputs = self.dropout_layer(outputs)
-        if self._use_residual:
-            outputs = outputs + inputs
-        outputs = self.layer_norm_inter(outputs)
-        outputs = self.ffn(outputs)
-        additional_outputs = []
-        if self._output_attention:
-            additional_outputs.append(attention_in_outputs)
-            additional_outputs.append(attention_inter_outputs)
-        return outputs, additional_outputs
-
-
-class _BaseTransformerDecoder(HybridBlock):
-    def __init__(self, attention_cell='multi_head', num_layers=2, units=128, hidden_size=2048,
-                 max_length=50, num_heads=4, scaled=True, scale_embed=True, norm_inputs=True,
-                 dropout=0.0, use_residual=True, output_attention=False, weight_initializer=None,
-                 bias_initializer='zeros', prefix=None, params=None):
-        super().__init__(prefix=prefix, params=params)
-        assert units % num_heads == 0, 'In TransformerDecoder, the units should be divided ' \
-                                       'exactly by the number of heads. Received units={}, ' \
-                                       'num_heads={}'.format(units, num_heads)
-        self._num_layers = num_layers
-        self._units = units
-        self._hidden_size = hidden_size
-        self._num_states = num_heads
-        self._max_length = max_length
-        self._dropout = dropout
-        self._use_residual = use_residual
-        self._output_attention = output_attention
-        self._scaled = scaled
-        self._scale_embed = scale_embed
-        self._norm_inputs = norm_inputs
-        with self.name_scope():
-            if dropout:
-                self.dropout_layer = nn.Dropout(rate=dropout)
-            if self._norm_inputs:
-                self.layer_norm = nn.LayerNorm()
-            encoding = _position_encoding_init(max_length, units)
-            self.position_weight = self.params.get_constant('const', encoding.astype(np.float32))
-            self.transformer_cells = nn.HybridSequential()
-            for i in range(num_layers):
-                self.transformer_cells.add(
-                    TransformerDecoderCell(units=units, hidden_size=hidden_size,
-                                           num_heads=num_heads, attention_cell=attention_cell,
-                                           weight_initializer=weight_initializer,
-                                           bias_initializer=bias_initializer, dropout=dropout,
-                                           scaled=scaled, use_residual=use_residual,
-                                           output_attention=output_attention,
-                                           prefix='transformer%d_' % i))
-
-    def init_state_from_encoder(self, encoder_outputs, encoder_valid_length=None):
-        """Initialize the state from the encoder outputs.
-
-        Parameters
-        ----------
-        encoder_outputs : list
-        encoder_valid_length : NDArray or None
-
-        Returns
-        -------
-        decoder_states : list
-            The decoder states, includes:
-
-            - mem_value : NDArray
-            - mem_masks : NDArray or None
-        """
-        mem_value = encoder_outputs
-        decoder_states = [mem_value]
-        mem_length = mem_value.shape[1]
-        if encoder_valid_length is not None:
-            dtype = encoder_valid_length.dtype
-            ctx = encoder_valid_length.context
-            mem_masks = mx.nd.broadcast_lesser(
-                mx.nd.arange(mem_length, ctx=ctx, dtype=dtype).reshape((1, -1)),
-                encoder_valid_length.reshape((-1, 1)))
-            decoder_states.append(mem_masks)
-        else:
-            decoder_states.append(None)
-        return decoder_states
-
-    def hybrid_forward(self, F, inputs, states, valid_length=None, position_weight=None):
-        #pylint: disable=arguments-differ
-        """Decode the decoder inputs. This function is only used for training.
-
-        Parameters
-        ----------
-        inputs : NDArray, Shape (batch_size, length, C_in)
-        states : list of NDArrays or None
-            Initial states. The list of decoder states
-        valid_length : NDArray or None
-            Valid lengths of each sequence. This is usually used when part of sequence has
-            been padded. Shape (batch_size,)
-
-        Returns
-        -------
-        output : NDArray, Shape (batch_size, length, C_out)
-        states : list
-            The decoder states:
-            - mem_value : NDArray
-            - mem_masks : NDArray or None
-        additional_outputs : list of list
-            Either be an empty list or contains the attention weights in this step.
-            The attention weights will have shape (batch_size, length, mem_length) or
-            (batch_size, num_heads, length, mem_length)
-        """
-
-        length_array = F.contrib.arange_like(inputs, axis=1)
-        mask = F.broadcast_lesser_equal(length_array.reshape((1, -1)),
-                                        length_array.reshape((-1, 1)))
-        if valid_length is not None:
-            batch_mask = F.broadcast_lesser(length_array.reshape((1, -1)),
-                                            valid_length.reshape((-1, 1)))
-            batch_mask = F.expand_dims(batch_mask, -1)
-            mask = F.broadcast_mul(batch_mask, F.expand_dims(mask, 0))
-        else:
-            mask = F.expand_dims(mask, axis=0)
-            mask = F.broadcast_like(mask, inputs, lhs_axes=(0, ), rhs_axes=(0, ))
-
-        mem_value, mem_mask = states
-        if mem_mask is not None:
-            mem_mask = F.expand_dims(mem_mask, axis=1)
-            mem_mask = F.broadcast_like(mem_mask, inputs, lhs_axes=(1, ), rhs_axes=(1, ))
-
-        if self._scale_embed:
-            inputs = inputs * math.sqrt(self._units)
-
-        # Positional Encoding
-        steps = F.contrib.arange_like(inputs, axis=1)
-        positional_embed = F.Embedding(steps, position_weight, self._max_length, self._units)
-        inputs = F.broadcast_add(inputs, F.expand_dims(positional_embed, axis=0))
-
-        if self._dropout:
-            inputs = self.dropout_layer(inputs)
-
-        if self._norm_inputs:
-            inputs = self.layer_norm(inputs)
-
-        additional_outputs = []
-        attention_weights_l = []
-        outputs = inputs
-        for cell in self.transformer_cells:
-            outputs, attention_weights = cell(outputs, mem_value, mask, mem_mask)
-            if self._output_attention:
-                attention_weights_l.append(attention_weights)
-        if self._output_attention:
-            additional_outputs.extend(attention_weights_l)
-
-        if valid_length is not None:
-            outputs = F.SequenceMask(outputs, sequence_length=valid_length,
-                                     use_sequence_length=True, axis=1)
-        return outputs, states, additional_outputs
-
-
-class TransformerDecoder(_BaseTransformerDecoder, Seq2SeqDecoder):
-    """Transformer Decoder.
-
-    Multi-step ahead decoder for use during training with teacher forcing.
-
-    Parameters
-    ----------
-    attention_cell : AttentionCell or str, default 'multi_head'
-        Arguments of the attention cell.
-        Can be 'multi_head', 'scaled_luong', 'scaled_dot', 'dot', 'cosine', 'normed_mlp', 'mlp'
-    num_layers : int
-        Number of attention layers.
-    units : int
-        Number of units for the output.
-    hidden_size : int
-        number of units in the hidden layer of position-wise feed-forward networks
-    max_length : int
-        Maximum length of the input sequence. This is used for constructing position encoding
-    num_heads : int
-        Number of heads in multi-head attention
-    scaled : bool
-        Whether to scale the softmax input by the sqrt of the input dimension
-        in multi-head attention
-    scale_embed : bool, default True
-        Whether to scale the input embeddings by the sqrt of the `units`.
-    norm_inputs : bool, default True
-        Whether to normalize the input embeddings with LayerNorm. If dropout is
-        enabled, normalization happens after dropout is applied to inputs.
-    dropout : float
-        Dropout probability.
-    use_residual : bool
-        Whether to use residual connection.
-    output_attention: bool
-        Whether to output the attention weights
-    weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default 'rnn_'
-        Prefix for name of `Block`s
-        (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells.
-        Created if `None`.
-    """
-
-
-class TransformerOneStepDecoder(_BaseTransformerDecoder, Seq2SeqOneStepDecoder):
-    """Transformer Decoder.
-
-    One-step ahead decoder for use during inference.
-
-    Parameters
-    ----------
-    attention_cell : AttentionCell or str, default 'multi_head'
-        Arguments of the attention cell.
-        Can be 'multi_head', 'scaled_luong', 'scaled_dot', 'dot', 'cosine', 'normed_mlp', 'mlp'
-    num_layers : int
-        Number of attention layers.
-    units : int
-        Number of units for the output.
-    hidden_size : int
-        number of units in the hidden layer of position-wise feed-forward networks
-    max_length : int
-        Maximum length of the input sequence. This is used for constructing position encoding
-    num_heads : int
-        Number of heads in multi-head attention
-    scaled : bool
-        Whether to scale the softmax input by the sqrt of the input dimension
-        in multi-head attention
-    scale_embed : bool, default True
-        Whether to scale the input embeddings by the sqrt of the `units`.
-    norm_inputs : bool, default True
-        Whether to normalize the input embeddings with LayerNorm. If dropout is
-        enabled, normalization happens after dropout is applied to inputs.
-    dropout : float
-        Dropout probability.
-    use_residual : bool
-        Whether to use residual connection.
-    output_attention: bool
-        Whether to output the attention weights
-    weight_initializer : str or Initializer
-        Initializer for the input weights matrix, used for the linear
-        transformation of the inputs.
-    bias_initializer : str or Initializer
-        Initializer for the bias vector.
-    prefix : str, default 'rnn_'
-        Prefix for name of `Block`s
-        (and name of weight if params is `None`).
-    params : Parameter or None
-        Container for weight sharing between cells.
-        Created if `None`.
-    """
-
-    def forward(self, step_input, states):  # pylint: disable=arguments-differ
-        # We implement forward, as the number of states changes between the
-        # first and later calls of the one-step ahead Transformer decoder. This
-        # is due to the lack of numpy shape semantics. Once we enable numpy
-        # shape semantic in the GluonNLP code-base, the number of states should
-        # stay constant, but the first state element will be an array of shape
-        # (batch_size, 0, C_in) at the first call.
-        if len(states) == 3:  # step_input from prior call is included
-            last_embeds, _, _ = states
-            inputs = mx.nd.concat(last_embeds, mx.nd.expand_dims(step_input, axis=1), dim=1)
-            states = states[1:]
-        else:
-            inputs = mx.nd.expand_dims(step_input, axis=1)
-        return super().forward(inputs, states)
-
-    def hybrid_forward(self, F, inputs, states, position_weight):
-        # pylint: disable=arguments-differ
-        """One-step-ahead decoding of the Transformer decoder.
-
-        Parameters
-        ----------
-        step_input : NDArray, Shape (batch_size, C_in)
-        states : list of NDArray
-
-        Returns
-        -------
-        step_output : NDArray
-            The output of the decoder. Shape is (batch_size, C_out)
-        new_states: list
-            Includes
-            - last_embeds : NDArray or None
-            - mem_value : NDArray
-            - mem_masks : NDArray, optional
-
-        step_additional_outputs : list of list
-            Either be an empty list or contains the attention weights in this step.
-            The attention weights will have shape (batch_size, length, mem_length) or
-            (batch_size, num_heads, length, mem_length)
-        """
-        outputs, states, additional_outputs = super().hybrid_forward(
-            F, inputs, states, valid_length=None, position_weight=position_weight)
-
-        # Append inputs to states: They are needed in the next one-step ahead decoding step
-        new_states = [inputs] + states
-        # Only return one-step ahead
-        step_output = F.slice_axis(outputs, axis=1, begin=-1, end=None).reshape((0, -1))
-
-        return step_output, new_states, additional_outputs
-
-
-
-###############################################################################
-#                                  MODEL API                                  #
-###############################################################################
-
-model_store._model_sha1.update(
-    {name: checksum for checksum, name in [
-        ('e25287c5a924b7025e08d626f02626d5fa3af2d1', 'transformer_en_de_512_WMT2014'),
-    ]})
-
-def get_transformer_encoder_decoder(num_layers=2,
-                                    num_heads=8, scaled=True,
-                                    units=512, hidden_size=2048, dropout=0.0, use_residual=True,
-                                    max_src_length=50, max_tgt_length=50,
-                                    weight_initializer=None, bias_initializer='zeros',
-                                    prefix='transformer_', params=None):
-    """Build a pair of Parallel Transformer encoder/decoder
-
-    Parameters
-    ----------
-    num_layers : int
-    num_heads : int
-    scaled : bool
-    units : int
-    hidden_size : int
-    dropout : float
-    use_residual : bool
-    max_src_length : int
-    max_tgt_length : int
-    weight_initializer : mx.init.Initializer or None
-    bias_initializer : mx.init.Initializer or None
-    prefix : str, default 'transformer_'
-        Prefix for name of `Block`s.
-    params : Parameter or None
-        Container for weight sharing between layers.
-        Created if `None`.
-
-    Returns
-    -------
-    encoder : TransformerEncoder
-    decoder : TransformerDecoder
-    one_step_ahead_decoder : TransformerOneStepDecoder
-    """
-    encoder = TransformerEncoder(
-        num_layers=num_layers, num_heads=num_heads, max_length=max_src_length, units=units,
-        hidden_size=hidden_size, dropout=dropout, scaled=scaled, use_residual=use_residual,
-        weight_initializer=weight_initializer, bias_initializer=bias_initializer,
-        prefix=prefix + 'enc_', params=params)
-    decoder = TransformerDecoder(
-        num_layers=num_layers, num_heads=num_heads, max_length=max_tgt_length, units=units,
-        hidden_size=hidden_size, dropout=dropout, scaled=scaled, use_residual=use_residual,
-        weight_initializer=weight_initializer, bias_initializer=bias_initializer,
-        prefix=prefix + 'dec_', params=params)
-    one_step_ahead_decoder = TransformerOneStepDecoder(
-        num_layers=num_layers, num_heads=num_heads, max_length=max_tgt_length, units=units,
-        hidden_size=hidden_size, dropout=dropout, scaled=scaled, use_residual=use_residual,
-        weight_initializer=weight_initializer, bias_initializer=bias_initializer,
-        prefix=prefix + 'dec_', params=decoder.collect_params())
-    return encoder, decoder, one_step_ahead_decoder
-
-
-def _get_transformer_model(model_cls, model_name, dataset_name, src_vocab, tgt_vocab, encoder,
-                           decoder, one_step_ahead_decoder, share_embed, embed_size, tie_weights,
-                           embed_initializer, pretrained, ctx, root, **kwargs):
-    src_vocab = _load_vocab(dataset_name + '_src', src_vocab, root)
-    tgt_vocab = _load_vocab(dataset_name + '_tgt', tgt_vocab, root)
-    kwargs['encoder'] = encoder
-    kwargs['decoder'] = decoder
-    kwargs['one_step_ahead_decoder'] = one_step_ahead_decoder
-    kwargs['src_vocab'] = src_vocab
-    kwargs['tgt_vocab'] = tgt_vocab
-    kwargs['share_embed'] = share_embed
-    kwargs['embed_size'] = embed_size
-    kwargs['tie_weights'] = tie_weights
-    kwargs['embed_initializer'] = embed_initializer
-    # XXX the existing model is trained with prefix 'transformer_'
-    net = model_cls(prefix='transformer_', **kwargs)
-    if pretrained:
-        _load_pretrained_params(net, model_name, dataset_name, root, ctx)
-    return net, src_vocab, tgt_vocab
-
-
-def transformer_en_de_512(dataset_name=None, src_vocab=None, tgt_vocab=None, pretrained=False,
-                          ctx=cpu(), root=os.path.join(get_home_dir(), 'models'), **kwargs):
-    r"""Transformer pretrained model.
-
-    Embedding size is 400, and hidden layer size is 1150.
-
-    Parameters
-    ----------
-    dataset_name : str or None, default None
-    src_vocab : gluonnlp.Vocab or None, default None
-    tgt_vocab : gluonnlp.Vocab or None, default None
-    pretrained : bool, default False
-        Whether to load the pretrained weights for model.
-    ctx : Context, default CPU
-        The context in which to load the pretrained weights.
-    root : str, default '$MXNET_HOME/models'
-        Location for keeping the model parameters.
-        MXNET_HOME defaults to '~/.mxnet'.
-
-    Returns
-    -------
-    gluon.Block, gluonnlp.Vocab, gluonnlp.Vocab
-    """
-    predefined_args = {'num_units': 512,
-                       'hidden_size': 2048,
-                       'dropout': 0.1,
-                       'epsilon': 0.1,
-                       'num_layers': 6,
-                       'num_heads': 8,
-                       'scaled': True,
-                       'share_embed': True,
-                       'embed_size': 512,
-                       'tie_weights': True,
-                       'embed_initializer': None}
-    mutable_args = frozenset(['num_units', 'hidden_size', 'dropout', 'epsilon', 'num_layers',
-                              'num_heads', 'scaled'])
-    assert all((k not in kwargs or k in mutable_args) for k in predefined_args), \
-           'Cannot override predefined model settings.'
-    predefined_args.update(kwargs)
-    encoder, decoder, one_step_ahead_decoder = get_transformer_encoder_decoder(
-        units=predefined_args['num_units'], hidden_size=predefined_args['hidden_size'],
-        dropout=predefined_args['dropout'], num_layers=predefined_args['num_layers'],
-        num_heads=predefined_args['num_heads'], max_src_length=530, max_tgt_length=549,
-        scaled=predefined_args['scaled'])
-    return _get_transformer_model(NMTModel, 'transformer_en_de_512', dataset_name, src_vocab,
-                                  tgt_vocab, encoder, decoder, one_step_ahead_decoder,
-                                  predefined_args['share_embed'], predefined_args['embed_size'],
-                                  predefined_args['tie_weights'],
-                                  predefined_args['embed_initializer'], pretrained, ctx, root)
-
-
-class ParallelTransformer(Parallelizable):
-    """Data parallel transformer.
-
-    Parameters
-    ----------
-    model : Block
-        The transformer model.
-    label_smoothing: Block
-        The block to perform label smoothing.
-    loss_function : Block
-        The loss function to optimizer.
-    rescale_loss : float
-        The scale to which the loss is rescaled to avoid gradient explosion.
-    """
-    def __init__(self, model, label_smoothing, loss_function, rescale_loss):
-        self._model = model
-        self._label_smoothing = label_smoothing
-        self._loss = loss_function
-        self._rescale_loss = rescale_loss
-
-    def forward_backward(self, x):
-        """Perform forward and backward computation for a batch of src seq and dst seq"""
-        (src_seq, tgt_seq, src_valid_length, tgt_valid_length), batch_size = x
-        with mx.autograd.record():
-            out, _ = self._model(src_seq, tgt_seq[:, :-1],
-                                 src_valid_length, tgt_valid_length - 1)
-            smoothed_label = self._label_smoothing(tgt_seq[:, 1:])
-            ls = self._loss(out, smoothed_label, tgt_valid_length - 1).sum()
-            ls = (ls * (tgt_seq.shape[1] - 1)) / batch_size / self._rescale_loss
-        ls.backward()
-        return ls
diff --git a/src/gluonnlp/model/translation.py b/src/gluonnlp/model/translation.py
deleted file mode 100644
index f0ac7754e9..0000000000
--- a/src/gluonnlp/model/translation.py
+++ /dev/null
@@ -1,242 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Machine translation models and translators."""
-
-
-__all__ = []
-
-import warnings
-from mxnet.gluon import Block
-from mxnet.gluon import nn
-import mxnet as mx
-
-
-class NMTModel(Block):
-    """Model for Neural Machine Translation.
-
-    Parameters
-    ----------
-    src_vocab : Vocab
-        Source vocabulary.
-    tgt_vocab : Vocab
-        Target vocabulary.
-    encoder : Seq2SeqEncoder
-        Encoder that encodes the input sentence.
-    decoder : Seq2SeqDecoder
-        Decoder used during training phase. The decoder generates predictions
-        based on the output of the encoder.
-    one_step_ahead_decoder : Seq2SeqOneStepDecoder
-        One-step ahead decoder used during inference phase. The decoder
-        generates predictions based on the output of the encoder.
-    embed_size : int or None, default None
-        Size of the embedding vectors. It is used to generate the source and target embeddings
-        if src_embed and tgt_embed are None.
-    embed_dropout : float, default 0.0
-        Dropout rate of the embedding weights. It is used to generate the source and target
-        embeddings if src_embed and tgt_embed are None.
-    embed_initializer : Initializer, default mx.init.Uniform(0.1)
-        Initializer of the embedding weights. It is used to generate the source and target
-        embeddings if src_embed and tgt_embed are None.
-    src_embed : Block or None, default None
-        The source embedding. If set to None, src_embed will be constructed using embed_size and
-        embed_dropout.
-    tgt_embed : Block or None, default None
-        The target embedding. If set to None and the tgt_embed will be constructed using
-        embed_size and embed_dropout. Also if `share_embed` is turned on, we will set tgt_embed
-        to be the same as src_embed.
-    share_embed : bool, default False
-        Whether to share the src/tgt embeddings or not.
-    tgt_proj : Block or None, default None
-        Layer that projects the decoder outputs to the target vocabulary.
-    prefix : str or None
-        See document of `Block`.
-    params : ParameterDict or None
-        See document of `Block`.
-    """
-    def __init__(self, src_vocab, tgt_vocab, encoder, decoder, one_step_ahead_decoder,
-                 embed_size=None, embed_dropout=0.0, embed_initializer=mx.init.Uniform(0.1),
-                 src_embed=None, tgt_embed=None, share_embed=False, tie_weights=False,
-                 tgt_proj=None, prefix=None, params=None):
-        super(NMTModel, self).__init__(prefix=prefix, params=params)
-        self.tgt_vocab = tgt_vocab
-        self.src_vocab = src_vocab
-        self.encoder = encoder
-        self.decoder = decoder
-        self.one_step_ahead_decoder = one_step_ahead_decoder
-        self._shared_embed = share_embed
-        if embed_dropout is None:
-            embed_dropout = 0.0
-        # Construct src embedding
-        if share_embed and tgt_embed is not None:
-            warnings.warn('"share_embed" is turned on and \"tgt_embed\" is not None. '
-                          'In this case, the provided "tgt_embed" will be overwritten by the '
-                          '"src_embed". Is this intended?')
-        if src_embed is None:
-            assert embed_size is not None, '"embed_size" cannot be None if "src_embed" is not ' \
-                                           'given.'
-            with self.name_scope():
-                self.src_embed = nn.HybridSequential(prefix='src_embed_')
-                with self.src_embed.name_scope():
-                    self.src_embed.add(nn.Embedding(input_dim=len(src_vocab), output_dim=embed_size,
-                                                    weight_initializer=embed_initializer))
-                    self.src_embed.add(nn.Dropout(rate=embed_dropout))
-        else:
-            self.src_embed = src_embed
-        # Construct tgt embedding
-        if share_embed:
-            self.tgt_embed = self.src_embed
-        else:
-            if tgt_embed is not None:
-                self.tgt_embed = tgt_embed
-            else:
-                assert embed_size is not None,\
-                    '"embed_size" cannot be None if "tgt_embed" is ' \
-                    'not given and "shared_embed" is not turned on.'
-                with self.name_scope():
-                    self.tgt_embed = nn.HybridSequential(prefix='tgt_embed_')
-                    with self.tgt_embed.name_scope():
-                        self.tgt_embed.add(
-                            nn.Embedding(input_dim=len(tgt_vocab), output_dim=embed_size,
-                                         weight_initializer=embed_initializer))
-                        self.tgt_embed.add(nn.Dropout(rate=embed_dropout))
-        # Construct tgt proj
-        if tie_weights:
-            self.tgt_proj = nn.Dense(units=len(tgt_vocab), flatten=False,
-                                     params=self.tgt_embed[0].params, prefix='tgt_proj_')
-            assert list(self.tgt_proj.params.values())[0] \
-                   == list(self.tgt_embed[0].params.values())[0], \
-                'The weights of target word embedding are not tied with' \
-                ' the weights of target word classifiers'
-        else:
-            if tgt_proj is None:
-                with self.name_scope():
-                    self.tgt_proj = nn.Dense(units=len(tgt_vocab), flatten=False,
-                                             prefix='tgt_proj_')
-            else:
-                self.tgt_proj = tgt_proj
-
-    def encode(self, inputs, states=None, valid_length=None):
-        """Encode the input sequence.
-
-        Parameters
-        ----------
-        inputs : NDArray
-        states : list of NDArrays or None, default None
-        valid_length : NDArray or None, default None
-
-        Returns
-        -------
-        outputs : list
-            Outputs of the encoder.
-        """
-        return self.encoder(self.src_embed(inputs), states, valid_length)
-
-    def decode_seq(self, inputs, states, valid_length=None):
-        """Decode given the input sequence.
-
-        Parameters
-        ----------
-        inputs : NDArray
-        states : list of NDArrays
-        valid_length : NDArray or None, default None
-
-        Returns
-        -------
-        output : NDArray
-            The output of the decoder. Shape is (batch_size, length, tgt_word_num)
-        states: list
-            The new states of the decoder
-        additional_outputs : list
-            Additional outputs of the decoder, e.g, the attention weights
-        """
-        outputs, states, additional_outputs = self.decoder(self.tgt_embed(inputs), states,
-                                                           valid_length)
-        outputs = self.tgt_proj(outputs)
-        return outputs, states, additional_outputs
-
-    def decode_step(self, step_input, states):
-        """One step decoding of the translation model.
-
-        Parameters
-        ----------
-        step_input : NDArray
-            Shape (batch_size,)
-        states : list of NDArrays
-
-        Returns
-        -------
-        step_output : NDArray
-            Shape (batch_size, C_out)
-        states : list
-        step_additional_outputs : list
-            Additional outputs of the step, e.g, the attention weights
-        """
-        step_output, states, step_additional_outputs =\
-            self.one_step_ahead_decoder(self.tgt_embed(step_input), states)
-        step_output = self.tgt_proj(step_output)
-        return step_output, states, step_additional_outputs
-
-    def __call__(self, src_seq, tgt_seq, src_valid_length=None, tgt_valid_length=None):  #pylint: disable=arguments-differ
-        """Generate the prediction given the src_seq and tgt_seq.
-
-        This is used in training an NMT model.
-
-        Parameters
-        ----------
-        src_seq : NDArray
-        tgt_seq : NDArray
-        src_valid_length : NDArray or None
-        tgt_valid_length : NDArray or None
-
-        Returns
-        -------
-        outputs : NDArray
-            Shape (batch_size, tgt_length, tgt_word_num)
-        additional_outputs : list of list
-            Additional outputs of encoder and decoder, e.g, the attention weights
-        """
-        return super(NMTModel, self).__call__(src_seq, tgt_seq, src_valid_length, tgt_valid_length)
-
-    def forward(self, src_seq, tgt_seq, src_valid_length=None, tgt_valid_length=None):  #pylint: disable=arguments-differ
-        """Generate the prediction given the src_seq and tgt_seq.
-
-        This is used in training an NMT model.
-
-        Parameters
-        ----------
-        src_seq : NDArray
-        tgt_seq : NDArray
-        src_valid_length : NDArray or None
-        tgt_valid_length : NDArray or None
-
-        Returns
-        -------
-        outputs : NDArray
-            Shape (batch_size, tgt_length, tgt_word_num)
-        additional_outputs : list of list
-            Additional outputs of encoder and decoder, e.g, the attention weights
-        """
-        additional_outputs = []
-        encoder_outputs, encoder_additional_outputs = self.encode(src_seq,
-                                                                  valid_length=src_valid_length)
-        decoder_states = self.decoder.init_state_from_encoder(encoder_outputs,
-                                                              encoder_valid_length=src_valid_length)
-        outputs, _, decoder_additional_outputs =\
-            self.decode_seq(tgt_seq, decoder_states, tgt_valid_length)
-        additional_outputs.append(encoder_additional_outputs)
-        additional_outputs.append(decoder_additional_outputs)
-        return outputs, additional_outputs
diff --git a/src/gluonnlp/model/utils.py b/src/gluonnlp/model/utils.py
deleted file mode 100644
index 4eca81abbc..0000000000
--- a/src/gluonnlp/model/utils.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Building blocks and utility for models."""
-__all__ = ['apply_weight_drop']
-
-import collections
-import functools
-import re
-import warnings
-
-from mxnet.gluon import Block, contrib, rnn
-from mxnet.gluon.model_zoo import model_store
-from ..data.utils import _load_pretrained_vocab
-from .parameter import WeightDropParameter
-from .lstmpcellwithclip import LSTMPCellWithClip
-
-# pylint: disable=too-many-nested-blocks
-
-
-def apply_weight_drop(block, local_param_regex, rate, axes=(),
-                      weight_dropout_mode='training'):
-    """Apply weight drop to the parameter of a block.
-
-    Parameters
-    ----------
-    block : Block or HybridBlock
-        The block whose parameter is to be applied weight-drop.
-    local_param_regex : str
-        The regex for parameter names used in the self.params.get(), such as 'weight'.
-    rate : float
-        Fraction of the input units to drop. Must be a number between 0 and 1.
-    axes : tuple of int, default ()
-        The axes on which dropout mask is shared. If empty, regular dropout is applied.
-    weight_drop_mode : {'training', 'always'}, default 'training'
-        Whether the weight dropout should be applied only at training time, or always be applied.
-
-    Examples
-    --------
-    >>> net = gluon.rnn.LSTM(10, num_layers=2, bidirectional=True)
-    >>> gluonnlp.model.apply_weight_drop(net, r'.*h2h_weight', 0.5)
-    >>> net.collect_params()
-    lstm0_ (
-      Parameter lstm0_l0_i2h_weight (shape=(40, 0), dtype=float32)
-      WeightDropParameter lstm0_l0_h2h_weight (shape=(40, 10), dtype=float32, \
-rate=0.5, mode=training)
-      Parameter lstm0_l0_i2h_bias (shape=(40,), dtype=float32)
-      Parameter lstm0_l0_h2h_bias (shape=(40,), dtype=float32)
-      Parameter lstm0_r0_i2h_weight (shape=(40, 0), dtype=float32)
-      WeightDropParameter lstm0_r0_h2h_weight (shape=(40, 10), dtype=float32, \
-rate=0.5, mode=training)
-      Parameter lstm0_r0_i2h_bias (shape=(40,), dtype=float32)
-      Parameter lstm0_r0_h2h_bias (shape=(40,), dtype=float32)
-      Parameter lstm0_l1_i2h_weight (shape=(40, 20), dtype=float32)
-      WeightDropParameter lstm0_l1_h2h_weight (shape=(40, 10), dtype=float32, \
-rate=0.5, mode=training)
-      Parameter lstm0_l1_i2h_bias (shape=(40,), dtype=float32)
-      Parameter lstm0_l1_h2h_bias (shape=(40,), dtype=float32)
-      Parameter lstm0_r1_i2h_weight (shape=(40, 20), dtype=float32)
-      WeightDropParameter lstm0_r1_h2h_weight (shape=(40, 10), dtype=float32, \
-rate=0.5, mode=training)
-      Parameter lstm0_r1_i2h_bias (shape=(40,), dtype=float32)
-      Parameter lstm0_r1_h2h_bias (shape=(40,), dtype=float32)
-    )
-    >>> ones = mx.nd.ones((3, 4, 5))
-    >>> net.initialize()
-    >>> with mx.autograd.train_mode():
-    ...     net(ones).max().asscalar() != net(ones).max().asscalar()
-    True
-    """
-    if not rate:
-        return
-
-    existing_params = _find_params(block, local_param_regex)
-    for (local_param_name, param), \
-            (ref_params_list, ref_reg_params_list) in existing_params.items():
-        if isinstance(param, WeightDropParameter):
-            continue
-        dropped_param = WeightDropParameter(param, rate, weight_dropout_mode, axes)
-        for ref_params in ref_params_list:
-            ref_params[param.name] = dropped_param
-        for ref_reg_params in ref_reg_params_list:
-            ref_reg_params[local_param_name] = dropped_param
-            if hasattr(block, local_param_name):
-                local_attr = getattr(block, local_param_name)
-                if local_attr == param:
-                    local_attr = dropped_param
-                elif isinstance(local_attr, (list, tuple)):
-                    if isinstance(local_attr, tuple):
-                        local_attr = list(local_attr)
-                    for i, v in enumerate(local_attr):
-                        if v == param:
-                            local_attr[i] = dropped_param
-                elif isinstance(local_attr, dict):
-                    for k, v in local_attr:
-                        if v == param:
-                            local_attr[k] = dropped_param
-                else:
-                    continue
-                if local_attr:
-                    super(Block, block).__setattr__(local_param_name, local_attr)
-
-# pylint: enable=too-many-nested-blocks
-
-
-def _find_params(block, local_param_regex):
-    # return {(local_param_name, parameter): (referenced_params_list,
-    #                                         referenced_reg_params_list)}
-
-    results = collections.defaultdict(lambda: ([], []))
-    pattern = re.compile(local_param_regex)
-    local_param_names = ((local_param_name, p) for local_param_name, p in block._reg_params.items()
-                         if pattern.match(local_param_name))
-
-    for local_param_name, p in local_param_names:
-        ref_params_list, ref_reg_params_list = results[(local_param_name, p)]
-        ref_reg_params_list.append(block._reg_params)
-
-        params = block._params
-        while params:
-            if p.name in params._params:
-                ref_params_list.append(params._params)
-            if params._shared:
-                params = params._shared
-                warnings.warn('When applying weight drop, target parameter {} was found '
-                              'in a shared parameter dict. The parameter attribute of the '
-                              'original block on which the shared parameter dict was attached '
-                              'will not be updated with WeightDropParameter. If necessary, '
-                              'please update the attribute manually. The likely name of the '
-                              'attribute is ".{}"'.format(p.name, local_param_name))
-            else:
-                break
-
-    if block._children:
-        if isinstance(block._children, list):
-            children = block._children
-        elif isinstance(block._children, dict):
-            children = block._children.values()
-        for c in children:
-            child_results = _find_params(c, local_param_regex)
-            for (child_p_name, child_p), (child_pd_list, child_rd_list) in child_results.items():
-                pd_list, rd_list = results[(child_p_name, child_p)]
-                pd_list.extend(child_pd_list)
-                rd_list.extend(child_rd_list)
-
-    return results
-
-
-def _get_rnn_cell(mode, num_layers, input_size, hidden_size,
-                  dropout, weight_dropout,
-                  var_drop_in, var_drop_state, var_drop_out,
-                  skip_connection, proj_size=None, cell_clip=None, proj_clip=None):
-    """create rnn cell given specs
-
-    Parameters
-    ----------
-    mode : str
-        The type of RNN cell to use. Options are 'lstmpc', 'rnn_tanh', 'rnn_relu', 'lstm', 'gru'.
-    num_layers : int
-        The number of RNN cells in the encoder.
-    input_size : int
-        The initial input size of in the RNN cell.
-    hidden_size : int
-        The hidden size of the RNN cell.
-    dropout : float
-        The dropout rate to use for encoder output.
-    weight_dropout: float
-        The dropout rate to the hidden to hidden connections.
-    var_drop_in: float
-        The variational dropout rate for inputs. Won’t apply dropout if it equals 0.
-    var_drop_state: float
-        The variational dropout rate for state inputs on the first state channel.
-        Won’t apply dropout if it equals 0.
-    var_drop_out: float
-        The variational dropout rate for outputs. Won’t apply dropout if it equals 0.
-    skip_connection : bool
-        Whether to add skip connections (add RNN cell input to output)
-    proj_size : int
-        The projection size of each LSTMPCellWithClip cell.
-        Only available when the mode=lstmpc.
-    cell_clip : float
-        Clip cell state between [-cellclip, cell_clip] in LSTMPCellWithClip cell.
-        Only available when the mode=lstmpc.
-    proj_clip : float
-        Clip projection between [-projclip, projclip] in LSTMPCellWithClip cell
-        Only available when the mode=lstmpc.
-    """
-
-    assert mode == 'lstmpc' or proj_size is None, \
-        'proj_size takes effect only when mode is lstmpc'
-    assert mode == 'lstmpc' or cell_clip is None, \
-        'cell_clip takes effect only when mode is lstmpc'
-    assert mode == 'lstmpc' or proj_clip is None, \
-        'proj_clip takes effect only when mode is lstmpc'
-
-    rnn_cell = rnn.HybridSequentialRNNCell()
-    with rnn_cell.name_scope():
-        for i in range(num_layers):
-            if mode == 'rnn_relu':
-                cell = rnn.RNNCell(hidden_size, 'relu', input_size=input_size)
-            elif mode == 'rnn_tanh':
-                cell = rnn.RNNCell(hidden_size, 'tanh', input_size=input_size)
-            elif mode == 'lstm':
-                cell = rnn.LSTMCell(hidden_size, input_size=input_size)
-            elif mode == 'gru':
-                cell = rnn.GRUCell(hidden_size, input_size=input_size)
-            elif mode == 'lstmpc':
-                cell = LSTMPCellWithClip(hidden_size, proj_size,
-                                         cell_clip=cell_clip,
-                                         projection_clip=proj_clip,
-                                         input_size=input_size)
-            if var_drop_in + var_drop_state + var_drop_out != 0:
-                cell = contrib.rnn.VariationalDropoutCell(cell,
-                                                          var_drop_in,
-                                                          var_drop_state,
-                                                          var_drop_out)
-
-            if skip_connection:
-                cell = rnn.ResidualCell(cell)
-
-            rnn_cell.add(cell)
-
-            if i != num_layers - 1 and dropout != 0:
-                rnn_cell.add(rnn.DropoutCell(dropout))
-
-            if weight_dropout:
-                apply_weight_drop(rnn_cell, 'h2h_weight', rate=weight_dropout)
-
-    return rnn_cell
-
-
-def _get_rnn_layer(mode, num_layers, input_size, hidden_size, dropout, weight_dropout):
-    """create rnn layer given specs"""
-    if mode == 'rnn_relu':
-        rnn_block = functools.partial(rnn.RNN, activation='relu')
-    elif mode == 'rnn_tanh':
-        rnn_block = functools.partial(rnn.RNN, activation='tanh')
-    elif mode == 'lstm':
-        rnn_block = rnn.LSTM
-    elif mode == 'gru':
-        rnn_block = rnn.GRU
-
-    block = rnn_block(hidden_size, num_layers, dropout=dropout,
-                      input_size=input_size)
-
-    if weight_dropout:
-        apply_weight_drop(block, '.*h2h_weight', rate=weight_dropout)
-
-    return block
-
-
-def _load_vocab(dataset_name, vocab, root, cls=None):
-    if dataset_name:
-        if vocab is not None:
-            warnings.warn('Both dataset_name and vocab are specified. '
-                          'Loading vocab based on dataset_name. '
-                          'Input "vocab" argument will be ignored.')
-        vocab = _load_pretrained_vocab(dataset_name, root, cls)
-    else:
-        assert vocab is not None, 'Must specify vocab if not loading from predefined datasets.'
-    return vocab
-
-
-def _load_pretrained_params(net, model_name, dataset_name, root, ctx, ignore_extra=False,
-                            allow_missing=False):
-    path = '_'.join([model_name, dataset_name])
-    model_file = model_store.get_model_file(path, root=root)
-    net.load_parameters(model_file, ctx=ctx, ignore_extra=ignore_extra, allow_missing=allow_missing)
-
-def _get_cell_type(cell_type):
-    """Get the object type of the cell by parsing the input
-
-    Parameters
-    ----------
-    cell_type : str or type
-
-    Returns
-    -------
-    cell_constructor: type
-        The constructor of the RNNCell
-    """
-    if isinstance(cell_type, str):
-        if cell_type == 'lstm':
-            return rnn.LSTMCell
-        elif cell_type == 'gru':
-            return rnn.GRUCell
-        elif cell_type == 'relu_rnn':
-            return functools.partial(rnn.RNNCell, activation='relu')
-        elif cell_type == 'tanh_rnn':
-            return functools.partial(rnn.RNNCell, activation='tanh')
-        else:
-            raise NotImplementedError
-    else:
-        return cell_type
diff --git a/src/gluonnlp/models/__init__.py b/src/gluonnlp/models/__init__.py
new file mode 100644
index 0000000000..006cc35dfd
--- /dev/null
+++ b/src/gluonnlp/models/__init__.py
@@ -0,0 +1,72 @@
+from typing import Tuple, List
+from . import albert
+from . import bert
+from . import electra
+from . import mobilebert
+from . import roberta
+from . import transformer
+from . import transformer_xl
+from . import xlmr
+from . import bart
+from ..base import get_model_zoo_home_dir
+from ..registry import BACKBONE_REGISTRY
+from ..data.tokenizers import BaseTokenizer
+__all__ = ['list_backbone_names', 'get_backbone']
+
+
+def list_backbone_names():
+    all_keys = []
+    for backbone_type in BACKBONE_REGISTRY.list_keys():
+        all_keys.extend(BACKBONE_REGISTRY.get(backbone_type)[-1]())
+    return all_keys
+
+
+def get_backbone(model_name: str,
+                 root: str = get_model_zoo_home_dir(),
+                 **kwargs) -> Tuple['Block', str, BaseTokenizer, str, List]:
+    """Get the backbone network
+
+    Parameters
+    ----------
+    model_name
+        The name of the pretrained model
+    root
+        Downloaded directory of the model zoo
+
+    Returns
+    -------
+    model_cls
+        The class to construct the backbone network
+    cfg
+        Path to the config file of the backbone
+    tokenizer
+        The tokenizer that is bound to the backbone model
+    backbone_param_path
+        The path to the pretrained backbone weights
+    others
+        The other items returned by the create function.
+         Will be wrapped into a list
+
+    Examples
+    --------
+
+    >>> from gluonnlp.models import get_backbone
+    >>> model_cls, tokenizer, cfg, backbone_param_path, _ = get_backbone('google_en_cased_bert_base')
+    >>> model = model_cls.from_cfg(cfg)
+    >>> model.load_parameters(backbone_param_path)
+    """
+    model_cls, local_create_fn = None, None
+
+    for backbone_type in BACKBONE_REGISTRY.list_keys():
+        ele_model_cls, ele_local_create_fn, list_key_fn = BACKBONE_REGISTRY.get(backbone_type)
+        if model_name in list_key_fn():
+            model_cls = ele_model_cls
+            local_create_fn = ele_local_create_fn
+    if model_cls is None or local_create_fn is None:
+        raise KeyError('The backbone model "{}" is not found! '
+                       'Here are all available backbone models = {}'
+                       .format(model_name,
+                               list_backbone_names()))
+    cfg, tokenizer, local_params_path, *others = local_create_fn(model_name=model_name, root=root,
+                                                                 **kwargs)
+    return model_cls, cfg, tokenizer, local_params_path, others
diff --git a/src/gluonnlp/models/albert.py b/src/gluonnlp/models/albert.py
new file mode 100644
index 0000000000..13bbc2458f
--- /dev/null
+++ b/src/gluonnlp/models/albert.py
@@ -0,0 +1,804 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Albert Model.
+
+@inproceedings{lan2020albert,
+  title={Albert: A lite bert for self-supervised learning of language representations},
+  author={Lan, Zhenzhong and Chen, Mingda and Goodman, Sebastian and Gimpel, Kevin and Sharma, Piyush and Soricut, Radu},
+  booktitle={ICLR},
+  year={2020}
+}
+
+"""
+__all__ = ['AlbertModel', 'AlbertForMLM', 'AlbertForPretrain',
+           'list_pretrained_albert', 'get_pretrained_albert',
+           'albert_cfg_reg']
+
+import os
+from typing import Tuple
+
+import mxnet as mx
+from mxnet import use_np
+from mxnet.gluon import HybridBlock, nn
+from .transformer import TransformerEncoderLayer
+from ..registry import BACKBONE_REGISTRY
+from ..base import get_model_zoo_home_dir, get_repo_model_zoo_url, get_model_zoo_checksum_dir
+from ..utils.config import CfgNode as CN
+from ..utils.misc import load_checksum_stats, download
+from ..utils.registry import Registry
+from ..initializer import TruncNorm
+from ..attention_cell import gen_self_attn_mask
+from ..layers import get_activation, PositionalEmbedding
+from ..op import select_vectors_by_position
+from ..data.tokenizers import SentencepieceTokenizer
+
+albert_cfg_reg = Registry('albert_cfg')
+
+
+@albert_cfg_reg.register()
+def google_albert_base():
+    cfg = CN()
+    # Model Parameters
+    cfg.MODEL = CN()
+    cfg.MODEL.vocab_size = 30000
+    cfg.MODEL.embed_size = 128
+    cfg.MODEL.units = 768
+    cfg.MODEL.hidden_size = 3072
+    cfg.MODEL.max_length = 512
+    cfg.MODEL.num_heads = 12
+    cfg.MODEL.num_layers = 12
+    cfg.MODEL.pos_embed_type = 'learned'
+    cfg.MODEL.activation = 'gelu(tanh)'
+    cfg.MODEL.layer_norm_eps = 1E-12
+    cfg.MODEL.num_groups = 1
+    cfg.MODEL.num_token_types = 2
+    cfg.MODEL.hidden_dropout_prob = 0.0
+    cfg.MODEL.attention_dropout_prob = 0.0
+    cfg.MODEL.dtype = 'float32'
+    cfg.MODEL.layout = 'NT'
+    cfg.MODEL.compute_layout = 'auto'
+    # Hyper-parameters of the Initializers
+    cfg.INITIALIZER = CN()
+    cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02]
+    cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02]  # TruncNorm(0, 0.02)
+    cfg.INITIALIZER.bias = ['zeros']
+    # Version of the model. This helps ensure backward compatibility.
+    # Also, we can not use string here due to https://github.com/rbgirshick/yacs/issues/26
+    cfg.VERSION = 1
+    cfg.freeze()
+    return cfg
+
+
+@albert_cfg_reg.register()
+def google_albert_large():
+    cfg = google_albert_base()
+    cfg.defrost()
+    cfg.MODEL.hidden_size = 4096
+    cfg.MODEL.num_heads = 16
+    cfg.MODEL.num_layers = 24
+    cfg.MODEL.units = 1024
+    cfg.freeze()
+    return cfg
+
+
+@albert_cfg_reg.register()
+def google_albert_xlarge():
+    cfg = google_albert_base()
+    cfg.defrost()
+    cfg.MODEL.hidden_size = 8192
+    cfg.MODEL.num_heads = 32
+    cfg.MODEL.num_layers = 24
+    cfg.MODEL.units = 2048
+    cfg.freeze()
+    return cfg
+
+
+@albert_cfg_reg.register()
+def google_albert_xxlarge():
+    cfg = google_albert_base()
+    cfg.defrost()
+    cfg.MODEL.hidden_size = 16384
+    cfg.MODEL.num_heads = 64
+    cfg.MODEL.num_layers = 12
+    cfg.MODEL.units = 4096
+    cfg.freeze()
+    return cfg
+
+
+PRETRAINED_URL = {
+    'google_albert_base_v2': {
+        'cfg': google_albert_base(),
+        'spm_model': 'google_albert_base_v2/spm-65999e5d.model',
+        'vocab': 'google_albert_base_v2/vocab-2ee53ae7.json',
+        'params': 'google_albert_base_v2/model-125be477.params',
+        'mlm_params': 'google_albert_base_v2/model_mlm-fe20650e.params',
+        'lowercase': True,
+    },
+    'google_albert_large_v2': {
+        'cfg': google_albert_large(),
+        'spm_model': 'google_albert_large_v2/spm-65999e5d.model',
+        'vocab': 'google_albert_large_v2/vocab-2ee53ae7.json',
+        'params': 'google_albert_large_v2/model-ad60bcd5.params',
+        'mlm_params': 'google_albert_large_v2/model_mlm-6a5015ee.params',
+        'lowercase': True,
+    },
+    'google_albert_xlarge_v2': {
+        'cfg': google_albert_xlarge(),
+        'spm_model': 'google_albert_xlarge_v2/spm-65999e5d.model',
+        'vocab': 'google_albert_xlarge_v2/vocab-2ee53ae7.json',
+        'params': 'google_albert_xlarge_v2/model-4149c9e2.params',
+        'mlm_params': 'google_albert_xlarge_v2/model_mlm-ee184d38.params',
+        'lowercase': True,
+    },
+    'google_albert_xxlarge_v2': {
+        'cfg': google_albert_xxlarge(),
+        'spm_model': 'google_albert_xxlarge_v2/spm-65999e5d.model',
+        'vocab': 'google_albert_xxlarge_v2/vocab-2ee53ae7.json',
+        'params': 'google_albert_xxlarge_v2/model-5601a0ed.params',
+        'mlm_params': 'google_albert_xxlarge_v2/model_mlm-d2e2b06f.params',
+        'lowercase': True,
+    },
+}
+
+FILE_STATS = load_checksum_stats(os.path.join(get_model_zoo_checksum_dir(), 'albert.txt'))
+
+
+@use_np
+class AlbertEncoder(HybridBlock):
+    def __init__(self, units=512, hidden_size=2048,
+                 num_layers=6,
+                 num_groups=1,
+                 num_heads=8,
+                 attention_dropout_prob=0.,
+                 hidden_dropout_prob=0.,
+                 output_attention=False,
+                 dtype='float32',
+                 output_all_encodings=False,
+                 layer_norm_eps=1E-12,
+                 weight_initializer=TruncNorm(stdev=0.02),
+                 bias_initializer='zeros',
+                 activation='gelu',
+                 layout='NT'):
+        super().__init__()
+        assert units % num_heads == 0,\
+            'In AlbertEncoder, The units should be divided exactly ' \
+            'by the number of heads. Received units={}, num_heads={}' \
+            .format(units, num_heads)
+
+        self._dtype = dtype
+        self._num_layers = num_layers
+        self._num_groups = num_groups
+        assert num_layers % num_groups == 0
+        self._num_layers_each_group = num_layers // num_groups
+
+        self._output_attention = output_attention
+        self._output_all_encodings = output_all_encodings
+        self._layout = layout
+
+
+        self.all_encoder_groups = nn.HybridSequential()
+        for group_idx in range(num_groups):
+            self.all_encoder_groups.add(
+                TransformerEncoderLayer(units=units,
+                                        hidden_size=hidden_size,
+                                        num_heads=num_heads,
+                                        attention_dropout_prob=attention_dropout_prob,
+                                        hidden_dropout_prob=hidden_dropout_prob,
+                                        layer_norm_eps=layer_norm_eps,
+                                        weight_initializer=weight_initializer,
+                                        bias_initializer=bias_initializer,
+                                        activation=activation,
+                                        dtype=dtype,
+                                        layout=layout))
+
+    @property
+    def layout(self):
+        return self._layout
+
+    def hybrid_forward(self, F, data, valid_length):
+        """
+        Generate the representation given the inputs.
+
+        This is used in training or fine-tuning a Bert model.
+
+        Parameters
+        ----------
+        F
+        data
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C)
+        valid_length :
+            Shape (batch_size,)
+
+        Returns
+        -------
+        out
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_out)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C)
+        """
+        # 1. Embed the data
+        time_axis = 1 if self.layout == 'NT' else 0
+        attn_mask = gen_self_attn_mask(F, data, valid_length, dtype=self._dtype,
+                                       attn_type='full', layout=self.layout)
+        out = data
+        all_encodings_outputs = []
+        additional_outputs = []
+        for layer_idx in range(self._num_layers):
+            groups_id = layer_idx // self._num_layers_each_group
+            layer = self.all_encoder_groups[groups_id]
+            out, attention_weights = layer(out, attn_mask)
+            # out : [batch_size, seq_len, units]
+            # attention_weights : [batch_size, num_heads, seq_len, seq_len]
+            if self._output_all_encodings:
+                out = F.npx.sequence_mask(out,
+                                          sequence_length=valid_length,
+                                          use_sequence_length=True,
+                                          axis=time_axis)
+                all_encodings_outputs.append(out)
+
+            if self._output_attention:
+                additional_outputs.append(attention_weights)
+
+        if not self._output_all_encodings:
+            # if self._output_all_encodings, SequenceMask is already applied above
+            out = F.npx.sequence_mask(out, sequence_length=valid_length,
+                                      use_sequence_length=True,
+                                      axis=time_axis)
+            return out, additional_outputs
+        else:
+            return all_encodings_outputs, additional_outputs
+
+
+@use_np
+class AlbertModel(HybridBlock):
+    def __init__(self,
+                 vocab_size=30000,
+                 units=768,
+                 hidden_size=3072,
+                 embed_size=128,
+                 num_layers=12,
+                 num_heads=12,
+                 num_groups=1,
+                 max_length=512,
+                 hidden_dropout_prob=0.,
+                 attention_dropout_prob=0.,
+                 num_token_types=2,
+                 pos_embed_type='learned',
+                 activation='gelu',
+                 layer_norm_eps=1E-12,
+                 embed_initializer=TruncNorm(stdev=0.02),
+                 weight_initializer=TruncNorm(stdev=0.02),
+                 bias_initializer='zeros',
+                 dtype='float32',
+                 use_pooler=True,
+                 layout='NT',
+                 compute_layout='auto'):
+        super().__init__()
+        self._dtype = dtype
+        self.use_pooler = use_pooler
+        self.pos_embed_type = pos_embed_type
+        self.num_token_types = num_token_types
+        self.vocab_size = vocab_size
+        self.embed_size = embed_size
+        self.units = units
+        self.max_length = max_length
+        self.activation = activation
+        self.embed_initializer = embed_initializer
+        self.weight_initializer = weight_initializer
+        self.bias_initializer = bias_initializer
+        self.layer_norm_eps = layer_norm_eps
+        self._layout = layout
+        if compute_layout is None or compute_layout == 'auto':
+            self._compute_layout = layout
+        else:
+            self._compute_layout = compute_layout
+        # Construct AlbertEncoder
+        self.encoder = AlbertEncoder(
+            units=units,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            num_groups=num_groups,
+            attention_dropout_prob=attention_dropout_prob,
+            hidden_dropout_prob=hidden_dropout_prob,
+            output_attention=False,
+            output_all_encodings=False,
+            activation=activation,
+            layer_norm_eps=layer_norm_eps,
+            weight_initializer=weight_initializer,
+            bias_initializer=bias_initializer,
+            dtype=dtype,
+            layout=self._compute_layout
+        )
+        self.encoder.hybridize()
+        # Construct word embedding
+        self.word_embed = nn.Embedding(input_dim=vocab_size,
+                                       output_dim=embed_size,
+                                       weight_initializer=embed_initializer,
+                                       dtype=dtype)
+        if embed_size != units:
+            self.embed_factorized_proj = nn.Dense(units=units,
+                                                  flatten=False,
+                                                  weight_initializer=weight_initializer,
+                                                  bias_initializer=bias_initializer)
+        self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps)
+        self.embed_dropout = nn.Dropout(hidden_dropout_prob)
+        # Construct token type embedding
+        self.token_type_embed = nn.Embedding(input_dim=num_token_types,
+                                             output_dim=embed_size,
+                                             weight_initializer=weight_initializer)
+        self.token_pos_embed = PositionalEmbedding(units=embed_size,
+                                                   max_length=max_length,
+                                                   dtype=self._dtype,
+                                                   method=pos_embed_type)
+        if self.use_pooler:
+            # Construct pooler
+            self.pooler = nn.Dense(units=units,
+                                   in_units=units,
+                                   flatten=False,
+                                   activation='tanh',
+                                   weight_initializer=weight_initializer,
+                                   bias_initializer=bias_initializer)
+
+    @property
+    def layout(self):
+        return self._layout
+
+    def hybrid_forward(self, F, inputs, token_types, valid_length=None):
+        # pylint: disable=arguments-differ
+        """Generate the representation given the inputs.
+
+        This is used in training or fine-tuning a Albert model.
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+
+            If the inputs contain two sequences, we will set different token types for the first
+             sentence and the second sentence.
+        valid_length :
+            The valid length of each sequence
+            Shape (batch_size,)
+
+        Returns
+        -------
+        contextual_embedding
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units)
+        pooled_output
+            This is optional. Shape (batch_size, units)
+        """
+        initial_embedding = self.get_initial_embedding(F, inputs, token_types)
+        # Projecting the embedding into units
+        prev_out = initial_embedding
+        if self.embed_size != self.units:
+            prev_out = self.embed_factorized_proj(prev_out)
+        outputs = []
+        if self._compute_layout != self._layout:
+            # Swap input to reflect the compute_layout
+            contextual_embeddings, additional_outputs = self.encoder(F.np.swapaxes(prev_out, 0, 1),
+                                                                     valid_length)
+            contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1)
+        else:
+            contextual_embeddings, additional_outputs = self.encoder(prev_out, valid_length)
+        outputs.append(contextual_embeddings)
+        if self.use_pooler:
+            pooled_out = self.apply_pooling(contextual_embeddings)
+            outputs.append(pooled_out)
+        return tuple(outputs) if len(outputs) > 1 else outputs[0]
+
+    def get_initial_embedding(self, F, inputs, token_types=None):
+        """Get the initial token embeddings that considers the token type and positional embeddings
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+            If None, it will be initialized as all zero
+
+        Returns
+        -------
+        embedding
+            The initial embedding that will be fed into the encoder
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_embed)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_embed)
+        """
+        if self.layout == 'NT':
+            batch_axis, time_axis = 0, 1
+        else:
+            batch_axis, time_axis = 1, 0
+        embedding = self.word_embed(inputs)
+        if token_types is None:
+            token_types = F.np.zeros_like(inputs)
+        type_embedding = self.token_type_embed(token_types)
+        embedding = embedding + type_embedding
+        if self.pos_embed_type is not None:
+            positional_embedding = self.token_pos_embed(F.npx.arange_like(inputs, axis=time_axis))
+            positional_embedding = F.np.expand_dims(positional_embedding, axis=batch_axis)
+            embedding = embedding + positional_embedding
+        # Extra layer normalization plus dropout
+        embedding = self.embed_layer_norm(embedding)
+        embedding = self.embed_dropout(embedding)
+        return embedding
+
+    def apply_pooling(self, sequence):
+        """Generate the representation given the inputs.
+
+        This is used for pre-training or fine-tuning a Bert model.
+        Get the first token of the whole sequence which is [CLS]
+
+        Parameters
+        ----------
+        sequence
+            - layout = 'NT'
+                Shape (batch_size, sequence_length, units)
+            - layout = 'TN'
+                Shape (sequence_length, batch_size, units)
+
+        Returns
+        -------
+        pooled_out
+            Shape (batch_size, units)
+        """
+        if self.layout == 'NT':
+            outputs = sequence[:, 0, :]
+        else:
+            outputs = sequence[0, :, :]
+        return self.pooler(outputs)
+
+    @staticmethod
+    def get_cfg(key=None):
+        if key is not None:
+            return albert_cfg_reg.create(key)
+        else:
+            return google_albert_base()
+
+    @classmethod
+    def from_cfg(cls, cfg, use_pooler=True, dtype=None) -> 'AlbertModel':
+        """
+
+        Parameters
+        ----------
+        cfg
+        use_pooler
+            Whether to use pooler
+        dtype
+            The dtype of the backbone model
+
+        Returns
+        -------
+        model
+            The created AlbertModel
+        """
+        cfg = cls.get_cfg().clone_merge(cfg)
+        assert cfg.VERSION == 1, 'Wrong version!'
+        embed_initializer = mx.init.create(*cfg.INITIALIZER.embed)
+        weight_initializer = mx.init.create(*cfg.INITIALIZER.weight)
+        bias_initializer = mx.init.create(*cfg.INITIALIZER.bias)
+        if dtype is None:
+            dtype = cfg.MODEL.dtype
+        return cls(vocab_size=cfg.MODEL.vocab_size,
+                   units=cfg.MODEL.units,
+                   hidden_size=cfg.MODEL.hidden_size,
+                   embed_size=cfg.MODEL.embed_size,
+                   num_layers=cfg.MODEL.num_layers,
+                   num_heads=cfg.MODEL.num_heads,
+                   num_groups=cfg.MODEL.num_groups,
+                   max_length=cfg.MODEL.max_length,
+                   hidden_dropout_prob=cfg.MODEL.hidden_dropout_prob,
+                   attention_dropout_prob=cfg.MODEL.attention_dropout_prob,
+                   num_token_types=cfg.MODEL.num_token_types,
+                   pos_embed_type=cfg.MODEL.pos_embed_type,
+                   activation=cfg.MODEL.activation,
+                   layer_norm_eps=cfg.MODEL.layer_norm_eps,
+                   dtype=dtype,
+                   layout=cfg.MODEL.layout,
+                   embed_initializer=embed_initializer,
+                   weight_initializer=weight_initializer,
+                   bias_initializer=bias_initializer,
+                   use_pooler=use_pooler)
+
+
+@use_np
+class AlbertForMLM(HybridBlock):
+    def __init__(self, backbone_cfg,
+                 weight_initializer=None,
+                 bias_initializer=None):
+        """
+
+        Parameters
+        ----------
+        backbone_cfg
+        weight_initializer
+        bias_initializer
+        """
+        super().__init__()
+        self.backbone_model = AlbertModel.from_cfg(backbone_cfg)
+        if weight_initializer is None:
+            weight_initializer = self.backbone_model.weight_initializer
+        if bias_initializer is None:
+            bias_initializer = self.backbone_model.bias_initializer
+        self.mlm_decoder = nn.HybridSequential()
+        # Extra non-linear layer
+        self.mlm_decoder.add(nn.Dense(units=self.backbone_model.embed_size,
+                                      flatten=False,
+                                      weight_initializer=weight_initializer,
+                                      bias_initializer=bias_initializer))
+        self.mlm_decoder.add(get_activation(self.backbone_model.activation))
+        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
+        # only load the dense weights with a re-initialized bias
+        # parameters are stored in 'word_embed_bias' which is
+        # not used in original embedding
+        self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size,
+                                      flatten=False,
+                                      bias_initializer=bias_initializer))
+        self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
+        self.mlm_decoder.hybridize()
+
+    @property
+    def layout(self):
+        return self.backbone_model.layout
+
+    def hybrid_forward(self, F, inputs, token_types, valid_length,
+                       masked_positions):
+        """Getting the scores of the masked positions.
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+            The type of the token. For example, if the inputs contain two sequences,
+            we will set different token types for the first sentence and the second sentence.
+        valid_length :
+            The valid length of each sequence
+            Shape (batch_size,)
+        masked_positions :
+            The masked position of the sequence
+            Shape (batch_size, num_masked_positions).
+
+        Returns
+        -------
+        contextual_embedding
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units)
+        pooled_out
+            Shape (batch_size, units)
+        mlm_scores :
+            Shape (batch_size, num_masked_positions, vocab_size)
+        """
+        contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length)
+        if self.layout == 'NT':
+            mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        else:
+            mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1),
+                                                      masked_positions)
+        mlm_scores = self.mlm_decoder(mlm_features)
+        return contextual_embeddings, pooled_out, mlm_scores
+
+
+@use_np
+class AlbertForPretrain(HybridBlock):
+    def __init__(self, backbone_cfg,
+                 weight_initializer=None,
+                 bias_initializer=None):
+        """
+
+        Parameters
+        ----------
+        backbone_cfg
+            The cfg of the backbone model
+        weight_initializer
+        bias_initializer
+        """
+        super().__init__()
+        self.backbone_model = AlbertModel.from_cfg(backbone_cfg)
+        if weight_initializer is None:
+            weight_initializer = self.backbone_model.weight_initializer
+        if bias_initializer is None:
+            bias_initializer = self.backbone_model.bias_initializer
+        # Construct sop_classifier for sentence order prediction
+        self.sop_classifier = nn.Dense(units=2,
+                                       weight_initializer=weight_initializer)
+        self.mlm_decoder = nn.HybridSequential()
+        # Extra non-linear layer
+        self.mlm_decoder.add(nn.Dense(units=self.backbone_model.embed_size,
+                                      flatten=False,
+                                      weight_initializer=weight_initializer,
+                                      bias_initializer=bias_initializer))
+        self.mlm_decoder.add(get_activation(self.backbone_model.activation))
+        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
+        # only load the dense weights with a re-initialized bias
+        # parameters are stored in 'word_embed_bias' which is
+        # not used in original embedding
+        self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size,
+                                      flatten=False,
+                                      bias_initializer=bias_initializer))
+        self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
+        self.mlm_decoder.hybridize()
+
+    @property
+    def layout(self):
+        return self.backbone_model.layout
+
+    def hybrid_forward(self, F, inputs, token_types, valid_length,
+                       masked_positions):
+        """Generate the representation given the inputs.
+
+        This is used in training or fine-tuning a Albert model.
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types :
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+
+            If the inputs contain two sequences, we will set different token types for the first
+             sentence and the second sentence.
+        valid_length :
+            The valid length of each sequence
+            Shape (batch_size,)
+        masked_positions :
+            The masked position of the sequence
+            Shape (batch_size, num_masked_positions).
+
+        Returns
+        -------
+        contextual_embedding
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
+        pooled_out
+            Shape (batch_size, units)
+        sop_score :
+            Shape (batch_size, 2)
+        mlm_scores :
+            Shape (batch_size, num_masked_positions, vocab_size)
+        """
+        contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length)
+        sop_score = self.sop_classifier(pooled_out)
+        if self.layout == 'NT':
+            mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        else:
+            mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1),
+                                                      masked_positions)
+        mlm_scores = self.mlm_decoder(mlm_features)
+        return contextual_embeddings, pooled_out, sop_score, mlm_scores
+
+
+def list_pretrained_albert():
+    return sorted(list(PRETRAINED_URL.keys()))
+
+
+def get_pretrained_albert(model_name: str = 'google_albert_base_v2',
+                          root: str = get_model_zoo_home_dir(),
+                          load_backbone: str = True,
+                          load_mlm: str = False)\
+        -> Tuple[CN, SentencepieceTokenizer, str, str]:
+    """Get the pretrained Albert weights
+
+    Parameters
+    ----------
+    model_name
+        The name of the Albert model.
+    root
+        The downloading root
+    load_backbone
+        Whether to load the weights of the backbone network
+    load_mlm
+        Whether to load the weights of MLM
+
+    Returns
+    -------
+    cfg
+        Network configuration
+    tokenizer
+        The SentencepieceTokenizer
+    backbone_params_path
+        Path to the parameter of the backbone network
+    mlm_params_path
+        Path to the parameter that includes both the backbone and the MLM
+    """
+    assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format(
+        model_name, list_pretrained_albert())
+    cfg_path = PRETRAINED_URL[model_name]['cfg']
+    if isinstance(cfg_path, CN):
+        cfg = cfg_path
+    else:
+        cfg = None
+    spm_model_path = PRETRAINED_URL[model_name]['spm_model']
+    vocab_path = PRETRAINED_URL[model_name]['vocab']
+    params_path = PRETRAINED_URL[model_name]['params']
+    mlm_params_path = PRETRAINED_URL[model_name]['mlm_params']
+    local_paths = dict()
+    download_jobs = [('spm_model', spm_model_path), ('vocab', vocab_path)]
+    if cfg is None:
+        download_jobs.append(('cfg', cfg_path))
+    for key, path in download_jobs:
+        local_paths[key] = download(url=get_repo_model_zoo_url() + path,
+                                    path=os.path.join(root, path),
+                                    sha1_hash=FILE_STATS[path])
+    if load_backbone:
+        local_params_path = download(url=get_repo_model_zoo_url() + params_path,
+                                     path=os.path.join(root, params_path),
+                                     sha1_hash=FILE_STATS[params_path])
+    else:
+        local_params_path = None
+    if load_mlm:
+        local_mlm_params_path = download(url=get_repo_model_zoo_url() + mlm_params_path,
+                                         path=os.path.join(root, mlm_params_path),
+                                         sha1_hash=FILE_STATS[mlm_params_path])
+    else:
+        local_mlm_params_path = None
+    do_lower = True if 'lowercase' in PRETRAINED_URL[model_name]\
+                       and PRETRAINED_URL[model_name]['lowercase'] else False
+    tokenizer = SentencepieceTokenizer(local_paths['spm_model'],
+                                       vocab=local_paths['vocab'],
+                                       lowercase=do_lower)
+    if cfg is None:
+        cfg = AlbertModel.get_cfg().clone_merge(local_paths['cfg'])
+    return cfg, tokenizer, local_params_path, local_mlm_params_path
+
+
+BACKBONE_REGISTRY.register('albert', [AlbertModel,
+                                      get_pretrained_albert,
+                                      list_pretrained_albert])
diff --git a/src/gluonnlp/models/bart.py b/src/gluonnlp/models/bart.py
new file mode 100644
index 0000000000..3d6a3329e8
--- /dev/null
+++ b/src/gluonnlp/models/bart.py
@@ -0,0 +1,443 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+BART Model
+
+@article{lewis2019bart,
+    title = {BART: Denoising Sequence-to-Sequence Pre-training for Natural
+Language Generation, Translation, and Comprehension},
+    author = {Mike Lewis and Yinhan Liu and Naman Goyal and Marjan Ghazvininejad and
+              Abdelrahman Mohamed and Omer Levy and Veselin Stoyanov
+              and Luke Zettlemoyer },
+    journal={arXiv preprint arXiv:1910.13461},
+    year = {2019},
+}
+
+"""
+
+__all__ = ['BartModel', 'list_pretrained_bart', 'get_pretrained_bart']
+
+import os
+from typing import Tuple, List
+
+import mxnet as mx
+from mxnet import use_np
+from mxnet.gluon import nn
+
+from ..base import get_model_zoo_home_dir, get_repo_model_zoo_url, \
+                   get_model_zoo_checksum_dir
+from ..registry import BACKBONE_REGISTRY
+from ..utils.misc import download, load_checksum_stats
+from .transformer import TransformerModel
+from ..utils.config import CfgNode as CN
+from ..utils.registry import Registry
+from ..data.tokenizers import HuggingFaceByteBPETokenizer
+
+bart_cfg_reg = Registry('bart_cfg')
+
+
+@bart_cfg_reg.register()
+def bart_base():
+    cfg = CN()
+    # Config for the bart base model
+    cfg.MODEL = CN()
+    cfg.MODEL.vocab_size = 51201
+    cfg.MODEL.max_src_length = 1024
+    cfg.MODEL.max_tgt_length = 1024
+    cfg.MODEL.scale_embed = False
+    cfg.MODEL.pos_embed_type = 'learned'
+    cfg.MODEL.shared_embed = True
+    cfg.MODEL.tie_weights = True
+    cfg.MODEL.attention_dropout = 0.1
+    cfg.MODEL.activation_dropout = 0.0
+    cfg.MODEL.dropout = 0.1
+    cfg.MODEL.layer_norm_eps = 1E-5
+    cfg.MODEL.pooler_activation = 'tanh'
+    cfg.MODEL.data_norm = True
+    cfg.MODEL.layout = 'NT'
+    cfg.MODEL.dtype = 'float32'
+
+    # Parameters for the encoder
+    cfg.MODEL.ENCODER = CN()
+    cfg.MODEL.ENCODER.num_layers = 6
+    cfg.MODEL.ENCODER.units = 768
+    cfg.MODEL.ENCODER.num_heads = 12
+    cfg.MODEL.ENCODER.hidden_size = 3072
+    cfg.MODEL.ENCODER.recurrent = False
+    cfg.MODEL.ENCODER.pre_norm = False
+    cfg.MODEL.ENCODER.activation = 'gelu'
+    cfg.MODEL.ENCODER.use_qkv_bias = True
+
+    # Parameters for the decoder
+    cfg.MODEL.DECODER = CN()
+    cfg.MODEL.DECODER.num_layers = 6
+    cfg.MODEL.DECODER.units = 768
+    cfg.MODEL.DECODER.num_heads = 12
+    cfg.MODEL.DECODER.hidden_size = 3072
+    cfg.MODEL.DECODER.recurrent = False
+    cfg.MODEL.DECODER.pre_norm = False
+    cfg.MODEL.DECODER.activation = 'gelu'
+    cfg.MODEL.DECODER.use_qkv_bias = True
+
+    # Parameters for the initializer
+    cfg.INITIALIZER = CN()
+    cfg.INITIALIZER.embed = ['xavier', 'gaussian', 'in', 1.0]
+    cfg.INITIALIZER.weight = ['xavier', 'uniform', 'avg', 1.0]
+    cfg.INITIALIZER.bias = ['zeros']
+    cfg.VERSION = 1
+    cfg.freeze()
+    return cfg
+
+
+@bart_cfg_reg.register()
+def bart_large():
+    cfg = bart_base()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 50265
+    cfg.MODEL.ENCODER.units = 1024
+    cfg.MODEL.ENCODER.hidden_size = 4096
+    cfg.MODEL.ENCODER.num_heads = 16
+    cfg.MODEL.ENCODER.num_layers = 12
+    cfg.MODEL.DECODER.units = 1024
+    cfg.MODEL.DECODER.hidden_size = 4096
+    cfg.MODEL.DECODER.num_heads = 16
+    cfg.MODEL.DECODER.num_layers = 12
+    cfg.freeze()
+    return cfg
+
+
+PRETRAINED_URL = {
+    'fairseq_bart_base': {
+        'cfg': bart_base(),
+        'merges': 'fairseq_bart_base/gpt2-396d4d8e.merges',
+        'vocab': 'fairseq_bart_base/gpt2-f4dedacb.vocab',
+        'params': 'fairseq_bart_base/model-8f4929b5.params',
+        'lowercase': False,
+    },
+    'fairseq_bart_large': {
+        'cfg': bart_large(),
+        'merges': 'fairseq_bart_large/gpt2-396d4d8e.merges',
+        'vocab': 'fairseq_bart_large/gpt2-f1335494.vocab',
+        'params': 'fairseq_bart_large/model-862277b1.params',
+        'lowercase': False,
+    }
+}
+
+
+FILE_STATS = load_checksum_stats(os.path.join(get_model_zoo_checksum_dir(), 'bart.txt'))
+
+
+@use_np
+class BartModel(TransformerModel):
+    def __init__(self,
+                 use_pooler: bool = False,
+                 classifier_activation: bool = False,
+                 extract_feature: bool = False,
+                 pooler_activation='tanh',
+                 **kwargs):
+        """
+
+        Parameters
+        ----------
+        use_pooler
+            Whether to use pooler
+        classifier_activation
+        extract_feature
+            Whether to extract the feature
+        pooler_activation
+        **kwargs
+        """
+        super().__init__(**kwargs)
+        assert self._src_vocab_size == self._tgt_vocab_size, \
+            'Vocab size mismatch between encoder and decoder'
+        self._vocab_size = self._src_vocab_size
+        self.extract_feature = extract_feature
+        self.use_pooler = use_pooler
+        self.classifier_activation = classifier_activation
+        if not extract_feature:
+            if self.tie_weights:
+                self.tgt_final_layer = \
+                    nn.Dense(self._tgt_vocab_size, flatten=False,
+                             use_bias=False,
+                             dtype=self._dtype)
+                self.tgt_final_layer.weight = self.tgt_embed_layer.weight
+            else:
+                self.tgt_final_layer = \
+                    nn.Dense(self._tgt_vocab_size,
+                             flatten=False,
+                             weight_initializer=self.weight_initializer,
+                             use_bias=False,
+                             dtype=self._dtype)
+        elif use_pooler and classifier_activation:
+            # Construct pooler
+            self.pooler = nn.Dense(units=self.units,
+                                   in_units=self.units,
+                                   flatten=False,
+                                   activation=pooler_activation,
+                                   weight_initializer=self.weight_initializer,
+                                   bias_initializer=self.bias_initializer,
+                                   dtype=self._dtype)
+
+    def hybrid_forward(self, F, src_data, src_valid_length, tgt_data, tgt_valid_length):
+        """
+
+        Parameters
+        ----------
+        F
+        src_data
+            - layout = 'NT'
+                Shape (batch_size, src_length)
+            - layout = 'TN'
+                Shape (src_length, batch_size)
+        src_valid_length
+            Shape (batch_size,)
+        tgt_data
+            - layout = 'NT'
+                Shape (batch_size, tgt_length)
+            - layout = 'TN'
+                Shape (tgt_length, batch_size)
+        tgt_valid_length
+            Shape (batch_size,)
+
+        Returns
+        -------
+        A tuple contains
+
+        - If 'self.extract_feature' = True
+            - contextual_embedding
+                - layout = 'NT'
+                    Shape (batch_size, tgt_length, units)
+                - layout = 'TN'
+                    Shape (tgt_length, batch_size, units)
+            - pooled_output, optional, only enabled if use_pooler = True
+                Shape (batch_size, units)
+        - If 'self.extract_feature' = False
+            - dec_out
+                - layout = 'NT'
+                    Shape (batch_size, tgt_length, tgt_vocab_size)
+                - layout = 'TN'
+                    Shape (tgt_length, batch_size, tgt_vocab_size)
+        """
+        enc_out = self.encode(F, src_data, src_valid_length)
+        contextual_embedding = self.decode_seq(F, tgt_data, tgt_valid_length, enc_out,
+                                               src_valid_length)
+        if self.extract_feature:
+            if self.use_pooler:
+                pooled_output = self.apply_pooling(F, contextual_embedding, tgt_valid_length)
+                return contextual_embedding, pooled_output
+            else:
+                return contextual_embedding
+        else:
+            dec_out = self.tgt_final_layer(contextual_embedding)
+            return dec_out
+
+    def apply_pooling(self, F, sequence, valid_length):
+        """Generate the representation given the inputs.
+
+        This is used for pre-training or fine-tuning a BART model.
+        In BART, the pooled output is the embedding of the last token.
+
+        Parameters
+        ----------
+        F
+            ndarray or symbol
+        sequence
+            - layout = 'NT'
+                Shape (batch_size, sequence_length, units)
+            - layout = 'TN'
+                Shape (sequence_length, batch_size, units)
+        valid_length
+            Valid length of each sequence
+            shape (batch_size,)
+
+        Returns
+        -------
+        outputs
+            Shape (batch_size, units)
+        """
+        if self._layout == 'NT':
+            batch_indices = F.npx.arange_like(sequence, axis=0).astype(mx.np.int32)
+            outputs = sequence[batch_indices, valid_length - 1]
+        elif self._layout == 'TN':
+            batch_indices = F.npx.arange_like(sequence, axis=1).astype(mx.np.int32)
+            outputs = sequence[valid_length - 1, batch_indices]
+        else:
+            raise NotImplementedError
+        if self.classifier_activation:
+            return self.pooler(outputs)
+        else:
+            return outputs
+
+    @property
+    def layout(self) -> str:
+        return self._layout
+
+    @property
+    def vocab_size(self):
+        return self._vocab_size
+
+    @classmethod
+    def get_cfg(cls, key=None):
+        if key is None:
+            return bart_base()
+        else:
+            return bart_cfg_reg.create(key)
+
+    @classmethod
+    def from_cfg(cls, cfg,
+                 dtype=None,
+                 extract_feature=False,
+                 use_pooler=True,
+                 classifier_activation=False):
+        """
+
+        Parameters
+        ----------
+        cfg
+            The configuration
+        dtype
+            Data type of the loaded config
+        extract_feature
+            Whether to only extract feature.
+            If so, the output of the layer will be contextual embeddings or the
+            contextual embedding + pooled output
+        use_pooler
+            Whether to use pooler
+        classifier_activation
+            Whether to use the classifier activation
+
+        Returns
+        -------
+        model
+            The initialized BartModel
+        """
+        cfg = cls.get_cfg().clone_merge(cfg)
+        embed_initializer = mx.init.create(*cfg.INITIALIZER.embed)
+        weight_initializer = mx.init.create(*cfg.INITIALIZER.weight)
+        bias_initializer = mx.init.create(*cfg.INITIALIZER.bias)
+        if dtype is None:
+            dtype = cfg.MODEL.dtype
+        return cls(src_vocab_size=cfg.MODEL.vocab_size,
+                   tgt_vocab_size=cfg.MODEL.vocab_size,
+                   max_src_length=cfg.MODEL.max_src_length,
+                   max_tgt_length=cfg.MODEL.max_tgt_length,
+                   scale_embed=cfg.MODEL.scale_embed,
+                   pos_embed_type=cfg.MODEL.pos_embed_type,
+                   shared_embed=cfg.MODEL.shared_embed,
+                   tie_weights=cfg.MODEL.tie_weights,
+                   data_norm=cfg.MODEL.data_norm,
+                   extract_feature=extract_feature,
+                   use_pooler=use_pooler,
+                   classifier_activation=classifier_activation,
+                   attention_dropout=cfg.MODEL.attention_dropout,
+                   activation_dropout=cfg.MODEL.activation_dropout,
+                   dropout=cfg.MODEL.dropout,
+                   pooler_activation=cfg.MODEL.pooler_activation,
+                   layer_norm_eps=cfg.MODEL.layer_norm_eps,
+                   enc_num_layers=cfg.MODEL.ENCODER.num_layers,
+                   enc_units=cfg.MODEL.ENCODER.units,
+                   enc_num_heads=cfg.MODEL.ENCODER.num_heads,
+                   enc_hidden_size=cfg.MODEL.ENCODER.hidden_size,
+                   enc_recurrent=cfg.MODEL.ENCODER.recurrent,
+                   enc_activation=cfg.MODEL.ENCODER.activation,
+                   enc_pre_norm=cfg.MODEL.ENCODER.pre_norm,
+                   dec_num_layers=cfg.MODEL.DECODER.num_layers,
+                   dec_units=cfg.MODEL.DECODER.units,
+                   dec_num_heads=cfg.MODEL.DECODER.num_heads,
+                   dec_hidden_size=cfg.MODEL.DECODER.hidden_size,
+                   dec_recurrent=cfg.MODEL.DECODER.recurrent,
+                   dec_activation=cfg.MODEL.DECODER.activation,
+                   dec_pre_norm=cfg.MODEL.DECODER.pre_norm,
+                   layout=cfg.MODEL.layout,
+                   embed_initializer=embed_initializer,
+                   weight_initializer=weight_initializer,
+                   bias_initializer=bias_initializer,
+                   dtype=dtype)
+
+
+def list_pretrained_bart():
+    return sorted(list(PRETRAINED_URL.keys()))
+
+
+def get_pretrained_bart(model_name: str = 'fairseq_bart_base',
+                        root: str = get_model_zoo_home_dir(),
+                        load_backbone: bool = True) \
+        -> Tuple[CN, HuggingFaceByteBPETokenizer, str, List]:
+    """Get the pretrained RoBERTa weights
+
+    Parameters
+    ----------
+    model_name
+        The name of the RoBERTa model.
+    root
+        The downloading root
+    load_backbone
+        Whether to load the weights of the backbone network
+
+    Returns
+    -------
+    cfg
+        Network configuration
+    tokenizer
+        The HuggingFaceByteBPETokenizer
+    params_path
+        Path to the parameters
+    additional_output
+        The additional outputs
+
+    """
+    assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format(
+        model_name, list_pretrained_bart())
+    cfg_path = PRETRAINED_URL[model_name]['cfg']
+    if isinstance(cfg_path, CN):
+        cfg = cfg_path
+    else:
+        cfg = None
+    merges_path = PRETRAINED_URL[model_name]['merges']
+    vocab_path = PRETRAINED_URL[model_name]['vocab']
+    params_path = PRETRAINED_URL[model_name]['params']
+
+    local_paths = dict()
+    download_jobs = [('vocab', vocab_path), ('merges', merges_path)]
+    if cfg is None:
+        download_jobs.append(('cfg', cfg_path))
+    for k, path in download_jobs:
+        local_paths[k] = download(url=get_repo_model_zoo_url() + path,
+                                  path=os.path.join(root, path),
+                                  sha1_hash=FILE_STATS[path])
+    if load_backbone:
+        local_params_path = download(url=get_repo_model_zoo_url() + params_path,
+                                     path=os.path.join(root, params_path),
+                                     sha1_hash=FILE_STATS[params_path])
+    else:
+        local_params_path = None
+
+    do_lower = True if 'lowercase' in PRETRAINED_URL[model_name]\
+                       and PRETRAINED_URL[model_name]['lowercase'] else False
+    tokenizer = HuggingFaceByteBPETokenizer(
+        merges_file=local_paths['merges'],
+        vocab_file=local_paths['vocab'],
+        lowercase=do_lower)
+    additional_out = []
+    if cfg is None:
+        cfg = BartModel.get_cfg().clone_merge(local_paths['cfg'])
+    return cfg, tokenizer, local_params_path, additional_out
+
+
+BACKBONE_REGISTRY.register('bart', [BartModel,
+                                    get_pretrained_bart,
+                                    list_pretrained_bart])
diff --git a/src/gluonnlp/models/bert.py b/src/gluonnlp/models/bert.py
new file mode 100644
index 0000000000..2bc57a7124
--- /dev/null
+++ b/src/gluonnlp/models/bert.py
@@ -0,0 +1,833 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Bert Model
+
+@article{devlin2018bert,
+  title={BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
+  author={Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina},
+  journal={arXiv preprint arXiv:1810.04805},
+  year={2018}
+}
+"""
+
+__all__ = ['BertModel', 'BertForMLM', 'BertForPretrain',
+           'list_pretrained_bert', 'get_pretrained_bert']
+
+import os
+from typing import Tuple
+
+import mxnet as mx
+from mxnet import use_np
+from mxnet.gluon import HybridBlock, nn
+from ..registry import BACKBONE_REGISTRY
+from .transformer import TransformerEncoderLayer
+from ..base import get_model_zoo_home_dir, get_repo_model_zoo_url, get_model_zoo_checksum_dir
+from ..utils.config import CfgNode as CN
+from ..utils.misc import load_checksum_stats, download
+from ..utils.registry import Registry
+from ..initializer import TruncNorm
+from ..attention_cell import MultiHeadAttentionCell, gen_self_attn_mask
+from ..layers import get_activation, PositionalEmbedding, PositionwiseFFN, InitializerType
+from ..op import select_vectors_by_position
+from ..data.tokenizers import HuggingFaceWordPieceTokenizer
+
+bert_cfg_reg = Registry('bert_cfg')
+
+
+@bert_cfg_reg.register()
+def google_en_uncased_bert_base():
+    cfg = CN()
+    # Parameters for thr small model
+    cfg.MODEL = CN()
+    cfg.MODEL.vocab_size = 30522
+    cfg.MODEL.units = 768
+    cfg.MODEL.hidden_size = 3072
+    cfg.MODEL.max_length = 512
+    cfg.MODEL.num_heads = 12
+    cfg.MODEL.num_layers = 12
+    cfg.MODEL.pos_embed_type = 'learned'
+    cfg.MODEL.activation = 'gelu'
+    cfg.MODEL.layer_norm_eps = 1E-12
+    cfg.MODEL.num_token_types = 2
+    cfg.MODEL.hidden_dropout_prob = 0.1
+    cfg.MODEL.attention_dropout_prob = 0.1
+    cfg.MODEL.dtype = 'float32'
+    cfg.MODEL.layout = 'NT'
+    cfg.MODEL.compute_layout = 'auto'
+    # Hyper-parameters of the Initializers
+    cfg.INITIALIZER = CN()
+    cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02]
+    cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02]  # TruncNorm(0, 0.02)
+    cfg.INITIALIZER.bias = ['zeros']
+    # Version of the model. This helps ensure backward compatibility.
+    # Also, we can not use string here due to https://github.com/rbgirshick/yacs/issues/26
+    cfg.VERSION = 1
+    cfg.freeze()
+    return cfg
+
+
+@bert_cfg_reg.register()
+def google_en_uncased_bert_large():
+    cfg = google_en_uncased_bert_base()
+    cfg.defrost()
+    cfg.MODEL.hidden_size = 4096
+    cfg.MODEL.num_heads = 16
+    cfg.MODEL.num_layers = 24
+    cfg.MODEL.units = 1024
+    cfg.freeze()
+    return cfg
+
+
+@bert_cfg_reg.register()
+def google_en_cased_bert_base():
+    cfg = google_en_uncased_bert_base()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 28996
+    cfg.freeze()
+    return cfg
+
+
+@bert_cfg_reg.register()
+def google_en_cased_bert_large():
+    cfg = google_en_uncased_bert_large()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 28996
+    cfg.freeze()
+    return cfg
+
+
+@bert_cfg_reg.register()
+def google_zh_bert_base():
+    cfg = google_en_uncased_bert_base()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 21128
+    cfg.freeze()
+    return cfg
+
+
+@bert_cfg_reg.register()
+def google_multi_cased_bert_base():
+    cfg = google_en_uncased_bert_base()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 119547
+    cfg.freeze()
+    return cfg
+
+
+@bert_cfg_reg.register()
+def google_multi_cased_bert_large():
+    cfg = google_en_uncased_bert_large()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 119547
+    cfg.freeze()
+    return cfg
+
+
+PRETRAINED_URL = {
+    'google_en_cased_bert_base': {
+        'cfg': google_en_cased_bert_base(),
+        'vocab': 'google_en_cased_bert_base/vocab-c1defaaa.json',
+        'params': 'google_en_cased_bert_base/model-c566c289.params',
+        'mlm_params': 'google_en_cased_bert_base/model_mlm-bde14bee.params',
+        'lowercase': False,
+    },
+
+    'google_en_uncased_bert_base': {
+        'cfg': google_en_uncased_bert_base(),
+        'vocab': 'google_en_uncased_bert_base/vocab-e6d2b21d.json',
+        'params': 'google_en_uncased_bert_base/model-3712e50a.params',
+        'mlm_params': 'google_en_uncased_bert_base/model_mlm-04e88b58.params',
+        'lowercase': True,
+    },
+    'google_en_cased_bert_large': {
+        'cfg': google_en_cased_bert_large(),
+        'vocab': 'google_en_cased_bert_large/vocab-c1defaaa.json',
+        'params': 'google_en_cased_bert_large/model-7aa93704.params',
+        'mlm_params': 'google_en_cased_bert_large/model_mlm-59ff3f6a.params',
+        'lowercase': False,
+    },
+    'google_en_uncased_bert_large': {
+        'cfg': google_en_uncased_bert_large(),
+        'vocab': 'google_en_uncased_bert_large/vocab-e6d2b21d.json',
+        'params': 'google_en_uncased_bert_large/model-e53bbc57.params',
+        'mlm_params': 'google_en_uncased_bert_large/model_mlm-44bc70c0.params',
+        'lowercase': True,
+    },
+    'google_zh_bert_base': {
+        'cfg': google_zh_bert_base(),
+        'vocab': 'google_zh_bert_base/vocab-711c13e4.json',
+        'params': 'google_zh_bert_base/model-2efbff63.params',
+        'mlm_params': 'google_zh_bert_base/model_mlm-75339658.params',
+        'lowercase': False,
+    },
+    'google_multi_cased_bert_base': {
+        'cfg': google_multi_cased_bert_base(),
+        'vocab': 'google_multi_cased_bert_base/vocab-016e1169.json',
+        'params': 'google_multi_cased_bert_base/model-c2110078.params',
+        'mlm_params': 'google_multi_cased_bert_base/model_mlm-4611e7a3.params',
+        'lowercase': False,
+    },
+    'google_en_cased_bert_wwm_large': {
+        'cfg': google_en_cased_bert_large(),
+        'vocab': 'google_en_cased_bert_wwm_large/vocab-c1defaaa.json',
+        'params': 'google_en_cased_bert_wwm_large/model-0fe841cf.params',
+        'mlm_params': None,
+        'lowercase': False,
+    },
+    'google_en_uncased_bert_wwm_large': {
+        'cfg': google_en_uncased_bert_large(),
+        'vocab': 'google_en_uncased_bert_wwm_large/vocab-e6d2b21d.json',
+        'params': 'google_en_uncased_bert_wwm_large/model-cb3ad3c2.params',
+        'mlm_params': None,
+        'lowercase': True,
+    }
+}
+
+
+FILE_STATS = load_checksum_stats(os.path.join(get_model_zoo_checksum_dir(), 'bert.txt'))
+
+
+@use_np
+class BertTransformer(HybridBlock):
+    def __init__(self, units: int = 512,
+                 hidden_size: int = 2048,
+                 num_layers: int = 6,
+                 num_heads: int = 8,
+                 attention_dropout_prob: float = 0.,
+                 hidden_dropout_prob: float = 0.,
+                 output_attention: bool = False,
+                 dtype='float32',
+                 output_all_encodings: bool = False,
+                 layer_norm_eps: float = 1E-12,
+                 weight_initializer: InitializerType = TruncNorm(stdev=0.02),
+                 bias_initializer: InitializerType = 'zeros',
+                 activation='gelu',
+                 layout='NT'):
+        super().__init__()
+        assert units % num_heads == 0,\
+            'In BertTransformer, The units should be divided exactly ' \
+            'by the number of heads. Received units={}, num_heads={}' \
+            .format(units, num_heads)
+
+        self._dtype = dtype
+        self._num_layers = num_layers
+        self._output_attention = output_attention
+        self._output_all_encodings = output_all_encodings
+        self._layout = layout
+
+        self.all_layers = nn.HybridSequential()
+        for layer_idx in range(num_layers):
+            self.all_layers.add(
+              TransformerEncoderLayer(units=units,
+                                      hidden_size=hidden_size,
+                                      num_heads=num_heads,
+                                      attention_dropout_prob=attention_dropout_prob,
+                                      hidden_dropout_prob=hidden_dropout_prob,
+                                      layer_norm_eps=layer_norm_eps,
+                                      weight_initializer=weight_initializer,
+                                      bias_initializer=bias_initializer,
+                                      activation=activation,
+                                      layout=layout,
+                                      dtype=dtype))
+
+    @property
+    def layout(self):
+        return self._layout
+
+    def hybrid_forward(self, F, data, valid_length):
+        """
+        Generate the representation given the inputs.
+
+        This is used in training or fine-tuning a bert model.
+
+        Parameters
+        ----------
+        F
+        data
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C)
+        valid_length
+            Shape (batch_size,)
+
+        Returns
+        -------
+        out
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_out)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_out)
+        """
+        if self.layout == 'NT':
+            time_axis, batch_axis = 1, 0
+        else:
+            time_axis, batch_axis = 0, 1
+        # 1. Embed the data
+        attn_mask = gen_self_attn_mask(F, data, valid_length, dtype=self._dtype,
+                                       attn_type='full', layout=self.layout)
+        out = data
+        all_encodings_outputs = []
+        additional_outputs = []
+        for layer_idx in range(self._num_layers):
+            layer = self.all_layers[layer_idx]
+            out, attention_weights = layer(out, attn_mask)
+            # out : [batch_size, seq_len, units] or [seq_len, batch_size, units]
+            # attention_weights : [batch_size, num_heads, seq_len, seq_len]
+            if self._output_all_encodings:
+                out = F.npx.sequence_mask(out,
+                                          sequence_length=valid_length,
+                                          use_sequence_length=True, axis=time_axis)
+                all_encodings_outputs.append(out)
+
+            if self._output_attention:
+                additional_outputs.append(attention_weights)
+
+        if not self._output_all_encodings:
+            # if self._output_all_encodings, SequenceMask is already applied above
+            out = F.npx.sequence_mask(out, sequence_length=valid_length,
+                                      use_sequence_length=True, axis=time_axis)
+            return out, additional_outputs
+        else:
+            return all_encodings_outputs, additional_outputs
+
+
+@use_np
+class BertModel(HybridBlock):
+    def __init__(self,
+                 vocab_size=30000,
+                 units=768,
+                 hidden_size=3072,
+                 num_layers=12,
+                 num_heads=12,
+                 max_length=512,
+                 hidden_dropout_prob=0.,
+                 attention_dropout_prob=0.,
+                 num_token_types=2,
+                 pos_embed_type='learned',
+                 activation='gelu',
+                 layer_norm_eps=1E-12,
+                 embed_initializer=TruncNorm(stdev=0.02),
+                 weight_initializer=TruncNorm(stdev=0.02),
+                 bias_initializer='zeros',
+                 dtype='float32',
+                 use_pooler=True,
+                 layout='NT',
+                 compute_layout='auto'):
+        super().__init__()
+        self._dtype = dtype
+        self.use_pooler = use_pooler
+        self.pos_embed_type = pos_embed_type
+        self.num_token_types = num_token_types
+        self.vocab_size = vocab_size
+        self.units = units
+        self.max_length = max_length
+        self.activation = activation
+        self.embed_initializer = embed_initializer
+        self.weight_initializer = weight_initializer
+        self.bias_initializer = bias_initializer
+        self.layer_norm_eps = layer_norm_eps
+        self._layout = layout
+        if compute_layout is None or compute_layout == 'auto':
+            self._compute_layout = layout
+        else:
+            self._compute_layout = compute_layout
+        # Construct BertTransformer
+        self.encoder = BertTransformer(
+            units=units,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            attention_dropout_prob=attention_dropout_prob,
+            hidden_dropout_prob=hidden_dropout_prob,
+            output_attention=False,
+            output_all_encodings=False,
+            activation=activation,
+            layer_norm_eps=layer_norm_eps,
+            weight_initializer=weight_initializer,
+            bias_initializer=bias_initializer,
+            dtype=dtype,
+            layout=self._compute_layout
+        )
+        self.encoder.hybridize()
+        # Construct word embedding
+        self.word_embed = nn.Embedding(input_dim=vocab_size,
+                                       output_dim=units,
+                                       weight_initializer=embed_initializer,
+                                       dtype=dtype)
+        self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps)
+        self.embed_dropout = nn.Dropout(hidden_dropout_prob)
+        # Construct token type embedding
+        self.token_type_embed = nn.Embedding(input_dim=num_token_types,
+                                             output_dim=units,
+                                             weight_initializer=weight_initializer)
+        self.token_pos_embed = PositionalEmbedding(units=units,
+                                                   max_length=max_length,
+                                                   dtype=self._dtype,
+                                                   method=pos_embed_type)
+        if self.use_pooler:
+            # Construct pooler
+            self.pooler = nn.Dense(units=units,
+                                   in_units=units,
+                                   flatten=False,
+                                   activation='tanh',
+                                   weight_initializer=weight_initializer,
+                                   bias_initializer=bias_initializer)
+
+    @property
+    def layout(self):
+        return self._layout
+
+    def hybrid_forward(self, F, inputs, token_types, valid_length):
+        # pylint: disable=arguments-differ
+        """Generate the representation given the inputs.
+
+        This is used in training or fine-tuning a bert model.
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (batch_size, seq_length)
+
+            If the inputs contain two sequences, we will set different token types for the first
+             sentence and the second sentence.
+        valid_length :
+            The valid length of each sequence
+            Shape (batch_size,)
+
+        Returns
+        -------
+        contextual_embedding
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
+        pooled_output :
+            This is optional. Shape (batch_size, units)
+        """
+        initial_embedding = self.get_initial_embedding(F, inputs, token_types)
+        prev_out = initial_embedding
+        outputs = []
+        if self._compute_layout != self._layout:
+            # Swap the axes if the compute_layout and layout mismatch
+            contextual_embeddings, additional_outputs = self.encoder(F.np.swapaxes(prev_out, 0, 1),
+                                                                     valid_length)
+            contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1)
+        else:
+            contextual_embeddings, additional_outputs = self.encoder(prev_out, valid_length)
+        outputs.append(contextual_embeddings)
+        if self.use_pooler:
+            pooled_out = self.apply_pooling(contextual_embeddings)
+            outputs.append(pooled_out)
+        return tuple(outputs) if len(outputs) > 1 else outputs[0]
+
+    def get_initial_embedding(self, F, inputs, token_types=None):
+        """Get the initial token embeddings that considers the token type and positional embeddings
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+            If None, it will be initialized as all zero
+
+        Returns
+        -------
+        embedding
+            The initial embedding that will be fed into the encoder
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_emb)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_emb)
+        """
+        if self.layout == 'NT':
+            time_axis, batch_axis = 1, 0
+        else:
+            time_axis, batch_axis = 0, 1
+        embedding = self.word_embed(inputs)
+        if token_types is None:
+            token_types = F.np.zeros_like(inputs)
+        type_embedding = self.token_type_embed(token_types)
+        embedding = embedding + type_embedding
+        if self.pos_embed_type is not None:
+            positional_embedding = self.token_pos_embed(F.npx.arange_like(inputs, axis=time_axis))
+            positional_embedding = F.np.expand_dims(positional_embedding, axis=batch_axis)
+            embedding = embedding + positional_embedding
+        # Extra layer normalization plus dropout
+        embedding = self.embed_layer_norm(embedding)
+        embedding = self.embed_dropout(embedding)
+        return embedding
+
+    def apply_pooling(self, sequence):
+        """Generate the representation given the inputs.
+
+        This is used for pre-training or fine-tuning a bert model.
+        Get the first token of the whole sequence which is [CLS]
+
+        sequence
+            - layout = 'NT'
+                Shape (batch_size, sequence_length, units)
+            - layout = 'TN'
+                Shape (sequence_length, batch_size, units)
+        return:
+            Shape (batch_size, units)
+        """
+        if self.layout == 'NT':
+            outputs = sequence[:, 0, :]
+        else:
+            outputs = sequence[0, :, :]
+        return self.pooler(outputs)
+
+    @staticmethod
+    def get_cfg(key=None):
+        if key is not None:
+            return bert_cfg_reg.create(key)
+        else:
+            return google_en_uncased_bert_base()
+
+    @classmethod
+    def from_cfg(cls, cfg, use_pooler=True, dtype=None) -> 'BertModel':
+        """
+
+        Parameters
+        ----------
+        cfg
+            Configuration
+        use_pooler
+            Whether to output the pooled feature
+        dtype
+            data type of the model
+
+        Returns
+        -------
+        ret
+            The constructed BertModel
+        """
+        cfg = BertModel.get_cfg().clone_merge(cfg)
+        assert cfg.VERSION == 1, 'Wrong version!'
+        embed_initializer = mx.init.create(*cfg.INITIALIZER.embed)
+        weight_initializer = mx.init.create(*cfg.INITIALIZER.weight)
+        bias_initializer = mx.init.create(*cfg.INITIALIZER.bias)
+        if dtype is None:
+            dtype = cfg.MODEL.dtype
+        return cls(vocab_size=cfg.MODEL.vocab_size,
+                   units=cfg.MODEL.units,
+                   hidden_size=cfg.MODEL.hidden_size,
+                   num_layers=cfg.MODEL.num_layers,
+                   num_heads=cfg.MODEL.num_heads,
+                   max_length=cfg.MODEL.max_length,
+                   hidden_dropout_prob=cfg.MODEL.hidden_dropout_prob,
+                   attention_dropout_prob=cfg.MODEL.attention_dropout_prob,
+                   num_token_types=cfg.MODEL.num_token_types,
+                   pos_embed_type=cfg.MODEL.pos_embed_type,
+                   activation=cfg.MODEL.activation,
+                   layer_norm_eps=cfg.MODEL.layer_norm_eps,
+                   dtype=dtype,
+                   embed_initializer=embed_initializer,
+                   weight_initializer=weight_initializer,
+                   bias_initializer=bias_initializer,
+                   use_pooler=use_pooler,
+                   layout=cfg.MODEL.layout,
+                   compute_layout=cfg.MODEL.compute_layout)
+
+
+@use_np
+class BertForMLM(HybridBlock):
+    def __init__(self, backbone_cfg,
+                 weight_initializer=None,
+                 bias_initializer=None):
+        """
+
+        Parameters
+        ----------
+        backbone_cfg
+        weight_initializer
+        bias_initializer
+        """
+        super().__init__()
+        self.backbone_model = BertModel.from_cfg(backbone_cfg)
+        if weight_initializer is None:
+            weight_initializer = self.backbone_model.weight_initializer
+        if bias_initializer is None:
+            bias_initializer = self.backbone_model.bias_initializer
+        self.mlm_decoder = nn.HybridSequential()
+        # Extra non-linear layer
+        self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units,
+                                      flatten=False,
+                                      weight_initializer=weight_initializer,
+                                      bias_initializer=bias_initializer))
+        self.mlm_decoder.add(get_activation(self.backbone_model.activation))
+        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
+        # only load the dense weights with a re-initialized bias
+        # parameters are stored in 'word_embed_bias' which is
+        # not used in original embedding
+        self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size,
+                                      flatten=False,
+                                      bias_initializer=bias_initializer))
+        self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
+        self.mlm_decoder.hybridize()
+
+    @property
+    def layout(self):
+        return self.backbone_model.layout
+
+    def hybrid_forward(self, F, inputs, token_types, valid_length,
+                       masked_positions):
+        """Getting the scores of the masked positions.
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+
+            If the inputs contain two sequences, we will set different token types for the first
+             sentence and the second sentence.
+        valid_length :
+            The valid length of each sequence
+            Shape (batch_size,)
+        masked_positions :
+            The masked position of the sequence
+            Shape (batch_size, num_masked_positions).
+
+        Returns
+        -------
+        contextual_embedding
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units)
+        pooled_out
+            Shape (batch_size, units)
+        mlm_scores :
+            Shape (batch_size, num_masked_positions, vocab_size)
+        """
+        contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length)
+        if self.layout == 'NT':
+            mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        else:
+            mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1),
+                                                      masked_positions)
+        mlm_scores = self.mlm_decoder(mlm_features)
+        return contextual_embeddings, pooled_out, mlm_scores
+
+
+@use_np
+class BertForPretrain(HybridBlock):
+    def __init__(self, backbone_cfg,
+                 weight_initializer=None,
+                 bias_initializer=None):
+        """
+
+        Parameters
+        ----------
+        backbone_cfg
+            The cfg of the backbone model
+        weight_initializer
+        bias_initializer
+        """
+        super().__init__()
+        self.backbone_model = BertModel.from_cfg(backbone_cfg)
+        if weight_initializer is None:
+            weight_initializer = self.backbone_model.weight_initializer
+        if bias_initializer is None:
+            bias_initializer = self.backbone_model.bias_initializer
+        # Construct nsp_classifier for next sentence prediction
+        self.nsp_classifier = nn.Dense(units=2,
+                                       weight_initializer=weight_initializer)
+        self.mlm_decoder = nn.HybridSequential()
+        # Extra non-linear layer
+        self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units,
+                                      flatten=False,
+                                      weight_initializer=weight_initializer,
+                                      bias_initializer=bias_initializer))
+        self.mlm_decoder.add(get_activation(self.backbone_model.activation))
+        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
+        # only load the dense weights with a re-initialized bias
+        # parameters are stored in 'word_embed_bias' which is
+        # not used in original embedding
+        self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size,
+                                      flatten=False,
+                                      bias_initializer=bias_initializer))
+        self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
+        self.mlm_decoder.hybridize()
+
+    @property
+    def layout(self):
+        return self.backbone_model.layout
+
+    def hybrid_forward(self, F, inputs, token_types, valid_length,
+                       masked_positions):
+        """Generate the representation given the inputs.
+
+        This is used in training or fine-tuning a bert model.
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+
+            If the inputs contain two sequences, we will set different token types for the first
+             sentence and the second sentence.
+        valid_length
+            The valid length of each sequence
+            Shape (batch_size,)
+        masked_positions
+            The masked position of the sequence
+            Shape (batch_size, num_masked_positions).
+
+        Returns
+        -------
+        contextual_embedding
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
+        pooled_out
+            Shape (batch_size, units)
+        nsp_score :
+            Shape (batch_size, 2)
+        mlm_scores :
+            Shape (batch_size, num_masked_positions, vocab_size)
+        """
+        contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length)
+        nsp_score = self.nsp_classifier(pooled_out)
+        if self.layout == 'NT':
+            mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        else:
+            mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1),
+                                                      masked_positions)
+        mlm_scores = self.mlm_decoder(mlm_features)
+        return contextual_embeddings, pooled_out, nsp_score, mlm_scores
+
+
+def list_pretrained_bert():
+    return sorted(list(PRETRAINED_URL.keys()))
+
+
+def get_pretrained_bert(model_name: str = 'google_en_cased_bert_base',
+                        root: str = get_model_zoo_home_dir(),
+                        load_backbone: str = True,
+                        load_mlm: str = False)\
+        -> Tuple[CN, HuggingFaceWordPieceTokenizer, str, str]:
+    """Get the pretrained bert weights
+
+    Parameters
+    ----------
+    model_name
+        The name of the bert model.
+    root
+        The downloading root
+    load_backbone
+        Whether to load the weights of the backbone network
+    load_mlm
+        Whether to load the weights of MLM
+
+    Returns
+    -------
+    cfg
+        Network configuration
+    tokenizer
+        The HuggingFaceWordPieceTokenizer
+    backbone_params_path
+        Path to the parameter of the backbone network
+    mlm_params_path
+        Path to the parameter that includes both the backbone and the MLM
+    """
+    assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format(
+        model_name, list_pretrained_bert())
+    cfg_path = PRETRAINED_URL[model_name]['cfg']
+    if isinstance(cfg_path, CN):
+        cfg = cfg_path
+    else:
+        cfg = None
+    vocab_path = PRETRAINED_URL[model_name]['vocab']
+    params_path = PRETRAINED_URL[model_name]['params']
+    mlm_params_path = PRETRAINED_URL[model_name]['mlm_params']
+    local_paths = dict()
+    download_jobs = [('vocab', vocab_path)]
+    if cfg is None:
+        download_jobs.append(('cfg', cfg_path))
+    for key, path in download_jobs:
+        local_paths[key] = download(url=get_repo_model_zoo_url() + path,
+                                    path=os.path.join(root, path),
+                                    sha1_hash=FILE_STATS[path])
+    if load_backbone:
+        local_params_path = download(url=get_repo_model_zoo_url() + params_path,
+                                     path=os.path.join(root, params_path),
+                                     sha1_hash=FILE_STATS[params_path])
+    else:
+        local_params_path = None
+    if load_mlm and mlm_params_path is not None:
+        local_mlm_params_path = download(url=get_repo_model_zoo_url() + mlm_params_path,
+                                         path=os.path.join(root, mlm_params_path),
+                                         sha1_hash=FILE_STATS[mlm_params_path])
+    else:
+        local_mlm_params_path = None
+    do_lower = True if 'lowercase' in PRETRAINED_URL[model_name]\
+                       and PRETRAINED_URL[model_name]['lowercase'] else False
+    tokenizer = HuggingFaceWordPieceTokenizer(
+                    vocab_file=local_paths['vocab'],
+                    unk_token='[UNK]',
+                    pad_token='[PAD]',
+                    cls_token='[CLS]',
+                    sep_token='[SEP]',
+                    mask_token='[MASK]',
+                    lowercase=do_lower)
+    if cfg is None:
+        cfg = BertModel.get_cfg().clone_merge(local_paths['cfg'])
+    return cfg, tokenizer, local_params_path, local_mlm_params_path
+
+
+BACKBONE_REGISTRY.register('bert', [BertModel,
+                                    get_pretrained_bert,
+                                    list_pretrained_bert])
diff --git a/src/gluonnlp/models/electra.py b/src/gluonnlp/models/electra.py
new file mode 100644
index 0000000000..bb26f37d15
--- /dev/null
+++ b/src/gluonnlp/models/electra.py
@@ -0,0 +1,1127 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Electra Model.
+
+@inproceedings{clark2020electra,
+  title = {{ELECTRA}: Pre-training Text Encoders as Discriminators Rather Than Generators},
+  author = {Kevin Clark and Minh-Thang Luong and Quoc V. Le and Christopher D. Manning},
+  booktitle = {ICLR},
+  year = {2020},
+  url = {https://openreview.net/pdf?id=r1xMH1BtvB}
+}
+
+"""
+__all__ = ['ElectraModel', 'ElectraDiscriminator', 'ElectraGenerator',
+           'ElectraForPretrain', 'list_pretrained_electra', 'get_pretrained_electra']
+
+import os
+from typing import Tuple, Optional
+
+import mxnet as mx
+import numpy as np
+from mxnet import use_np
+from mxnet.gluon import HybridBlock, nn
+from ..registry import BACKBONE_REGISTRY
+from ..op import gumbel_softmax, select_vectors_by_position, add_vectors_by_position, update_vectors_by_position
+from ..base import get_model_zoo_home_dir, get_repo_model_zoo_url, get_model_zoo_checksum_dir
+from ..layers import PositionalEmbedding, get_activation
+from .transformer import TransformerEncoderLayer
+from ..initializer import TruncNorm
+from ..utils.config import CfgNode as CN
+from ..utils.misc import load_checksum_stats, download
+from ..utils.registry import Registry
+from ..attention_cell import gen_self_attn_mask
+from ..data.tokenizers import HuggingFaceWordPieceTokenizer
+
+electra_cfg_reg = Registry('electra_cfg')
+
+
+def get_generator_cfg(model_config):
+    """
+    Get the generator configuration from the Electra model config.
+    The size of generator is usually smaller than discriminator but same in electra small,
+    which exists  a conflict between source code and original paper.
+    """
+    generator_cfg = model_config.clone()
+    generator_layers_scale = model_config.MODEL.generator_layers_scale
+    generator_units_scale = model_config.MODEL.generator_units_scale
+    generator_cfg.defrost()
+    # the round function is used to slove int(0.3333*768)!=256 for electra base
+    generator_cfg.MODEL.units = round(generator_units_scale * model_config.MODEL.units)
+    generator_cfg.MODEL.hidden_size = round(generator_units_scale * model_config.MODEL.hidden_size)
+    generator_cfg.MODEL.num_heads = round(generator_units_scale * model_config.MODEL.num_heads)
+    generator_cfg.MODEL.num_layers = round(generator_layers_scale * model_config.MODEL.num_layers)
+    generator_cfg.freeze()
+    return generator_cfg
+
+
+@electra_cfg_reg.register()
+def google_electra_small():
+    cfg = CN()
+    # Model
+    cfg.MODEL = CN()
+    cfg.MODEL.vocab_size = 30522
+    cfg.MODEL.embed_size = 128
+    cfg.MODEL.units = 256
+    cfg.MODEL.hidden_size = 1024
+    cfg.MODEL.max_length = 512
+    cfg.MODEL.num_heads = 4
+    cfg.MODEL.num_layers = 12
+    cfg.MODEL.pos_embed_type = 'learned'
+    cfg.MODEL.activation = 'gelu'
+    cfg.MODEL.layer_norm_eps = 1E-12
+    cfg.MODEL.num_token_types = 2
+    # Dropout regularization
+    cfg.MODEL.hidden_dropout_prob = 0.1
+    cfg.MODEL.attention_dropout_prob = 0.1
+    cfg.MODEL.dtype = 'float32'
+    # Layout flags
+    cfg.MODEL.layout = 'NT'
+    cfg.MODEL.compute_layout = 'auto'
+    # Generator hyper-parameters
+    cfg.MODEL.generator_layers_scale = 1.0
+    cfg.MODEL.generator_units_scale = 1.0
+    # Initializer
+    cfg.INITIALIZER = CN()
+    cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02]
+    cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02]  # TruncNorm(0, 0.02)
+    cfg.INITIALIZER.bias = ['zeros']
+    cfg.VERSION = 1
+    cfg.freeze()
+    return cfg
+
+
+@electra_cfg_reg.register()
+def google_electra_base():
+    cfg = google_electra_small()
+    cfg.defrost()
+    cfg.MODEL.embed_size = 768
+    cfg.MODEL.units = 768
+    cfg.MODEL.hidden_size = 3072
+    cfg.MODEL.num_heads = 12
+    cfg.MODEL.num_layers = 12
+    cfg.MODEL.generator_units_scale = 0.33333
+    cfg.freeze()
+    return cfg
+
+
+@electra_cfg_reg.register()
+def google_electra_large():
+    cfg = google_electra_small()
+    cfg.defrost()
+    cfg.MODEL.embed_size = 1024
+    cfg.MODEL.units = 1024
+    cfg.MODEL.hidden_size = 4096
+    cfg.MODEL.num_heads = 16
+    cfg.MODEL.num_layers = 24
+    cfg.MODEL.generator_units_scale = 0.25
+    cfg.freeze()
+    return cfg
+
+
+PRETRAINED_URL = {
+    'google_electra_small': {
+        'cfg': google_electra_small(),
+        'vocab': 'google_electra_small/vocab-e6d2b21d.json',
+        'params': 'google_electra_small/model-2654c8b4.params',
+        'disc_model': 'google_electra_small/disc_model-137714b6.params',
+        'gen_model': 'google_electra_small/gen_model-0c30d1c5.params',
+        'lowercase': True,
+    },
+    'google_electra_base': {
+        'cfg': google_electra_base(),
+        'vocab': 'google_electra_base/vocab-e6d2b21d.json',
+        'params': 'google_electra_base/model-31c235cc.params',
+        'disc_model': 'google_electra_base/disc_model-514bd353.params',
+        'gen_model': 'google_electra_base/gen_model-253a62c9.params',
+        'lowercase': True,
+    },
+    'google_electra_large': {
+        'cfg': google_electra_large(),
+        'vocab': 'google_electra_large/vocab-e6d2b21d.json',
+        'params': 'google_electra_large/model-9baf9ff5.params',
+        'disc_model': 'google_electra_large/disc_model-5b820c02.params',
+        'gen_model': 'google_electra_large/gen_model-82c1b17b.params',
+        'lowercase': True,
+    },
+    'gluon_electra_small_owt': {
+        'cfg': 'gluon_electra_small_owt/model-6e276d98.yml',
+        'vocab': 'gluon_electra_small_owt/vocab-e6d2b21d.json',
+        'params': 'gluon_electra_small_owt/model-e9636891.params',
+        'disc_model': 'gluon_electra_small_owt/disc_model-87836017.params',
+        'gen_model': 'gluon_electra_small_owt/gen_model-45a6fb67.params',
+        'lowercase': True,
+    }
+}
+
+FILE_STATS = load_checksum_stats(os.path.join(get_model_zoo_checksum_dir(), 'electra.txt'))
+
+
+# TODO(sxjscience) Use BertTransformer
+@use_np
+class ElectraEncoder(HybridBlock):
+    def __init__(self, units=512,
+                 hidden_size=2048,
+                 num_layers=6,
+                 num_heads=8,
+                 attention_dropout_prob=0.,
+                 hidden_dropout_prob=0.,
+                 output_attention=False,
+                 dtype='float32',
+                 output_all_encodings=False,
+                 layer_norm_eps=1E-12,
+                 weight_initializer=TruncNorm(stdev=0.02),
+                 bias_initializer='zeros',
+                 activation='gelu',
+                 layout='NT'):
+        """
+
+        Parameters
+        ----------
+        units
+            The number of units
+        hidden_size
+            The hidden size
+        num_layers
+            Number of layers
+        num_heads
+            Number of heads
+        attention_dropout_prob
+            Dropout probability of the attention layer
+        hidden_dropout_prob
+            Dropout probability
+        output_attention
+            Whether to output the attention weights
+        dtype
+            Data type of the weights
+        output_all_encodings
+        layer_norm_eps
+        weight_initializer
+        bias_initializer
+        activation
+        layout
+        """
+        super().__init__()
+        assert units % num_heads == 0, \
+            'In ElectraEncoder, The units should be divisible ' \
+            'by the number of heads. Received units={}, num_heads={}' \
+            .format(units, num_heads)
+
+        self._dtype = dtype
+        self._layout = layout
+        self._num_layers = num_layers
+
+        self._output_attention = output_attention
+        self._output_all_encodings = output_all_encodings
+
+        self.all_encoder_layers = nn.HybridSequential()
+        for layer_idx in range(num_layers):
+            self.all_encoder_layers.add(
+                TransformerEncoderLayer(units=units,
+                                        hidden_size=hidden_size,
+                                        num_heads=num_heads,
+                                        attention_dropout_prob=attention_dropout_prob,
+                                        hidden_dropout_prob=hidden_dropout_prob,
+                                        layer_norm_eps=layer_norm_eps,
+                                        weight_initializer=weight_initializer,
+                                        bias_initializer=bias_initializer,
+                                        activation=activation,
+                                        dtype=dtype,
+                                        layout=layout))
+
+    @property
+    def layout(self):
+        return self._layout
+
+    def hybrid_forward(self, F, data, valid_length):
+        """
+        Generate the representation given the inputs.
+
+        This is used in training or fine-tuning a Electra model.
+
+        Parameters
+        ----------
+        F
+        data
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C)
+        valid_length
+            Shape (batch_size,)
+
+        Returns
+        -------
+        out
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_out)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_out)
+        """
+        if self.layout == 'NT':
+            time_axis, batch_axis = 1, 0
+        else:
+            time_axis, batch_axis = 0, 1
+        # 1. Embed the data
+        attn_mask = gen_self_attn_mask(F, data, valid_length,
+                                       dtype=self._dtype,
+                                       layout=self._layout,
+                                       attn_type='full')
+        out = data
+        all_encodings_outputs = []
+        additional_outputs = []
+        for layer_idx in range(self._num_layers):
+            layer = self.all_encoder_layers[layer_idx]
+            out, attention_weights = layer(out, attn_mask)
+            # out : [batch_size, seq_len, units]
+            # attention_weights : [batch_size, num_heads, seq_len, seq_len]
+            if self._output_all_encodings:
+                out = F.npx.sequence_mask(out,
+                                          sequence_length=valid_length,
+                                          use_sequence_length=True,
+                                          axis=time_axis)
+                all_encodings_outputs.append(out)
+
+            if self._output_attention:
+                additional_outputs.append(attention_weights)
+
+        if not self._output_all_encodings:
+            # if self._output_all_encodings, SequenceMask is already applied above
+            out = F.npx.sequence_mask(out, sequence_length=valid_length,
+                                      use_sequence_length=True, axis=time_axis)
+            return out, additional_outputs
+        else:
+            return all_encodings_outputs, additional_outputs
+
+
+@use_np
+class ElectraModel(HybridBlock):
+    """Electra Model
+
+    This is almost the same as bert model with embedding_size adjustable (factorized embedding).
+    """
+
+    def __init__(self,
+                 vocab_size=30000,
+                 units=768,
+                 hidden_size=3072,
+                 embed_size=128,
+                 num_layers=12,
+                 num_heads=12,
+                 max_length=512,
+                 hidden_dropout_prob=0.,
+                 attention_dropout_prob=0.,
+                 num_token_types=2,
+                 pos_embed_type='learned',
+                 activation='gelu',
+                 layer_norm_eps=1E-12,
+                 embed_initializer=TruncNorm(stdev=0.02),
+                 weight_initializer=TruncNorm(stdev=0.02),
+                 bias_initializer='zeros',
+                 dtype='float32',
+                 use_pooler=True,
+                 layout='NT',
+                 compute_layout='auto'):
+        super().__init__()
+        self._dtype = dtype
+        self.use_pooler = use_pooler
+        self.pos_embed_type = pos_embed_type
+        self.num_token_types = num_token_types
+        self.vocab_size = vocab_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.embed_size = embed_size
+        self.units = units
+        self.max_length = max_length
+        self.activation = activation
+        self.embed_initializer = embed_initializer
+        self.weight_initializer = weight_initializer
+        self.bias_initializer = bias_initializer
+        self.layer_norm_eps = layer_norm_eps
+        self._layout = layout
+        if compute_layout is None or compute_layout == 'auto':
+            self._compute_layout = layout
+        else:
+            self._compute_layout = compute_layout
+        # Construct ElectraEncoder
+        self.encoder = ElectraEncoder(
+            units=units,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            attention_dropout_prob=attention_dropout_prob,
+            hidden_dropout_prob=hidden_dropout_prob,
+            output_attention=False,
+            output_all_encodings=False,
+            activation=activation,
+            layer_norm_eps=layer_norm_eps,
+            weight_initializer=weight_initializer,
+            bias_initializer=bias_initializer,
+            dtype=dtype,
+            layout=self._compute_layout,
+        )
+        self.encoder.hybridize()
+
+        self.word_embed = nn.Embedding(input_dim=vocab_size,
+                                       output_dim=embed_size,
+                                       weight_initializer=embed_initializer,
+                                       dtype=dtype)
+        # Construct token type embedding
+        self.token_type_embed = nn.Embedding(input_dim=num_token_types,
+                                             output_dim=embed_size,
+                                             weight_initializer=weight_initializer)
+        self.token_pos_embed = PositionalEmbedding(units=embed_size,
+                                                   max_length=max_length,
+                                                   dtype=self._dtype,
+                                                   method=pos_embed_type)
+        self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps)
+
+        self.embed_dropout = nn.Dropout(hidden_dropout_prob)
+        if embed_size != units:
+            self.embed_factorized_proj = nn.Dense(units=units,
+                                                  flatten=False,
+                                                  weight_initializer=weight_initializer,
+                                                  bias_initializer=bias_initializer)
+
+    @property
+    def layout(self):
+        return self._layout
+
+    def hybrid_forward(self, F, inputs, token_types, valid_length=None):
+        # pylint: disable=arguments-differ
+        """Generate the representation given the inputs.
+
+        This is used in training or fine-tuning a Electra model.
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+
+            If the inputs contain two sequences, we will set different token types for the first
+             sentence and the second sentence.
+        valid_length
+            The valid length of each sequence
+            Shape (batch_size,)
+
+        Returns
+        -------
+        contextual_embedding
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
+        pooled_output
+            This is optional. Shape (batch_size, units)
+        """
+        initial_embedding = self.get_initial_embedding(F, inputs, token_types)
+        # Projecting the embedding into units
+        prev_out = initial_embedding
+        if self.embed_size != self.units:
+            prev_out = self.embed_factorized_proj(prev_out)
+        outputs = []
+        if self._compute_layout != self._layout:
+            # Swap the axes if the compute_layout and layout mismatch
+            contextual_embeddings, additional_outputs = self.encoder(F.np.swapaxes(prev_out, 0, 1),
+                                                                     valid_length)
+            contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1)
+        else:
+            contextual_embeddings, additional_outputs = self.encoder(prev_out, valid_length)
+        outputs.append(contextual_embeddings)
+        if self.use_pooler:
+            # Here we just get the first token ([CLS]) without any pooling strategy,
+            # which is slightly different from bert model with the pooled_out
+            # the attribute name is keeping the same as bert and albert model with defualt
+            # use_pooler=True
+            if self._layout == 'NT':
+                pooled_out = contextual_embeddings[:, 0, :]
+            else:
+                pooled_out = contextual_embeddings[0, :, :]
+            outputs.append(pooled_out)
+        return tuple(outputs) if len(outputs) > 1 else outputs[0]
+
+    #TODO(sxjscience) Move to a `common.py`
+    def get_initial_embedding(self, F, inputs, token_types=None):
+        """Get the initial token embeddings that considers the token type and positional embeddings
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+            If None, it will be initialized as all zero
+
+        Returns
+        -------
+        embedding
+            The initial embedding that will be fed into the encoder
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_embed)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_embed)
+        """
+        if self.layout == 'NT':
+            time_axis, batch_axis = 1, 0
+        else:
+            time_axis, batch_axis = 0, 1
+        embedding = self.word_embed(inputs)
+        if token_types is None:
+            token_types = F.np.zeros_like(inputs)
+        type_embedding = self.token_type_embed(token_types)
+        embedding = embedding + type_embedding
+        if self.pos_embed_type is not None:
+            positional_embedding = self.token_pos_embed(F.npx.arange_like(inputs, axis=time_axis))
+            positional_embedding = F.np.expand_dims(positional_embedding, axis=batch_axis)
+            embedding = embedding + positional_embedding
+        # Extra layer normalization plus dropout
+        embedding = self.embed_layer_norm(embedding)
+        embedding = self.embed_dropout(embedding)
+        return embedding
+
+    def apply_layerwise_decay(self, layerwise_decay, not_included=None):
+        """Apply the layer-wise gradient decay
+
+        .. math::
+            lr = lr * layerwise_decay^(max_depth - layer_depth)
+
+        Parameters:
+        ----------
+        layerwise_decay: int
+            layer-wise decay power
+        not_included: list of str
+            A list or parameter names that not included in the layer-wise decay
+        """
+
+        # consider the task specific finetuning layer as the last layer, following with pooler
+        # In addition, the embedding parameters have the smaller learning rate based on this setting.
+        max_depth = self.num_layers + 2
+        for _, value in self.collect_params('.*embed*').items():
+            value.lr_mult = layerwise_decay**(max_depth)
+
+        for (layer_depth, layer) in enumerate(self.encoder.all_encoder_layers):
+            layer_params = layer.collect_params()
+            for key, value in layer_params.items():
+                if not_included:
+                    for pn in not_included:
+                        if pn in key:
+                            continue
+                value.lr_mult = layerwise_decay**(max_depth - (layer_depth + 1))
+
+    def frozen_params(self, untunable_depth, not_included=None):
+        """Froze part of parameters according to layer depth.
+
+        That is, make all layer that shallower than `untunable_depth` untunable
+        to stop the gradient backward computation and accelerate the training.
+
+        Parameters:
+        ----------
+        untunable_depth: int
+            the depth of the neural network starting from 1 to number of layers
+        not_included: list of str
+            A list or parameter names that not included in the untunable parameters
+        """
+        all_layers = self.encoder.all_encoder_layers
+        for _, value in self.collect_params('.*embed*').items():
+            value.grad_req = 'null'
+
+        for layer in all_layers[:untunable_depth]:
+            for key, value in layer.collect_params().items():
+                if not_included:
+                    for pn in not_included:
+                        if pn in key:
+                            continue
+                value.grad_req = 'null'
+
+    @staticmethod
+    def get_cfg(key=None):
+        if key is not None:
+            return electra_cfg_reg.create(key)
+        else:
+            return google_electra_base()
+
+    @classmethod
+    def from_cfg(cls, cfg, use_pooler=True, dtype=None) -> 'ElectraModel':
+        cfg = ElectraModel.get_cfg().clone_merge(cfg)
+        assert cfg.VERSION == 1, 'Wrong version!'
+        embed_initializer = mx.init.create(*cfg.INITIALIZER.embed)
+        weight_initializer = mx.init.create(*cfg.INITIALIZER.weight)
+        bias_initializer = mx.init.create(*cfg.INITIALIZER.bias)
+        if dtype is None:
+            dtype = cfg.MODEL.dtype
+        return cls(vocab_size=cfg.MODEL.vocab_size,
+                   units=cfg.MODEL.units,
+                   hidden_size=cfg.MODEL.hidden_size,
+                   embed_size=cfg.MODEL.embed_size,
+                   num_layers=cfg.MODEL.num_layers,
+                   num_heads=cfg.MODEL.num_heads,
+                   max_length=cfg.MODEL.max_length,
+                   hidden_dropout_prob=cfg.MODEL.hidden_dropout_prob,
+                   attention_dropout_prob=cfg.MODEL.attention_dropout_prob,
+                   num_token_types=cfg.MODEL.num_token_types,
+                   pos_embed_type=cfg.MODEL.pos_embed_type,
+                   activation=cfg.MODEL.activation,
+                   layer_norm_eps=cfg.MODEL.layer_norm_eps,
+                   dtype=dtype,
+                   embed_initializer=embed_initializer,
+                   weight_initializer=weight_initializer,
+                   bias_initializer=bias_initializer,
+                   use_pooler=use_pooler,
+                   layout=cfg.MODEL.layout,
+                   compute_layout=cfg.MODEL.compute_layout)
+
+
+@use_np
+class ElectraDiscriminator(HybridBlock):
+    """
+    It is slightly different from the traditional mask language model which recover the
+    masked word (find the matched word in dictionary). The Object of Discriminator in
+    Electra is 'replaced token detection' that is a binary classification task to
+    predicts every token whether it is an original or a replacement.
+    """
+
+    def __init__(self, backbone_cfg,
+                 weight_initializer=None,
+                 bias_initializer=None):
+        """
+
+        Parameters
+        ----------
+        backbone_cfg
+        weight_initializer
+        bias_initializer
+        """
+        super().__init__()
+        self.backbone_model = ElectraModel.from_cfg(backbone_cfg)
+        if weight_initializer is None:
+            weight_initializer = self.backbone_model.weight_initializer
+        if bias_initializer is None:
+            bias_initializer = self.backbone_model.bias_initializer
+        self.rtd_encoder = nn.HybridSequential()
+        # Extra non-linear layer
+        self.rtd_encoder.add(nn.Dense(units=self.backbone_model.units,
+                                      flatten=False,
+                                      weight_initializer=weight_initializer,
+                                      bias_initializer=bias_initializer))
+        self.rtd_encoder.add(get_activation(self.backbone_model.activation))
+        self.rtd_encoder.add(nn.Dense(units=1,
+                                      flatten=False,
+                                      weight_initializer=weight_initializer,
+                                      bias_initializer=bias_initializer))
+        self.rtd_encoder.hybridize()
+
+    def hybrid_forward(self, F, inputs, token_types, valid_length):
+        """Getting the scores of the replaced token detection of the whole sentence
+        based on the corrupted tokens produced from a generator.
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+
+            If the inputs contain two sequences, we will set different token types for the first
+             sentence and the second sentence.
+        valid_length
+            The valid length of each sequence
+            Shape (batch_size,)
+
+        Returns
+        -------
+        contextual_embedding
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
+        pooled_out
+            Shape (batch_size, units)
+        rtd_scores
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        """
+        contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length)
+        rtd_scores = self.rtd_encoder(contextual_embeddings).squeeze(-1)
+        return contextual_embeddings, pooled_out, rtd_scores
+
+
+@use_np
+class ElectraGenerator(HybridBlock):
+    """
+    This is a typical mlm model whose size is usually the 1/4 - 1/2 of the discriminator.
+    """
+
+    def __init__(self, backbone_cfg,
+                 weight_initializer=None,
+                 bias_initializer=None):
+        """
+
+        Parameters
+        ----------
+        backbone_cfg
+            Configuration of the backbone model
+        weight_initializer
+        bias_initializer
+        """
+        super().__init__()
+        self.backbone_model = ElectraModel.from_cfg(backbone_cfg)
+        if weight_initializer is None:
+            weight_initializer = self.backbone_model.weight_initializer
+        if bias_initializer is None:
+            bias_initializer = self.backbone_model.bias_initializer
+        self.mlm_decoder = nn.HybridSequential()
+        # Extra non-linear layer
+        self.mlm_decoder.add(nn.Dense(units=self.backbone_model.embed_size,
+                                      flatten=False,
+                                      weight_initializer=weight_initializer,
+                                      bias_initializer=bias_initializer))
+        self.mlm_decoder.add(get_activation(self.backbone_model.activation))
+        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
+        # only load the dense weights with a re-initialized bias
+        # parameters are stored in 'word_embed_bias' which is
+        # not used in original embedding
+        self.mlm_decoder.add(
+            nn.Dense(
+                units=self.backbone_model.vocab_size,
+                flatten=False,
+                bias_initializer=bias_initializer))
+        self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
+        self.mlm_decoder.hybridize()
+
+    # TODO(sxjscience,zheyu) Should design a better API
+    def tie_embeddings(self, word_embed_params=None,
+                       token_type_embed_params=None,
+                       token_pos_embed_params=None,
+                       embed_layer_norm_params=None):
+        """Tie the embedding layers between the backbone and the MLM decoder
+
+        Parameters
+        ----------
+        word_embed_params
+        token_type_embed_params
+        token_pos_embed_params
+        embed_layer_norm_params
+
+        """
+        self.backbone_model.word_embed.share_parameters(word_embed_params)
+        self.mlm_decoder[-1].share_parameters(word_embed_params)
+        self.backbone_model.token_type_embed.share_parameters(token_type_embed_params)
+        self.backbone_model.token_pos_embed.share_parameters(token_pos_embed_params)
+        self.backbone_model.embed_layer_norm.share_parameters(embed_layer_norm_params)
+
+    def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions):
+        """Getting the scores of the masked positions.
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+
+            If the inputs contain two sequences, we will set different token types for the first
+             sentence and the second sentence.
+        valid_length :
+            The valid length of each sequence
+            Shape (batch_size,)
+        masked_positions :
+            The masked position of the sequence
+            Shape (batch_size, num_masked_positions).
+
+        Returns
+        -------
+        contextual_embedding
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
+        pooled_out
+            Shape (batch_size, units)
+        mlm_scores :
+            Shape (batch_size, num_masked_positions, vocab_size)
+        """
+        contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length)
+        if self.backbone_model.layout == 'NT':
+            mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        else:
+            mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1),
+                                                      masked_positions)
+        mlm_scores = self.mlm_decoder(mlm_features)
+        return contextual_embeddings, pooled_out, mlm_scores
+
+
+@use_np
+class ElectraForPretrain(HybridBlock):
+    """
+    An integrated model combined with a generator and a discriminator.  Generator here
+    produces a corrupted tokens playing as fake data to fool a discriminator whose
+    objective is to distinguish whether each token in the input sentence it accepts
+    is the same as the original. It is a classification task instead of prediction
+    task as other pretrained models such as bert.
+    """
+
+    def __init__(self,
+                 disc_cfg,
+                 uniform_generator=False,
+                 tied_generator=False,
+                 tied_embeddings=True,
+                 disallow_correct=False,
+                 temperature=1.0,
+                 gumbel_eps=1E-9,
+                 dtype='float32',
+                 weight_initializer=None,
+                 bias_initializer=None):
+        """
+
+        Parameters
+        ----------
+        disc_cfg :
+            Config for discriminator model including scaled size for generator
+        uniform_generator :
+            Wether to get a generator with uniform weights, the mlm_scores from
+            which are totally random. In this case , a discriminator learns from
+            a random 15% of the input tokens distinct from the subset.
+        tied_generator :
+            Whether to tie backbone model weights of generator and discriminator.
+            The size of G and D are required to be same if set to True.
+        tied_embeddings :
+            Whether to tie the embeddings of generator and discriminator
+        disallow_correct :
+            Whether the correct smaples of generator are allowed,
+            that is 15% of tokens are always fake.
+        temperature :
+            Temperature of gumbel distribution for sampling from generator
+        weight_initializer
+        bias_initializer
+        """
+        super().__init__()
+        self._uniform_generator = uniform_generator
+        self._tied_generator = tied_generator
+        self._tied_embeddings = tied_embeddings
+        self._disallow_correct = disallow_correct
+        self._temperature = temperature
+        self._gumbel_eps = gumbel_eps
+        self._dtype = dtype
+
+        self.disc_cfg = disc_cfg
+        self.vocab_size = disc_cfg.MODEL.vocab_size
+        self.gen_cfg = get_generator_cfg(disc_cfg)
+        self.discriminator = ElectraDiscriminator(disc_cfg,
+                                                  weight_initializer=weight_initializer,
+                                                  bias_initializer=bias_initializer)
+        self.disc_backbone = self.discriminator.backbone_model
+
+        if not uniform_generator and not tied_generator:
+            self.generator = ElectraGenerator(self.gen_cfg,
+                                              weight_initializer=weight_initializer,
+                                              bias_initializer=bias_initializer)
+            if tied_embeddings:
+                self.generator.tie_embeddings(self.disc_backbone.word_embed.collect_params(),
+                                              self.disc_backbone.token_type_embed.collect_params(),
+                                              self.disc_backbone.token_pos_embed.collect_params(),
+                                              self.disc_backbone.embed_layer_norm.collect_params())
+            self.generator.hybridize()
+
+        elif tied_generator:
+            # Reuse the weight of the discriminator backbone model
+            self.generator = ElectraGenerator(self.gen_cfg,
+                                              weight_initializer=weight_initializer,
+                                              bias_initializer=bias_initializer)
+            # TODO(sxjscience, zheyu) Verify
+            self.generator.backbone_model = self.disc_backbone
+            self.generator.hybridize()
+        elif uniform_generator:
+            # get the mlm_scores randomly over vocab
+            self.generator = None
+
+        self.discriminator.hybridize()
+
+    def hybrid_forward(self, F, inputs, token_types, valid_length,
+                       original_tokens, masked_positions):
+        """Getting the mlm scores of each masked positions from a generator,
+        then produces the corrupted tokens sampling from a gumbel distribution.
+        We also get the ground-truth and scores of the replaced token detection
+        which is output by a discriminator. The ground-truth is an array with same
+        shape as the input using 1 stand for original token and 0 for replacement.
+
+        Notice: There is a problem when the masked positions have duplicate indexs.
+        Try to avoid that in the data preprocessing process. In addition, loss calculation
+        should be done in the training scripts as well.
+
+        Parameters
+        ----------
+        F
+        inputs
+            The masked input
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            The token types.
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+
+            If the inputs contain two sequences, we will set different token types for the first
+             sentence and the second sentence.
+        valid_length
+            The valid length of each sequence.
+            Shape (batch_size,)
+        original_tokens
+            The original tokens that appear in the unmasked input sequence.
+            Shape (batch_size, num_masked_positions).
+        masked_positions :
+            The masked position of the sequence.
+            Shape (batch_size, num_masked_positions).
+
+        Returns
+        -------
+        mlm_scores
+            The masked language model score.
+            Shape (batch_size, num_masked_positions, vocab_size)
+        rtd_scores
+            The replaced-token-detection score. Predicts whether the tokens are replaced or not.
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        replaced_inputs
+
+            Shape (batch_size, num_masked_positions)
+        labels
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        """
+        if self._uniform_generator:
+            # generate the corrupt tokens randomly with a mlm_scores vector whose value is all 0
+            zero_logits = F.np.zeros((1, 1, self.vocab_size), dtype=self._dtype)
+            mlm_scores = F.np.expand_dims(F.np.zeros_like(masked_positions, dtype=self._dtype),
+                                          axis=-1)
+            mlm_scores = mlm_scores + zero_logits
+        else:
+            _, _, mlm_scores = self.generator(inputs, token_types, valid_length, masked_positions)
+
+        corrupted_tokens, fake_data, labels = self.get_corrupted_tokens(
+            F, inputs, original_tokens, masked_positions, mlm_scores)
+        # The discriminator takes the same input as the generator and the token_ids are
+        # replaced with fake data
+        _, _, rtd_scores = self.discriminator(fake_data, token_types, valid_length)
+        return mlm_scores, rtd_scores, corrupted_tokens, labels
+
+    def get_corrupted_tokens(self, F, inputs, original_tokens, masked_positions, logits):
+        """
+        Sample from the generator to create corrupted input.
+
+        Parameters
+        ----------
+        F
+        inputs
+            The masked input
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        original_tokens
+            The original tokens that appear in the unmasked input sequence
+            Shape (batch_size, num_masked_positions).
+        masked_positions
+            The masked position of the sequence
+            Shape (batch_size, num_masked_positions).
+        logits
+            The logits of each tokens
+            Shape (batch_size, num_masked_positions, vocab_size)
+
+        Returns
+        -------
+        corrupted_tokens
+            Shape (batch_size, )
+        fake_data
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        labels
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        """
+
+        if self._disallow_correct:
+            # TODO(sxjscience), Revise the implementation
+            disallow = F.npx.one_hot(masked_positions, depth=self.vocab_size, dtype=self._dtype)
+            logits = logits - 1000.0 * disallow
+        # gumbel_softmax() samples from the logits with a noise of Gumbel distribution
+        prob = gumbel_softmax(
+            F,
+            logits,
+            temperature=self._temperature,
+            eps=self._gumbel_eps,
+            use_np_gumbel=False)
+        corrupted_tokens = F.np.argmax(prob, axis=-1).astype(np.int32)
+
+        if self.disc_backbone.layout == 'TN':
+            inputs = inputs.T
+        original_data = update_vectors_by_position(F,
+            inputs, original_tokens, masked_positions)
+        fake_data = update_vectors_by_position(F,
+            inputs, corrupted_tokens, masked_positions)
+        updates_mask = add_vectors_by_position(F, F.np.zeros_like(inputs),
+                F.np.ones_like(masked_positions), masked_positions)
+        # Dealing with multiple zeros in masked_positions which
+        # results in a non-zero value in the first index [CLS]
+        updates_mask = F.np.minimum(updates_mask, 1)
+        labels = updates_mask * F.np.not_equal(fake_data, original_data)
+        if self.disc_backbone.layout == 'TN':
+            return corrupted_tokens, fake_data.T, labels.T
+        else:
+            return corrupted_tokens, fake_data, labels
+
+
+def list_pretrained_electra():
+    return sorted(list(PRETRAINED_URL.keys()))
+
+
+def get_pretrained_electra(model_name: str = 'google_electra_small',
+                           root: str = get_model_zoo_home_dir(),
+                           load_backbone: bool = True,
+                           load_disc: bool = False,
+                           load_gen: bool = False) \
+        -> Tuple[CN, HuggingFaceWordPieceTokenizer,
+                 Optional[str],
+                 Tuple[Optional[str], Optional[str]]]:
+    """Get the pretrained Electra weights
+
+    Parameters
+    ----------
+    model_name
+        The name of the Electra model.
+    root
+        The downloading root
+    load_backbone
+        Whether to load the weights of the backbone network
+    load_disc
+        Whether to load the weights of the discriminator
+    load_gen
+        Whether to load the weights of the generator
+
+    Returns
+    -------
+    cfg
+        Network configuration
+    tokenizer
+        The HuggingFaceWordPieceTokenizer
+    backbone_params_path
+        Path to the parameter of the backbone network
+    other_net_params_paths
+        Path to the parameter of the discriminator and the generator.
+        They will be returned inside a tuple.
+    """
+    assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format(
+        model_name, list_pretrained_electra())
+    cfg_path = PRETRAINED_URL[model_name]['cfg']
+    if isinstance(cfg_path, CN):
+        cfg = cfg_path
+    else:
+        cfg = None
+    vocab_path = PRETRAINED_URL[model_name]['vocab']
+    params_path = PRETRAINED_URL[model_name]['params']
+    disc_params_path = PRETRAINED_URL[model_name]['disc_model']
+    gen_params_path = PRETRAINED_URL[model_name]['gen_model']
+
+    local_paths = dict()
+    download_jobs = [('vocab', vocab_path)]
+    if cfg is None:
+        download_jobs.append(('cfg', cfg_path))
+    for k, path in download_jobs:
+        local_paths[k] = download(url=get_repo_model_zoo_url() + path,
+                                  path=os.path.join(root, path),
+                                  sha1_hash=FILE_STATS[path])
+    if load_backbone:
+        local_params_path = download(url=get_repo_model_zoo_url() + params_path,
+                                     path=os.path.join(root, params_path),
+                                     sha1_hash=FILE_STATS[params_path])
+    else:
+        local_params_path = None
+    if load_disc:
+        local_disc_params_path = download(url=get_repo_model_zoo_url() + disc_params_path,
+                                          path=os.path.join(root, disc_params_path),
+                                          sha1_hash=FILE_STATS[disc_params_path])
+    else:
+        local_disc_params_path = None
+
+    if load_gen:
+        local_gen_params_path = download(url=get_repo_model_zoo_url() + gen_params_path,
+                                         path=os.path.join(root, gen_params_path),
+                                         sha1_hash=FILE_STATS[gen_params_path])
+    else:
+        local_gen_params_path = None
+
+    do_lower = True if 'lowercase' in PRETRAINED_URL[model_name]\
+                       and PRETRAINED_URL[model_name]['lowercase'] else False
+    tokenizer = HuggingFaceWordPieceTokenizer(
+        vocab_file=local_paths['vocab'],
+        unk_token='[UNK]',
+        pad_token='[PAD]',
+        cls_token='[CLS]',
+        sep_token='[SEP]',
+        mask_token='[MASK]',
+        lowercase=do_lower)
+    if cfg is None:
+        cfg = ElectraModel.get_cfg().clone_merge(local_paths['cfg'])
+    return cfg, tokenizer, local_params_path, (local_disc_params_path, local_gen_params_path)
+
+
+BACKBONE_REGISTRY.register('electra', [ElectraModel,
+                                       get_pretrained_electra,
+                                       list_pretrained_electra])
diff --git a/src/gluonnlp/models/gpt2.py b/src/gluonnlp/models/gpt2.py
new file mode 100644
index 0000000000..b64441f57f
--- /dev/null
+++ b/src/gluonnlp/models/gpt2.py
@@ -0,0 +1,678 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+GPT-2 Model
+
+@article{radford2019language,
+  title={Language Models are Unsupervised Multitask Learners},
+  author={Radford, Alec and Wu, Jeff and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya},
+  year={2019}
+}
+"""
+
+__all__ = ['GPT2Model', 'GPT2ForLM', 'list_pretrained_gpt2', 'get_pretrained_gpt2']
+
+import os
+from typing import Tuple
+
+import mxnet as mx
+from mxnet import use_np
+from mxnet.gluon import HybridBlock, nn
+from ..registry import BACKBONE_REGISTRY
+from ..base import get_model_zoo_home_dir, get_repo_model_zoo_url, get_model_zoo_checksum_dir
+from ..utils.config import CfgNode as CN
+from ..utils.misc import load_checksum_stats, download
+from ..utils.registry import Registry
+from ..initializer import TruncNorm
+from ..attention_cell import MultiHeadAttentionCell
+from ..layers import get_activation, PositionalEmbedding
+from ..data.tokenizers import HuggingFaceByteBPETokenizer
+
+
+FILE_STATS = load_checksum_stats(os.path.join(get_model_zoo_checksum_dir(), 'gpt2.txt'))
+gpt2_cfg_reg = Registry('gpt2_cfg')
+
+
+@gpt2_cfg_reg.register()
+def gpt2_124M():
+    cfg = CN()
+    # Config for the roberta base model
+    cfg.MODEL = CN()
+    cfg.MODEL.vocab_size = 50257
+    cfg.MODEL.units = 768
+    cfg.MODEL.max_length = 1024
+    cfg.MODEL.num_heads = 12
+    cfg.MODEL.num_layers = 12
+    cfg.MODEL.embed_dropout_prob = 0.1
+    cfg.MODEL.hidden_dropout_prob = 0.1
+    cfg.MODEL.attention_dropout_prob = 0.1
+    cfg.MODEL.pos_embed_type = 'learned'
+    cfg.MODEL.activation = 'gelu(tanh)'
+    cfg.MODEL.layer_norm_eps = 1E-5
+    cfg.MODEL.dtype = 'float32'
+    # Layout
+    cfg.MODEL.layout = 'NT'
+    cfg.MODEL.compute_layout = 'auto'
+    # Initialization method
+    cfg.INITIALIZER = CN()
+    cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02]
+    cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02]
+    cfg.INITIALIZER.bias = ['zeros']
+    cfg.VERSION = 1
+    cfg.freeze()
+    return cfg
+
+@gpt2_cfg_reg.register()
+def gpt2_355M():
+    cfg = gpt2_124M()
+    cfg.defrost()
+    cfg.MODEL.num_heads = 16
+    cfg.MODEL.num_layers = 24
+    cfg.MODEL.units = 1024
+    cfg.freeze()
+    return cfg
+
+@gpt2_cfg_reg.register()
+def gpt2_774M():
+    cfg = gpt2_124M()
+    cfg.defrost()
+    cfg.MODEL.num_heads = 20
+    cfg.MODEL.num_layers = 36
+    cfg.MODEL.units = 1280
+    cfg.freeze()
+    return cfg
+
+PRETRAINED_URL = {
+    'gpt2_124M': {
+        'cfg': gpt2_124M(),
+        'merges': 'gpt2_124M/gpt2-396d4d8e.merges',
+        'vocab': 'gpt2_124M/gpt2-9dc62091.vocab',
+        'params': 'gpt2_124M/model-bfed311d.params',
+        'lm_params': 'gpt2_124M/model_lm-99b90604.params'
+    },
+    'gpt2_355M': {
+        'cfg': gpt2_355M(),
+        'merges': 'gpt2_355M/gpt2-396d4d8e.merges',
+        'vocab': 'gpt2_355M/gpt2-9dc62091.vocab',
+        'params': 'gpt2_355M/model-81dee612.params',
+        'lm_params': 'gpt2_355M/model_lm-eed0e964.params'
+    },
+    'gpt2_774M': {
+        'cfg': gpt2_774M(),
+        'merges': 'gpt2_774M/gpt2-396d4d8e.merges',
+        'vocab': 'gpt2_774M/gpt2-9dc62091.vocab',
+        'params': 'gpt2_774M/model-9917e24e.params',
+        'lm_params': 'gpt2_774M/model_lm-cfbfa641.params'
+    },
+}
+
+
+@use_np
+class GPT2SelfAttentionLayer(HybridBlock):
+    def __init__(self, units: int = 768,
+                 num_heads: int = 12,
+                 layer_norm_eps: float = 1E-5,
+                 use_qkv_bias: bool = True,
+                 hidden_dropout_prob: float = 0.1,
+                 attention_dropout_prob: float = 0.1,
+                 weight_initializer=None,
+                 bias_initializer='zeros',
+                 dtype='float32',
+                 layout='NT'):
+        super().__init__()
+        self._units = units
+        self._num_heads = num_heads
+        self._layer_norm_eps = layer_norm_eps
+        self._use_qkv_bias = use_qkv_bias
+        self._hidden_dropout_prob = hidden_dropout_prob
+        self._attention_dropout_prob = attention_dropout_prob
+        self._weight_initializer = weight_initializer
+        self._bias_initializer = bias_initializer
+        self._dtype = dtype
+        self._layout = layout
+        assert layout in ['TN', 'NT'], 'Invalid layout received = {}. ' \
+                                       'Only "TN" and "NT" are accepted!'.format(layout)
+        self._attention_layout = 'NTK' if self._layout == 'NT' else 'TNK'
+        
+        self.ln = nn.LayerNorm(
+            epsilon=self._layer_norm_eps,
+            in_channels=self._units
+        )
+        self.qkv = nn.Dense(
+            3 * units,
+            in_units=units,
+            use_bias=use_qkv_bias,
+            flatten=False,
+            weight_initializer=weight_initializer,
+            bias_initializer=bias_initializer,
+            dtype=dtype
+        )
+        self.out_proj = nn.Dense(
+            units=units,
+            in_units=units,
+            use_bias=True,
+            flatten=False,
+            weight_initializer=weight_initializer,
+            bias_initializer=bias_initializer,
+            dtype=self._dtype
+        )
+        self.attention_cell = MultiHeadAttentionCell(
+            query_units=self._units,
+            num_heads=self._num_heads,
+            attention_dropout=self._attention_dropout_prob,
+            scaled=True,
+            dtype=self._dtype,
+            layout=self._attention_layout
+        )
+        self.hidden_dropout = nn.Dropout(self._hidden_dropout_prob)
+
+    def hybrid_forward(self, F, x, layer_states, prev_len):
+        """
+
+        Parameters
+        ----------
+        x :
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_in)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_in)
+        layer_states :
+            - layout = 'NT'
+                Shape (2, batch_size, prev_len, C_in)
+            - layout = 'TN'
+                Shape (2, prev_len, batch_size, C_in)
+        prev_len
+        """
+        x = self.ln(x)
+        if self._layout == 'NT':
+            batch_axis, time_axis = 0, 1
+        else:
+            batch_axis, time_axis = 1, 0
+
+        query, key, value = F.np.split(self.qkv(x), 3, axis=-1)
+        if layer_states is not None:
+            prev_key, prev_value = layer_states[0], layer_states[1]
+            key = F.np.concatenate([prev_key, key], axis=time_axis)
+            value = F.np.concatenate([prev_value, value], axis=time_axis)
+        new_states = F.np.stack([key, value], axis=0)
+        
+        # gen mask
+        query_pos = F.npx.arange_like(query, axis=time_axis)
+        if prev_len is not None:
+            query_pos = query_pos + prev_len
+        key_pos = F.npx.arange_like(key, axis=time_axis)
+        # (query_len, key_len)
+        mask = (F.npx.reshape(key_pos, (1, -1)) <= 
+                F.npx.reshape(query_pos, (-1, 1))).astype(self._dtype)
+        # broadcast to (batch_size, query_len, key_len)
+        mask = F.npx.broadcast_like(
+            F.np.expand_dims(mask, axis=0),
+            query,
+            lhs_axes=0,
+            rhs_axes=batch_axis
+        )
+
+        query = F.npx.reshape(query, (-2, -2, self._num_heads, -1))
+        key = F.npx.reshape(key, (-2, -2, self._num_heads, -1))
+        value = F.npx.reshape(value, (-2, -2, self._num_heads, -1))
+
+        out, [_, attn_weight] = self.attention_cell(query, key, value, mask)
+        out = self.out_proj(out)
+        out = self.hidden_dropout(out)
+
+        return out, new_states
+
+
+@use_np
+class GPT2FFN(HybridBlock):
+    def __init__(self,
+                 units: int = 768,
+                 hidden_size: int = 3072,
+                 layer_norm_eps: float = 1E-5,
+                 hidden_dropout_prob: float = 0.1,
+                 weight_initializer=None,
+                 bias_initializer='zeros',
+                 activation='gelu(tanh)',
+                 dtype='float32'):
+        super().__init__()
+        self._units = units
+        self._hidden_size = hidden_size
+        self._layer_norm_eps = layer_norm_eps
+        self._hidden_dropout_prob = hidden_dropout_prob
+        self._weight_initializer = weight_initializer
+        self._bias_initializer = bias_initializer
+        self._activation = activation
+        self._dtype = dtype
+        self.layer_norm = nn.LayerNorm(epsilon=self._layer_norm_eps,
+                                      in_channels=self._units)
+        self.ffn_1 = nn.Dense(units=self._hidden_size,
+                              in_units=self._units,
+                              flatten=False,
+                              weight_initializer=self._weight_initializer,
+                              bias_initializer=self._bias_initializer,
+                              dtype=self._dtype)
+        self.activation = get_activation(self._activation)
+        self.ffn_2 = nn.Dense(units=self._units,
+                              in_units=self._hidden_size,
+                              flatten=False,
+                              weight_initializer=self._weight_initializer,
+                              bias_initializer=self._bias_initializer,
+                              dtype=self._dtype)
+        self.hidden_dropout = nn.Dropout(self._hidden_dropout_prob)
+
+    def hybrid_forward(self, F, data):
+        # here the resnet is applied before the layernorm,
+        # which is different from the PositionwiseFFN(pre_norm=True)
+        out = self.layer_norm(data)
+        out = self.activation(self.ffn_1(out))
+        out = self.ffn_2(out)
+        out = self.hidden_dropout(out)
+        out = out + data
+        return out
+
+
+@use_np
+class GPT2Layer(HybridBlock):
+    def __init__(self, units: int = 768,
+                 num_heads: int = 12,
+                 layer_norm_eps: float = 1E-5,
+                 use_qkv_bias: bool = True,
+                 hidden_dropout_prob: float = 0.1,
+                 attention_dropout_prob: float = 0.1,
+                 weight_initializer=None,
+                 bias_initializer='zeros',
+                 activation: str = 'gelu(tanh)',
+                 dtype='float32',
+                 layout='NT'):
+        super().__init__()
+        self._units = units
+        self._hidden_size = 4 * units
+        self._num_heads = num_heads
+        self._layer_norm_eps = layer_norm_eps
+        self._use_qkv_bias = use_qkv_bias
+        self._hidden_dropout_prob = hidden_dropout_prob
+        self._attention_dropout_prob = attention_dropout_prob
+        self._weight_initializer = weight_initializer
+        self._bias_initializer = bias_initializer
+        self._activation = activation
+        self._dtype = dtype
+        self._layout = layout
+        
+        self.atten = GPT2SelfAttentionLayer(
+            units=self._units,
+            num_heads=self._num_heads,
+            layer_norm_eps=self._layer_norm_eps,
+            use_qkv_bias=self._use_qkv_bias,
+            weight_initializer=self._weight_initializer,
+            bias_initializer=self._bias_initializer,
+            dtype=self._dtype,
+            layout=self._layout
+        )
+        self.ffn = GPT2FFN(
+            units=self._units,
+            hidden_size=self._hidden_size,
+            layer_norm_eps=self._layer_norm_eps,
+            hidden_dropout_prob=self._hidden_dropout_prob,
+            weight_initializer=self._weight_initializer,
+            bias_initializer=self._bias_initializer,
+            activation=self._activation,
+            dtype=self._dtype
+        )
+    
+    def hybrid_forward(self, F, x, layer_states, prev_len):
+        """
+
+        Parameters
+        ----------
+        x :
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_in)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_in)
+        layer_states :
+            - layout = 'NT'
+                Shape (2, batch_size, prev_len, C_in)
+            - layout = 'TN'
+                Shape (2, prev_len, batch_size, C_in)
+        prev_len
+        """
+        h, new_layer_states = self.atten(x, layer_states, prev_len)
+        x = x + h
+        h = self.ffn(x)
+        return h, new_layer_states
+
+@use_np
+class GPT2Model(HybridBlock):
+    def __init__(self,
+                 vocab_size=50257,
+                 units=768,
+                 num_layers=12,
+                 num_heads=12,
+                 max_length=1024,
+                 pos_embed_type='learned',
+                 activation='gelu(tanh)',
+                 layer_norm_eps=1E-5,
+                 embed_dropout_prob=0.1,
+                 hidden_dropout_prob=0.1,
+                 attention_dropout_prob=0.1,
+                 embed_initializer=TruncNorm(stdev=0.02),
+                 weight_initializer=TruncNorm(stdev=0.02),
+                 bias_initializer='zeros',
+                 dtype='float32',
+                 output_all_encodings=False,
+                 layout='NT',
+                 compute_layout='auto'):
+        super().__init__()
+        self._vocab_size = vocab_size
+        self._units= units
+        self._num_layers = num_layers
+        self._num_heads = num_heads
+        self._max_length = max_length
+        self._pos_embed_type = pos_embed_type
+        self._activation = activation
+        self._layer_norm_eps = layer_norm_eps
+        self._embed_dropout_prob = embed_dropout_prob
+        self._hidden_dropout_prob = hidden_dropout_prob
+        self._attention_dropout_prob = attention_dropout_prob
+        self._embed_initializer = embed_initializer
+        self._weight_initializer = weight_initializer
+        self._bias_initializer = bias_initializer
+        self._dtype = dtype
+        self._layout = layout
+        if compute_layout == 'auto' or compute_layout is None:
+            self._compute_layout = layout
+        else:
+            self._compute_layout = compute_layout
+        self._embed= nn.Embedding(
+            input_dim=self._vocab_size,
+            output_dim=self._units,
+            weight_initializer=embed_initializer,
+            dtype=self._dtype
+        )
+        self._embed_dropout = nn.Dropout(self._embed_dropout_prob)
+        self._pos_embed = PositionalEmbedding(
+            units=self._units,
+            max_length=self._max_length,
+            dtype=self._dtype,
+            method=pos_embed_type
+        )
+        self._layers = nn.HybridSequential()
+        for layer_idx in range(self._num_layers):
+            self._layers.add(
+                GPT2Layer(
+                    units=self._units,
+                    num_heads=self._num_heads,
+                    layer_norm_eps=self._layer_norm_eps,
+                    use_qkv_bias=True,
+                    hidden_dropout_prob=self._hidden_dropout_prob,
+                    attention_dropout_prob=self._attention_dropout_prob,
+                    weight_initializer=self._weight_initializer,
+                    bias_initializer=self._bias_initializer,
+                    activation=self._activation,
+                    dtype=self._dtype,
+                    layout=self._compute_layout
+                )
+            )
+        self._final_ln = nn.LayerNorm(epsilon=layer_norm_eps,
+                                      in_channels=units)
+
+    @property
+    def layout(self):
+        return self._layout
+
+    def hybrid_forward(self, F, x, states, prev_len):
+        """
+
+        Parameters
+        ----------
+        x
+        states :
+            - layout = 'NT'
+                Shape (num_layers, 2, batch_size, prev_len, C_in)]
+            - layout = 'TN'
+                Shape (num_layers, 2, prev_len, batch_size, C_in)]
+        prev_len
+        """
+        x = self.get_initial_embedding(F, x, prev_len)
+        
+        if self._layout != self._compute_layout:
+            x = F.np.swapaxes(x, 0, 1)
+            states = F.np.swapaxes(states, 2, 3)
+        
+        new_states = []
+        for layer_idx in range(self._num_layers):
+            layer_states = None if states is None else states[layer_idx]
+            x, new_layer_states = self._layers[layer_idx](x, layer_states, prev_len)
+            new_states.append(new_layer_states)
+        new_states = F.np.stack(new_states, axis=0)
+        
+        x = self._final_ln(x)
+        if self._layout != self._compute_layout:
+            x = F.np.swapaxes(x, 0, 1)
+            new_states = F.np.swapaxes(new_states, 2, 3)
+        return x, new_states
+
+    def get_initial_embedding(self, F, inputs, prev_len):
+        """Get the initial token embeddings that considers the token type and positional embeddings
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        prev_len
+        
+        Returns
+        -------
+        embedding
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C)
+        """
+        embedding = self._embed(inputs)
+        if self._layout == 'NT':
+            batch_axis, time_axis = 0, 1
+        else:
+            batch_axis, time_axis = 1, 0
+        if self._pos_embed_type is not None:
+            pos = F.npx.arange_like(inputs, axis=time_axis)
+            if prev_len is not None:
+                pos = pos + prev_len
+            positional_embedding = self._pos_embed(pos)
+            positional_embedding = F.np.expand_dims(positional_embedding, axis=batch_axis)
+            embedding = embedding + positional_embedding
+        embedding = self._embed_dropout(embedding)
+        return embedding
+
+    def init_states(self, batch_size, ctx):
+        """Initialize the states required for incremental decoding
+
+        Returns
+        -------
+        init_states
+            - layout = 'NT'
+                Shape (num_layers, 2, batch_size, 0, C_in)
+            - layout = 'TN'
+                Shape (num_layers, 2, 0, batch_size, C_in)
+        """
+        return mx.np.zeros(shape=(self._num_layers, 2, batch_size, 0,
+                                  self._units), ctx=ctx, dtype=self._dtype) if self.layout == 'NT' else \
+               mx.np.zeros(shape=(self._num_layers, 2, 0, batch_size,
+                                  self._units), ctx=ctx, dtype=self._dtype)
+
+    @staticmethod
+    def get_cfg(key=None):
+        if key is not None:
+            return gpt2_cfg_reg.create(key)
+        else:
+            return gpt2_124M()
+
+    @classmethod
+    def from_cfg(cls,
+                 cfg,
+                 dtype=None,
+                 output_all_encodings=False) -> 'GPT2Model':
+        cfg = GPT2Model.get_cfg().clone_merge(cfg)
+        embed_initializer = mx.init.create(*cfg.INITIALIZER.embed)
+        weight_initializer = mx.init.create(*cfg.INITIALIZER.weight)
+        bias_initializer = mx.init.create(*cfg.INITIALIZER.bias)
+        if dtype is None:
+            dtype = cfg.MODEL.dtype
+        return cls(vocab_size=cfg.MODEL.vocab_size,
+                   units=cfg.MODEL.units,
+                   num_layers=cfg.MODEL.num_layers,
+                   num_heads=cfg.MODEL.num_heads,
+                   max_length=cfg.MODEL.max_length,
+                   pos_embed_type=cfg.MODEL.pos_embed_type,
+                   activation=cfg.MODEL.activation,
+                   layer_norm_eps=cfg.MODEL.layer_norm_eps,
+                   embed_dropout_prob=cfg.MODEL.embed_dropout_prob,
+                   hidden_dropout_prob=cfg.MODEL.hidden_dropout_prob,
+                   attention_dropout_prob=cfg.MODEL.attention_dropout_prob,
+                   embed_initializer=embed_initializer,
+                   weight_initializer=weight_initializer,
+                   bias_initializer=bias_initializer,
+                   dtype=dtype,
+                   output_all_encodings=output_all_encodings,
+                   layout=cfg.MODEL.layout,
+                   compute_layout=cfg.MODEL.compute_layout)
+
+@use_np
+class GPT2ForLM(HybridBlock):
+    def __init__(self, backbone_cfg=None):
+        super().__init__()
+        self._backbone_model = GPT2Model.from_cfg(backbone_cfg)
+        self._lm_head = nn.Dense(
+            units=backbone_cfg.MODEL.vocab_size,
+            in_units=backbone_cfg.MODEL.units,
+            use_bias=False,
+            flatten=False
+        )
+        self._lm_head.weight = self._backbone_model._embed.weight
+
+    def hybrid_forward(self, F, inputs, states, prev_len):
+        """Getting the logits
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        states
+        prev_len
+
+        Returns
+        -------
+        logits
+            - layout = 'NT'
+                Shape (batch_size, seq_length, vocab_size).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, vocab_size).
+        """
+
+        contextual_embeddings, new_states = self._backbone_model(inputs, states, prev_len)
+        logits = self._lm_head(contextual_embeddings)
+        return logits, new_states
+
+    def init_states(self, batch_size, ctx):
+        return self._backbone_model.init_states(batch_size, ctx)
+
+
+def list_pretrained_gpt2():
+    return sorted(list(PRETRAINED_URL.keys()))
+
+
+def get_pretrained_gpt2(model_name: str = 'gpt2_124M',
+                        root: str = get_model_zoo_home_dir(),
+                        load_backbone: bool = True,
+                        load_lm: bool = False)\
+        -> Tuple[CN, HuggingFaceByteBPETokenizer, str, str]:
+    """Get the pretrained GPT-2 weights
+
+    Parameters
+    ----------
+    model_name
+        The name of the GPT-2 model.
+    root
+        The downloading root
+    load_backbone
+        Whether to load the weights of the backbone network
+    load_lm
+        Whether to load the weights of LM
+
+    Returns
+    -------
+    cfg
+        Network configuration
+    tokenizer
+        The HuggingFaceByteBPETokenizer
+    params_path
+        Path to the parameters
+    lm_params_path
+        Path to the parameter that includes both the backbone and the LM
+    """
+    assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format(
+        model_name, list_pretrained_gpt2())
+    cfg_path = PRETRAINED_URL[model_name]['cfg']
+    if isinstance(cfg_path, CN):
+        cfg = cfg_path
+    else:
+        cfg = None
+    merges_path = PRETRAINED_URL[model_name]['merges']
+    vocab_path = PRETRAINED_URL[model_name]['vocab']
+    params_path = PRETRAINED_URL[model_name]['params']
+    lm_params_path = PRETRAINED_URL[model_name]['lm_params']
+
+    local_paths = dict()
+    download_jobs = [('vocab', vocab_path), ('merges', merges_path)]
+    if cfg is None:
+        download_jobs.append(('cfg', cfg_path))
+    for k, path in download_jobs:
+        local_paths[k] = download(url=get_repo_model_zoo_url() + path,
+                                  path=os.path.join(root, path),
+                                  sha1_hash=FILE_STATS[path])
+    if load_backbone:
+        local_params_path = download(url=get_repo_model_zoo_url() + params_path,
+                                     path=os.path.join(root, params_path),
+                                     sha1_hash=FILE_STATS[params_path])
+    else:
+        local_params_path = None
+    if load_lm and lm_params_path is not None:
+        local_lm_params_path = download(url=get_repo_model_zoo_url() + lm_params_path,
+                                         path=os.path.join(root, lm_params_path),
+                                         sha1_hash=FILE_STATS[lm_params_path])
+    else:
+        local_lm_params_path = None
+
+    tokenizer = HuggingFaceByteBPETokenizer(
+                    merges_file=local_paths['merges'],
+                    vocab_file=local_paths['vocab'])
+    if cfg is None:
+        cfg = GPT2Model.get_cfg().clone_merge(local_paths['cfg'])
+    return cfg, tokenizer, local_params_path, local_lm_params_path
+
+
+BACKBONE_REGISTRY.register('gpt2', [GPT2Model,
+                                    get_pretrained_gpt2,
+                                    list_pretrained_gpt2])
diff --git a/src/gluonnlp/models/mobilebert.py b/src/gluonnlp/models/mobilebert.py
new file mode 100644
index 0000000000..96ada137f3
--- /dev/null
+++ b/src/gluonnlp/models/mobilebert.py
@@ -0,0 +1,1097 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Mobile BERT Model
+
+@article{sun2020mobilebert,
+  title={Mobilebert: a compact task-agnostic mobile bert for resource-limited devices},
+  author={Sun, Zhiqing and Yu, Hongkun and Song, Xiaodan and Liu, Renjie and Yang, Yiming and Zhou, Denny},
+  journal={arXiv preprint arXiv:2004.02984},
+  year={2020}
+}
+"""
+
+
+import os
+from typing import Tuple, Optional
+
+import mxnet as mx
+import numpy as np
+from mxnet import use_np
+from mxnet.gluon import HybridBlock, nn
+
+from ..op import select_vectors_by_position
+from ..base import get_model_zoo_home_dir, get_repo_model_zoo_url, get_model_zoo_checksum_dir
+from ..layers import InitializerType, PositionwiseFFN, PositionalEmbedding, get_layer_norm, get_activation
+from ..initializer import TruncNorm
+from ..utils.config import CfgNode as CN
+from ..utils.misc import load_checksum_stats, download
+from ..utils.registry import Registry
+from ..registry import BACKBONE_REGISTRY
+from ..attention_cell import MultiHeadAttentionCell, gen_self_attn_mask
+from ..data.tokenizers import HuggingFaceWordPieceTokenizer
+
+__all__ = ['MobileBertModel', 'MobileBertForMLM', 'MobileBertForPretrain',
+           'list_pretrained_mobilebert', 'get_pretrained_mobilebert']
+
+mobilebert_cfg_reg = Registry('mobilebert_cfg')
+
+
+@mobilebert_cfg_reg.register()
+def google_uncased_mobilebert():
+    cfg = CN()
+    cfg.MODEL = CN()
+    cfg.MODEL.vocab_size = 30522
+    cfg.MODEL.units = 512
+    cfg.MODEL.embed_size = 128
+    cfg.MODEL.inner_size = 128
+    cfg.MODEL.hidden_size = 512
+    cfg.MODEL.max_length = 512
+    cfg.MODEL.num_heads = 4
+    cfg.MODEL.num_layers = 24
+
+    cfg.MODEL.use_bottleneck = True  # Whether to use bottleneck
+    cfg.MODEL.trigram_embed = True  # Trigram embedding
+    cfg.MODEL.classifier_activation = False  # Whether to use an additional pooling layer
+    cfg.MODEL.bottleneck_strategy = 'qk_sharing'
+    cfg.MODEL.num_stacked_ffn = 4
+    cfg.MODEL.pos_embed_type = 'learned'
+    cfg.MODEL.activation = 'relu'
+    cfg.MODEL.num_token_types = 2
+    cfg.MODEL.hidden_dropout_prob = 0.0
+    cfg.MODEL.attention_dropout_prob = 0.1
+    cfg.MODEL.normalization = 'no_norm'
+    cfg.MODEL.layer_norm_eps = 1E-12
+    cfg.MODEL.dtype = 'float32'
+    # Layout flags
+    cfg.MODEL.layout = 'NT'
+    cfg.MODEL.compute_layout = 'auto'
+    # Initializer
+    cfg.INITIALIZER = CN()
+    cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02]
+    cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02]  # TruncNorm(0, 0.02)
+    cfg.INITIALIZER.bias = ['zeros']
+    cfg.VERSION = 1
+    cfg.freeze()
+    return cfg
+
+
+PRETRAINED_URL = {
+    'google_uncased_mobilebert': {
+        'cfg': google_uncased_mobilebert(),
+        'vocab': 'google_uncased_mobilebert/vocab-e6d2b21d.json',
+        'params': 'google_uncased_mobilebert/model-c8346cf2.params',
+        'mlm_params': 'google_uncased_mobilebert/model_mlm-53948e82.params',
+        'lowercase': True,
+    }
+}
+
+
+FILE_STATS = load_checksum_stats(os.path.join(get_model_zoo_checksum_dir(), 'mobilebert.txt'))
+
+
+@use_np
+class MobileBertEncoderLayer(HybridBlock):
+    """The Transformer Encoder Layer in Mobile Bert"""
+    # TODO(zheyuye), use stacked groups for single ffn layer in TransformerEncoderLayer
+    # and revise the other models and scripts, masking sure their are compatible.
+
+    def __init__(self,
+                 use_bottleneck: bool = True,
+                 units: int = 512,
+                 real_units: int = 128,
+                 hidden_size: int = 2048,
+                 num_heads: int = 8,
+                 num_stacked_ffn: int = 1,
+                 bottleneck_strategy: str = 'qk_sharing',
+                 attention_dropout_prob: float = 0.1,
+                 hidden_dropout_prob: float = 0.1,
+                 activation_dropout_prob: float = 0.0,
+                 activation: str = 'gelu',
+                 normalization: str = 'layer_norm',
+                 layer_norm_eps: float = 1e-12,
+                 use_qkv_bias: bool = True,
+                 weight_initializer: Optional[InitializerType] = None,
+                 bias_initializer: Optional[InitializerType] = 'zeros',
+                 dtype='float32',
+                 layout='NT'):
+        """
+
+        Parameters
+        ----------
+        use_bottleneck
+            Whether to use the bottleneck layer.
+        units
+            size of inter-bottleneck
+        real_units
+            size of intra-bottleneck
+        hidden_size
+            size of feed-forward network
+        num_heads
+        num_stacked_ffn
+        attention_dropout_prob
+        hidden_dropout_prob
+        activation_dropout_prob
+        activation
+        normalization
+        layer_norm_eps
+            onlyv valid when normalization is 'layer_norm'
+        use_qkv_bias
+        weight_initializer
+        bias_initializer
+        dtype
+            Data type of the block
+        layout
+            Layout of the input + output
+        """
+        super().__init__()
+        self._use_bottleneck = use_bottleneck
+        self._units = units
+        self._real_units = real_units
+        self._num_heads = num_heads
+        self._num_stacked_ffn = num_stacked_ffn
+        self._bottleneck_strategy = bottleneck_strategy
+        self._dtype = dtype
+        self._layout = layout
+        assert real_units % num_heads == 0, 'units must be divisive by the number of heads'
+        self.dropout_layer = nn.Dropout(hidden_dropout_prob)
+        if use_bottleneck:
+            self.in_bottleneck_proj = nn.Dense(units=real_units,
+                                               in_units=units,
+                                               flatten=False,
+                                               weight_initializer=weight_initializer,
+                                               bias_initializer=bias_initializer,
+                                               dtype=self._dtype)
+            self.in_bottleneck_ln = get_layer_norm(normalization=normalization,
+                                                   in_channels=real_units,
+                                                   epsilon=layer_norm_eps)
+            self.out_bottleneck_proj = nn.Dense(units=units,
+                                                in_units=real_units,
+                                                flatten=False,
+                                                weight_initializer=weight_initializer,
+                                                bias_initializer=bias_initializer,
+                                                dtype=self._dtype)
+            self.out_bottleneck_ln = get_layer_norm(normalization=normalization,
+                                                    in_channels=units,
+                                                    epsilon=layer_norm_eps)
+
+            if bottleneck_strategy == 'qk_sharing':
+                self.shared_qk = nn.Dense(units=real_units,
+                                          in_units=units,
+                                          flatten=False,
+                                          weight_initializer=weight_initializer,
+                                          bias_initializer=bias_initializer,
+                                          dtype=self._dtype)
+                self.shared_qk_ln = get_layer_norm(normalization=normalization,
+                                                   in_channels=real_units,
+                                                   epsilon=layer_norm_eps)
+        self.attention_proj = nn.Dense(units=real_units,
+                                       flatten=False,
+                                       in_units=real_units,
+                                       use_bias=True,
+                                       weight_initializer=weight_initializer,
+                                       bias_initializer=bias_initializer,
+                                       dtype=self._dtype)
+        # The in_units of qkv varies according to the sharing strategy
+        if self._use_bottleneck:
+            if self._bottleneck_strategy == 'qk_sharing':
+                attn_query_in_units = real_units
+                attn_key_in_units = real_units
+                attn_value_in_units = units
+            elif self._bottleneck_strategy == 'from_bottleneck':
+                attn_query_in_units = real_units
+                attn_key_in_units = real_units
+                attn_value_in_units = real_units
+            elif self._bottleneck_strategy == 'from_input':
+                attn_query_in_units = units
+                attn_key_in_units = units
+                attn_value_in_units = units
+            else:
+                raise NotImplementedError
+        else:
+            attn_query_in_units = units
+            attn_key_in_units = units
+            attn_value_in_units = units
+        self.attn_query = nn.Dense(units=real_units,
+                                   in_units=attn_query_in_units,
+                                   flatten=False,
+                                   use_bias=use_qkv_bias,
+                                   weight_initializer=weight_initializer,
+                                   bias_initializer=bias_initializer,
+                                   dtype=self._dtype)
+        self.attn_key = nn.Dense(units=real_units,
+                                 in_units=attn_key_in_units,
+                                 flatten=False,
+                                 use_bias=use_qkv_bias,
+                                 weight_initializer=weight_initializer,
+                                 bias_initializer=bias_initializer,
+                                 dtype=self._dtype)
+        self.attn_value = nn.Dense(units=real_units,
+                                   in_units=attn_value_in_units,
+                                   flatten=False,
+                                   use_bias=use_qkv_bias,
+                                   weight_initializer=weight_initializer,
+                                   bias_initializer=bias_initializer,
+                                   dtype=self._dtype)
+        attention_layout = 'NTK' if self._layout == 'NT' else 'TNK'
+        self.attention_cell = \
+            MultiHeadAttentionCell(
+                query_units=real_units,
+                num_heads=num_heads,
+                attention_dropout=attention_dropout_prob,
+                scaled=True,
+                dtype=self._dtype,
+                layout=attention_layout
+            )
+        self.layer_norm = get_layer_norm(normalization=normalization,
+                                         in_channels=real_units,
+                                         epsilon=layer_norm_eps)
+
+        self.stacked_ffn = nn.HybridSequential()
+        for ffn_idx in range(num_stacked_ffn):
+            is_last_ffn = (ffn_idx == (num_stacked_ffn - 1))
+            # only apply dropout on last ffn layer if use bottleneck
+            dropout = float(hidden_dropout_prob * (not use_bottleneck) * is_last_ffn)
+            self.stacked_ffn.add(
+                PositionwiseFFN(units=real_units,
+                                hidden_size=hidden_size,
+                                dropout=dropout,
+                                activation_dropout=activation_dropout_prob,
+                                weight_initializer=weight_initializer,
+                                bias_initializer=bias_initializer,
+                                activation=activation,
+                                normalization=normalization,
+                                layer_norm_eps=layer_norm_eps,
+                                dtype=self._dtype))
+
+    @property
+    def layout(self):
+        return self._layout
+
+    def hybrid_forward(self, F, data, attn_mask):
+        """
+
+        Parameters
+        ----------
+        F
+        data
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_in)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_in)
+        attn_mask
+            The attention mask
+            Shape (batch_size, seq_length, seq_length)
+
+        Returns
+        -------
+        out
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_out)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_out)
+        attn_weight
+            Shape (batch_size, seq_length, seq_length)
+        """
+        if self._use_bottleneck:
+            bn_proj = self.in_bottleneck_proj(data)
+            bn_proj = self.in_bottleneck_ln(bn_proj)
+            input = bn_proj
+            if self._bottleneck_strategy == 'qk_sharing':
+                # for Mobile Bert
+                qk_shared = self.shared_qk(data)
+                qk_shared = self.shared_qk_ln(qk_shared)
+                query = qk_shared
+                key = qk_shared
+                value = data
+            elif self._bottleneck_strategy == 'from_bottleneck':
+                # for Mobile Bert Tiny
+                query = bn_proj
+                key = bn_proj
+                value = bn_proj
+            elif self._bottleneck_strategy == 'from_input':
+                query = data
+                key = data
+                value = data
+            else:
+                raise NotImplementedError
+        else:
+            input = data
+            query = data
+            key = data
+            value = data
+
+        query = F.npx.reshape(self.attn_query(query), (-2, -2, self._num_heads, -1))
+        key = F.npx.reshape(self.attn_key(key), (-2, -2, self._num_heads, -1))
+        value = F.npx.reshape(self.attn_value(value), (-2, -2, self._num_heads, -1))
+        out, [_, attn_weight] = self.attention_cell(query, key, value, attn_mask)
+        out = self.attention_proj(out)
+        if not self._use_bottleneck:
+            out = self.dropout_layer(out)
+        out = out + input
+        out = self.layer_norm(out)
+        for ffn_idx in range(self._num_stacked_ffn):
+            ffn = self.stacked_ffn[ffn_idx]
+            out = ffn(out)
+
+        if self._use_bottleneck:
+            out = self.out_bottleneck_proj(out)
+            out = self.dropout_layer(out)
+            out = out + data
+            out = self.out_bottleneck_ln(out)
+        return out, attn_weight
+
+
+@use_np
+class MobileBertTransformer(HybridBlock):
+    def __init__(self,
+                 use_bottleneck: bool = True,
+                 units: int = 512,
+                 hidden_size: int = 512,
+                 inner_size: int = 128,
+                 num_layers: int = 24,
+                 num_heads: int = 4,
+                 num_stacked_ffn: int = 1,
+                 bottleneck_strategy: str = 'qk_sharing',
+                 activation: str = 'gelu',
+                 normalization: str = 'layer_norm',
+                 attention_dropout_prob: float = 0.,
+                 hidden_dropout_prob: float = 0.1,
+                 output_attention: bool = False,
+                 output_all_encodings: bool = False,
+                 layer_norm_eps: float = 1E-12,
+                 weight_initializer: InitializerType = TruncNorm(stdev=0.02),
+                 bias_initializer: InitializerType = 'zeros',
+                 dtype='float32',
+                 layout='NT'):
+        super().__init__()
+        self._dtype = dtype
+        self._num_layers = num_layers
+        self._output_attention = output_attention
+        self._output_all_encodings = output_all_encodings
+        self._layout = layout
+
+        assert bottleneck_strategy in ['qk_sharing', 'from_bottleneck', 'from_input'], \
+            'The bottleneck strategy={} is not supported.'.format(bottleneck_strategy)
+        real_units = inner_size if use_bottleneck else units
+        assert real_units % num_heads == 0,\
+            'In MobileBertTransformer, The real_units should be divided exactly ' \
+            'by the number of heads. Received real_units={}, num_heads={}' \
+            .format(real_units, num_heads)
+
+        self.all_layers = nn.HybridSequential()
+        for layer_idx in range(num_layers):
+            self.all_layers.add(
+                MobileBertEncoderLayer(use_bottleneck=use_bottleneck,
+                                       units=units,
+                                       real_units=real_units,
+                                       hidden_size=hidden_size,
+                                       num_heads=num_heads,
+                                       attention_dropout_prob=attention_dropout_prob,
+                                       hidden_dropout_prob=hidden_dropout_prob,
+                                       num_stacked_ffn=num_stacked_ffn,
+                                       bottleneck_strategy=bottleneck_strategy,
+                                       layer_norm_eps=layer_norm_eps,
+                                       weight_initializer=weight_initializer,
+                                       bias_initializer=bias_initializer,
+                                       normalization=normalization,
+                                       activation=activation,
+                                       layout=layout))
+
+    @property
+    def layout(self):
+        return self._layout
+
+    def hybrid_forward(self, F, data, valid_length):
+        """
+        Generate the representation given the inputs.
+
+        This is used in training or fine-tuning a mobile bert model.
+
+        Parameters
+        ----------
+        F
+        data
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C)
+        valid_length
+            Shape (batch_size,)
+
+        Returns
+        -------
+        out
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_out)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_out)
+        """
+        if self._layout == 'NT':
+            batch_axis, time_axis = 0, 1
+        elif self._layout == 'TN':
+            batch_axis, time_axis = 1, 0
+        else:
+            raise NotImplementedError('Received layout="{}". '
+                                      'Only "NT" and "TN" are supported.'.format(self._layout))
+        # 1. Embed the data
+        attn_mask = gen_self_attn_mask(F, data, valid_length,
+                                       dtype=self._dtype,
+                                       layout=self._layout,
+                                       attn_type='full')
+        out = data
+        all_encodings_outputs = []
+        additional_outputs = []
+        all_encodings_outputs.append(out)
+        for layer_idx in range(self._num_layers):
+            layer = self.all_layers[layer_idx]
+            out, attention_weights = layer(out, attn_mask)
+            # out : [batch_size, seq_len, units]
+            # attention_weights : [batch_size, num_heads, seq_len, seq_len]
+            if self._output_all_encodings:
+                out = F.npx.sequence_mask(out,
+                                          sequence_length=valid_length,
+                                          use_sequence_length=True,
+                                          axis=time_axis)
+                all_encodings_outputs.append(out)
+
+            if self._output_attention:
+                additional_outputs.append(attention_weights)
+
+        if not self._output_all_encodings:
+            # if self._output_all_encodings, SequenceMask is already applied above
+            out = F.npx.sequence_mask(out, sequence_length=valid_length,
+                                      use_sequence_length=True,
+                                      axis=time_axis)
+            return out, additional_outputs
+        else:
+            return all_encodings_outputs, additional_outputs
+
+
+@use_np
+class MobileBertModel(HybridBlock):
+    def __init__(self,
+                 vocab_size: int = 30000,
+                 embed_size: int = 128,
+                 units: int = 512,
+                 hidden_size: int = 512,
+                 inner_size: int = 128,
+                 max_length: int = 512,
+                 num_heads: int = 4,
+                 num_layers: int = 24,
+                 num_stacked_ffn: int = 4,
+                 bottleneck_strategy: str = 'qk_sharing',
+                 activation: str = 'relu',
+                 normalization: str = 'no_norm',
+                 hidden_dropout_prob: int = 0.,
+                 attention_dropout_prob: int = 0.1,
+                 num_token_types: int = 2,
+                 pos_embed_type: str = 'learned',
+                 layer_norm_eps: float = 1E-12,
+                 embed_initializer: InitializerType = TruncNorm(stdev=0.02),
+                 weight_initializer: InitializerType = TruncNorm(stdev=0.02),
+                 bias_initializer: InitializerType = 'zeros',
+                 use_bottleneck=True,
+                 trigram_embed=True,
+                 use_pooler=True,
+                 classifier_activation=False,
+                 dtype='float32',
+                 layout='NT',
+                 compute_layout='auto'):
+        super().__init__()
+        self._dtype = dtype
+        self.use_bottleneck = use_bottleneck
+        self.bottleneck_strategy = bottleneck_strategy
+        self.trigram_embed = trigram_embed
+        self.normalization = normalization
+        self.use_pooler = use_pooler
+        self.classifier_activation = classifier_activation
+        self.pos_embed_type = pos_embed_type
+        self.num_token_types = num_token_types
+        self.vocab_size = vocab_size
+        self.embed_size = embed_size
+        self.units = units
+        self.hidden_size = hidden_size
+        self.max_length = max_length
+        self.activation = activation
+        self.num_stacked_ffn = num_stacked_ffn
+        self.embed_initializer = embed_initializer
+        self.weight_initializer = weight_initializer
+        self.bias_initializer = bias_initializer
+        self.layer_norm_eps = layer_norm_eps
+        self._layout = layout
+        if compute_layout == 'auto' or compute_layout is None:
+            self._compute_layout = layout
+        else:
+            assert compute_layout in ['TN', 'NT']
+            self._compute_layout = compute_layout
+        # Construct MobileBertTransformer
+        self.encoder = MobileBertTransformer(
+            units=units,
+            hidden_size=hidden_size,
+            num_layers=num_layers,
+            num_heads=num_heads,
+            inner_size=inner_size,
+            num_stacked_ffn=num_stacked_ffn,
+            bottleneck_strategy=bottleneck_strategy,
+            attention_dropout_prob=attention_dropout_prob,
+            hidden_dropout_prob=hidden_dropout_prob,
+            output_attention=False,
+            output_all_encodings=False,
+            activation=activation,
+            normalization=normalization,
+            layer_norm_eps=layer_norm_eps,
+            weight_initializer=weight_initializer,
+            bias_initializer=bias_initializer,
+            dtype=dtype,
+            layout=self._compute_layout,
+        )
+        self.encoder.hybridize()
+        # Construct word embedding
+        self.word_embed = nn.Embedding(input_dim=vocab_size,
+                                       output_dim=embed_size,
+                                       weight_initializer=embed_initializer,
+                                       dtype=dtype)
+        if trigram_embed or embed_size != units:
+            if trigram_embed:
+                in_units = 3 * embed_size
+            else:
+                in_units = embed_size
+            self.embed_factorized_proj = nn.Dense(units=units,
+                                                  in_units=in_units,
+                                                  flatten=False,
+                                                  weight_initializer=weight_initializer,
+                                                  bias_initializer=bias_initializer)
+        self.embed_layer_norm = get_layer_norm(normalization=normalization,
+                                               in_channels=units,
+                                               epsilon=self.layer_norm_eps)
+
+        self.embed_dropout = nn.Dropout(hidden_dropout_prob)
+        # Construct token type embedding
+        self.token_type_embed = nn.Embedding(input_dim=num_token_types,
+                                             output_dim=units,
+                                             weight_initializer=weight_initializer,
+                                             dtype=self._dtype)
+        self.token_pos_embed = PositionalEmbedding(units=units,
+                                                   max_length=max_length,
+                                                   dtype=self._dtype,
+                                                   method=pos_embed_type)
+        if self.use_pooler and self.classifier_activation:
+            # Construct pooler
+            self.pooler = nn.Dense(units=units,
+                                   in_units=units,
+                                   flatten=False,
+                                   activation='tanh',
+                                   dtype=self._dtype,
+                                   weight_initializer=weight_initializer,
+                                   bias_initializer=bias_initializer)
+
+    @property
+    def layout(self):
+        return self._layout
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    def hybrid_forward(self, F, inputs, token_types, valid_length):
+        # pylint: disable=arguments-differ
+        """Generate the representation given the inputs.
+
+        This is used in training or fine-tuning a mobile bert model.
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+            If the inputs contain two sequences, we will set different token types for the first
+             sentence and the second sentence.
+        valid_length :
+            The valid length of each sequence
+            Shape (batch_size,)
+
+        Returns
+        -------
+        contextual_embedding :
+            Shape (batch_size, seq_length, units).
+        pooled_output :
+            This is optional. Shape (batch_size, units)
+        """
+        embedding = self.get_initial_embedding(F, inputs, token_types)
+
+        if self._compute_layout != self._layout:
+            contextual_embeddings, additional_outputs = self.encoder(F.np.swapaxes(embedding, 0, 1),
+                                                                     valid_length)
+            contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1)
+        else:
+            contextual_embeddings, additional_outputs = self.encoder(embedding, valid_length)
+        if self.use_pooler:
+            pooled_out = self.apply_pooling(contextual_embeddings)
+            return contextual_embeddings, pooled_out
+        else:
+            return contextual_embeddings
+
+    def get_initial_embedding(self, F, inputs, token_types=None):
+        """Get the initial token embeddings that considers the token type and positional embeddings
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+            If None, it will be initialized as all zero
+
+        Returns
+        -------
+        embedding
+            The initial embedding that will be fed into the encoder
+        """
+        if self._layout == 'NT':
+            batch_axis, time_axis = 0, 1
+        elif self._layout == 'TN':
+            batch_axis, time_axis = 1, 0
+        else:
+            raise NotImplementedError
+        word_embedding = self.word_embed(inputs)
+
+        if self.trigram_embed:
+            if self._layout == 'NT':
+                word_embedding = F.np.concatenate(
+                    [F.np.pad(word_embedding[:, 1:], ((0, 0), (0, 1), (0, 0))),
+                     word_embedding,
+                     F.np.pad(word_embedding[:, :-1], ((0, 0), (1, 0), (0, 0)))], axis=-1)
+            elif self._layout == 'TN':
+                word_embedding = F.np.concatenate(
+                    [F.np.pad(word_embedding[1:, :], ((0, 1), (0, 0), (0, 0))),
+                     word_embedding,
+                     F.np.pad(word_embedding[:-1, :], ((1, 0), (0, 0), (0, 0)))], axis=-1)
+            else:
+                raise NotImplementedError
+        # Projecting the embedding into units only for word embedding
+        if self.trigram_embed or self.embed_size != self.units:
+            word_embedding = self.embed_factorized_proj(word_embedding)
+
+        if token_types is None:
+            token_types = F.np.zeros_like(inputs)
+        type_embedding = self.token_type_embed(token_types)
+        embedding = word_embedding + type_embedding
+        if self.pos_embed_type is not None:
+            positional_embedding =\
+                self.token_pos_embed(F.npx.arange_like(embedding, axis=time_axis))
+            positional_embedding = F.np.expand_dims(positional_embedding, axis=batch_axis)
+            embedding = embedding + positional_embedding
+        # Extra layer normalization plus dropout
+        embedding = self.embed_layer_norm(embedding)
+        embedding = self.embed_dropout(embedding)
+        return embedding
+
+    def apply_pooling(self, sequence):
+        """Generate the representation given the inputs.
+
+        This is used for pre-training or fine-tuning a mobile bert model.
+        Get the first token of the whole sequence which is [CLS]
+
+        Parameters
+        ----------
+        sequence
+            - layout = 'NT'
+                Shape (batch_size, sequence_length, units)
+            - layout = 'TN'
+                Shape (sequence_length, batch_size, units)
+
+        Returns
+        -------
+        outputs
+            Shape (batch_size, units)
+        """
+        if self._layout == 'NT':
+            outputs = sequence[:, 0, :]
+        else:
+            outputs = sequence[0, :, :]
+        if self.classifier_activation:
+            return self.pooler(outputs)
+        else:
+            return outputs
+
+    @staticmethod
+    def get_cfg(key=None):
+        if key is not None:
+            return mobilebert_cfg_reg.create(key)
+        else:
+            return google_uncased_mobilebert()
+
+    @classmethod
+    def from_cfg(cls,
+                 cfg,
+                 use_pooler=True,
+                 dtype=None) -> 'MobileBertModel':
+        cfg = MobileBertModel.get_cfg().clone_merge(cfg)
+        assert cfg.VERSION == 1, 'Wrong version!'
+        embed_initializer = mx.init.create(*cfg.INITIALIZER.embed)
+        weight_initializer = mx.init.create(*cfg.INITIALIZER.weight)
+        bias_initializer = mx.init.create(*cfg.INITIALIZER.bias)
+        if dtype is None:
+            dtype = cfg.MODEL.dtype
+        return cls(vocab_size=cfg.MODEL.vocab_size,
+                   units=cfg.MODEL.units,
+                   hidden_size=cfg.MODEL.hidden_size,
+                   embed_size=cfg.MODEL.embed_size,
+                   num_layers=cfg.MODEL.num_layers,
+                   num_heads=cfg.MODEL.num_heads,
+                   bottleneck_strategy=cfg.MODEL.bottleneck_strategy,
+                   inner_size=cfg.MODEL.inner_size,
+                   num_stacked_ffn=cfg.MODEL.num_stacked_ffn,
+                   max_length=cfg.MODEL.max_length,
+                   hidden_dropout_prob=cfg.MODEL.hidden_dropout_prob,
+                   attention_dropout_prob=cfg.MODEL.attention_dropout_prob,
+                   num_token_types=cfg.MODEL.num_token_types,
+                   pos_embed_type=cfg.MODEL.pos_embed_type,
+                   activation=cfg.MODEL.activation,
+                   normalization=cfg.MODEL.normalization,
+                   layer_norm_eps=cfg.MODEL.layer_norm_eps,
+                   dtype=dtype,
+                   embed_initializer=embed_initializer,
+                   weight_initializer=weight_initializer,
+                   bias_initializer=bias_initializer,
+                   use_bottleneck=cfg.MODEL.use_bottleneck,
+                   trigram_embed=cfg.MODEL.trigram_embed,
+                   use_pooler=use_pooler,
+                   classifier_activation=cfg.MODEL.classifier_activation,
+                   layout=cfg.MODEL.layout,
+                   compute_layout=cfg.MODEL.compute_layout)
+
+
+@use_np
+class MobileBertForMLM(HybridBlock):
+    def __init__(self, backbone_cfg,
+                 weight_initializer=None,
+                 bias_initializer=None):
+        """
+
+        Parameters
+        ----------
+        backbone_cfg
+        weight_initializer
+        bias_initializer
+        """
+        super().__init__()
+        self.backbone_model = MobileBertModel.from_cfg(backbone_cfg)
+        if weight_initializer is None:
+            weight_initializer = self.backbone_model.weight_initializer
+        if bias_initializer is None:
+            bias_initializer = self.backbone_model.bias_initializer
+        self.mlm_decoder = nn.HybridSequential()
+        # Extra non-linear layer
+        self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units,
+                                      flatten=False,
+                                      weight_initializer=weight_initializer,
+                                      bias_initializer=bias_initializer,
+                                      dtype=self.backbone_model.dtype))
+        self.mlm_decoder.add(get_activation(self.backbone_model.activation))
+        # use basic layer normalization for pretaining
+        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
+        self.mlm_decoder.hybridize()
+        # only load the dense weights with a re-initialized bias
+        # parameters are stored in 'word_embed_bias' which is
+        # not used in original embedding
+        self.embedding_table = nn.Dense(
+            units=self.backbone_model.vocab_size,
+            in_units=self.backbone_model.embed_size,
+            flatten=False,
+            dtype=self.backbone_model.dtype,
+            bias_initializer=bias_initializer)
+        self.embedding_table.weight = self.backbone_model.word_embed.weight
+        if self.backbone_model.embed_size != self.backbone_model.units:
+            self.extra_table = nn.Dense(
+                units=self.backbone_model.vocab_size,
+                use_bias=False,
+                in_units=self.backbone_model.units - self.backbone_model.embed_size,
+                flatten=False)
+
+    def hybrid_forward(self, F, inputs, token_types, valid_length,
+                       masked_positions):
+        """Getting the scores of the masked positions.
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+            The type of the token. For example, if the inputs contain two sequences,
+            we will set different token types for the first sentence and the second sentence.
+        valid_length
+            The valid length of each sequence
+            Shape (batch_size,)
+        masked_positions
+            The masked position of the sequence
+            Shape (batch_size, num_masked_positions).
+
+        Returns
+        -------
+        contextual_embedding
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
+        pooled_out
+            Shape (batch_size, units)
+        mlm_scores
+            Shape (batch_size, num_masked_positions, vocab_size)
+        """
+        contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length)
+        if self.backbone_model.layout == 'TN':
+            mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1),
+                                                      masked_positions)
+        else:
+            mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        intermediate_output = self.mlm_decoder(mlm_features)
+        if self.backbone_model.embed_size != self.backbone_model.units:
+            scores = self.embedding_table(
+                intermediate_output[:, :, :self.backbone_model.embed_size])
+            extra_scores = self.extra_table(
+                intermediate_output[:, :, self.backbone_model.embed_size:])
+            mlm_scores = scores + extra_scores
+        else:
+            mlm_scores = self.embedding_table(intermediate_output)
+        return contextual_embeddings, pooled_out, mlm_scores
+
+
+@use_np
+class MobileBertForPretrain(HybridBlock):
+    def __init__(self, backbone_cfg,
+                 weight_initializer=None,
+                 bias_initializer=None):
+        """
+
+        Parameters
+        ----------
+        backbone_cfg
+            The cfg of the backbone model
+        weight_initializer
+        bias_initializer
+        """
+        super().__init__()
+        self.backbone_model = MobileBertModel.from_cfg(backbone_cfg)
+        if weight_initializer is None:
+            weight_initializer = self.backbone_model.weight_initializer
+        if bias_initializer is None:
+            bias_initializer = self.backbone_model.bias_initializer
+        # Construct nsp_classifier for next sentence prediction
+        self.nsp_classifier = nn.Dense(units=2,
+                                       weight_initializer=weight_initializer,
+                                       dtype=self.backbone_model.dtype)
+        self.mlm_decoder = nn.HybridSequential()
+        # Extra non-linear layer
+        self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units,
+                                      flatten=False,
+                                      weight_initializer=weight_initializer,
+                                      bias_initializer=bias_initializer,
+                                      dtype=self.backbone_model.dtype))
+        self.mlm_decoder.add(get_activation(self.backbone_model.activation))
+        # use basic layer normalization for pretaining
+        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps))
+        self.mlm_decoder.hybridize()
+        # only load the dense weights with a re-initialized bias
+        # parameters are stored in 'word_embed_bias' which is
+        # not used in original embedding
+        self.embedding_table = nn.Dense(
+            units=self.backbone_model.vocab_size,
+            in_units=self.backbone_model.embed_size,
+            flatten=False,
+            bias_initializer=bias_initializer,
+            dtype=self.backbone_model.dtype)
+        self.embedding_table.weight = self.backbone_model.word_embed.weight
+        if self.backbone_model.embed_size != self.backbone_model.units:
+            self.extra_table = nn.Dense(
+                units=self.backbone_model.vocab_size,
+                in_units=self.backbone_model.units -
+                self.backbone_model.embed_size,
+                flatten=False,
+                use_bias=False,
+                bias_initializer=bias_initializer,
+                dtype=self.backbone_model.dtype)
+
+    def hybrid_forward(self, F, inputs, token_types, valid_length,
+                       masked_positions):
+        """Generate the representation given the inputs.
+
+        This is used in training or fine-tuning a mobile mobile bert model.
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        token_types
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+
+            If the inputs contain two sequences, we will set different token types for the first
+             sentence and the second sentence.
+        valid_length
+            The valid length of each sequence
+            Shape (batch_size,)
+        masked_positions
+            The masked position of the sequence
+            Shape (batch_size, num_masked_positions).
+
+        Returns
+        -------
+        contextual_embedding
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
+        pooled_out
+            Shape (batch_size, units)
+        nsp_score
+            Shape (batch_size, 2)
+        mlm_scores
+            Shape (batch_size, num_masked_positions, vocab_size)
+        """
+        contextual_embeddings, pooled_out = self.backbone_model(inputs, token_types, valid_length)
+        nsp_score = self.nsp_classifier(pooled_out)
+        if self.backbone_model.layout == 'NT':
+            mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        else:
+            mlm_features = select_vectors_by_position(F, F.np.swapaxes(contextual_embeddings, 0, 1),
+                                                      masked_positions)
+        intermediate_output = self.mlm_decoder(mlm_features)
+        if self.backbone_model.embed_size != self.backbone_model.units:
+            scores = self.embedding_table(
+                intermediate_output[:, :, :self.backbone_model.embed_size])
+            extra_scores = self.extra_table(
+                intermediate_output[:, :, self.backbone_model.embed_size:])
+            mlm_scores = scores + extra_scores
+        else:
+            mlm_scores = self.embedding_table(intermediate_output)
+        return contextual_embeddings, pooled_out, nsp_score, mlm_scores
+
+
+def list_pretrained_mobilebert():
+    return sorted(list(PRETRAINED_URL.keys()))
+
+
+def get_pretrained_mobilebert(model_name: str = 'google_uncased_mobilebert',
+                              root: str = get_model_zoo_home_dir(),
+                             load_backbone: str = True,
+                             load_mlm: str = False)\
+        -> Tuple[CN, HuggingFaceWordPieceTokenizer, str, str]:
+    """Get the pretrained mobile bert weights
+
+    Parameters
+    ----------
+    model_name
+        The name of the mobile bert model.
+    root
+        The downloading root
+    load_backbone
+        Whether to load the weights of the backbone network
+    load_mlm
+        Whether to load the weights of MLM
+
+    Returns
+    -------
+    cfg
+        Network configuration
+    tokenizer
+        The HuggingFaceWordPieceTokenizer
+    backbone_params_path
+        Path to the parameter of the backbone network
+    mlm_params_path
+        Path to the parameter that includes both the backbone and the MLM
+    """
+    assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format(
+        model_name, list_pretrained_mobilebert())
+    cfg_path = PRETRAINED_URL[model_name]['cfg']
+    if isinstance(cfg_path, CN):
+        cfg = cfg_path
+    else:
+        cfg = None
+    vocab_path = PRETRAINED_URL[model_name]['vocab']
+    params_path = PRETRAINED_URL[model_name]['params']
+    mlm_params_path = PRETRAINED_URL[model_name]['mlm_params']
+    local_paths = dict()
+    download_jobs = [('vocab', vocab_path)]
+    if cfg is None:
+        download_jobs.append(('cfg', cfg_path))
+    for k, path in download_jobs:
+        local_paths[k] = download(url=get_repo_model_zoo_url() + path,
+                                  path=os.path.join(root, path),
+                                  sha1_hash=FILE_STATS[path])
+    if load_backbone:
+        local_params_path = download(url=get_repo_model_zoo_url() + params_path,
+                                     path=os.path.join(root, params_path),
+                                     sha1_hash=FILE_STATS[params_path])
+    else:
+        local_params_path = None
+    if load_mlm and mlm_params_path is not None:
+        local_mlm_params_path = download(url=get_repo_model_zoo_url() + mlm_params_path,
+                                         path=os.path.join(root, mlm_params_path),
+                                         sha1_hash=FILE_STATS[mlm_params_path])
+    else:
+        local_mlm_params_path = None
+
+    do_lower = True if 'lowercase' in PRETRAINED_URL[model_name]\
+                       and PRETRAINED_URL[model_name]['lowercase'] else False
+    tokenizer = HuggingFaceWordPieceTokenizer(
+                    vocab_file=local_paths['vocab'],
+                    unk_token='[UNK]',
+                    pad_token='[PAD]',
+                    cls_token='[CLS]',
+                    sep_token='[SEP]',
+                    mask_token='[MASK]',
+                    lowercase=do_lower)
+    if cfg is None:
+        cfg = MobileBertModel.get_cfg().clone_merge(local_paths['cfg'])
+    return cfg, tokenizer, local_params_path, local_mlm_params_path
+
+
+BACKBONE_REGISTRY.register('mobilebert', [MobileBertModel,
+                                          get_pretrained_mobilebert,
+                                          list_pretrained_mobilebert])
diff --git a/src/gluonnlp/models/model_zoo_checksums/albert.txt b/src/gluonnlp/models/model_zoo_checksums/albert.txt
new file mode 100644
index 0000000000..e1e4c38df1
--- /dev/null
+++ b/src/gluonnlp/models/model_zoo_checksums/albert.txt
@@ -0,0 +1,20 @@
+google_albert_base_v2/model-125be477.params          125be477d1cecc6843245eafe46ca1dc5961ffb5   46736016
+google_albert_base_v2/model-8767fdc9.yml             8767fdc9e1190606dc9aa17725438b4ae33704c4   436
+google_albert_base_v2/model_mlm-fe20650e.params      fe20650e289fcd1a36c09d39e1d5cf5ffa64ba32   47251372
+google_albert_base_v2/spm-65999e5d.model             65999e5d811d9dc77a93bd712c8cb28e3addd852   760289
+google_albert_base_v2/vocab-2ee53ae7.json            2ee53ae76a9d8f478e67abc28a4cb9ec7444f090   372576
+google_albert_large_v2/model-ad60bcd5.params         ad60bcd55cbba463c6e85062769fce846dd9fcf0   70737552
+google_albert_large_v2/model-e2e9b974.yml            e2e9b9748ffe2b147cd92cbc8edba129ed9e98c1   388
+google_albert_large_v2/model_mlm-6a5015ee.params     6a5015ee845f874c1201b5a954275a489e0ed10c   71383980
+google_albert_large_v2/spm-65999e5d.model            65999e5d811d9dc77a93bd712c8cb28e3addd852   760289
+google_albert_large_v2/vocab-2ee53ae7.json           2ee53ae76a9d8f478e67abc28a4cb9ec7444f090   372576
+google_albert_xlarge_v2/model-4149c9e2.params        4149c9e2793dbd9352d27ab11d67f84b0763f4b2   234901136
+google_albert_xlarge_v2/model-8123bffd.yml           8123bffda684857ddac48ebeaaa18aba0e1503fb   437
+google_albert_xlarge_v2/model_mlm-ee184d38.params    ee184d389424bab1adf17cc1feb86c69ba0791ff   236071852
+google_albert_xlarge_v2/spm-65999e5d.model           65999e5d811d9dc77a93bd712c8cb28e3addd852   760289
+google_albert_xlarge_v2/vocab-2ee53ae7.json          2ee53ae76a9d8f478e67abc28a4cb9ec7444f090   372576
+google_albert_xxlarge_v2/model-5601a0ed.params       5601a0edddb11d324aecccca7f496ef09013481e   890384016
+google_albert_xxlarge_v2/model-07fbeebc.yml          07fbeebcdee60e2362040807d56c572ae7dd7f03   438
+google_albert_xxlarge_v2/model_mlm-d2e2b06f.params   d2e2b06f68668cab9c37dd60dca82f00e2e248ab   892603308
+google_albert_xxlarge_v2/spm-65999e5d.model          65999e5d811d9dc77a93bd712c8cb28e3addd852   760289
+google_albert_xxlarge_v2/vocab-2ee53ae7.json         2ee53ae76a9d8f478e67abc28a4cb9ec7444f090   372576
diff --git a/src/gluonnlp/models/model_zoo_checksums/bart.txt b/src/gluonnlp/models/model_zoo_checksums/bart.txt
new file mode 100644
index 0000000000..75e61f9ef8
--- /dev/null
+++ b/src/gluonnlp/models/model_zoo_checksums/bart.txt
@@ -0,0 +1,8 @@
+fairseq_bart_base/model-8f4929b5.params   8f4929b54f2f77619885cea9f3bd7dba51a27f38    560560748
+fairseq_bart_base/gpt2-396d4d8e.merges    396d4d8ec90cb02f4d56e049e0e4add868bcd943    456318
+fairseq_bart_base/model-251bf089.yml      251bf08944d18cc29b59a4a854bdbccf601dabb5    754
+fairseq_bart_base/gpt2-f4dedacb.vocab     f4dedacb076b1df441c9c7398ed9acd3c19865f3    575079
+fairseq_bart_large/model-862277b1.params  862277b1489ed95140cb63279fbd0098ef2dea90    1625180962
+fairseq_bart_large/gpt2-396d4d8e.merges   396d4d8ec90cb02f4d56e049e0e4add868bcd943    456318
+fairseq_bart_large/model-a2932dea.yml     a2932deaf9737d95891755841fae3e388f3d698a    746
+fairseq_bart_large/gpt2-f1335494.vocab    f1335494f47917829e3b1d08e579ff2c3fe4fd60    558231
diff --git a/src/gluonnlp/models/model_zoo_checksums/bert.txt b/src/gluonnlp/models/model_zoo_checksums/bert.txt
new file mode 100644
index 0000000000..3ffb67e34a
--- /dev/null
+++ b/src/gluonnlp/models/model_zoo_checksums/bert.txt
@@ -0,0 +1,30 @@
+google_en_cased_bert_base/model-c566c289.params          c566c289e824e8b8b66bd3ee92c571bfd20b4280   433253136
+google_en_cased_bert_base/model-5620839a.yml             5620839ae19a0ab1aad2ef9e12e67459b155ff69   402
+google_en_cased_bert_base/model_mlm-bde14bee.params      bde14bee070cbc634de8f73e17a2c824606b8c8a   435740186
+google_en_cased_bert_base/vocab-c1defaaa.json            c1defaaa483849fee9780586b66070d275ce8892   300599
+google_en_uncased_bert_base/model-3712e50a.params        3712e50af896412222415ceabc94b8d6750e7e79   437941008
+google_en_uncased_bert_base/model-4d8422ad.yml           4d8422ad3e6e81c140d9658d2d0341703ece7e33   402
+google_en_uncased_bert_base/model_mlm-04e88b58.params    04e88b5874ce62c273118bf01b3ff6bdf4a56351   440434162
+google_en_uncased_bert_base/vocab-e6d2b21d.json          e6d2b21d910ccb356aa18f27a1c7d70660edc058   323235
+google_en_cased_bert_large/model-7aa93704.params         7aa937046ee83d966fadb450abb69b77b5684dd7   1334340792
+google_en_cased_bert_large/model-9e127fee.yml            9e127feed8f6cd3eccbe33163e1c2204ba6a8280   403
+google_en_cased_bert_large/model_mlm-59ff3f6a.params     59ff3f6aa4b5fea5a85ef05b042c23eff4a4bf60   1338668082
+google_en_cased_bert_large/vocab-c1defaaa.json           c1defaaa483849fee9780586b66070d275ce8892   300599
+google_en_uncased_bert_large/model-e53bbc57.params       e53bbc579057d563d6f8804122af886729e59dbc   1340591288
+google_en_uncased_bert_large/model-d0c37dcc.yml          d0c37dcc55a8b523386a2a8f63a45607bdc41f42   403
+google_en_uncased_bert_large/model_mlm-44bc70c0.params   44bc70c0e616df05782f789dd326a6cca0f500eb   1344924682
+google_en_uncased_bert_large/vocab-e6d2b21d.json         e6d2b21d910ccb356aa18f27a1c7d70660edc058   323235q
+google_zh_bert_base/model-2efbff63.params                2efbff631a6d16dee41f52901ce6fc9c8a892a60   409082640
+google_zh_bert_base/model-9b16bda6.yml                   9b16bda697bd0bb28c51bdd11ac4051f55af266c   402
+google_zh_bert_base/model_mlm-75339658.params            75339658a36f2e662f17c208bb8feebd22c128c1   476443514
+google_zh_bert_base/vocab-711c13e4.json                  711c13e452ec0c6775155dd32be53caf800ecf6c   173079
+google_multi_cased_bert_base/model-c2110078.params       c2110078ce0ed7a965f7cdd8886e7a56f64e1694   711425808
+google_multi_cased_bert_base/model-881ad607.yml          881ad6074921553c1f87c820aa969c7eacb7b1f0   403
+google_multi_cased_bert_base/vocab-016e1169.json         016e11698c9f198c728b4c2200f099d6be24289c   1354328
+google_multi_cased_bert_base/model_mlm-4611e7a3.params   4611e7a38f72a48f11db2297485ba68c1f0f8958   1081523526
+google_en_cased_bert_wwm_large/model-0fe841cf.params     0fe841cfcb948b50aca9230db2a48e2873ade922   1334340792
+google_en_cased_bert_wwm_large/model-9e127fee.yml        9e127feed8f6cd3eccbe33163e1c2204ba6a8280   403
+google_en_cased_bert_wwm_large/vocab-c1defaaa.json       c1defaaa483849fee9780586b66070d275ce8892   300599
+google_en_uncased_bert_wwm_large/model-cb3ad3c2.params   cb3ad3c2c365bc7ef8af0ac1618fbc574d939ea1   1340591288
+google_en_uncased_bert_wwm_large/model-d0c37dcc.yml      d0c37dcc55a8b523386a2a8f63a45607bdc41f42   403
+google_en_uncased_bert_wwm_large/vocab-e6d2b21d.json     e6d2b21d910ccb356aa18f27a1c7d70660edc058   323235
diff --git a/src/gluonnlp/models/model_zoo_checksums/electra.txt b/src/gluonnlp/models/model_zoo_checksums/electra.txt
new file mode 100644
index 0000000000..2d66960466
--- /dev/null
+++ b/src/gluonnlp/models/model_zoo_checksums/electra.txt
@@ -0,0 +1,20 @@
+google_electra_small/vocab-e6d2b21d.json             e6d2b21d910ccb356aa18f27a1c7d70660edc058   323235
+google_electra_small/model-2654c8b4.params           2654c8b4e240a5713078d2bd79582285c3f1b333   53945262
+google_electra_small/gen_model-0c30d1c5.params       0c30d1c5678154937dee1d11bef8db6f43d4d767   54202512
+google_electra_small/model-9ffb21c8.yml              9ffb21c8885bdb3e5f62c3f7a670d406167ec10c   472
+google_electra_small/disc_model-137714b6.params      137714b6c7f327e642861a7380dd94c8b3dbf1ea   54211975
+google_electra_base/vocab-e6d2b21d.json              e6d2b21d910ccb356aa18f27a1c7d70660edc058   323235
+google_electra_base/model-31c235cc.params            31c235cc6da6f1872adffb31efe9318600b89ae5   435579680
+google_electra_base/gen_model-253a62c9.params        253a62c9aa9de24d85e09a9ae62ef88501e53dff   134978192
+google_electra_base/model-5b35ca0b.yml               5b35ca0b7f117978e372cfd8d98970d2d726e6c0   477
+google_electra_base/disc_model-514bd353.params       514bd353f9d42bc907bfa7e1175f4013b0147d7e   437947611
+google_electra_large/vocab-e6d2b21d.json             e6d2b21d910ccb356aa18f27a1c7d70660edc058   323235
+google_electra_large/model-9baf9ff5.params           9baf9ff55cee0195b7754aee7fcb3a1019c99f45   1336395080
+google_electra_large/gen_model-82c1b17b.params       82c1b17b4b5ac19700c272858b0b211437f72855   205211944
+google_electra_large/model-31b7dfdd.yml              31b7dfdd343bd2b2e43e200a735c83b0af1963f1   476
+google_electra_large/disc_model-5b820c02.params      5b820c026aa2ad779c1e9a41ff4ff1408fefacbf   1340602227
+gluon_electra_small_owt/vocab-e6d2b21d.json          e6d2b21d910ccb356aa18f27a1c7d70660edc058   323235
+gluon_electra_small_owt/model-e9636891.params        e9636891daae9f2940b2b3210cca3c34c3d8f21e   53748654
+gluon_electra_small_owt/model-6e276d98.yml           6e276d98360fbb7c379d28bac34a3ca2918a90ab   473
+gluon_electra_small_owt/gen_model-45a6fb67.params    45a6fb67e1e6cb65d22b80498f2152ce9780d579   33926624
+gluon_electra_small_owt/disc_model-87836017.params   878360174ac71c3fdc7071be7835bea532c09b8d   54015367
diff --git a/src/gluonnlp/models/model_zoo_checksums/gpt2.txt b/src/gluonnlp/models/model_zoo_checksums/gpt2.txt
new file mode 100644
index 0000000000..f117b813d1
--- /dev/null
+++ b/src/gluonnlp/models/model_zoo_checksums/gpt2.txt
@@ -0,0 +1,15 @@
+gpt2_124M/model-fac1f39c.yml       fac1f39c804e324c69162b9b37bd24ab98241612    424
+gpt2_124M/model_lm-99b90604.params 99b9060488b4542ccd045c28401da10a3158ca80    497771820
+gpt2_124M/gpt2-396d4d8e.merges     396d4d8ec90cb02f4d56e049e0e4add868bcd943    456318
+gpt2_124M/gpt2-9dc62091.vocab      9dc620913410d5ec1a988abf852891e1c9f0f649    558055
+gpt2_124M/model-bfed311d.params    bfed311d5c980ba475f90ccf7f536d25c3b40386    497769466
+gpt2_355M/model-2aea05ff.yml       2aea05ff1e67ef816b3f824102da8b7b1292a620    425
+gpt2_355M/model_lm-eed0e964.params eed0e964f4222823a557acfee2c106f228ce0188    1419317644
+gpt2_355M/gpt2-396d4d8e.merges     396d4d8ec90cb02f4d56e049e0e4add868bcd943    456318
+gpt2_355M/gpt2-9dc62091.vocab      9dc620913410d5ec1a988abf852891e1c9f0f649    558055
+gpt2_355M/model-81dee612.params    81dee612413733899f6e5fbbeac91da781805e1b    1419312986
+gpt2_774M/model-c9555788.yml       c95557880783ec4f94b09b5b045c8d9e9a198e4d    425
+gpt2_774M/model_lm-cfbfa641.params cfbfa6419aaf1eae480fba5a1a7c8ea6096d43d6    3096157676
+gpt2_774M/gpt2-396d4d8e.merges     396d4d8ec90cb02f4d56e049e0e4add868bcd943    456318
+gpt2_774M/gpt2-9dc62091.vocab      9dc620913410d5ec1a988abf852891e1c9f0f649    558055
+gpt2_774M/model-9917e24e.params    9917e24e89c651793adea69042d6cceddfc7973c    3096150714
diff --git a/src/gluonnlp/models/model_zoo_checksums/mobilebert.txt b/src/gluonnlp/models/model_zoo_checksums/mobilebert.txt
new file mode 100644
index 0000000000..1c253380b9
--- /dev/null
+++ b/src/gluonnlp/models/model_zoo_checksums/mobilebert.txt
@@ -0,0 +1,4 @@
+google_uncased_mobilebert/model-1c33216b.yml             1c33216b256a76713e0906b7ceefb3b37d4d35a0   510
+google_uncased_mobilebert/vocab-e6d2b21d.json            e6d2b21d910ccb356aa18f27a1c7d70660edc058   323235
+google_uncased_mobilebert/model-c8346cf2.params          c8346cf2caf9cc422f081f03b50bc69945328894   98424130
+google_uncased_mobilebert/model_mlm-53948e82.params      53948e82d8ec091927af357387b36ade0e42b34c   146503986
diff --git a/src/gluonnlp/models/model_zoo_checksums/roberta.txt b/src/gluonnlp/models/model_zoo_checksums/roberta.txt
new file mode 100644
index 0000000000..6de6e8ce5f
--- /dev/null
+++ b/src/gluonnlp/models/model_zoo_checksums/roberta.txt
@@ -0,0 +1,10 @@
+fairseq_roberta_base/model-565d1db7.yml         565d1db71b0452fa2c28f155b8e9d90754f4f40a    401
+fairseq_roberta_base/gpt2-396d4d8e.merges       396d4d8ec90cb02f4d56e049e0e4add868bcd943    456318
+fairseq_roberta_base/gpt2-f1335494.vocab        f1335494f47917829e3b1d08e579ff2c3fe4fd60    558231
+fairseq_roberta_base/model-09a1520a.params      09a1520adf652468c07e43a6ed28908418fa58a7    496222787
+fairseq_roberta_base/model_mlm-29889e2b.params  29889e2b4ef20676fda117bb7b754e1693d0df25    498794868
+fairseq_roberta_large/model-6b043b91.params     6b043b91a6a781a12ea643d0644d32300db38ec8    1417251819
+fairseq_roberta_large/gpt2-396d4d8e.merges      396d4d8ec90cb02f4d56e049e0e4add868bcd943    456318
+fairseq_roberta_large/model-6e66dc4a.yml        6e66dc4a450560a93aaf3d0ba9e0d447495d778a    402
+fairseq_roberta_large/gpt2-f1335494.vocab       f1335494f47917829e3b1d08e579ff2c3fe4fd60    558231
+fairseq_roberta_large/model_mlm-119f38e1.params 119f38e1249bd28bea7dd2e90c09b8f4b879fa19    1421664140
diff --git a/src/gluonnlp/models/model_zoo_checksums/xlmr.txt b/src/gluonnlp/models/model_zoo_checksums/xlmr.txt
new file mode 100644
index 0000000000..355584fcd5
--- /dev/null
+++ b/src/gluonnlp/models/model_zoo_checksums/xlmr.txt
@@ -0,0 +1,8 @@
+fairseq_xlmr_base/model-3fa134e9.params         3fa134e9a13e2329ffa7b8d39612695ed8397c9d      1109814851
+fairseq_xlmr_base/model-b893d178.yml            b893d178fa859fb6c708a08fc970b9980e047825      402
+fairseq_xlmr_base/model_mlm-86e37954.params     86e379542a6430cd988ff4b6a25966949afc241a      1113185880
+fairseq_xlmr_base/sentencepiece-18e17bae.model  18e17bae37be115135d4cf4ad9dfcc4f3b12cb80      5069075
+fairseq_xlmr_large/model-b62b074c.params        b62b074cdd41e682075e2407f842be6578696b26      2235374571
+fairseq_xlmr_large/model-01fc59fb.yml           01fc59fb3a805f09d2aa11369d5b57e0be931fdd      403
+fairseq_xlmr_large/model_mlm-887506c2.params    887506c20bda452cf13ef04390eaa57a55602a92      2240585840
+fairseq_xlmr_large/sentencepiece-18e17bae.model 18e17bae37be115135d4cf4ad9dfcc4f3b12cb80      5069075
diff --git a/src/gluonnlp/models/roberta.py b/src/gluonnlp/models/roberta.py
new file mode 100644
index 0000000000..b9af04dafd
--- /dev/null
+++ b/src/gluonnlp/models/roberta.py
@@ -0,0 +1,608 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+RoBERTa Model
+
+@article{liu2019roberta,
+    title = {RoBERTa: A Robustly Optimized BERT Pretraining Approach},
+    author = {Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and
+              Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and
+              Luke Zettlemoyer and Veselin Stoyanov},
+    journal={arXiv preprint arXiv:1907.11692},
+    year = {2019},
+}
+"""
+
+__all__ = ['RobertaModel', 'RobertaForMLM', 'list_pretrained_roberta', 'get_pretrained_roberta']
+
+import os
+from typing import Tuple
+
+import mxnet as mx
+from mxnet import use_np
+from mxnet.gluon import HybridBlock, nn
+
+from ..op import select_vectors_by_position
+from ..base import get_model_zoo_home_dir, get_repo_model_zoo_url, \
+                   get_model_zoo_checksum_dir
+from ..layers import PositionalEmbedding, get_activation
+from ..registry import BACKBONE_REGISTRY
+from ..utils.misc import download, load_checksum_stats
+from ..utils.registry import Registry
+from .transformer import TransformerEncoderLayer
+from ..initializer import TruncNorm
+from ..utils.config import CfgNode as CN
+from ..attention_cell import gen_self_attn_mask
+from ..data.tokenizers import HuggingFaceByteBPETokenizer
+
+
+FILE_STATS = load_checksum_stats(os.path.join(get_model_zoo_checksum_dir(), 'roberta.txt'))
+roberta_cfg_reg = Registry('roberta_cfg')
+
+
+@roberta_cfg_reg.register()
+def roberta_base():
+    cfg = CN()
+    # Config for the roberta base model
+    cfg.MODEL = CN()
+    cfg.MODEL.vocab_size = 50265
+    cfg.MODEL.units = 768
+    cfg.MODEL.hidden_size = 3072
+    cfg.MODEL.max_length = 512
+    cfg.MODEL.num_heads = 12
+    cfg.MODEL.num_layers = 12
+    cfg.MODEL.pos_embed_type = 'learned'
+    cfg.MODEL.activation = 'gelu'
+    cfg.MODEL.pooler_activation = 'tanh'
+    cfg.MODEL.layer_norm_eps = 1E-5
+    cfg.MODEL.hidden_dropout_prob = 0.1
+    cfg.MODEL.attention_dropout_prob = 0.1
+    cfg.MODEL.dtype = 'float32'
+    # Layout
+    cfg.MODEL.layout = 'NT'
+    cfg.MODEL.compute_layout = 'auto'
+    # Initialization method
+    cfg.INITIALIZER = CN()
+    cfg.INITIALIZER.embed = ['truncnorm', 0, 0.02]
+    cfg.INITIALIZER.weight = ['truncnorm', 0, 0.02]
+    cfg.INITIALIZER.bias = ['zeros']
+    cfg.VERSION = 1
+    cfg.freeze()
+    return cfg
+
+
+@roberta_cfg_reg.register()
+def roberta_large():
+    cfg = roberta_base()
+    cfg.defrost()
+    cfg.MODEL.units = 1024
+    cfg.MODEL.hidden_size = 4096
+    cfg.MODEL.num_heads = 16
+    cfg.MODEL.num_layers = 24
+    cfg.freeze()
+    return cfg
+
+
+PRETRAINED_URL = {
+    'fairseq_roberta_base': {
+        'cfg': roberta_base(),
+        'merges': 'fairseq_roberta_base/gpt2-396d4d8e.merges',
+        'vocab': 'fairseq_roberta_base/gpt2-f1335494.vocab',
+        'params': 'fairseq_roberta_base/model-09a1520a.params',
+        'mlm_params': 'fairseq_roberta_base/model_mlm-29889e2b.params',
+        'lowercase': False,
+    },
+    'fairseq_roberta_large': {
+        'cfg': roberta_large(),
+        'merges': 'fairseq_roberta_large/gpt2-396d4d8e.merges',
+        'vocab': 'fairseq_roberta_large/gpt2-f1335494.vocab',
+        'params': 'fairseq_roberta_large/model-6b043b91.params',
+        'mlm_params': 'fairseq_roberta_large/model_mlm-119f38e1.params',
+        'lowercase': False,
+    }
+}
+
+
+@use_np
+class RobertaEncoder(HybridBlock):
+    def __init__(self,
+                 units=768,
+                 hidden_size=3072,
+                 num_layers=12,
+                 num_heads=12,
+                 attention_dropout_prob=0.1,
+                 hidden_dropout_prob=0.1,
+                 layer_norm_eps=1E-5,
+                 weight_initializer=TruncNorm(stdev=0.02),
+                 bias_initializer='zeros',
+                 activation='gelu',
+                 dtype='float32',
+                 output_all_encodings=False,
+                 output_attention=False,
+                 layout='NT'):
+        super().__init__()
+        self.units = units
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.attention_dropout_prob = attention_dropout_prob
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.layer_norm_eps = layer_norm_eps
+        self.activation = activation
+        self._dtype = dtype
+        self._layout = layout
+        self._output_all_encodings = output_all_encodings
+        self._output_attention = output_attention
+        self.all_layers = nn.HybridSequential()
+        for layer_idx in range(self.num_layers):
+            self.all_layers.add(
+                TransformerEncoderLayer(
+                    units=self.units,
+                    hidden_size=self.hidden_size,
+                    num_heads=self.num_heads,
+                    attention_dropout_prob=self.attention_dropout_prob,
+                    hidden_dropout_prob=self.hidden_dropout_prob,
+                    layer_norm_eps=self.layer_norm_eps,
+                    weight_initializer=weight_initializer,
+                    bias_initializer=bias_initializer,
+                    activation=self.activation,
+                    dtype=self._dtype,
+                    layout=layout)
+            )
+
+    @property
+    def layout(self):
+        return self._layout
+
+    def hybrid_forward(self, F, x, valid_length):
+        atten_mask = gen_self_attn_mask(F, x, valid_length,
+                                        layout=self._layout,
+                                        dtype=self._dtype, attn_type='full')
+        all_encodings_outputs = [x]
+        additional_outputs = []
+        for layer_idx in range(self.num_layers):
+            layer = self.all_layers[layer_idx]
+            x, attention_weights = layer(x, atten_mask)
+            if self._output_all_encodings:
+                all_encodings_outputs.append(x)
+            if self._output_attention:
+                additional_outputs.append(attention_weights)
+        # sequence_mask is not necessary here because masking could be performed in downstream tasks
+        if self._output_all_encodings:
+            return all_encodings_outputs, additional_outputs
+        else:
+            return x, additional_outputs
+
+
+@use_np
+class RobertaModel(HybridBlock):
+    def __init__(self,
+                 vocab_size=50265,
+                 units=768,
+                 hidden_size=3072,
+                 num_layers=12,
+                 num_heads=12,
+                 max_length=512,
+                 hidden_dropout_prob=0.1,
+                 attention_dropout_prob=0.1,
+                 pos_embed_type='learned',
+                 activation='gelu',
+                 pooler_activation='tanh',
+                 layer_norm_eps=1E-5,
+                 embed_initializer=TruncNorm(stdev=0.02),
+                 weight_initializer=TruncNorm(stdev=0.02),
+                 bias_initializer='zeros',
+                 dtype='float32',
+                 use_pooler=True,
+                 classifier_activation=False,
+                 encoder_normalize_before=True,
+                 output_all_encodings=False,
+                 layout='NT',
+                 compute_layout='auto'):
+        """
+
+        Parameters
+        ----------
+        vocab_size
+        units
+        hidden_size
+        num_layers
+        num_heads
+        max_length
+        hidden_dropout_prob
+        attention_dropout_prob
+        pos_embed_type
+        activation
+        pooler_activation
+        layer_norm_eps
+        embed_initializer
+        weight_initializer
+        bias_initializer
+        dtype
+        use_pooler
+            Whether to output the CLS hidden state
+        classifier_activation
+            Whether to use classification head
+        encoder_normalize_before
+            Whether to normalize before the
+        output_all_encodings
+            Whether to output all encodings
+        layout
+            The layout
+        compute_layout
+            The computation layout
+        """
+        super().__init__()
+        self._dtype = dtype
+        self._output_all_encodings = output_all_encodings
+        self.vocab_size = vocab_size
+        self.units = units
+        self.hidden_size = hidden_size
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.max_length = max_length
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.pos_embed_type = pos_embed_type
+        self.activation = activation
+        self.pooler_activation = pooler_activation
+        self.layer_norm_eps = layer_norm_eps
+        self.use_pooler = use_pooler
+        self.classifier_activation = classifier_activation
+        self.encoder_normalize_before = encoder_normalize_before
+        self.weight_initializer = weight_initializer
+        self.bias_initializer = bias_initializer
+        self._layout = layout
+        if compute_layout == 'auto' or compute_layout is None:
+            self._compute_layout = layout
+        else:
+            self._compute_layout = compute_layout
+        self.word_embed = nn.Embedding(
+            input_dim=self.vocab_size,
+            output_dim=self.units,
+            weight_initializer=embed_initializer,
+            dtype=self._dtype
+        )
+        if self.encoder_normalize_before:
+            self.embed_ln = nn.LayerNorm(
+                epsilon=self.layer_norm_eps,
+                in_channels=self.units)
+        self.embed_dropout = nn.Dropout(self.hidden_dropout_prob)
+        self.pos_embed = PositionalEmbedding(
+            units=self.units,
+            max_length=self.max_length,
+            dtype=self._dtype,
+            method=pos_embed_type)
+
+        self.encoder = RobertaEncoder(
+            units=self.units,
+            hidden_size=self.hidden_size,
+            num_layers=self.num_layers,
+            num_heads=self.num_heads,
+            attention_dropout_prob=self.attention_dropout_prob,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            layer_norm_eps=self.layer_norm_eps,
+            weight_initializer=weight_initializer,
+            bias_initializer=bias_initializer,
+            activation=self.activation,
+            dtype=self._dtype,
+            output_all_encodings=self._output_all_encodings,
+            layout=self._compute_layout,
+        )
+        self.encoder.hybridize()
+
+        if self.use_pooler and self.classifier_activation:
+            # Construct pooler
+            self.pooler = nn.Dense(units=self.units,
+                                   in_units=self.units,
+                                   flatten=False,
+                                   activation=self.pooler_activation,
+                                   weight_initializer=weight_initializer,
+                                   bias_initializer=bias_initializer)
+
+    @property
+    def layout(self):
+        return self._layout
+
+    def hybrid_forward(self, F, tokens, valid_length):
+        embedding = self.get_initial_embedding(F, tokens)
+        if self._layout != self._compute_layout:
+            contextual_embeddings, additional_outputs = self.encoder(F.np.swapaxes(embedding, 0, 1),
+                                                                     valid_length)
+            contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1)
+        else:
+            contextual_embeddings, additional_outputs = self.encoder(embedding, valid_length)
+        if self.use_pooler:
+            if isinstance(contextual_embeddings, list):
+                pooled_out = self.apply_pooling(contextual_embeddings[-1])
+            else:
+                pooled_out = self.apply_pooling(contextual_embeddings)
+            return contextual_embeddings, pooled_out
+        else:
+            return contextual_embeddings
+
+    def get_initial_embedding(self, F, inputs):
+        """Get the initial token embeddings that considers the token type and positional embeddings
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+
+        Returns
+        -------
+        embedding
+            The initial embedding that will be fed into the encoder
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C)
+        """
+        if self._layout == 'NT':
+            batch_axis, time_axis = 0, 1
+        else:
+            batch_axis, time_axis = 1, 0
+        embedding = self.word_embed(inputs)
+        if self.pos_embed_type:
+            positional_embedding = self.pos_embed(F.npx.arange_like(inputs, axis=time_axis))
+            positional_embedding = F.np.expand_dims(positional_embedding, axis=batch_axis)
+            embedding = embedding + positional_embedding
+        if self.encoder_normalize_before:
+            embedding = self.embed_ln(embedding)
+        embedding = self.embed_dropout(embedding)
+
+        return embedding
+
+    def apply_pooling(self, sequence):
+        """Generate the representation given the inputs.
+
+        This is used for pre-training or fine-tuning a mobile bert model.
+        Get the first token of the whole sequence which is [CLS]
+
+        Parameters
+        ----------
+        sequence
+            - layout = 'NT'
+                Shape (batch_size, sequence_length, units)
+            - layout = 'TN'
+                Shape (sequence_length, batch_size, units)
+
+        Returns
+        -------
+        ret
+            Shape (batch_size, units)
+        """
+        if self._layout == 'NT':
+            outputs = sequence[:, 0, :]
+        elif self._layout == 'TN':
+            outputs = sequence[0, :, :]
+        else:
+            raise NotImplementedError
+        if self.classifier_activation:
+            return self.pooler(outputs)
+        else:
+            return outputs
+
+    @staticmethod
+    def get_cfg(key=None):
+        if key is not None:
+            return roberta_cfg_reg.create(key)
+        else:
+            return roberta_base()
+
+    @classmethod
+    def from_cfg(cls,
+                 cfg,
+                 use_pooler=True,
+                 dtype=None,
+                 output_all_encodings=False) -> 'RobertaModel':
+        cfg = RobertaModel.get_cfg().clone_merge(cfg)
+        embed_initializer = mx.init.create(*cfg.INITIALIZER.embed)
+        weight_initializer = mx.init.create(*cfg.INITIALIZER.weight)
+        bias_initializer = mx.init.create(*cfg.INITIALIZER.bias)
+        if dtype is None:
+            dtype = cfg.MODEL.dtype
+        return cls(vocab_size=cfg.MODEL.vocab_size,
+                   units=cfg.MODEL.units,
+                   hidden_size=cfg.MODEL.hidden_size,
+                   num_layers=cfg.MODEL.num_layers,
+                   num_heads=cfg.MODEL.num_heads,
+                   max_length=cfg.MODEL.max_length,
+                   hidden_dropout_prob=cfg.MODEL.hidden_dropout_prob,
+                   attention_dropout_prob=cfg.MODEL.attention_dropout_prob,
+                   pos_embed_type=cfg.MODEL.pos_embed_type,
+                   activation=cfg.MODEL.activation,
+                   pooler_activation=cfg.MODEL.pooler_activation,
+                   layer_norm_eps=cfg.MODEL.layer_norm_eps,
+                   embed_initializer=embed_initializer,
+                   weight_initializer=weight_initializer,
+                   bias_initializer=bias_initializer,
+                   dtype=dtype,
+                   use_pooler=use_pooler,
+                   output_all_encodings=output_all_encodings,
+                   layout=cfg.MODEL.layout,
+                   compute_layout=cfg.MODEL.compute_layout)
+
+
+@use_np
+class RobertaForMLM(HybridBlock):
+    def __init__(self, backbone_cfg,
+                 weight_initializer=None,
+                 bias_initializer=None):
+        """
+
+        Parameters
+        ----------
+        backbone_cfg
+        weight_initializer
+        bias_initializer
+        """
+        super().__init__()
+        self.backbone_model = RobertaModel.from_cfg(backbone_cfg)
+        if weight_initializer is None:
+            weight_initializer = self.backbone_model.weight_initializer
+        if bias_initializer is None:
+            bias_initializer = self.backbone_model.bias_initializer
+        self.units = self.backbone_model.units
+        self.mlm_decoder = nn.HybridSequential()
+        # Extra non-linear layer
+        self.mlm_decoder.add(nn.Dense(units=self.units,
+                                      in_units=self.units,
+                                      flatten=False,
+                                      weight_initializer=weight_initializer,
+                                      bias_initializer=bias_initializer))
+        self.mlm_decoder.add(get_activation(self.backbone_model.activation))
+        self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps,
+                                          in_channels=self.units))
+        # only load the dense weights with a re-initialized bias
+        # parameters are stored in 'word_embed_bias' which is
+        # not used in original embedding
+        self.mlm_decoder.add(
+            nn.Dense(
+                units=self.backbone_model.vocab_size,
+                in_units=self.units,
+                flatten=False,
+                bias_initializer=bias_initializer))
+        self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight
+        self.mlm_decoder.hybridize()
+
+    def hybrid_forward(self, F, inputs, valid_length, masked_positions):
+        """Getting the scores of the masked positions.
+
+        Parameters
+        ----------
+        F
+        inputs
+            - layout = 'NT'
+                Shape (batch_size, seq_length)
+            - layout = 'TN'
+                Shape (seq_length, batch_size)
+        valid_length
+            The valid length of each sequence
+            Shape (batch_size,)
+        masked_positions
+            The masked position of the sequence
+            Shape (batch_size, num_masked_positions).
+
+        Returns
+        -------
+        contextual_embedding
+            - layout = 'NT'
+                Shape (batch_size, seq_length, units).
+            - layout = 'TN'
+                Shape (seq_length, batch_size, units).
+        pooled_out
+            Shape (batch_size, units)
+        mlm_scores :
+            Shape (batch_size, num_masked_positions, vocab_size)
+        """
+
+        all_encodings_outputs, pooled_out = self.backbone_model(inputs, valid_length)
+        if self.backbone_model._output_all_encodings:
+            contextual_embeddings = all_encodings_outputs[-1]
+        else:
+            contextual_embeddings = all_encodings_outputs
+        if self.backbone_model.layout == 'TN':
+            contextual_embeddings = F.np.swapaxes(contextual_embeddings, 0, 1)
+        mlm_features = select_vectors_by_position(F, contextual_embeddings, masked_positions)
+        mlm_scores = self.mlm_decoder(mlm_features)
+        return all_encodings_outputs, pooled_out, mlm_scores
+
+
+def list_pretrained_roberta():
+    return sorted(list(PRETRAINED_URL.keys()))
+
+
+def get_pretrained_roberta(model_name: str = 'fairseq_roberta_base',
+                           root: str = get_model_zoo_home_dir(),
+                           load_backbone: bool = True,
+                           load_mlm: bool = False) \
+        -> Tuple[CN, HuggingFaceByteBPETokenizer, str, str]:
+    """Get the pretrained RoBERTa weights
+
+    Parameters
+    ----------
+    model_name
+        The name of the RoBERTa model.
+    root
+        The downloading root
+    load_backbone
+        Whether to load the weights of the backbone network
+    load_mlm
+        Whether to load the weights of MLM
+
+    Returns
+    -------
+    cfg
+        Network configuration
+    tokenizer
+        The HuggingFaceByteBPETokenizer
+    params_path
+        Path to the parameters
+    mlm_params_path
+        Path to the parameter that includes both the backbone and the MLM
+    """
+    assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format(
+        model_name, list_pretrained_roberta())
+    cfg_path = PRETRAINED_URL[model_name]['cfg']
+    if isinstance(cfg_path, CN):
+        cfg = cfg_path
+    else:
+        cfg = None
+    merges_path = PRETRAINED_URL[model_name]['merges']
+    vocab_path = PRETRAINED_URL[model_name]['vocab']
+    params_path = PRETRAINED_URL[model_name]['params']
+    mlm_params_path = PRETRAINED_URL[model_name]['mlm_params']
+
+    local_paths = dict()
+    download_jobs = [('vocab', vocab_path), ('merges', merges_path)]
+    if cfg is None:
+        download_jobs.append(('cfg', cfg_path))
+    for k, path in download_jobs:
+        local_paths[k] = download(url=get_repo_model_zoo_url() + path,
+                                  path=os.path.join(root, path),
+                                  sha1_hash=FILE_STATS[path])
+    if load_backbone:
+        local_params_path = download(url=get_repo_model_zoo_url() + params_path,
+                                     path=os.path.join(root, params_path),
+                                     sha1_hash=FILE_STATS[params_path])
+    else:
+        local_params_path = None
+    if load_mlm and mlm_params_path is not None:
+        local_mlm_params_path = download(url=get_repo_model_zoo_url() + mlm_params_path,
+                                         path=os.path.join(root, mlm_params_path),
+                                         sha1_hash=FILE_STATS[mlm_params_path])
+    else:
+        local_mlm_params_path = None
+    do_lower = True if 'lowercase' in PRETRAINED_URL[model_name]\
+                       and PRETRAINED_URL[model_name]['lowercase'] else False
+    tokenizer = HuggingFaceByteBPETokenizer(
+                    merges_file=local_paths['merges'],
+                    vocab_file=local_paths['vocab'],
+                    lowercase=do_lower)
+    if cfg is None:
+        cfg = RobertaModel.get_cfg().clone_merge(local_paths['cfg'])
+    return cfg, tokenizer, local_params_path, local_mlm_params_path
+
+
+BACKBONE_REGISTRY.register('roberta', [RobertaModel,
+                                       get_pretrained_roberta,
+                                       list_pretrained_roberta])
diff --git a/src/gluonnlp/models/transformer.py b/src/gluonnlp/models/transformer.py
new file mode 100644
index 0000000000..bbb7605f18
--- /dev/null
+++ b/src/gluonnlp/models/transformer.py
@@ -0,0 +1,1416 @@
+from abc import ABC
+
+import numpy as np
+import mxnet as mx
+from mxnet import use_np
+from mxnet.gluon import nn, HybridBlock
+from typing import Optional, Tuple, List
+from ..utils.registry import Registry
+from ..attention_cell import MultiHeadAttentionCell, gen_self_attn_mask, gen_mem_attn_mask
+from ..layers import PositionalEmbedding, PositionwiseFFN, InitializerType
+from ..utils.config import CfgNode as CN
+from ..sequence_sampler import BaseStepDecoder
+__all__ = ['TransformerEncoderLayer', 'TransformerDecoderLayer',
+           'TransformerEncoder', 'TransformerDecoder',
+           'TransformerModel', 'TransformerNMTInference']
+
+transformer_cfg_reg = Registry('transformer_cfg')
+
+
+@transformer_cfg_reg.register()
+def transformer_base():
+    """Configuration of Transformer WMT EN-DE Base"""
+    cfg = CN()
+    cfg.MODEL = CN()
+    cfg.MODEL.src_vocab_size = -1
+    cfg.MODEL.tgt_vocab_size = -1
+    cfg.MODEL.max_src_length = -1
+    cfg.MODEL.max_tgt_length = -1
+    cfg.MODEL.scale_embed = True
+    cfg.MODEL.pos_embed_type = "sinusoidal"
+    cfg.MODEL.shared_embed = True
+    cfg.MODEL.tie_weights = True
+    cfg.MODEL.attention_dropout = 0.0
+    cfg.MODEL.activation_dropout = 0.0
+    cfg.MODEL.dropout = 0.1
+    cfg.MODEL.layout = 'NT'
+    cfg.MODEL.dtype = 'float32'
+
+    # Parameters for the encoder
+    cfg.MODEL.ENCODER = CN()
+    cfg.MODEL.ENCODER.num_layers = 6
+    cfg.MODEL.ENCODER.units = 512
+    cfg.MODEL.ENCODER.num_heads = 8
+    cfg.MODEL.ENCODER.hidden_size = 2048
+    cfg.MODEL.ENCODER.recurrent = False
+    cfg.MODEL.ENCODER.activation = 'relu'
+    cfg.MODEL.ENCODER.pre_norm = False
+    cfg.MODEL.ENCODER.use_qkv_bias = True
+
+    # Parameters for the decoder
+    cfg.MODEL.DECODER = CN()
+    cfg.MODEL.DECODER.num_layers = 6
+    cfg.MODEL.DECODER.units = 512
+    cfg.MODEL.DECODER.num_heads = 8
+    cfg.MODEL.DECODER.hidden_size = 2048
+    cfg.MODEL.DECODER.recurrent = False
+    cfg.MODEL.DECODER.activation = 'relu'
+    cfg.MODEL.DECODER.pre_norm = False
+    cfg.MODEL.DECODER.use_qkv_bias = False
+
+    # Parameters for the initializer
+    cfg.INITIALIZER = CN()
+    cfg.INITIALIZER.embed = ['xavier', 'gaussian', 'in', 1.0]
+    cfg.INITIALIZER.weight = ['xavier', 'uniform', 'avg', 3.0]
+    cfg.INITIALIZER.bias = ['zeros']
+    cfg.VERSION = 1
+    cfg.freeze()
+    return cfg
+
+
+@transformer_cfg_reg.register()
+def transformer_base_prenorm():
+    cfg = transformer_base()
+    cfg.defrost()
+    cfg.MODEL.ENCODER.pre_norm = True
+    cfg.MODEL.DECODER.pre_norm = True
+    cfg.freeze()
+    return cfg
+
+
+@transformer_cfg_reg.register()
+def transformer_iwslt_de_en():
+    cfg = TransformerModel.get_cfg()
+    cfg.defrost()
+    cfg.MODEL.ENCODER.units = 512
+    cfg.MODEL.ENCODER.hidden_size = 1024
+    cfg.MODEL.ENCODER.num_heads = 4
+    cfg.MODEL.ENCODER.num_layers = 6
+    cfg.MODEL.DECODER.units = 512
+    cfg.MODEL.DECODER.hidden_size = 1024
+    cfg.MODEL.DECODER.num_heads = 4
+    cfg.MODEL.DECODER.num_layers = 6
+    cfg.freeze()
+    return cfg
+
+
+@transformer_cfg_reg.register()
+def transformer_wmt_en_de_big():
+    """Same wmt_en_de_big architecture as in FairSeq"""
+    cfg = TransformerModel.get_cfg()
+    cfg.defrost()
+    cfg.MODEL.attention_dropout = 0.1
+    cfg.MODEL.dropout = 0.3
+    cfg.MODEL.ENCODER.units = 1024
+    cfg.MODEL.ENCODER.hidden_size = 4096
+    cfg.MODEL.ENCODER.num_heads = 16
+    cfg.MODEL.ENCODER.num_layers = 6
+    cfg.MODEL.DECODER.units = 1024
+    cfg.MODEL.DECODER.hidden_size = 4096
+    cfg.MODEL.DECODER.num_heads = 16
+    cfg.MODEL.DECODER.num_layers = 6
+    cfg.freeze()
+    return cfg
+
+
+@transformer_cfg_reg.register()
+def transformer_wmt_en_de_big_t2t():
+    """Parameter used in the T2T implementation"""
+    cfg = transformer_wmt_en_de_big()
+    cfg.defrost()
+    cfg.MODEL.attention_dropout = 0.1
+    cfg.MODEL.activation_dropout = 0.1
+    cfg.MODEL.ENCODER.pre_norm = True
+    cfg.MODEL.DECODER.pre_norm = True
+    cfg.freeze()
+    return cfg
+
+
+@use_np
+class TransformerEncoderLayer(HybridBlock):
+    """Transformer Encoder Layer"""
+    def __init__(self,
+                 units: int = 512,
+                 hidden_size: int = 2048,
+                 num_heads: int = 8,
+                 attention_dropout_prob: float = 0.1,
+                 hidden_dropout_prob: float = 0.1,
+                 activation_dropout_prob: float = 0.0,
+                 layer_norm_eps: float = 1e-12,
+                 pre_norm: bool = False,
+                 use_qkv_bias: bool = True,
+                 weight_initializer: Optional[InitializerType] = None,
+                 bias_initializer: Optional[InitializerType] = 'zeros',
+                 activation: str = 'relu',
+                 dtype='float32',
+                 layout='NT'):
+        """
+
+        Parameters
+        ----------
+        units
+        hidden_size
+        num_heads
+        attention_dropout_prob
+        hidden_dropout_prob
+        activation_dropout_prob
+        layer_norm_eps
+        pre_norm
+            Whether to attach the normalization layer before attention layer
+            If pre_norm:
+                norm(data) -> attn -> res(+data) -> ffn
+            Else:
+                data -> attn -> norm(res(+data)) -> ffn
+
+        use_qkv_bias
+            Wether to use bias for self attention
+        weight_initializer
+        bias_initializer
+        activation
+        dtype
+        layout
+        """
+        super().__init__()
+        self._units = units
+        self._hidden_size = hidden_size
+        self._num_heads = num_heads
+        self._attention_dropout_prob = attention_dropout_prob
+        self._hidden_dropout_prob = hidden_dropout_prob
+        self._activation_dropout_prob = activation_dropout_prob
+        self._pre_norm = pre_norm
+        self._dtype = dtype
+        self._layout = layout
+        assert layout in ['TN', 'NT'], 'Invalid layout received = {}. ' \
+                                       'Only "TN" and "NT" are accepted!'.format(layout)
+        assert self._units % self._num_heads == 0, 'units must be divisive by the number of heads'
+        self.dropout_layer = nn.Dropout(hidden_dropout_prob)
+        self.attn_qkv = nn.Dense(3 * units,
+                                 flatten=False,
+                                 use_bias=use_qkv_bias,
+                                 in_units=units,
+                                 weight_initializer=weight_initializer,
+                                 bias_initializer=bias_initializer,
+                                 dtype=self._dtype)
+        self.attention_proj = nn.Dense(units=units,
+                                       flatten=False,
+                                       in_units=units,
+                                       use_bias=True,
+                                       weight_initializer=weight_initializer,
+                                       bias_initializer=bias_initializer,
+                                       dtype=self._dtype)
+        attention_layout = 'NTK' if self._layout == 'NT' else 'TNK'
+        self.attention_cell = \
+            MultiHeadAttentionCell(
+                query_units=self._units,
+                num_heads=self._num_heads,
+                attention_dropout=self._attention_dropout_prob,
+                scaled=True,
+                dtype=self._dtype,
+                layout=attention_layout
+            )
+        self.layer_norm = nn.LayerNorm(epsilon=layer_norm_eps,
+                                       in_channels=units)
+        self.ffn = PositionwiseFFN(units=units,
+                                   hidden_size=hidden_size,
+                                   dropout=hidden_dropout_prob,
+                                   activation_dropout=activation_dropout_prob,
+                                   weight_initializer=weight_initializer,
+                                   bias_initializer=bias_initializer,
+                                   layer_norm_eps=layer_norm_eps,
+                                   activation=activation,
+                                   pre_norm=pre_norm,
+                                   dtype=self._dtype)
+
+    @property
+    def layout(self) -> str:
+        return self._layout
+
+    def hybrid_forward(self, F, data, attn_mask):
+        """
+
+        Parameters
+        ----------
+        F
+        data :
+            If layout == 'NT'
+                Shape (batch_size, seq_length, C_in)
+            Else
+                Shape (seq_length, batch_size, C_in)
+        attn_mask :
+            Shape (batch_size, seq_length, seq_length)
+
+        Returns
+        -------
+        out :
+            If layout == 'NT'
+                Shape (batch_size, seq_length, C_out)
+            Else
+                Shape (seq_length, batch_size, C_out)
+        attn_weight :
+            Shape (batch_size, seq_length, seq_length)
+        """
+        if self._pre_norm:
+            data = self.layer_norm(data)
+        query, key, value = F.np.split(self.attn_qkv(data), 3, axis=-1)
+        query = F.npx.reshape(query, (-2, -2, self._num_heads, -1))
+        key = F.npx.reshape(key, (-2, -2, self._num_heads, -1))
+        value = F.npx.reshape(value, (-2, -2, self._num_heads, -1))
+        out, [_, attn_weight] = self.attention_cell(query, key, value, attn_mask)
+        out = self.attention_proj(out)
+        out = self.dropout_layer(out)
+        out = out + data
+        if not self._pre_norm:
+            out = self.layer_norm(out)
+        out = self.ffn(out)
+        return out, attn_weight
+
+@use_np
+class TransformerEncoder(HybridBlock):
+    def __init__(self, num_layers=6, recurrent=False,
+                 units=512, hidden_size=2048, num_heads=8,
+                 activation_dropout=0.0, dropout=0.1, use_qkv_bias=True,
+                 attention_dropout=0.1, layer_norm_eps=1E-5, data_norm=False,
+                 pre_norm=False, weight_initializer=None, bias_initializer='zeros',
+                 activation='relu', dtype='float32', layout='NT'):
+        """
+
+        Parameters
+        ----------
+        num_layers :
+            The number of layers
+        recurrent : bool
+            Whether the layers share weights or not
+        units
+        hidden_size
+        num_heads
+        dropout
+        layer_norm_eps
+        data_norm
+            Whether to apply LayerNorm to the data
+        pre_norm
+            Whether to apply LayerNorm before the attention layer.
+        weight_initializer
+        bias_initializer
+        activation
+        dtype
+        layout
+        """
+        super().__init__()
+        self._dtype = dtype
+        self.num_layers = num_layers
+        self._recurrent = recurrent
+        self._data_norm = data_norm
+        self._pre_norm = pre_norm
+        self._layout = layout
+        assert layout in ['TN', 'NT'], 'Invalid layout received = {}. ' \
+                                       'Only "TN" and "NT" are accepted!'.format(layout)
+        self.dropout_layer = nn.Dropout(dropout)
+        if self._pre_norm:
+            self.ln_final = nn.LayerNorm(epsilon=layer_norm_eps,
+                                         in_channels=units)
+        if self._data_norm:
+            self.ln_data = nn.LayerNorm(epsilon=layer_norm_eps,
+                                        in_channels=units)
+        # Construct the intermediate layers
+        self.layers = nn.HybridSequential()
+        real_num_layers = 1 if recurrent else num_layers
+        for i in range(real_num_layers):
+            self.layers.add(TransformerEncoderLayer(
+                units=units,
+                hidden_size=hidden_size,
+                num_heads=num_heads,
+                hidden_dropout_prob=dropout,
+                attention_dropout_prob=attention_dropout,
+                activation_dropout_prob=activation_dropout,
+                use_qkv_bias=use_qkv_bias,
+                layer_norm_eps=layer_norm_eps,
+                weight_initializer=weight_initializer,
+                bias_initializer=bias_initializer,
+                pre_norm=pre_norm,
+                activation=activation,
+                layout=self._layout,
+                dtype=dtype))
+
+    @property
+    def layout(self) -> str:
+        return self._layout
+
+    def hybrid_forward(self, F, data, valid_length):
+        """
+
+        Parameters
+        ----------
+        F
+        data :
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C)
+        valid_length :
+            Shape (batch_size,)
+
+        Returns
+        -------
+        out :
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_out)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_out)
+        """
+        # 1. Embed the data
+        attn_mask = gen_self_attn_mask(F, data, valid_length,
+                                       dtype=self._dtype,
+                                       layout=self.layout,
+                                       attn_type='full')
+        out = self.dropout_layer(data)
+        if self._data_norm:
+            out = self.ln_data(out)
+        for i in range(self.num_layers):
+            if self._recurrent:
+                layer = self.layers[0]
+            else:
+                layer = self.layers[i]
+            out, _ = layer(out, attn_mask)
+        if self._pre_norm:
+            out = self.ln_final(out)
+        return out
+
+
+@use_np
+class TransformerDecoderLayer(HybridBlock):
+    def __init__(self, units: int = 512,
+                 mem_units: Optional[int] = None,
+                 hidden_size: int = 2048,
+                 num_heads: int = 8,
+                 activation_dropout: float = 0.0,
+                 dropout: float = 0.1,
+                 attention_dropout: float = 0.1,
+                 layer_norm_eps: float = 1E-5,
+                 activation: str = 'relu',
+                 pre_norm: bool = False,
+                 use_qkv_bias: bool = True,
+                 weight_initializer=None,
+                 bias_initializer='zeros',
+                 dtype='float32',
+                 layout='NT'):
+        """
+
+        Parameters
+        ----------
+        units
+        mem_units
+            The number of units in the memory. By default, it is initialized to be the
+            same as the units.
+        hidden_size
+        num_heads
+        activation_dropout
+        dropout
+        attention_dropout
+        layer_norm_eps
+        activation
+        pre_norm
+            Whether to apply normalization before the attention layer
+        use_qkv_bias
+            Wether to use bias for both self attention and contextual attention
+        weight_initializer
+        bias_initializer
+        dtype
+            Data type
+        layout
+            Layout of the input
+        """
+        super().__init__()
+        self._dtype = dtype
+        self._units = units
+        if mem_units is None:
+            mem_units = units
+        self._mem_units = mem_units
+        self._pre_norm = pre_norm
+        self._num_heads = num_heads
+        self._attention_dropout = attention_dropout
+        self._dtype = dtype
+        self._layout = layout
+        assert layout in ['TN', 'NT'], 'Invalid layout received = {}. ' \
+                                       'Only "TN" and "NT" are accepted!'.format(layout)
+        attention_layout = 'NTK' if layout == 'NT' else 'TNK'
+        self.dropout_layer = nn.Dropout(dropout)
+        if units % num_heads:
+            raise ValueError('In Transformer, units should be divided exactly by the number of '
+                             'heads. Received units={}, num_heads={}'.format(units, num_heads))
+        self.attn_in_qkv = nn.Dense(3 * units, in_units=units,
+                                    use_bias=use_qkv_bias,
+                                    flatten=False,
+                                    weight_initializer=weight_initializer,
+                                    bias_initializer=bias_initializer,
+                                    dtype=dtype)
+        self.self_attention = MultiHeadAttentionCell(query_units=units,
+                                                     num_heads=num_heads,
+                                                     attention_dropout=self._attention_dropout,
+                                                     dtype=dtype,
+                                                     layout=attention_layout)
+        self.proj_in = nn.Dense(units=units, in_units=units, flatten=False,  use_bias=True,
+                                weight_initializer=weight_initializer,
+                                bias_initializer=bias_initializer,
+                                dtype=dtype)
+        self.attn_inter_q = nn.Dense(units,
+                                     in_units=units,
+                                     use_bias=use_qkv_bias,
+                                     flatten=False,
+                                     weight_initializer=weight_initializer,
+                                     bias_initializer=bias_initializer,
+                                     dtype=dtype)
+        self.attn_inter_k = nn.Dense(units, in_units=mem_units,
+                                     use_bias=use_qkv_bias,
+                                     flatten=False,
+                                     weight_initializer=weight_initializer,
+                                     bias_initializer=bias_initializer,
+                                     dtype=dtype)
+        self.attn_inter_v = nn.Dense(units, in_units=mem_units,
+                                     use_bias=use_qkv_bias,
+                                     flatten=False,
+                                     weight_initializer=weight_initializer,
+                                     bias_initializer=bias_initializer,
+                                     dtype=dtype)
+        self.inter_attention = MultiHeadAttentionCell(query_units=units,
+                                                      num_heads=num_heads,
+                                                      attention_dropout=self._attention_dropout,
+                                                      dtype=dtype,
+                                                      layout=attention_layout)
+        self.proj_inter = nn.Dense(units=units, in_units=units,
+                                   flatten=False, use_bias=True,
+                                   weight_initializer=weight_initializer,
+                                   bias_initializer=bias_initializer,
+                                   dtype=dtype)
+        # TODO(sxjscience) Add DType to LayerNorm
+        self.ln_in = nn.LayerNorm(epsilon=layer_norm_eps,
+                                  in_channels=units)
+        self.ln_inter = nn.LayerNorm(epsilon=layer_norm_eps,
+                                     in_channels=units)
+        self.ffn = PositionwiseFFN(units=units,
+                                   hidden_size=hidden_size,
+                                   dropout=dropout,
+                                   activation_dropout=activation_dropout,
+                                   weight_initializer=weight_initializer,
+                                   bias_initializer=bias_initializer,
+                                   layer_norm_eps=layer_norm_eps,
+                                   activation=activation,
+                                   pre_norm=pre_norm,
+                                   dtype=dtype)
+
+    @property
+    def layout(self) -> str:
+        return self._layout
+
+    def hybrid_forward(self, F, data, mem, self_causal_mask, mem_attn_mask):
+        """
+
+        Parameters
+        ----------
+        F
+        data :
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_in)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_in)
+        mem :
+            - layout = 'NT'
+                Shape (batch_size, mem_length, C_mem)
+            - layout = 'TN'
+                Shape (mem_length, batch_size, C_mem)
+        self_causal_mask :
+            Shape (batch_size, seq_length, seq_length)
+            Mask for the causal self-attention.
+            self_causal_mask[i, j, :] masks the elements that token `j` attends to.
+            To understand the self-causal attention mask, we can look at the following example:
+                       ['I', 'can', 'now', 'use', 'numpy', 'in', 'Gluon@@', 'NLP']
+            'I':         1,    0,     0,     0,      0,     0,      0,      0
+            'can':       1,    1,     0,     0,      0,     0,      0,      0
+            'now':       1,    1,     1,     0,      0,     0,      0,      0
+            'use':       1,    1,     1,     1,      0,     0,      0,      0
+            'numpy':     1,    1,     1,     1,      1,     0,      0,      0
+            'in':        1,    1,     1,     1,      1,     1,      0,      0
+            'Gluon@@':   1,    1,     1,     1,      1,     1,      1,      0
+            'NLP':       1,    1,     1,     1,      1,     1,      1,      1
+        mem_attn_mask :
+            Shape (batch_size, seq_length, mem_length)
+            Mask between the decoding input and the memory.
+                       ['numpy', 'in', 'Gluon@@', 'NLP']
+            'I':         1,     1,      1,      1
+            'can':       1,     1,      1,      1
+            'now':       1,     1,      1,      1
+            'use':       1,     1,      1,      1
+
+        Returns
+        -------
+        out :
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_out)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_out)
+        """
+        # 1. Get the causal self-attention value
+        if self._pre_norm:
+            data = self.ln_in(data)
+        self_query, self_key, self_value = F.np.split(self.attn_in_qkv(data), 3, axis=-1)
+        out, [_, self_attn_weight] = self.self_attention(
+                F.npx.reshape(self_query, (-2, -2, self._num_heads, -1)),
+                F.npx.reshape(self_key, (-2, -2, self._num_heads, -1)),
+                F.npx.reshape(self_value, (-2, -2, self._num_heads, -1)),
+                self_causal_mask)
+        out = self.proj_in(out)
+        out = self.dropout_layer(out)
+        out = out + data
+        if not self._pre_norm:
+            out = self.ln_in(out)
+        # 2. Attend to the contextual memory
+        data = out
+        if self._pre_norm:
+            data = self.ln_inter(data)
+        out, [_, context_attn_weight] = self.inter_attention(
+                F.npx.reshape(self.attn_inter_q(data), (-2, -2, self._num_heads, -1)),
+                F.npx.reshape(self.attn_inter_k(mem), (-2, -2, self._num_heads, -1)),
+                F.npx.reshape(self.attn_inter_v(mem), (-2, -2, self._num_heads, -1)),
+                mem_attn_mask)
+        out = self.proj_inter(out)
+        out = self.dropout_layer(out)
+        out = out + data
+        if not self._pre_norm:
+            out = self.ln_inter(out)
+        # 3. Encode the output via an FFN layer
+        out = self.ffn(out)
+        return out
+
+    @property
+    def state_batch_axis(self):
+        if self.layout == 'NT':
+            return 0, 0
+        else:
+            return 1, 1
+
+    def init_states(self, batch_size, ctx, dtype='float32'):
+        """Initialize the states required for incremental decoding
+
+        Returns
+        -------
+        init_key
+            - layout = 'NT'
+                Shape (batch_size, 0, N, C_key)
+            - layout = 'TN'
+                Shape (0, batch_size, N, C_key)
+        init_value :
+            - layout = 'NT'
+                Shape (batch_size, 0, N, C_value)
+            - layout = 'TN'
+                Shape (0, batch_size, N, C_value)
+        """
+        if self.layout == 'NT':
+            init_key = mx.np.zeros(shape=(batch_size, 0, self._num_heads,
+                                          self._units // self._num_heads), ctx=ctx, dtype=dtype)
+            init_value = mx.np.zeros(shape=(batch_size, 0, self._num_heads,
+                                            self._units // self._num_heads), ctx=ctx, dtype=dtype)
+        else:
+            init_key = mx.np.zeros(shape=(0, batch_size, self._num_heads,
+                                          self._units // self._num_heads), ctx=ctx, dtype=dtype)
+            init_value = mx.np.zeros(shape=(0, batch_size, self._num_heads,
+                                            self._units // self._num_heads), ctx=ctx, dtype=dtype)
+        return init_key, init_value
+
+    def incremental_decode(self, F, data, states, mem, mem_valid_length, mem_attn_mask=None):
+        """Incrementally generate the output given the decoder input.
+
+        Parameters
+        ----------
+        F
+        data
+            Shape (batch_size, C_in)
+        states
+            The previous states, contains
+            1. layout = 'NT':
+                - prev_multi_key
+                    Shape (batch_size, prev_seq_length, num_heads, C_key)
+                - prev_multi_value
+                    Shape (batch_size, prev_seq_length, num_heads, C_value)
+            2. layout = 'TN'
+                - prev_multi_key
+                    Shape (prev_seq_length, batch_size, num_heads, C_key)
+                - prev_multi_value
+                    Shape (prev_seq_length, batch_size, num_heads, C_value)
+        mem
+            The memory
+            1. layout = 'NT':
+                Shape (batch_size, mem_length, C_mem)
+            2. layout = 'TN'
+                Shape (mem_length, batch_size, C_mem)
+        mem_valid_length
+            Valid length of the memory
+            Shape (batch_size,)
+        mem_attn_mask
+            The attention mask between data and the memory
+            Has shape (batch_size, 1, mem_length)
+
+        Returns
+        -------
+        out
+            Shape (batch_size, C_out)
+        updated_states
+            - new_key
+                Shape (batch_size, prev_seq_length + 1, num_heads, C_key)
+            - new_value
+                Shape (batch_size, prev_seq_length + 1, num_heads, C_value)
+        """
+        if self._pre_norm:
+            data = self.ln_in(data)
+        if self.layout == 'NT':
+            time_axis = 1
+        else:
+            time_axis = 0
+        data = F.np.expand_dims(data, axis=time_axis)
+        # Shape (B, prev_L, #Head, C_K), (B, prev_L, #Head, C_V)
+        #  or (prev_L, B, #Head, C_K), (prev_L, B, #Head, C_V)
+        prev_key, prev_value = states
+        if mem_attn_mask is None:
+            mem_attn_mask = gen_mem_attn_mask(F, mem, mem_valid_length, data, None,
+                                              dtype=self._dtype, layout=self.layout)
+        # 1. Get the causal self-attention value, we need to attend to both the current data
+        # and the previous stored key/values
+        # Shape (B, 1, 3 * num_heads * C_key)
+        #  or (1, B, 3 * num_heads * C_key)
+        step_qkv = self.attn_in_qkv(data)
+        step_query, step_key, step_value = F.np.split(step_qkv, 3, axis=-1)
+        step_query = F.npx.reshape(step_query, (-2, -2, self._num_heads, -1))
+        step_key = F.npx.reshape(step_key, (-2, -2, self._num_heads, -1))
+        step_value = F.npx.reshape(step_value, (-2, -2, self._num_heads, -1))
+        new_key = F.np.concatenate([prev_key, step_key], axis=time_axis)
+        new_value = F.np.concatenate([prev_value, step_value], axis=time_axis)
+        out, [_, attn_weight] = self.self_attention(step_query, new_key, new_value, None)
+        out = self.proj_in(out)
+        out = self.dropout_layer(out)
+        out = out + data
+        if not self._pre_norm:
+            out = self.ln_in(out)
+        # 2. Attend to the contextual memory
+        data = out
+        if self._pre_norm:
+            data = self.ln_inter(data)
+        out, _ = self.inter_attention(F.npx.reshape(self.attn_inter_q(data),
+                                                    (-2, -2, self._num_heads, -1)),
+                                      F.npx.reshape(self.attn_inter_k(mem),
+                                                    (-2, -2, self._num_heads, -1)),
+                                      F.npx.reshape(self.attn_inter_v(mem),
+                                                    (-2, -2, self._num_heads, -1)),
+                                      mem_attn_mask)
+        out = self.proj_inter(out)
+        out = self.dropout_layer(out)
+        out = out + data
+        if not self._pre_norm:
+            out = self.ln_inter(out)
+        # 3. Encode the output via an FFN layer
+        out = self.ffn(out)
+        out = F.npx.reshape(out, (-5, -1))
+        return out, (new_key, new_value)
+
+
+@use_np
+class TransformerDecoder(HybridBlock):
+    def __init__(self, num_layers=6, recurrent=False,
+                 units=512, mem_units=None, hidden_size=2048, use_qkv_bias=True,
+                 num_heads=8, max_shift=None, activation_dropout=0.0,
+                 dropout=0.1, attention_dropout=0.1, layer_norm_eps=1E-5, data_norm=False,
+                 pre_norm=False, weight_initializer=None, bias_initializer=None,
+                 activation='relu', dtype='float32',
+                 layout='NT'):
+        super().__init__()
+        self._dtype = dtype
+        self._units = units
+        self._mem_units = mem_units
+        self.num_layers = num_layers
+        self.recurrent = recurrent
+        self.max_shift = max_shift
+        self._data_norm = data_norm
+        self._pre_norm = pre_norm
+        self._layout = layout
+        assert layout in ['TN', 'NT'], 'Invalid layout received = {}. ' \
+                                       'Only "TN" and "NT" are accepted!'.format(layout)
+        self.dropout_layer = nn.Dropout(dropout)
+        if self._data_norm:
+            self.ln_data = nn.LayerNorm(epsilon=layer_norm_eps,
+                                        in_channels=units)
+        if self._pre_norm:
+            self.ln_final = nn.LayerNorm(epsilon=layer_norm_eps,
+                                         in_channels=units)
+        # Construct the intermediate layers
+        self.layers = nn.HybridSequential()
+        real_num_layers = 1 if recurrent else num_layers
+        for i in range(real_num_layers):
+            self.layers.add(TransformerDecoderLayer(units=units,
+                                                    mem_units=mem_units,
+                                                    hidden_size=hidden_size,
+                                                    num_heads=num_heads,
+                                                    activation_dropout=activation_dropout,
+                                                    use_qkv_bias=use_qkv_bias,
+                                                    dropout=dropout,
+                                                    attention_dropout=attention_dropout,
+                                                    layer_norm_eps=layer_norm_eps,
+                                                    weight_initializer=weight_initializer,
+                                                    bias_initializer=bias_initializer,
+                                                    activation=activation,
+                                                    pre_norm=pre_norm,
+                                                    layout=layout,
+                                                    dtype=dtype))
+
+    @property
+    def layout(self) -> str:
+        return self._layout
+
+    def hybrid_forward(self, F, data, valid_length, mem_data, mem_valid_length):
+        """
+
+        Parameters
+        ----------
+        F
+        data
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_in)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_in)
+        valid_length
+            Shape (batch_size,)
+        mem_data
+            - layout = 'NT'
+                Shape (batch_size, mem_length, C_mem)
+            - layout = 'TN'
+                Shape (mem_length, batch_size, C_mem)
+        mem_valid_length
+            Shape (batch_size,)
+
+        Returns
+        -------
+        out
+            - layout = 'NT'
+                Shape (batch_size, seq_length, C_out)
+            - layout = 'TN'
+                Shape (seq_length, batch_size, C_out)
+        """
+        # 1. Embed the data
+        out = self.dropout_layer(data)
+        if self._data_norm:
+            out = self.ln_data(out)
+        self_causal_mask = gen_self_attn_mask(F, data, valid_length,
+                                              dtype=self._dtype,
+                                              attn_type='causal',
+                                              layout=self._layout)
+        mem_attn_mask = gen_mem_attn_mask(F, mem_data, mem_valid_length, data, valid_length,
+                                          dtype=self._dtype,
+                                          layout=self._layout)
+        for i in range(self.num_layers):
+            if self.recurrent:
+                layer = self.layers[0]
+            else:
+                layer = self.layers[i]
+            out = layer(out, mem_data, self_causal_mask, mem_attn_mask)
+        if self._pre_norm:
+            out = self.ln_final(out)
+        return out
+
+    @property
+    def state_batch_axis(self):
+        ret = []
+        for i in range(self.num_layers):
+            if self.recurrent:
+                layer = self.layers[0]
+            else:
+                layer = self.layers[i]
+            ret.append(layer.state_batch_axis)
+        return ret
+
+    def init_states(self, batch_size, ctx, dtype='float32'):
+        """Initialize the states required for incremental decoding
+
+        Returns
+        -------
+        states
+            A list of states, each includes:
+                - init_key :
+                    layout = 'NT':
+                        Shape (batch_size, 0, N, C_key)
+                - init_value :
+                    layout = 'TN':
+                        Shape (0, batch_size, N, C_value)
+        """
+        states = []
+        for i in range(self.num_layers):
+            if self.recurrent:
+                layer = self.layers[0]
+            else:
+                layer = self.layers[i]
+            states.append(layer.init_states(batch_size=batch_size,
+                                            ctx=ctx,
+                                            dtype=dtype))
+        return states
+
+    def incremental_decode(self, F, data, states, mem, mem_valid_length):
+        """Incrementally generate the output given the decoder input.
+
+        Parameters
+        ----------
+        F
+        data
+            Shape (batch_size, C_in)
+        states
+            The previous states, contain a list of
+            1. layout = 'NT'
+                - prev_multi_key
+                    Shape (batch_size, prev_seq_length, num_heads, C_key)
+                - prev_multi_value
+                    Shape (batch_size, prev_seq_length, num_heads, C_value)
+            2. layout = 'TN'
+                - prev_multi_key
+                    Shape (prev_seq_length, batch_size, num_heads, C_key)
+                - prev_multi_value
+                    Shape (prev_seq_length, batch_size, num_heads, C_value)
+        mem
+            The memory
+            1. layout = 'NT'
+                Shape (batch_size, mem_length, C_mem)
+            2. layout = 'TN'
+                Shape (mem_length, batch_size, C_mem)
+        mem_valid_length
+            Valid length of the memory
+            Shape (batch_size,)
+
+        Returns
+        -------
+        out
+            Shape (batch_size, C_out)
+        new_states
+            The updated states, contain a list of
+            1. layout = 'NT'
+                - new_key
+                    Shape (batch_size, prev_seq_length + 1, num_heads, C_key)
+            2. layout = 'TN'
+                - new_value
+                    Shape (prev_seq_length + 1, batch_size, num_heads, C_value)
+        """
+        # 1. Embed the data
+        out = self.dropout_layer(data)
+        if self._data_norm:
+            out = self.ln_data(out)
+        time_axis = 0 if self.layout == 'TN' else 1
+        # Generate the mem_attn_mask
+        time_steps = F.npx.arange_like(mem, axis=time_axis)  # (mem_length,)
+        mem_attn_mask = F.np.reshape(time_steps, (1, 1, -1))\
+                        < F.np.reshape(mem_valid_length, (-1, 1, 1))
+        # TODO(sxjscience) Try with boolean masking
+        mem_attn_mask = mem_attn_mask.astype(self._dtype)
+        new_states = []
+        for i in range(self.num_layers):
+            if self.recurrent:
+                layer = self.layers[0]
+            else:
+                layer = self.layers[i]
+            out, new_state = layer.incremental_decode(F, out, states[i],
+                                                      mem, mem_valid_length, mem_attn_mask)
+            new_states.append(new_state)
+        if self._pre_norm:
+            out = self.ln_final(out)
+        return out, new_states
+
+
+@use_np
+class TransformerModel(HybridBlock):
+    def __init__(self, src_vocab_size: int,
+                 tgt_vocab_size: int,
+                 max_src_length: Optional[int] = None,
+                 max_tgt_length: Optional[int] = None,
+                 scale_embed: bool = True,
+                 pos_embed_type="sinusoidal",
+                 shared_embed: bool = True,
+                 tie_weights: bool = True,
+                 activation_dropout: float = 0.0,
+                 dropout: float = 0.1,
+                 attention_dropout: float = 0.1,
+                 layer_norm_eps: float = 1E-5,
+                 data_norm: bool = False,
+                 enc_units: int = 512,
+                 enc_hidden_size: int = 2048,
+                 enc_num_heads: int = 8,
+                 enc_num_layers: int = 6,
+                 enc_recurrent: bool = False,
+                 enc_activation='relu',
+                 enc_pre_norm: bool = False,
+                 enc_use_qkv_bias: bool = True,
+                 dec_units: int = 512,
+                 dec_hidden_size: int = 2048,
+                 dec_num_heads: int = 8,
+                 dec_num_layers: int = 6,
+                 dec_recurrent: bool = False,
+                 dec_activation='relu',
+                 dec_pre_norm: bool = False,
+                 dec_use_qkv_bias: bool = True,
+                 embed_initializer=mx.init.Xavier('gaussian', 'in', 1),
+                 weight_initializer=mx.init.Xavier('uniform', 'avg', 3),
+                 bias_initializer='zeros',
+                 dtype='float32',
+                 layout='NT'):
+        """
+
+        Parameters
+        ----------
+        src_vocab_size
+            The vocabulary size of the source language
+        tgt_vocab_size
+            The vocabulary size of the target language
+        max_src_length
+            The maximal length of the source sequence.
+            If it's negative, we will use treat it as not set.
+        max_tgt_length
+            The maximal length of the target sequence.
+            If it's negative, we will use treat it as not set.
+        scale_embed
+            Whether to multiply the src and dst embeddings by sqrt(units)
+        pos_embed_type
+            Type of the positional embedding
+        shared_embed
+            Whether to share the embedding of the src and tgt language
+        tie_weights
+            Whether to tie the weights of input + output.
+        activation_dropout
+            The ratio of the activation dropout in FFN
+        dropout
+            The default dropout ratio
+        attention_dropout
+            The ratio of the attention dropout
+        layer_norm_eps
+            The epsilon of the layer normalization
+        data_norm
+            Whether to add layer normalization layer after the input.
+        enc_units
+            Units of the encoder
+        enc_hidden_size
+            Hidden size of the encoder
+        enc_num_heads
+            Number of heads of the encoder
+        enc_num_layers
+            Number of layers of the encoder
+        enc_recurrent
+            Whether to use recurrent encoder (share weights)
+        enc_activation
+            Activation of the encoder layer
+        enc_pre_norm
+            Whether to add layer_norm before self-attention in the encoder
+        enc_use_qkv_bias
+            Wether to use bias for attention layer in the encoder
+        dec_units
+            Units of the decoder
+        dec_hidden_size
+            Hidden size of the decoder
+        dec_num_heads
+            Number of heads of the decoder
+        dec_num_layers
+            Number of layers of the decoder
+        dec_recurrent
+            Whether to use recurrent decoder (share weights)
+        dec_activation
+            Activation of the decoder layer
+        dec_pre_norm
+            Whether to add layer_norm before self-attention in the decoder
+        dec_use_qkv_bias
+            Wether to use bias for attention layer in the decoder
+        embed_initializer
+            Initializer of the embedding layer
+        weight_initializer
+            Initializer of the weight
+        bias_initializer
+            Initializer of the bias
+        dtype
+            Data type of the weights
+        layout
+            The layout of the input + target
+        """
+        super().__init__()
+        assert src_vocab_size > 0 and tgt_vocab_size > 0,\
+            'Cannot set "src_vocab_size" and "tgt_vocab_size" to negative numbers. ' \
+            'Are you creating ' \
+            'the model with the config from TransformerModel.get_cfg()? If that is ' \
+            'the case, you will need to set the cfg.MODEL.src_vocab_size and ' \
+            'cfg.MODEL.tgt_vocab_size manually before passing to ' \
+            'TransformerModel.from_cfg().'
+        self._dtype = dtype
+        self._src_vocab_size = src_vocab_size
+        self._tgt_vocab_size = tgt_vocab_size
+        self.tie_weights = tie_weights
+        self.pos_embed_type = pos_embed_type
+        self.scaled_embed = scale_embed
+        self.enc_units = enc_units
+        self.dec_units = dec_units
+        self.weight_initializer = weight_initializer
+        self.bias_initializer = bias_initializer
+        self._layout = layout
+        assert layout in ['TN', 'NT'], 'Invalid layout received = {}. ' \
+                                       'Only "TN" and "NT" are accepted!'.format(layout)
+        if max_src_length is not None and max_src_length < 0:
+            max_src_length = None
+        if max_tgt_length is not None and max_tgt_length < 0:
+            max_tgt_length = None
+        if enc_units != dec_units:
+            assert shared_embed is False, 'Cannot share embedding when the enc_units and dec_units ' \
+                                          'are different! enc_units={},' \
+                                          ' dec_units={}'.format(enc_units, dec_units)
+        self.src_embed_layer = nn.Embedding(input_dim=src_vocab_size,
+                                            output_dim=enc_units,
+                                            weight_initializer=embed_initializer,
+                                            dtype=self._dtype)
+        self.tgt_embed_layer = nn.Embedding(input_dim=tgt_vocab_size,
+                                            output_dim=dec_units,
+                                            weight_initializer=embed_initializer,
+                                            dtype=self._dtype)
+        if shared_embed:
+            self.tgt_embed_layer.weight = self.src_embed_layer.weight
+        if pos_embed_type is not None:
+            self.src_pos_embed_layer = PositionalEmbedding(units=enc_units,
+                                                           max_length=max_src_length,
+                                                           dtype=self._dtype,
+                                                           method=pos_embed_type)
+            self.tgt_pos_embed_layer = PositionalEmbedding(units=dec_units,
+                                                           max_length=max_tgt_length,
+                                                           dtype=self._dtype,
+                                                           method=pos_embed_type)
+        self.encoder = TransformerEncoder(num_layers=enc_num_layers,
+                                          recurrent=enc_recurrent,
+                                          units=enc_units,
+                                          hidden_size=enc_hidden_size,
+                                          num_heads=enc_num_heads,
+                                          activation_dropout=activation_dropout,
+                                          use_qkv_bias=enc_use_qkv_bias,
+                                          dropout=dropout,
+                                          attention_dropout=attention_dropout,
+                                          layer_norm_eps=layer_norm_eps,
+                                          weight_initializer=weight_initializer,
+                                          bias_initializer=bias_initializer,
+                                          activation=enc_activation,
+                                          data_norm=data_norm,
+                                          pre_norm=enc_pre_norm,
+                                          dtype=self._dtype,
+                                          layout=layout)
+        self.decoder = TransformerDecoder(num_layers=dec_num_layers,
+                                          recurrent=dec_recurrent,
+                                          units=dec_units,
+                                          mem_units=enc_units,
+                                          hidden_size=dec_hidden_size,
+                                          num_heads=dec_num_heads,
+                                          activation_dropout=activation_dropout,
+                                          use_qkv_bias=dec_use_qkv_bias,
+                                          dropout=dropout,
+                                          attention_dropout=attention_dropout,
+                                          layer_norm_eps=layer_norm_eps,
+                                          weight_initializer=weight_initializer,
+                                          bias_initializer=bias_initializer,
+                                          activation=dec_activation,
+                                          data_norm=data_norm,
+                                          pre_norm=dec_pre_norm,
+                                          dtype=self._dtype,
+                                          layout=layout)
+        if tie_weights:
+            self.tgt_final_layer = \
+                nn.Dense(tgt_vocab_size, flatten=False,
+                         bias_initializer=bias_initializer,
+                         use_bias=False,
+                         dtype=self._dtype)
+            self.tgt_final_layer.weight = self.tgt_embed_layer.weight
+        else:
+            self.tgt_final_layer = \
+                nn.Dense(tgt_vocab_size,
+                         flatten=False,
+                         weight_initializer=weight_initializer,
+                         bias_initializer=bias_initializer,
+                         use_bias=False,
+                         dtype=self._dtype)
+
+    @property
+    def layout(self) -> str:
+        return self._layout
+
+    @property
+    def src_vocab_size(self):
+        return self._src_vocab_size
+
+    @property
+    def tgt_vocab_size(self):
+        return self._tgt_vocab_size
+
+    # TODO(sxjscience) We can actually try to hybridize this function via the
+    #  newly-introduced deferred compute.
+    def encode(self, F, src_data, src_valid_length):
+        """Encode the source data to memory
+
+        Parameters
+        ----------
+        F
+        src_data
+            - layout = 'NT'
+                Shape (batch_size, src_length)
+            - layout = 'TN'
+                Shape (src_length, batch_size)
+        src_valid_length
+            Shape (batch_size,)
+
+        Returns
+        -------
+        enc_out
+            - layout = 'NT'
+                Shape (batch_size, src_length, C_out)
+            - layout = 'TN'
+                Shape (src_length, batch_size, C_out)
+        """
+        src_data = self.src_embed_layer(src_data)
+        if self.scaled_embed:
+            src_data = src_data * np.sqrt(self.enc_units)
+        if self.pos_embed_type is not None:
+            if self.layout == 'NT':
+                src_data = src_data + self.src_pos_embed_layer(F.npx.arange_like(src_data, axis=1))
+            else:
+                src_data = src_data + F.np.expand_dims(self.src_pos_embed_layer(
+                    F.npx.arange_like(src_data, axis=0)), axis=1)
+
+        enc_out = self.encoder(src_data, src_valid_length)
+        return enc_out
+
+    def decode_seq(self, F, tgt_data, tgt_valid_length, mem_data, mem_valid_length):
+        """Decode a sequence of inputs
+
+        Parameters
+        ----------
+        F
+        tgt_data
+            - layout = 'NT'
+                Shape (batch_size, tgt_length)
+            - layout = 'TN'
+                Shape (tgt_length, batch_size)
+        tgt_valid_length
+            Shape (batch_size,)
+        mem_data
+            - layout = 'NT'
+                Shape (batch_size, src_length, C_out)
+            - layout = 'TN'
+                Shape (src_length, batch_size, C_out)
+        mem_valid_length :
+            Shape (batch_size,)
+
+        Returns
+        -------
+        dec_out
+            - layout = 'NT'
+                Shape (batch_size, tgt_length, tgt_vocab_size)
+            - layout = 'TN'
+                Shape (tgt_length, batch_size, tgt_vocab_size)
+        """
+        tgt_data = self.tgt_embed_layer(tgt_data)
+        if self.scaled_embed:
+            tgt_data = tgt_data * np.sqrt(self.dec_units)
+        if self.pos_embed_type is not None:
+            if self.layout == 'NT':
+                tgt_data = tgt_data + self.tgt_pos_embed_layer(
+                    F.npx.arange_like(tgt_data, axis=1))
+            else:
+                tgt_data = tgt_data + F.np.expand_dims(self.tgt_pos_embed_layer(
+                    F.npx.arange_like(tgt_data, axis=0)), axis=1)
+
+        dec_out = self.decoder(tgt_data, tgt_valid_length, mem_data, mem_valid_length)
+        return dec_out
+
+    def hybrid_forward(self, F, src_data, src_valid_length, tgt_data, tgt_valid_length):
+        """
+
+        Parameters
+        ----------
+        F
+        src_data
+            - layout = 'NT'
+                Shape (batch_size, src_length)
+            - layout = 'TN'
+                Shape (src_length, batch_size)
+        src_valid_length
+            Shape (batch_size,)
+        tgt_data
+            - layout = 'NT'
+                Shape (batch_size, tgt_length)
+            - layout = 'TN'
+                Shape (tgt_length, batch_size)
+        tgt_valid_length
+            Shape (batch_size,)
+
+        Returns
+        -------
+        out
+            - layout = 'NT'
+                Shape (batch_size, tgt_length, tgt_vocab_size)
+            - layout = 'TN'
+                Shape (tgt_length, batch_size, tgt_vocab_size)
+        """
+        enc_out = self.encode(F, src_data, src_valid_length)
+        dec_out = self.decode_seq(F, tgt_data, tgt_valid_length, enc_out, src_valid_length)
+        dec_out = self.tgt_final_layer(dec_out)
+        return dec_out
+
+    @classmethod
+    def get_cfg(cls, key=None):
+        if key is None:
+            # Use Transformer WMT EN-DE Base
+            return transformer_base()
+        else:
+            return transformer_cfg_reg.create(key)
+
+    @classmethod
+    def from_cfg(cls, cfg, dtype=None):
+        cfg = cls.get_cfg().clone_merge(cfg)
+        embed_initializer = mx.init.create(*cfg.INITIALIZER.embed)
+        weight_initializer = mx.init.create(*cfg.INITIALIZER.weight)
+        bias_initializer = mx.init.create(*cfg.INITIALIZER.bias)
+        if dtype is None:
+            dtype = cfg.MODEL.dtype
+        return cls(src_vocab_size=cfg.MODEL.src_vocab_size,
+                   tgt_vocab_size=cfg.MODEL.tgt_vocab_size,
+                   max_src_length=cfg.MODEL.max_src_length,
+                   max_tgt_length=cfg.MODEL.max_tgt_length,
+                   scale_embed=cfg.MODEL.scale_embed,
+                   pos_embed_type=cfg.MODEL.pos_embed_type,
+                   shared_embed=cfg.MODEL.shared_embed,
+                   tie_weights=cfg.MODEL.tie_weights,
+                   attention_dropout=cfg.MODEL.attention_dropout,
+                   activation_dropout=cfg.MODEL.activation_dropout,
+                   dropout=cfg.MODEL.dropout,
+                   enc_num_layers=cfg.MODEL.ENCODER.num_layers,
+                   enc_units=cfg.MODEL.ENCODER.units,
+                   enc_num_heads=cfg.MODEL.ENCODER.num_heads,
+                   enc_hidden_size=cfg.MODEL.ENCODER.hidden_size,
+                   enc_recurrent=cfg.MODEL.ENCODER.recurrent,
+                   enc_activation=cfg.MODEL.ENCODER.activation,
+                   enc_pre_norm=cfg.MODEL.ENCODER.pre_norm,
+                   enc_use_qkv_bias=cfg.MODEL.ENCODER.use_qkv_bias,
+                   dec_num_layers=cfg.MODEL.DECODER.num_layers,
+                   dec_units=cfg.MODEL.DECODER.units,
+                   dec_num_heads=cfg.MODEL.DECODER.num_heads,
+                   dec_hidden_size=cfg.MODEL.DECODER.hidden_size,
+                   dec_recurrent=cfg.MODEL.DECODER.recurrent,
+                   dec_activation=cfg.MODEL.DECODER.activation,
+                   dec_pre_norm=cfg.MODEL.DECODER.pre_norm,
+                   dec_use_qkv_bias=cfg.MODEL.DECODER.use_qkv_bias,
+                   layout=cfg.MODEL.layout,
+                   embed_initializer=embed_initializer,
+                   weight_initializer=weight_initializer,
+                   bias_initializer=bias_initializer,
+                   dtype=dtype)
+
+
+@use_np
+class TransformerNMTInference(HybridBlock, BaseStepDecoder):
+    def __init__(self, model):
+        """
+
+        Parameters
+        ----------
+        model
+        """
+        super().__init__()
+        self.model = model
+
+    def initialize(self, **kwargs):
+        # Manually disable the initialize
+        raise NotImplementedError('You can not initialize a TransformerNMTFastInference Model! '
+                                  'The correct approach is to create a TransformerModel and '
+                                  'then build the TransformerNMTInference with the given model.')
+
+    @property
+    # TODO(sxjscience) Think about how to improve this
+    def state_batch_axis(self) -> Tuple[int, int, int, List]:
+        """Return a data structure that stores the batch axis of the internal states
+         of the inference model.
+
+        Returns
+        -------
+        enc_out_batch_axis : int
+        src_valid_length_batch_axis : int
+        position_batch_axis : int
+        dec_layer_batch_axis : list
+        """
+        if self.model.layout == 'NT':
+            return 0, 0, 0, self.model.decoder.state_batch_axis
+        else:
+            return 1, 0, 0, self.model.decoder.state_batch_axis
+
+    def init_states(self, src_data, src_valid_length):  # TODO(sxjscience) Revisit here, support auxiliary states?
+        """Initialize the states required for sequence sampling
+
+        Parameters
+        ----------
+        src_data
+            - layout = 'NT'
+                Shape (batch_size, src_length)
+            - layout = 'TN'
+                Shape (src_length, batch_size)
+        src_valid_length
+            Shape (batch_size,)
+
+        Returns
+        -------
+        enc_out
+            - layout = 'NT'
+                Shape (batch_size, src_length, C_mem)
+            - layout = 'TN'
+                Shape (src_length, batch_size, C_mem)
+        src_valid_length
+            Shape (batch_size,)
+        position
+            Shape (batch_size,)
+        dec_states: list
+            The states of the decoder
+        """
+        if self.model.layout == 'NT':
+            batch_size = src_data.shape[0]
+        else:
+            batch_size = src_data.shape[1]
+        ctx = src_data.ctx
+        enc_out = self.model.encode(mx, src_data, src_valid_length)
+        position = mx.np.zeros((batch_size,), dtype=np.int32, ctx=ctx)
+        dtype = enc_out.dtype
+        dec_states = self.model.decoder.init_states(batch_size, ctx, dtype)
+        return enc_out, src_valid_length, position, dec_states
+
+    def hybrid_forward(self, F, step_data, states):
+        """
+
+        Parameters
+        ----------
+        step_data
+            Shape (batch_size,)
+        states
+            It includes :
+                - layout = 'NT'
+                    mem_data : (batch_size, src_length, C_mem)
+                    mem_valid_length : (batch_size,)
+                    position : (batch_size,)
+                    dec_states : list
+                - layout = 'TN'
+                    mem_data : (src_length, batch_size, C_mem)
+                    mem_valid_length : (batch_size,)
+                    position : (batch_size,)
+                    dec_states : list
+        Returns
+        -------
+        out
+            Shape (batch_size, C)
+        new_states
+            Has the same structure as the states
+        """
+        mem_data, mem_valid_length, position, dec_states = states
+        # 1. Get the embedding
+        step_data = self.model.tgt_embed_layer(step_data)
+        if self.model.scaled_embed:
+            step_data = step_data * np.sqrt(self.model.dec_units)
+        if self.model.pos_embed_type is not None:
+            step_data = step_data + self.model.tgt_pos_embed_layer(position)
+        out, new_states =\
+            self.model.decoder.incremental_decode(F, step_data, dec_states,
+                                                  mem_data, mem_valid_length)
+        out = self.model.tgt_final_layer(out)
+        return out, (mem_data, mem_valid_length, position + 1, new_states)
diff --git a/src/gluonnlp/models/transformer_xl.py b/src/gluonnlp/models/transformer_xl.py
new file mode 100644
index 0000000000..b6ff44c5df
--- /dev/null
+++ b/src/gluonnlp/models/transformer_xl.py
@@ -0,0 +1,614 @@
+import numpy as np
+import mxnet as mx
+from mxnet import use_np
+from mxnet.gluon import nn, Block, HybridBlock, Parameter
+from ..attention_cell import multi_head_dot_attn, gen_self_attn_mask, gen_mem_attn_mask,\
+    RelAttentionScoreCell, MultiHeadAttentionCell
+from ..layers import get_activation, PositionalEmbedding, PositionwiseFFN,\
+    AdaptiveEmbedding, ProjectedAdaptiveLogSoftmaxWithLoss
+from ..utils.config import CfgNode as CN
+from ..sequence_sampler import BaseStepDecoder
+__all__ = ['TransformerXLDecoderLayer', 'TransformerXLDecoder', 'TransformerXLForLM',
+           'TransformerXLForLMGen']
+
+
+@use_np
+class TransformerXLDecoderLayer(HybridBlock):
+    def __init__(self, units: int = 512,
+                 hidden_size: int = 2048,
+                 num_heads: int = 8,
+                 activation_dropout: float = 0.0,
+                 dropout: float = 0.1,
+                 attention_dropout: float = 0.1,
+                 layer_norm_eps: float = 1E-5,
+                 activation: str = 'relu',
+                 weight_initializer=None,
+                 bias_initializer='zeros',
+                 pre_norm=False,
+                 dtype='float32',
+                 layout='NT'):
+        super().__init__()
+        self._pre_norm = pre_norm
+        self._dtype = dtype
+        self._num_heads = num_heads
+        self._layout = layout
+        assert layout in ['NT', 'TN'], 'Unknown layout = {}'.format(layout)
+        if layout == 'NT':
+            self._cell_layout = 'NTK'
+        elif layout == 'TN':
+            self._cell_layout = 'TNK'
+        else:
+            raise NotImplementedError
+        assert units % num_heads == 0
+        self.dropout_layer = nn.Dropout(dropout)
+        self.attn_query = nn.Dense(units, in_units=units,
+                                   use_bias=False, flatten=False,
+                                   weight_initializer=weight_initializer,
+                                   bias_initializer=bias_initializer,
+                                   dtype=dtype)
+        self.attn_kv = nn.Dense(2 * units, in_units=units,
+                                use_bias=False, flatten=False,
+                                weight_initializer=weight_initializer,
+                                bias_initializer=bias_initializer,
+                                dtype=dtype)
+        self.rel_pos_score_cell = RelAttentionScoreCell(query_units=units,
+                                                        num_heads=num_heads,
+                                                        bidirectional=False,
+                                                        method='transformer_xl',
+                                                        dropout=dropout,
+                                                        dtype=dtype,
+                                                        layout=self._cell_layout)
+        self.attn_cell = MultiHeadAttentionCell(query_units=units,
+                                                num_heads=num_heads,
+                                                attention_dropout=attention_dropout,
+                                                dtype=dtype,
+                                                layout=self._cell_layout)
+        self.out_proj = nn.Dense(units, in_units=units,
+                                 use_bias=False, flatten=False,
+                                 weight_initializer=weight_initializer,
+                                 bias_initializer=bias_initializer,
+                                 dtype=dtype)
+        self.layer_norm = nn.LayerNorm(epsilon=layer_norm_eps,
+                                       in_channels=units)
+        self.ffn = PositionwiseFFN(units=units,
+                                   hidden_size=hidden_size,
+                                   activation=activation,
+                                   activation_dropout=activation_dropout,
+                                   dropout=dropout,
+                                   weight_initializer=weight_initializer,
+                                   bias_initializer=bias_initializer,
+                                   layer_norm_eps=layer_norm_eps,
+                                   pre_norm=pre_norm,
+                                   dtype=dtype)
+
+    @property
+    def layout(self):
+        return self._layout
+
+    def hybrid_forward(self, F, data, mem, rel_positions, mask, query_r_bias, query_k_bias):
+        """
+
+        Parameters
+        ----------
+        F
+        data
+            The input data.
+            layout = 'NT':
+                Shape (batch_size, query_length, units)
+            layout = 'TN':
+                Shape (query_length, batch_size, units)
+        mem
+            The memory.
+            layout = 'NT':
+                Shape (batch_size, mem_length, units)
+            layout = 'TN':
+                Shape (mem_length, batch_size, units)
+        rel_positions
+            The relative positions between data and [mem, data]
+            Shape (query_length, mem_length + query_length).
+            A positive value means that query is after the memory, i.e.,
+            query_location - mem_location.
+        mask
+            Mask between the query and the memory + query.
+            1--> will be used, 0 --> won't be used
+            Shape (batch_size, query_length, mem_length + query_length)
+        query_r_bias
+            The query bias for calculating the relative scores
+            Shape (num_heads, query_head_units)
+        query_k_bias
+            The key bias for calculating the relative scores.
+            Shape (num_heads, query_head_units)
+
+        Returns
+        -------
+        out
+            - layout = 'NT'
+                Shape (batch_size, query_length, units)
+            - layout = 'TN'
+                Shape (query_length, batch_size, units)
+        """
+        if self._layout == 'NT':
+            context = F.np.concatenate([mem, data], axis=1)
+        elif self._layout == 'TN':
+            context = F.np.concatenate([mem, data], axis=0)
+        else:
+            raise NotImplementedError
+        if self._pre_norm:
+            query = self.attn_query(self.layer_norm(data))
+            key_value = self.attn_kv(self.layer_norm(context))
+            key, value = F.np.split(key_value, 2, axis=-1)
+        else:
+            query = self.attn_query(data)
+            key_value = self.attn_kv(context)
+            key, value = F.np.split(key_value, 2, axis=-1)
+        query = F.npx.reshape(query, (-2, -2, self._num_heads, -1))
+        key = F.npx.reshape(key, (-2, -2, self._num_heads, -1))
+        value = F.npx.reshape(value, (-2, -2, self._num_heads, -1))
+        # Compute attention
+        rel_score = self.rel_pos_score_cell(rel_positions, query + query_r_bias)
+        out, _ = self.attn_cell(query + query_k_bias, key, value, mask, rel_score)
+        out = self.dropout_layer(out)
+        if self._pre_norm:
+            out = data + out
+        else:
+            out = self.layer_norm(data + out)
+        out = self.ffn(out)
+        return out
+
+
+@use_np
+class TransformerXLDecoder(HybridBlock):
+    def __init__(self, num_layers=3,
+                 units=512,
+                 hidden_size=2048,
+                 num_heads=8,
+                 activation_dropout=0.1,
+                 dropout=0.1,
+                 attention_dropout=0.0,
+                 layernorm_eps=1E-12,
+                 activation='relu',
+                 dtype='float32',
+                 layout='NT',
+                 pre_norm=False,
+                 weight_initializer=None,
+                 bias_initializer=None):
+        super().__init__()
+        self.query_k_bias = Parameter('query_k_bias',
+                                      shape=(num_heads, units // num_heads),
+                                      init=bias_initializer,
+                                      allow_deferred_init=True)
+        self.query_r_bias = Parameter('query_r_bias',
+                                      shape=(num_heads, units // num_heads),
+                                      init=bias_initializer,
+                                      allow_deferred_init=True)
+        self.decoder_layers = nn.HybridSequential()
+        for i in range(num_layers):
+            self.decoder_layers.add(
+                TransformerXLDecoderLayer(units=units,
+                                          hidden_size=hidden_size,
+                                          num_heads=num_heads,
+                                          activation_dropout=activation_dropout,
+                                          dropout=dropout,
+                                          attention_dropout=attention_dropout,
+                                          layer_norm_eps=layernorm_eps,
+                                          activation=activation,
+                                          dtype=dtype,
+                                          layout=layout,
+                                          pre_norm=pre_norm,
+                                          weight_initializer=weight_initializer,
+                                          bias_initializer=bias_initializer))
+
+    def hybrid_forward(self, F, data, mem_l, rel_positions, mask, **params):
+        """
+
+        Parameters
+        ----------
+        F
+        data
+            - layout = 'NT':
+                Shape (batch_size, query_length)
+            - layout = 'TN':
+                Shape (query_length, batch_size)
+        mem_l
+            Contains a list of memory objects, each one will contain:
+            - layout = 'NT':
+                Shape (batch_size, mem_length, C_i)
+            - layout = 'TN':
+                Shape (mem_length, batch_size, C_i)
+        rel_positions
+            The relative positions.
+            Shape (query_length, mem_length + query_length)
+        mask
+            Mask between the query and the memory + query.
+            Shape (batch_size, query_length, mem_length + query_length)
+
+        Returns
+        -------
+        out_l
+            Contains a list of hidden states, each will contain:
+            - layout = 'NT'
+                Shape (batch_size, query_length, C_o)
+            - layout = 'TN'
+                Shape (query_length, batch_size, C_o)
+        """
+        query_k_bias = params['query_k_bias']
+        query_r_bias = params['query_r_bias']
+        out_l = []
+        out = data
+        for i, layer in enumerate(self.decoder_layers):
+            out = layer(out, mem_l[i], rel_positions, mask, query_r_bias, query_k_bias)
+            out_l.append(out)
+        return out_l
+
+
+@use_np
+class TransformerXLForLM(Block):
+    def __init__(self, cfg=None):
+        super().__init__()
+        if cfg is None:
+            cfg = TransformerXLForLM.get_cfg()
+        else:
+            cfg = TransformerXLForLM.get_cfg().clone_merge(cfg)
+        self._cfg = cfg
+        assert cfg.MODEL.vocab_size > 0
+        weight_initializer = mx.init.create(*cfg.INITIALIZER.weight)
+        embed_initializer = mx.init.create(*cfg.INITIALIZER.embed)
+        bias_initializer = mx.init.create(*cfg.INITIALIZER.bias)
+        self._num_layers = cfg.MODEL.num_layers
+        self._layout = cfg.MODEL.layout
+        self._units = cfg.MODEL.units
+        self._dtype = cfg.MODEL.dtype
+        assert cfg.MODEL.units % cfg.MODEL.num_heads == 0
+        self.word_emb = AdaptiveEmbedding(vocab_size=cfg.MODEL.vocab_size,
+                                          embed_size=cfg.MODEL.embed_units,
+                                          units=cfg.MODEL.units,
+                                          cutoffs=cfg.MODEL.cutoffs,
+                                          div_val=cfg.MODEL.div_val,
+                                          scaled=True,
+                                          embedding_initializer=embed_initializer,
+                                          weight_initializer=weight_initializer,
+                                          dtype=cfg.MODEL.dtype)
+        self.dropout_layer = nn.Dropout(cfg.MODEL.dropout)
+        self.decoder = TransformerXLDecoder(num_layers=cfg.MODEL.num_layers,
+                                            units=cfg.MODEL.units,
+                                            hidden_size=cfg.MODEL.hidden_size,
+                                            num_heads=cfg.MODEL.num_heads,
+                                            activation_dropout=cfg.MODEL.activation_dropout,
+                                            dropout=cfg.MODEL.dropout,
+                                            attention_dropout=cfg.MODEL.attention_dropout,
+                                            layernorm_eps=cfg.MODEL.layernorm_eps,
+                                            activation=cfg.MODEL.activation,
+                                            dtype=cfg.MODEL.dtype,
+                                            layout=cfg.MODEL.layout,
+                                            pre_norm=cfg.MODEL.pre_norm,
+                                            weight_initializer=weight_initializer,
+                                            bias_initializer=bias_initializer)
+        if cfg.MODEL.tie_weights and cfg.MODEL.tie_projs:
+            crit_params = self.word_emb.collect_params('(embed|inter_proj)')
+        elif cfg.MODEL.tie_weights and not cfg.MODEL.tie_projs:
+            crit_params = self.word_emb.collect_params('embed')
+        elif not cfg.MODEL.tie_weights and cfg.MODEL.tie_projs:
+            crit_params = self.word_emb.collect_params('inter_proj')
+        else:
+            crit_params = None
+        self.crit = ProjectedAdaptiveLogSoftmaxWithLoss(
+            vocab_size=cfg.MODEL.vocab_size,
+            embed_size=cfg.MODEL.embed_units,
+            in_units=cfg.MODEL.units,
+            cutoffs=cfg.MODEL.cutoffs,
+            div_val=cfg.MODEL.div_val,
+            dtype=cfg.MODEL.dtype,
+            use_bias=True,
+            weight_initializer=weight_initializer,
+            bias_initializer=bias_initializer)
+        self.crit.share_parameters(crit_params)
+
+    @property
+    def cfg(self):
+        return self._cfg
+
+    @property
+    def mem_length(self):
+        return self.cfg.MODEL.mem_length
+
+    @classmethod
+    def get_cfg(cls, key=None):
+        if key is None:
+            config = CN()
+            config.MODEL = CN()
+            # For embedding
+            config.MODEL.vocab_size = 1000
+            config.MODEL.cutoffs = []
+            config.MODEL.div_val = 1.0
+            config.MODEL.embed_units = 512
+            config.MODEL.tie_weights = True
+            config.MODEL.tie_projs = True
+            # For decoder
+            config.MODEL.units = 512
+            config.MODEL.hidden_size = 2048
+            config.MODEL.num_layers = 6
+            config.MODEL.num_heads = 8
+            config.MODEL.activation = 'relu'
+            config.MODEL.pre_norm = False
+            config.MODEL.dropout = 0.1
+            config.MODEL.activation_dropout = 0.1
+            config.MODEL.attention_dropout = 0.0
+            config.MODEL.layernorm_eps = 1E-6
+            config.MODEL.dtype = 'float32'
+            config.MODEL.layout = 'TN'
+            # For memory
+            config.MODEL.mem_length = 512
+            # Initialization
+            config.INITIALIZER = CN()
+            config.INITIALIZER.weight = ['normal', 0.02]
+            config.INITIALIZER.bias = ['zeros']
+            config.INITIALIZER.embed = ['normal', 0.02]
+        else:
+            raise NotImplementedError
+        return config
+
+    @classmethod
+    def from_cfg(cls, cfg):
+        return cls(cfg=cfg)
+
+    @property
+    def state_batch_axis(self):
+        if self._layout == 'NT':
+            return [0 for _ in range(self._num_layers + 1)]
+        elif self._layout == 'TN':
+            return [1 for _ in range(self._num_layers + 1)]
+        else:
+            raise NotImplementedError
+
+    def init_states(self, batch_size, ctx):
+        """Initialize the states
+
+        Parameters
+        ----------
+        batch_size
+        ctx
+            ctx of the initialized
+
+        Returns
+        -------
+        mems
+            A list of memory states
+            - layout = 'NT'
+                Shape (B, T, C)
+            - layout = 'TN'
+                Shape (T, B, C)
+        """
+        if self._layout == 'NT':
+            return [mx.np.zeros((batch_size, 0, self._units), ctx=ctx)
+                    for _ in range(self._num_layers)]
+        elif self._layout == 'TN':
+            return [mx.np.zeros((0, batch_size, self._units), ctx=ctx)
+                    for _ in range(self._num_layers)]
+        else:
+            raise NotImplementedError
+
+    def set_mem_length(self, mem_length: int):
+        """
+
+        Parameters
+        ----------
+        mem_length
+            The memory length of the model
+        """
+        self._cfg.defrost()
+        self._cfg.MODEL.mem_length = mem_length
+        self._cfg.freeze()
+
+    def forward(self, data, target, mem_l, rel_positions=None, data_mem_mask=None,
+                causal_only=False, detach_memory=True):
+        """
+
+        Parameters
+        ----------
+        data
+            The input data
+            - layout == 'NT'
+                Shape (B, T)
+            - layout == 'TN'
+                Shape (T, B)
+        target
+            The ground truth
+            - layout == 'NT'
+                Shape (B, T)
+            - layout == 'TN'
+                Shape (T, B)
+        mem_l
+            A list of memory objects
+            - layout == 'NT'
+                Shape (B, T_mem, units)
+            - layout == 'TN'
+                Shape (T_mem, B, units)
+        rel_positions
+            Shape (query_length, mem_length + query_length)
+            By default, we will use the following relative positions
+                       ['I', 'can', 'now', 'use', 'numpy', 'in', 'Gluon@@', 'NLP']
+            'in':        5,    4,     3,     2,      1,     0,      -1,      -2
+            'Gluon@@':   6,    5,     4,     3,      2,     1,       0,      -1
+            'NLP':       7,    6,     5,     4,      3,     2,       1,       0
+        data_mem_mask
+            Shape (B, query_length, mem_length + query_length)
+            Here, 1 --> will be used, 0 --> won't be used.
+            By default, we will mask all locations that have distance > mem_length with the
+            current token.
+            Following is an example in which query_length = 3, mem_length = 4
+                        |------- <mem> ----------|--------- <query> ------------|
+             <query>   ['I', 'can', 'now', 'use', 'numpy', 'in', 'Gluon@@', 'NLP']
+            'numpy':     1,    1,     1,     1,      1,     0,      0,        0
+            'in':        0,    1,     1,     1,      1,     1,      0,        0
+            'Gluon@@':   0,    0,     1,     1,      1,     1,      1,        0
+            'NLP':       0,    0,     0,     1,      1,     1,      1,        1
+
+            Also, we provide the option in which we only mask the future tokens, this is
+            supported by setting `causal_only` to True. However, there will be a
+            discrepancy between training and inference because the effecitve memory length is
+            longer for the later tokens in the query.
+                        |------- <mem> ----------|--------- <query> ------------|
+             <query>   ['I', 'can', 'now', 'use', 'numpy', 'in', 'Gluon@@', 'NLP']
+            'numpy':     1,    1,     1,     1,      1,     0,      0,        0
+            'in':        1,    1,     1,     1,      1,     1,      0,        0
+            'Gluon@@':   1,    1,     1,     1,      1,     1,      1,        0
+            'NLP':       1,    1,     1,     1,      1,     1,      1,        1
+        causal_only
+            Whether to ignore the local masking constraint. See the flag above for more information.
+        detach_memory
+            Whether to detach the encoded memory from the graph.
+
+        Returns
+        -------
+        logits
+            The selected logits
+            - layout == 'NT'
+                Shape (B, T)
+            - layout == 'TN'
+                Shape (T, B)
+        new_mem_l
+            A list of the updated memory
+            - layout == 'NT'
+                Each will have shape (B, T, C)
+            - layout == 'TN'
+                Each will have shape (T, B, C)
+        """
+        # Note that curr_mem_length will not necessarily be equal to mem_length
+        if self._layout == 'NT':
+            time_axis = 1
+            batch_axis = 0
+        elif self._layout == 'TN':
+            time_axis = 0
+            batch_axis = 1
+        else:
+            raise NotImplementedError
+        query_length = data.shape[time_axis]
+        curr_mem_length = mem_l[0].shape[time_axis]
+        batch_size = mem_l[0].shape[batch_axis]
+        ctx = data.ctx
+        local_attn_mask = mx.np.ones((batch_size, query_length, curr_mem_length + query_length),
+                                     dtype=np.int32, ctx=ctx)
+        if not causal_only:
+            # Generate the mask, we mask out the input outside the local self.mem_length window
+            local_attn_mask = mx.np.triu(mx.np.tril(local_attn_mask, curr_mem_length),
+                                         curr_mem_length - self.mem_length)
+        else:
+            local_attn_mask = mx.np.tril(local_attn_mask, curr_mem_length)
+        if data_mem_mask is None:
+            data_mem_mask = local_attn_mask
+        else:
+            data_mem_mask = data_mem_mask * local_attn_mask
+        if rel_positions is None:
+            query_ids = mx.np.arange(curr_mem_length, curr_mem_length + query_length,
+                                     dtype=np.int32, ctx=ctx)
+            mem_ids = mx.np.arange(0, curr_mem_length + query_length,
+                                   dtype=np.int32, ctx=ctx)
+            rel_positions = mx.np.expand_dims(query_ids, axis=1)\
+                            - mx.np.expand_dims(mem_ids, axis=0)
+        # Get word embeddings
+        word_embeddings = self.word_emb(data)
+        word_embeddings = self.dropout_layer(word_embeddings)
+        out_l = self.decoder(word_embeddings, mem_l, rel_positions, data_mem_mask)
+        # Get the output logits
+        logits = self.crit(out_l[-1], target)
+
+        # Get the new memory
+        new_mem_l = []
+        for step_out, mem in zip([word_embeddings] + out_l, mem_l):
+            new_mem = mx.np.concatenate([mem, step_out], axis=time_axis)
+            if self._layout == 'NT':
+                new_mem = new_mem[:, -self.mem_length:]
+            elif self._layout == 'TN':
+                new_mem = new_mem[-self.mem_length:, :]
+            else:
+                raise NotImplementedError
+            if detach_memory:
+                new_mem_l.append(new_mem.detach())
+            else:
+                new_mem_l.append(new_mem)
+        return logits, new_mem_l
+
+    def step_forward(self, step_data, mem_l):
+        """Forward for just one step
+
+        Parameters
+        ----------
+        step_data
+            Shape (B,)
+        mem_l
+            A list of memory objects
+            - layout == 'NT'
+                Shape (B, T_mem, units)
+            - layout == 'TN'
+                Shape (T_mem, B, units)
+
+        Returns
+        -------
+        logits
+            Shape (B, V)
+        new_mem_l
+            A list of memory objects
+            - layout == 'NT'
+                Shape (B, min(T_mem + 1, memory_length), C)
+            - layout == 'TN'
+                Shape (min(T_mem + 1, memory_length), B, C)
+        """
+        batch_size = step_data.shape[0]
+        if self._layout == 'NT':
+            curr_mem_length = mem_l[0].shape[1]
+        elif self._layout == 'TN':
+            curr_mem_length = mem_l[0].shape[0]
+        else:
+            raise NotImplementedError
+        ctx = step_data.ctx
+        mask = mx.np.ones((batch_size, 1, curr_mem_length + 1), dtype=np.int32, ctx=ctx)
+        rel_positions = mx.np.expand_dims(mx.np.arange(curr_mem_length, -1, -1, dtype=np.int32,
+                                                       ctx=ctx), axis=0)
+        # Word embedding shape = (B, C)
+        word_embeddings = self.dropout_layer(self.word_emb(step_data))
+        if self._layout == 'NT':
+            word_embeddings = mx.np.expand_dims(word_embeddings, axis=1)
+        elif self._layout == 'TN':
+            word_embeddings = mx.np.expand_dims(word_embeddings, axis=0)
+        else:
+            raise NotImplementedError
+        out_l = self.decoder(word_embeddings, mem_l, rel_positions, mask)
+
+        # Get logits
+        if self._layout == 'NT':
+            final_out = out_l[-1][:, 0]
+        elif self._layout == 'TN':
+            final_out = out_l[-1][0, :]
+        else:
+            raise NotImplementedError
+        logits = self.crit.get_logits(mx, final_out)
+
+        # Update memory
+        new_mem_l = []
+        for step_out, mem in zip([word_embeddings] + out_l, mem_l):
+            if self._layout == 'NT':
+                new_mem = mx.np.concatenate([mem, step_out], axis=1)
+                new_mem = new_mem[:, -self.mem_length:]
+            elif self._layout == 'TN':
+                new_mem = mx.np.concatenate([mem, step_out], axis=0)
+                new_mem = new_mem[-self.mem_length:, :]
+            else:
+                raise NotImplementedError
+            new_mem_l.append(new_mem)
+        return logits, new_mem_l
+
+
+@use_np
+class TransformerXLForLMGen(BaseStepDecoder):
+    def __init__(self, net: TransformerXLForLM):
+        self.net = net
+
+    def init_states(self, batch_size, ctx):
+        return self.net.init_states(batch_size=batch_size, ctx=ctx)
+
+    @property
+    def state_batch_axis(self):
+        return self.net.state_batch_axis
+
+    def __call__(self, step_data, mem_l):
+        return self.net.step_forward(step_data, mem_l)
diff --git a/src/gluonnlp/models/xlmr.py b/src/gluonnlp/models/xlmr.py
new file mode 100644
index 0000000000..66a3784557
--- /dev/null
+++ b/src/gluonnlp/models/xlmr.py
@@ -0,0 +1,173 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+XLM-R Model
+
+@article{conneau2019unsupervised,
+  title={Unsupervised Cross-lingual Representation Learning at Scale},
+  author={Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'a}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin},
+  journal={arXiv preprint arXiv:1911.02116},
+  year={2019}
+}
+"""
+
+__all__ = ['XLMRModel', 'XLMRForMLM', 'list_pretrained_xlmr', 'get_pretrained_xlmr']
+
+from typing import Tuple
+import os
+from mxnet import use_np
+from .roberta import RobertaModel, RobertaForMLM, roberta_base, roberta_large
+from ..base import get_model_zoo_home_dir, get_repo_model_zoo_url, get_model_zoo_checksum_dir
+from ..utils.config import CfgNode as CN
+from ..utils.registry import Registry
+from ..utils.misc import load_checksum_stats, download
+from ..registry import BACKBONE_REGISTRY
+from ..data.tokenizers import SentencepieceTokenizer
+
+
+FILE_STATS = load_checksum_stats(os.path.join(get_model_zoo_checksum_dir(), 'xlmr.txt'))
+xlmr_cfg_reg = Registry('xlmr_cfg')
+
+
+@xlmr_cfg_reg.register()
+def xlmr_base():
+    cfg = roberta_base()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 250002
+    cfg.freeze()
+    return cfg
+
+
+@xlmr_cfg_reg.register()
+def xlmr_large():
+    cfg = roberta_large()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 250002
+    cfg.freeze()
+    return cfg
+
+
+@use_np
+class XLMRModel(RobertaModel):
+    @staticmethod
+    def get_cfg(key=None):
+        if key:
+            return xlmr_cfg_reg.create(key)
+        else:
+            return xlmr_base()
+
+
+PRETRAINED_URL = {
+    'fairseq_xlmr_base': {
+        'cfg': xlmr_base(),
+        'sentencepiece.model': 'fairseq_xlmr_base/sentencepiece-18e17bae.model',
+        'params': 'fairseq_xlmr_base/model-3fa134e9.params',
+        'mlm_params': 'fairseq_xlmr_base/model_mlm-86e37954.params',
+        'lowercase': False,
+    },
+    'fairseq_xlmr_large': {
+        'cfg': xlmr_large(),
+        'sentencepiece.model': 'fairseq_xlmr_large/sentencepiece-18e17bae.model',
+        'params': 'fairseq_xlmr_large/model-b62b074c.params',
+        'mlm_params': 'fairseq_xlmr_large/model_mlm-887506c2.params',
+        'lowercase': False,
+    }
+}
+
+
+@use_np
+class XLMRForMLM(RobertaForMLM):
+    pass
+
+
+def list_pretrained_xlmr():
+    return sorted(list(PRETRAINED_URL.keys()))
+
+
+def get_pretrained_xlmr(model_name: str = 'fairseq_xlmr_base',
+                        root: str = get_model_zoo_home_dir(),
+                        load_backbone: bool = True,
+                        load_mlm: bool = False) \
+        -> Tuple[CN, SentencepieceTokenizer, str, str]:
+    """Get the pretrained XLM-R weights
+
+    Parameters
+    ----------
+    model_name
+        The name of the xlmr model.
+    root
+        The downloading root
+    load_backbone
+        Whether to load the weights of the backbone network
+    load_mlm
+        Whether to load the weights of MLM
+
+    Returns
+    -------
+    cfg
+        Network configuration
+    tokenizer
+        The SentencepieceTokenizer
+    params_path
+        Path to the parameters
+    mlm_params_path
+        Path to the parameter that includes both the backbone and the MLM
+    """
+    assert model_name in PRETRAINED_URL, '{} is not found. All available are {}'.format(
+        model_name, list_pretrained_xlmr())
+    cfg_path = PRETRAINED_URL[model_name]['cfg']
+    if isinstance(cfg_path, CN):
+        cfg = cfg_path
+    else:
+        cfg = None
+    sp_model_path = PRETRAINED_URL[model_name]['sentencepiece.model']
+    params_path = PRETRAINED_URL[model_name]['params']
+    mlm_params_path = PRETRAINED_URL[model_name]['mlm_params']
+    local_paths = dict()
+    download_jobs = [('sentencepiece.model', sp_model_path)]
+    if cfg is None:
+        download_jobs.append(('cfg', cfg_path))
+    for k, path in download_jobs:
+        local_paths[k] = download(url=get_repo_model_zoo_url() + path,
+                                  path=os.path.join(root, path),
+                                  sha1_hash=FILE_STATS[path])
+    if load_backbone:
+        local_params_path = download(url=get_repo_model_zoo_url() + params_path,
+                                     path=os.path.join(root, params_path),
+                                     sha1_hash=FILE_STATS[params_path])
+    else:
+        local_params_path = None
+    if load_mlm and mlm_params_path is not None:
+        local_mlm_params_path = download(url=get_repo_model_zoo_url() + mlm_params_path,
+                                         path=os.path.join(root, mlm_params_path),
+                                         sha1_hash=FILE_STATS[mlm_params_path])
+    else:
+        local_mlm_params_path = None
+
+    do_lower = True if 'lowercase' in PRETRAINED_URL[model_name]\
+                       and PRETRAINED_URL[model_name]['lowercase'] else False
+    tokenizer = SentencepieceTokenizer(
+                    model_path=local_paths['sentencepiece.model'],
+                    lowercase=do_lower)
+    if cfg is None:
+        cfg = XLMRModel.get_cfg().clone_merge(local_paths['cfg'])
+    return cfg, tokenizer, local_params_path, local_mlm_params_path
+
+
+BACKBONE_REGISTRY.register('xlmr', [XLMRModel,
+                                    get_pretrained_xlmr,
+                                    list_pretrained_xlmr])
diff --git a/src/gluonnlp/op.py b/src/gluonnlp/op.py
new file mode 100644
index 0000000000..72afbff5a0
--- /dev/null
+++ b/src/gluonnlp/op.py
@@ -0,0 +1,313 @@
+import mxnet as mx
+import math
+import numpy as np
+from mxnet import use_np
+__all__ = ['select_vectors_by_position', 'add_vectors_by_position',
+           'update_vectors_by_position',
+           'gumbel_softmax', 'trunc_gumbel',
+           'relative_position_bucket',
+           'l2_normalize']
+
+
+@use_np
+def select_vectors_by_position(F, data, positions):
+    """Select each batch with the given positions.
+
+    Once advanced indexing can be hybridized, we can revise the implementation.
+
+    out[i, j, ...] = data[i, positions[i, j], ...]
+
+    Parameters
+    ----------
+    F
+    data
+        Input tensor of contextualized token embeddings
+        Shape (batch_size, seq_length, ...)
+    positions
+        Input tensor of the positions.
+        Shape (batch_size, num_sel_positions).
+        For each sample in the batch, the values in this tensor must not exceed
+        the length of the sequence.
+
+    Returns
+    -------
+    out
+        The selection result.
+        Shape (batch_size, num_sel_positions, ...)
+    """
+    # Here, we use gather_nd to select the output from data:
+    # Need to compute
+    #   out[i, j, :] = in[i, masked_position[i, j], :]
+    # Thus, construct a indices with shape [2, batch_size, num_masked_position], where
+    #     indices[0, i, j] = i
+    #     indices[1, i, j] = masked_position[i, j]
+    # Then, out = gather_nd(in, indices)
+    positions = positions.astype(np.int32)
+    # batch_idx.shape = (batch_size, 1) as [[0], [1], [2], ...]
+    batch_idx = F.np.expand_dims(F.npx.arange_like(positions, axis=0),
+                                 axis=1).astype(np.int32)
+    batch_idx = batch_idx + F.np.zeros_like(positions)
+    indices = F.np.stack([batch_idx, positions])
+    # TODO(sxjscience) We can revise the implementation to advanced indexing
+    #  once the bug in MXNet is solved:
+    #  https://github.com/apache/incubator-mxnet/issues/18919
+    out = F.npx.gather_nd(data, indices)
+    return out
+
+
+@use_np
+def add_vectors_by_position(F, data, increment, positions):
+    """Scatter each batch with the given positions.
+
+    data[i, positions[i, j], ...] += increment[i, j, ...]
+
+    Parameters
+    ----------
+    F
+    data
+        Input tensor of the array to be updated.
+        Shape (batch_size, seq_length, ...)
+    increment
+        Input tensor of token ids
+        Shape (batch_size, num_disp_position, ...)
+    positions
+        Input tensor of the positions.
+        Shape (batch_size, num_disp_position).
+        For each sample in the batch, the values in this tensor must not exceed
+        the length of the sequence.
+
+    Returns
+    -------
+    out
+        The updated result.
+        Shape (batch_size, seq_length, ...)
+    """
+    # Here, we use index_add to disperse the output from data:
+    # Need to compute
+    #   out[i, masked_position[i, j], :] = in[i, j, :]
+    # Thus, construct an indices with shape [2, batch_size * num_masked_position], where
+    #     indices[0, i * num_masked_position + j] = i
+    #     indices[1, i * num_masked_position + j] = masked_position[i, j]
+    # And convert data to the shape of the (batch_size * num_masked_position, )
+    # Then, out = npx.index_add(data, indices, increment)
+    positions = positions.astype(np.int32)
+    # batch_idx.shape = (batch_size, 1) as [[0], [1], [2], ...]
+    batch_idx = F.np.expand_dims(F.npx.arange_like(positions, axis=0),
+                                 axis=1).astype(np.int32)
+    batch_idx = batch_idx + F.np.zeros_like(positions)
+    indices = F.np.stack([batch_idx.reshape((-1,)), positions.reshape((-1,))])
+    out = F.npx.index_add(data, indices, F.npx.reshape(increment, (-5, -4)))
+    return out
+
+
+@use_np
+def update_vectors_by_position(F, data, val, positions):
+    """
+    Update each batch with the given positions. Considered as a reversed process of
+    "select_vectors_by_position", this is an operator similar to "add_vectors_by_position"
+    that updates the results instead of adding.
+
+    data[i, positions[i, j], :] = val[i, j, :]
+
+    Parameters
+    ----------
+    F
+    data:
+        Input tensor of the array to be updated.
+        Shape (batch_size, seq_length)
+    val
+        Input tensor of token ids
+        Shape (batch_size, num_disp_position)
+    positions
+        Input tensor of the positions.
+        Shape (batch_size, num_disp_position).
+        For each sample in the batch, the values in this tensor must not exceed
+        the length of the sequence.
+
+    Returns
+    -------
+    out
+        The updated result.
+        Shape (batch_size, seq_length)
+    """
+    positions = positions.astype(np.int32)
+    # batch_idx.shape = (batch_size, 1) as [[0], [1], [2], ...]
+    batch_idx = F.np.expand_dims(F.npx.arange_like(positions, axis=0),
+                                 axis=1).astype(np.int32)
+    batch_idx = batch_idx + F.np.zeros_like(positions)
+    indices = F.np.stack([batch_idx.reshape((-1,)), positions.reshape((-1,))])
+
+    out = F.npx.index_update(data, indices, F.npx.reshape(val, (-5, -4)))
+    return out
+
+
+@use_np
+def gumbel_softmax(F, logits, temperature: float = 1.0, eps: float = 1E-10,
+                   hard=True, use_np_gumbel: bool = True):
+    r"""Perform the gumbel-softmax trick to generate differentiable one-hot vectors from the input
+    logits.
+
+    Here, the gumbel distribution is
+
+    Gumbel(\alpha) = -log (-log U) + \log \alpha, in which U is the uniform(0, 1) distribution.
+
+    A nice property of Gumbel is:
+
+    \argmax({Gumbel(\alpha_i)}) \sim multinomial(\alpha_i)
+
+    The Gumbel-Softmax trick is to use the softmax + straight-through estimator to produce
+    one-hot vectors that represent the sampling result.
+
+    References:
+
+        1. https://en.wikipedia.org/wiki/Gumbel_distribution
+        2. [ICLR2017] Categorical Reparameterization with Gumbel-Softmax
+
+    Parameters
+    ----------
+    logits
+        Logits. Shape (..., V)
+    temperature
+        The temperature that controls the
+    eps
+        The eps for stability of gradient
+    hard
+        Whether to use the straight-through estimator to produce one-hot vectors.
+    use_np_gumbel
+        Whether to use the random.gumble operator
+
+    Returns
+    -------
+    ret
+        The returned output. Shape (..., V)
+    """
+    # TODO(sxjscience) Investigate the impact of random.gumbel:
+    #  Actually, random.gumble has no eps and may have problem in calculating the gradient.
+    if use_np_gumbel:
+        gumbels = F.np.random.gumbel(F.np.zeros_like(logits))
+    else:
+        u = F.np.random.uniform(F.np.zeros_like(logits), 1)
+        gumbels = -F.np.log(-F.np.log(u + eps) + eps)
+    y = F.npx.softmax((gumbels + logits) / temperature, axis=-1)
+    if hard:
+        y_hard = F.np.max(y, axis=-1, keepdims=True) == y
+        y_hard = F.npx.stop_gradient(y_hard - y) + y
+        return y_hard
+    else:
+        return y
+
+
+def trunc_gumbel(F, logits, truncation):
+    """Sample from the TruncGumbel distribution.
+
+    The cumulative density function (CDF) of the Truncated Gumbel distribution is defined as
+
+    TruncGumbel(\alpha, truncation) \prop max(Gumbel(\alpha), truncation)
+
+    To sample from the distribution, we can use the CDF inversion technique.
+
+    References:
+
+        1. [NIPS2014] A* Sampling, https://papers.nips.cc/paper/5449-a-sampling.pdf
+        2. https://cmaddis.github.io/gumbel-machinery
+
+    Parameters
+    ----------
+    logits
+        The logits. Shape (...,)
+    truncation
+        The truncation. Shape (...,)
+
+    Returns
+    -------
+    samples
+        Samples from the TruncGumbel(logits, truncation)
+        Shape (...,)
+    """
+    gumbels = F.np.random.gumbel(F.np.zeros_like(logits)) + logits
+    return -F.np.log(F.np.exp(-gumbels) + np.exp(-truncation))
+
+
+def relative_position_bucket(F, relative_position,
+                             bidirectional: bool = True,
+                             num_buckets: int = 32,
+                             max_distance: int = 128):
+    """Map the relative position to buckets. The major difference between our implementation and
+    that in [mesh_tensorflow](https://github.com/tensorflow/mesh/blob/c59988047e49b4d2af05603e3170724cdbadc467/mesh_tensorflow/transformer/transformer_layers.py#L595-L637)
+    is that we use 'query_i - mem_j' as the (i, j)th location in relative_position.
+
+    Thus, a positive value means that the query slot is in a later timestamp than the memory slot.
+    However, in mesh transformer, it is treated as `mem_i - query_j` (reversed).
+
+    The implementation uses the first half of the bucket (num_buckets // 2) to store the
+    exact increments in positions and the second half of the bucket
+    (num_buckets - num_buckets // 2) to store the bucketing values in the logarithm order.
+
+    Parameters
+    ----------
+    F
+    relative_position
+        Shape (...,)
+    bidirectional
+        Whether we are dealing with bidirectional attention.
+        If it's bidirectional, we will use the first half to map the positions of the
+        positive shifts and the second half to map the positions of the negative shifts.
+    num_buckets
+        The number of buckets.
+    max_distance
+        Maximum distance. Positions that fall outside of 'max_distance' will be trimmed.
+
+    Returns
+    -------
+    buckets
+        Shape (...,).
+        It has the same shape as the `relative_position`. It will have int32 type.
+    """
+    ret = 0
+    if bidirectional:
+        assert num_buckets % 2 == 0, 'When bidirectional is True, the number of buckets must be ' \
+                                     'divisible by 2.'
+        num_buckets //= 2
+        ret = ret + (relative_position < 0).astype(np.int32) * num_buckets
+        relative_position = F.np.abs(relative_position)
+    else:
+        # Clip all the negative values to 0
+        relative_position = F.np.clip(relative_position, a_min=0, a_max=None)
+    # Now, the relative_position is in the range [0, inf)
+
+    # Half of the buckets deal with the exact increments,
+    # i.e., 0, 1, 2, ..., max_exact - 1, where max_exact = num_buckets // 2
+    max_exact = num_buckets // 2
+    is_small = relative_position < max_exact
+
+    # The other half of the buckets are for logarithmically bigger bins in positions up to
+    # max_distance
+    val_if_large = max_exact + (
+            F.np.log(relative_position.astype(np.float32) / max_exact)
+            / math.log(max_distance / max_exact) * (num_buckets - max_exact)).astype(np.int32)
+    val_if_large = F.np.minimum(val_if_large, num_buckets - 1)
+    ret = ret + F.np.where(is_small, relative_position, val_if_large)
+    return ret
+
+
+def l2_normalize(F, data, axis=-1, eps=1e-6):
+    """Normalize the data by L2 normalization.
+    
+    Parameters
+    ----------
+    F
+        mx.sym or mx.nd
+    data
+        The input data
+    axis
+        The axis that we should perform l2 normalization
+    eps
+        The epsilon value
+    
+    Returns
+    -------
+    ret
+        The returned output
+    """
+    ret = data / (F.np.linalg.norm(data, axis=axis, keepdims=True) + eps)
+    return ret
diff --git a/src/gluonnlp/optimizer.py b/src/gluonnlp/optimizer.py
new file mode 100644
index 0000000000..8b86f925ec
--- /dev/null
+++ b/src/gluonnlp/optimizer.py
@@ -0,0 +1,235 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""AdamW optimizer."""
+import math
+import os
+import numpy as np
+from mxnet import optimizer
+import mxnet as mx
+from mxnet.ndarray.contrib import mp_adamw_update, adamw_update,\
+    multi_mp_adamw_update, multi_adamw_update
+
+
+__all__ = ['AdamW']
+
+
+@optimizer.register
+class AdamW(optimizer.Optimizer):
+    """The AdamW optimizer.
+
+    This class implements the optimizer described in *Decoupled Weight Decay Regularization*,
+     available at https://arxiv.org/pdf/1711.05101.pdf.
+
+    Updates are applied by::
+
+        grad = clip(grad * rescale_grad, clip_gradient)
+        m = beta1 * m + (1 - beta1) * grad
+        v = beta2 * v + (1 - beta2) * (grad**2)
+        lr = learning_rate * sqrt(1 - beta2**t) / (1 - beta1**t)
+        w = w - lr * (m / (sqrt(v) + epsilon) + wd * w)
+
+
+    Also, we can turn of the bias correction term and the updates are as follows::
+
+        grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        m = beta1 * m + (1 - beta1) * grad
+        v = beta2 * v + (1 - beta2) * (grad**2)
+        lr = learning_rate
+        w = w - lr * (m / (sqrt(v) + epsilon) + wd * w)
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.001
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    beta1 : float, default 0.9
+        Exponential decay rate for the first moment estimates.
+    beta2 : float, default 0.999
+        Exponential decay rate for the second moment estimates.
+    epsilon : float, default 1e-6
+        Small value to avoid division by 0.
+    correct_bias : bool, default True
+       Can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository).
+       Default True.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6,
+                 correct_bias=True, use_fused_step=True, **kwargs):
+        super().__init__(use_fused_step=use_fused_step,
+                                    learning_rate=learning_rate,
+                                    **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.correct_bias = correct_bias
+        self.aggregate_num = max(1, min(50,
+                                        int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', '4'))))
+        assert self.multi_precision is False, 'Currently we do not support multi-precision.'
+
+    def create_state(self, index, weight):
+        """state creation function."""
+        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
+                mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = mx.nd.clip(grad, - self.clip_gradient, self.clip_gradient)
+            if self.correct_bias:
+                coef1 = 1. - self.beta1**t
+                coef2 = 1. - self.beta2**t
+                lr *= math.sqrt(coef2) / coef1
+
+            # update mean and var
+            mean, var = state
+            mean[:] *= self.beta1
+            mean[:] += (1. - self.beta1) * grad
+            var[:] *= self.beta2
+            var[:] += (1. - self.beta2) * mx.nd.square(grad)
+
+            # update weight
+            d = mean / (mx.nd.sqrt(var) + self.epsilon)
+            weight[:] -= lr * d
+            # add wd
+            if wd > 0:
+                weight[:] -= lr * wd * weight
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        multi_precision = self.multi_precision and weights[0].dtype == np.float16
+        aggregate = self.aggregate_num > 1
+        if not isinstance(indices, (tuple, list)):
+            indices = [indices]
+            weights = [weights]
+            grads = [grads]
+            states = [states]
+        for w_i, g_i in zip(weights, grads):
+            assert(isinstance(w_i, mx.nd.NDArray))
+            assert(isinstance(g_i, mx.nd.NDArray))
+            aggregate = (aggregate and
+                         w_i.stype == 'default' and
+                         g_i.stype == 'default')
+        self._update_count(indices)
+        lrs = self._get_lrs(indices)
+        wds = self._get_wds(indices)
+        if self.correct_bias:
+            new_lrs = []
+            for idx, lr in zip(indices, lrs):
+                t = self._index_update_count[idx]
+                coef1 = 1. - self.beta1 ** t
+                coef2 = 1. - self.beta2 ** t
+                new_lrs.append(lr * math.sqrt(coef2) / coef1)
+            lrs = new_lrs
+        if not isinstance(self.rescale_grad, mx.nd.NDArray):
+            self.rescale_grad = mx.nd.full(shape=(1,), val=self.rescale_grad,
+                                           ctx=weights[0].context)
+        else:
+            self.rescale_grad = self.rescale_grad.as_in_context(weights[0].context)
+        kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
+                  'rescale_grad': self.rescale_grad}
+        if self.clip_gradient:
+            kwargs['clip_gradient'] = self.clip_gradient
+
+        if aggregate:
+            current_index = 0
+            while current_index < len(indices):
+                sidx = current_index
+                eidx = min(current_index + self.aggregate_num, len(indices))
+                if not multi_precision:
+                    mean, var = list(zip(*states[sidx:eidx]))
+                    multi_adamw_update(weights[sidx:eidx],
+                                       grads[sidx:eidx],
+                                       mean, var,
+                                       out=weights[sidx:eidx],
+                                       size=len(weights[sidx:eidx]),
+                                       lrs=list(np.ones(len(weights[sidx:eidx]))),
+                                       wds=wds[sidx:eidx],
+                                       etas=lrs[sidx:eidx],
+                                       **kwargs)
+                else:
+                    mean_var = list(zip(*states[sidx:eidx]))[0]
+                    tmean_var = list(zip(*mean_var))
+                    mean = tmean_var[0]
+                    var = tmean_var[1]
+                    multi_mp_adamw_update(weights[sidx:eidx],
+                                          grads[sidx:eidx],
+                                          mean, var,
+                                          list(zip(*states[sidx:eidx]))[1],
+                                          out=weights[sidx:eidx],
+                                          size=len(weights[sidx:eidx]),
+                                          lrs=list(np.ones(len(weights[sidx:eidx]))),
+                                          wds=wds[sidx:eidx],
+                                          etas=lrs[sidx:eidx],
+                                          **kwargs)
+                current_index += self.aggregate_num
+        else:
+            for w_i, g_i, s_i, lr, wd in zip(weights, grads, states, lrs, wds):
+                if not multi_precision:
+                    mean, var = s_i
+                    adamw_update(w_i, g_i, mean, var, out=w_i,
+                                 lr=1, wd=wd, eta=lr, **kwargs)
+                else:
+                    mean, var = s_i[0]
+                    mp_adamw_update(w_i, g_i, mean, var, s_i[1], out=w_i,
+                                    lr=1, wd=wd, eta=lr, **kwargs)
diff --git a/src/gluonnlp/optimizer/__init__.py b/src/gluonnlp/optimizer/__init__.py
deleted file mode 100644
index b56e43b23d..0000000000
--- a/src/gluonnlp/optimizer/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""NLP optimizer."""
-
-from . import bert_adam
-
-from .bert_adam import *
-
-__all__ = bert_adam.__all__
diff --git a/src/gluonnlp/optimizer/bert_adam.py b/src/gluonnlp/optimizer/bert_adam.py
deleted file mode 100644
index 3a7bd8c6d1..0000000000
--- a/src/gluonnlp/optimizer/bert_adam.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Weight updating functions."""
-import os
-import warnings
-import numpy
-from mxnet.optimizer import Optimizer, register
-from mxnet.ndarray import zeros, NDArray, full
-from mxnet.ndarray.contrib import mp_adamw_update, adamw_update, \
-    multi_mp_adamw_update, multi_adamw_update
-
-__all__ = ['BERTAdam']
-
-@register
-class BERTAdam(Optimizer):
-    """The Adam optimizer with weight decay regularization for BERT.
-
-    Updates are applied by::
-
-        rescaled_grad = clip(grad * rescale_grad, clip_gradient)
-        m = beta1 * m + (1 - beta1) * rescaled_grad
-        v = beta2 * v + (1 - beta2) * (rescaled_grad**2)
-        w = w - learning_rate * (m / (sqrt(v) + epsilon) + wd * w)
-
-    Note that this is different from `mxnet.optimizer.Adam`, where L2 loss is added and
-    accumulated in m and v. In BERTAdam, the weight decay term decoupled from gradient
-    based update.
-
-    This is also slightly different from the AdamW optimizer described in
-    *Fixing Weight Decay Regularization in Adam*, where the schedule multiplier and
-    learning rate is decoupled, and the bias-correction terms are removed.
-    The BERTAdam optimizer uses the same learning rate to apply gradients
-    w.r.t. the loss and weight decay.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`mxnet.optimizer.Optimizer`.
-
-    Parameters
-    ----------
-    beta1 : float, optional, default is 0.9
-        Exponential decay rate for the first moment estimates.
-    beta2 : float, optional, default is 0.999
-        Exponential decay rate for the second moment estimates.
-    epsilon : float, optional, default is 1e-6
-        Small value to avoid division by 0.
-    """
-    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6,
-                 **kwargs):
-        super(BERTAdam, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.aggregate_num = max(1, min(50, int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE',
-                                                          '4'))))
-
-    def create_state_multi_precision(self, index, weight):
-        """multi-precision state creation function."""
-        weight_master_copy = None
-        if self.multi_precision and weight.dtype == numpy.float16:
-            weight_master_copy = weight.astype(numpy.float32)
-            return (self.create_state(index, weight_master_copy), weight_master_copy)
-        if weight.dtype == numpy.float16 and not self.multi_precision:
-            warnings.warn('Accumulating with float16 in optimizer can lead to '
-                          'poor accuracy or slow convergence. '
-                          'Consider using multi_precision=True option of the '
-                          'BERTAdam optimizer')
-        return self.create_state(index, weight)
-
-    def create_state(self, _, weight):
-        """state creation function."""
-        return (zeros(weight.shape, weight.context, dtype=weight.dtype), #mean
-                zeros(weight.shape, weight.context, dtype=weight.dtype)) #variance
-
-    def update(self, index, weight, grad, state):
-        """update function"""
-        self._update_impl(index, weight, grad, state, multi_precision=False)
-
-    def update_multi_precision(self, index, weight, grad, state):
-        """multi-precision update function"""
-        use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16
-        self._update_impl(index, weight, grad, state,
-                          multi_precision=use_multi_precision)
-
-    def _update_impl(self, indices, weight, grad, state, multi_precision=False):
-        """update function"""
-        aggregate = self.aggregate_num > 1
-        if not isinstance(indices, (tuple, list)):
-            indices = [indices]
-            weight = [weight]
-            grad = [grad]
-            state = [state]
-        for w_i, g_i in zip(weight, grad):
-            assert(isinstance(w_i, NDArray))
-            assert(isinstance(g_i, NDArray))
-            aggregate = (aggregate and
-                         w_i.stype == 'default' and
-                         g_i.stype == 'default')
-        self._update_count(indices)
-        lrs = self._get_lrs(indices)
-        wds = self._get_wds(indices)
-
-        # pylint: disable=access-member-before-definition
-        if not isinstance(self.rescale_grad, NDArray):
-            self.rescale_grad = full(shape=(1,), val=self.rescale_grad, ctx=weight[0].context)
-        else:
-            self.rescale_grad = self.rescale_grad.as_in_context(weight[0].context)
-
-        kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
-                  'rescale_grad': self.rescale_grad}
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-
-        if aggregate:
-            current_index = 0
-            while current_index < len(indices):
-                sidx = current_index
-                eidx = min(current_index + self.aggregate_num, len(indices))
-                if not multi_precision:
-                    mean, var = list(zip(*state[sidx:eidx]))
-                    multi_adamw_update(weight[sidx:eidx],
-                                       grad[sidx:eidx],
-                                       mean, var,
-                                       out=weight[sidx:eidx],
-                                       size=len(weight[sidx:eidx]),
-                                       lrs=list(numpy.ones(len(weight[sidx:eidx]))),
-                                       wds=wds[sidx:eidx],
-                                       etas=lrs[sidx:eidx],
-                                       **kwargs)
-                else:
-                    mean_var = list(zip(*state[sidx:eidx]))[0]
-                    tmean_var = list(zip(*mean_var))
-                    mean = tmean_var[0]
-                    var = tmean_var[1]
-                    multi_mp_adamw_update(weight[sidx:eidx],
-                                          grad[sidx:eidx],
-                                          mean, var,
-                                          list(zip(*state[sidx:eidx]))[1],
-                                          out=weight[sidx:eidx],
-                                          size=len(weight[sidx:eidx]),
-                                          lrs=list(numpy.ones(len(weight[sidx:eidx]))),
-                                          wds=wds[sidx:eidx],
-                                          etas=lrs[sidx:eidx],
-                                          **kwargs)
-                current_index += self.aggregate_num
-        else:
-            for w_i, g_i, s_i, lr, wd in zip(weight, grad, state, lrs, wds):
-                if not multi_precision:
-                    mean, var = s_i
-                    adamw_update(w_i, g_i, mean, var, out=w_i,
-                                 lr=1, wd=wd, eta=lr, **kwargs)
-                else:
-                    mean, var = s_i[0]
-                    mp_adamw_update(w_i, g_i, mean, var, s_i[1], out=w_i,
-                                    lr=1, wd=wd, eta=lr, **kwargs)
diff --git a/src/gluonnlp/registry.py b/src/gluonnlp/registry.py
new file mode 100644
index 0000000000..54b644e0f9
--- /dev/null
+++ b/src/gluonnlp/registry.py
@@ -0,0 +1,7 @@
+from .utils.registry import Registry
+
+BACKBONE_REGISTRY = Registry('Backbone Models')
+TOKENIZER_REGISTRY = Registry('Tokenizer')
+DATA_PARSER_REGISTRY = Registry('NLP Data CLI Parser')
+DATA_MAIN_REGISTRY = Registry('NLP Data CLI Main Function')
+NLP_PREPROCESS_REGISTRY = Registry('NLP Preprocess CLI')
diff --git a/src/gluonnlp/sequence_sampler.py b/src/gluonnlp/sequence_sampler.py
new file mode 100644
index 0000000000..36b98468cf
--- /dev/null
+++ b/src/gluonnlp/sequence_sampler.py
@@ -0,0 +1,745 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Implements the beam search sampler."""
+import warnings
+import mxnet as mx
+import abc
+from mxnet.gluon import HybridBlock
+from typing import Callable, Optional
+from .layers import get_activation
+
+
+LARGE_POSITIVE_FLOAT = 1e18
+
+LARGE_NEGATIVE_FLOAT = -LARGE_POSITIVE_FLOAT
+
+
+class BaseStepDecoder(abc.ABC):
+    """Base class of a step decoder
+
+    You may inherit `BaseStepDecoder` and implement the required methods.
+
+    """
+    @property
+    @abc.abstractmethod
+    def state_batch_axis(self):
+        """Batch axis of the state
+
+        i --> axis of the batch dimension
+        None --> no batch axis in the state
+        """
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def init_states(self, **kwargs):
+        raise NotImplementedError()
+
+    @abc.abstractmethod
+    def __call__(self, data, states):
+        """The common signature of the sequence decoder
+
+        Parameters
+        ----------
+        data
+        states
+
+        Returns
+        -------
+        out
+        new_states
+        """
+        raise NotImplementedError
+
+
+# TODO(sxjscience)
+#  1. Add ParticleFilter Sampler
+class BeamSearchScorer(HybridBlock):
+    r"""Score function used in beam search.
+
+    Implements the length-penalized score function first used in the GNMT paper::
+
+        scores = (log_probs + scores) / length_penalty
+        length_penalty = (\frac{K + length}{K + 1})^\alpha
+
+    See Also
+
+    "Google's Neural Machine Translation System: Bridging the Gap between Human and
+    Machine Translation (https://arxiv.org/pdf/1609.08144.pdf)"
+
+    Parameters
+    ----------
+    alpha
+        If `alphas < 1.0`, it favors shorter sequences
+        If `alpha >= 1.0`, it favors longer sequences
+    K
+        Parameter in the formula
+    from_logits
+        Whether input is a log probability (usually from log_softmax) instead
+        of unnormalized numbers.
+    temperature
+        The temperature of the scoring function
+    """
+    def __init__(self, alpha: float = 1.0,
+                 K: float = 5.0,
+                 from_logits: bool = False,
+                 temperature: float = 1.0):
+        super().__init__()
+        self._alpha = float(alpha)
+        self._K = K
+        self._temperature = temperature
+        self._from_logits = from_logits
+
+    def __call__(self, outputs, scores, step):  # pylint: disable=arguments-differ
+        """Compute new scores of each candidate
+
+        Parameters
+        ----------
+        outputs : mx.np.ndarray
+            If from_logits is True, outputs is the log probabilities of the candidates.
+            Shape (d1, d2, ..., dn, V).
+            Otherwise, outputs is the unnormalized outputs from predictor of the same shape,
+            before softmax/log_softmax.
+        scores : mx.np.ndarray
+            The original scores of the beams. Shape (d1, d2, ..., dn)
+        step : mx.np.ndarray
+            Step to calculate the score function. It starts from 1. The shape is a scalar.
+
+        Returns
+        -------
+        candidate_scores : mx.np.ndarray
+            The scores of all the candidates. Shape (d1, d2, ..., dn, V), where V is the size
+            of the vocabulary.
+        """
+        return super().__call__(outputs, scores, step)
+
+    def forward(self, outputs, scores, step):  # pylint: disable=arguments-differ
+        if not self._from_logits:
+            outputs = mx.npx.log_softmax(outputs / self._temperature)
+
+        if self._alpha != 0.0:
+            step = step.astype(mx.np.float32)
+            prev_lp = (self._K + step - 1) ** self._alpha / ((self._K + 1) ** self._alpha)
+            prev_lp = prev_lp * (step != 1).astype(mx.np.float32) + (step == 1).astype(mx.np.float32)
+            lp = (self._K + step) ** self._alpha / ((self._K + 1) ** self._alpha)
+            scores = scores * prev_lp
+            candidate_scores = (outputs + mx.np.expand_dims(scores, axis=-1)) / lp
+        else:
+            candidate_scores = outputs + mx.np.expand_dims(scores, axis=-1)
+        return candidate_scores
+
+    def __repr__(self):
+        s = '{name}(alpha={alpha}, K={K}, from_logits={from_logits})'
+        return s.format(name=self.__class__.__name__,
+                        alpha=self._alpha,
+                        K=self._K,
+                        from_logits=self._from_logits)
+
+
+def _expand_to_beam_size(data, beam_size, batch_size, state_batch_axis=None):
+    """Tile all the states to have batch_size * beam_size on the batch axis.
+
+    Parameters
+    ----------
+    data : A single mx.np.ndarray or nested container with mx.np.ndarray
+        Each mx.np.ndarray should have shape (N, ...) when state_info is None,
+        or same as the layout in state_info when it's not None.
+    beam_size : int
+        Beam size
+    batch_size : int
+        Batch size
+    state_batch_axis : Nested structure of dictionary, default None.
+        Descriptors for states, usually from decoder's ``state_batch_axis()``.
+        When None, this method assumes that the batch axis is the first dimension.
+    Returns
+    -------
+    new_states : Object that contains mx.np.ndarray
+        Each mx.np.ndarray should have shape batch_size * beam_size on the batch axis.
+    """
+    if isinstance(data, (list, tuple)):
+        if state_batch_axis is not None:
+            # TODO(sxjscience) Better Exception Handling
+            return [_expand_to_beam_size(d, beam_size, batch_size, batch_axis)
+                    for d, batch_axis in zip(data, state_batch_axis)]
+        else:
+            return [_expand_to_beam_size(d, beam_size, batch_size, None) for d in data]
+    elif isinstance(data, dict):
+        if state_batch_axis is not None:
+            return {k: _expand_to_beam_size(v, beam_size, batch_size, state_batch_axis[k])
+                    for k, v in data.items()}
+        else:
+            return {k: _expand_to_beam_size(v, beam_size, batch_size, None)
+                    for k, v in data.items()}
+    elif isinstance(data, mx.np.ndarray):
+        if state_batch_axis is None:
+            batch_axis = 0
+        else:
+            batch_axis = state_batch_axis
+        if data.shape[batch_axis] != batch_size:
+            raise ValueError('The batch size of all the inner elements in states must be '
+                             '{}, Found shape={}, inferred batch axis={}'.format(batch_size, data.shape, batch_axis))
+        new_shape = list(data.shape)
+        new_shape[batch_axis] = batch_size * beam_size
+        new_shape = tuple(new_shape)
+        bcast_new_shape = new_shape[:batch_axis] + (batch_size, beam_size) + new_shape[(batch_axis + 1):]
+        return mx.np.expand_dims(data, batch_axis + 1).broadcast_to(bcast_new_shape).reshape(new_shape)
+    elif data is None:
+        return None
+    else:
+        raise NotImplementedError
+
+
+def _choose_states(states, indices, state_batch_axis=None):
+    """
+
+    Parameters
+    ----------
+    states : Object contains mx.np.ndarray
+    indices : mx.np.ndarray
+        Indices of the states to take. Shape (N,).
+    state_batch_axis
+        Descriptors for states, it is generated from decoder's ``state_batch_axis``.
+        When None, this method assumes that the batch axis is the first dimension.
+
+    Returns
+    -------
+    new_states : Object contains mx.np.ndarray
+        Each mx.np.ndarray should have shape (..., N, ...).
+    """
+    if isinstance(states, (list, tuple)):
+        if state_batch_axis is not None:
+            return [_choose_states(d, indices, b_axis)
+                    for d, b_axis in zip(states, state_batch_axis)]
+        else:
+            return [_choose_states(d, indices, None) for d in states]
+    elif isinstance(states, dict):
+        if state_batch_axis is not None:
+            return {k: _choose_states(v, indices, state_batch_axis[k]) for k, v in states.items()}
+        else:
+            return {k: _choose_states(v, indices, None) for k, v in states.items()}
+    elif isinstance(states, mx.np.ndarray):
+        if state_batch_axis is None:
+            batch_axis = 0
+        else:
+            batch_axis = state_batch_axis
+        states = mx.np.take(states, indices, axis=batch_axis)
+        return states
+    else:
+        raise TypeError('The type of the states is not supported, type(states) = {}'.format(type(states)))
+
+
+class _BeamSearchStepUpdate(HybridBlock):
+    def __init__(self, beam_size, vocab_size, eos_id, scorer, state_batch_axis,
+                 stochastic=False):
+        """
+
+        Parameters
+        ----------
+        beam_size : int
+        vocab_size : int
+        eos_id : int
+        scorer : BeamSearchScorer
+        state_batch_axis :
+        stochastic: bool
+        prefix : None
+        params : None
+        """
+        super().__init__()
+        self._beam_size = beam_size
+        self._vocab_size = vocab_size
+        self._eos_id = eos_id
+        self._scorer = scorer
+        self._state_batch_axis = state_batch_axis
+        self.stochastic = stochastic
+        self.activation = get_activation('relu')
+        assert eos_id is None or eos_id >= 0, 'eos_id cannot be negative! Received eos_id={}'.format(eos_id)
+
+    def gumbel_with_maximum(self, phi, T, dim=-1):
+        """
+        Parameters
+        ----------
+        F
+        phi : mx.np.ndarray
+            Shape (batch_size, beam_size, L).
+        T : mx.np.ndarray
+            The previous scores. Shape (batch_size, beam_size)
+        """
+        g_phi = phi + mx.np.random.gumbel(mx.np.zeros_like(phi))
+        Z = g_phi.max(dim)
+        g = self.shift_gumbel_maximum(g_phi, T, dim, Z=Z)
+        return g
+
+    def shift_gumbel_maximum(self, g_phi, T, dim=-1, Z=None):
+        """
+        Parameters
+        ----------
+        F
+        g_phi : mx.np.ndarray
+            Shape (batch_size, beam_size, L).
+        T : mx.np.ndarray
+            The previous scores. Shape (batch_size, beam_size)
+        """
+        if Z is None:
+            Z = g_phi.max(dim)
+        T_ = mx.npx.reshape(T, (-4, 1))
+        Z_ = mx.npx.reshape(Z, (-4, 1))
+        u = T_ - g_phi + mx.np.log1p(-mx.np.exp(g_phi - Z_)+1e-5)
+        return T_ - self.activation(u) - mx.np.log1p(mx.np.exp(-mx.np.abs(u)))
+
+    def forward(self, samples, valid_length, outputs, scores, step, beam_alive_mask,   # pylint: disable=arguments-differ
+                states, batch_shift):
+        """
+
+        Parameters
+        ----------
+        F
+        samples : mx.np.ndarray
+            The current samples generated by beam search.
+            Shape (batch_size, beam_size, L).
+        valid_length : mx.np.ndarray
+            The current valid lengths of the samples
+        outputs : mx.np.ndarray
+            Outputs from predictor. If from_logits was set to True in scorer, then it's the
+            log probability of the current step. Else, it's the unnormalized outputs before
+            softmax or log_softmax.
+            Shape (batch_size * beam_size, V).
+        scores : mx.np.ndarray
+            The previous scores. Shape (batch_size, beam_size)
+        step : mx.np.ndarray
+            The current step for doing beam search. Begins from 1. Shape ()
+        beam_alive_mask : mx.np.ndarray
+            Shape (batch_size, beam_size)
+        states : nested structure of mx.np.ndarray
+            Each mx.np.ndarray should have shape (N, ...) when state_info is None,
+            or same as the layout in state_info when it's not None.
+        batch_shift : mx.np.ndarray
+            Contains [0, beam_size, 2 * beam_size, ..., (batch_size - 1) * beam_size].
+            Shape (batch_size,)
+
+        Returns
+        -------
+        new_samples : mx.np.ndarray or an empty list
+            The updated samples.
+            When single_step is False, shape (batch_size, beam_size, L + 1)
+        new_valid_length : mx.np.ndarray
+            Valid lengths of the samples. Shape (batch_size, beam_size)
+        new_scores : mx.np.ndarray
+            Shape (batch_size, beam_size)
+        chosen_word_ids : mx.np.ndarray
+            The chosen word ids of the step. Shape (batch_size, beam_size). If it's negative,
+            no word will be appended to the beam.
+        beam_alive_mask : mx.np.ndarray
+            Shape (batch_size, beam_size)
+        new_states : nested structure of mx.np.ndarray
+            Inner mx.np.ndarrays have shape (batch_size * beam_size, ...)
+        """
+        beam_size = self._beam_size
+        vocab_size = self._vocab_size
+        beam_alive_mask_bcast = mx.np.expand_dims(beam_alive_mask, axis=2)
+        candidate_scores = self._scorer(mx.npx.reshape(outputs, (-6, -1, beam_size, -2)),
+                                        scores, step)
+        if self.stochastic:
+            if step == 1:
+                candidate_scores_gumbel\
+                    = candidate_scores[:1]\
+                      + mx.np.random.gumbel(mx.np.zeros_like(candidate_scores[:1]))
+                candidate_scores_residual = candidate_scores[1:]
+                candidate_scores = mx.np.concatenate((candidate_scores_gumbel,
+                                                      candidate_scores_residual), axis=0)
+            else:
+                candidate_scores = self.gumbel_with_maximum(candidate_scores, scores, -1)
+        # Concat the candidate scores and the scores of the finished beams
+        # The resulting candidate score will have shape (batch_size, beam_size * |V| + beam_size)
+        candidate_scores = mx.np.where(beam_alive_mask_bcast,
+                                       candidate_scores,
+                                       mx.np.full_like(candidate_scores,
+                                                     LARGE_NEGATIVE_FLOAT))
+        finished_scores = mx.np.where(beam_alive_mask,
+                                      mx.np.full_like(scores,
+                                                      LARGE_NEGATIVE_FLOAT),
+                                     scores)
+        candidate_scores = mx.np.concatenate([mx.npx.reshape(candidate_scores, (-2, -1)),
+                                              finished_scores],
+                                              axis=1)
+        # Get the top K scores
+        # new_scores and indices will have shape (batch_size, beam_size)
+        new_scores, indices = mx.npx.topk(candidate_scores, axis=1, k=beam_size, ret_typ='both')
+        indices = indices.astype(mx.np.int32)
+        use_prev = (indices >= (beam_size * vocab_size)).astype(mx.np.int32)
+        chosen_word_ids = mx.np.mod(indices, vocab_size)
+        beam_ids = mx.np.where(use_prev, indices - beam_size * vocab_size,
+                               mx.np.floor(indices / vocab_size).astype(mx.np.int32))
+        batch_beam_indices = beam_ids + mx.np.expand_dims(batch_shift, axis=1)
+        chosen_word_ids = mx.np.where(use_prev, - mx.np.ones_like(indices), chosen_word_ids)
+        # Update the samples and vaild_length
+        # TODO(sxjscience) The current implementation is quite tricky
+        #  We should wait for hybridizable advanced indexing to avoid this
+        selected_samples = mx.np.take(mx.npx.reshape(samples, (-5, -2)),
+                                      batch_beam_indices.reshape((-1,)), axis=0)
+        new_samples = mx.npx.reshape(mx.np.concatenate([selected_samples,
+                                                        chosen_word_ids.reshape((-1, 1))],
+                                                       axis=1),
+                                    (-6, -1, beam_size, -2))
+        new_valid_length = mx.np.take(valid_length.reshape((-1,)),
+                                      batch_beam_indices.reshape((-1,)),
+                                      axis=0).reshape((-1, beam_size)) + 1 - use_prev
+        # Update the states
+        new_states = _choose_states(states, batch_beam_indices.reshape((-1,)),
+                                    self._state_batch_axis)
+        # Update the alive mask.
+        beam_alive_mask = mx.np.take(beam_alive_mask.reshape((-1,)),
+                                     batch_beam_indices.reshape((-1,)), axis=0)\
+                              .reshape((-1, beam_size))
+        if self._eos_id is not None:
+            beam_alive_mask = beam_alive_mask * (chosen_word_ids != self._eos_id).astype(mx.np.float32)
+        return new_samples, new_valid_length, new_scores, chosen_word_ids,\
+               beam_alive_mask, new_states
+
+
+class BeamSearchSampler(HybridBlock):
+    r"""Draw samples from the decoder by beam search.
+
+    Parameters
+    ----------
+    beam_size : int
+        The beam size.
+    decoder
+        Function of the one-step-ahead decoder, should inherit SequenceDecoder and
+        have the form::
+
+            outputs, new_states = decoder(step_input, states)
+
+        The outputs, input should follow these rules:
+
+        - step_input has shape (batch_size,),
+        - outputs has shape (batch_size, V),
+        - states and new_states have the same structure.
+    eos_id
+        Id of the EOS token. No other elements will be appended to the sample if it reaches eos_id,
+        if eos_id == None, beam will not die
+    scorer : BeamSearchScorer, default BeamSearchScorer(alpha=1.0, K=5)
+        The score function used in beam search.
+    max_length_a
+        TODO(sxjscience) We can potentially make it more general.
+        The `a` value in the formula `a * x + b`. Generate sequences of maximum length `a * x + b`,
+        where `x` is the maximum source length.
+    max_length_b
+        The b value in the formula `a * x + b`. Generate sequences of maximum length `a * x + b`,
+        where `x` is the maximum source length.
+    min_length
+        The minimum length of the generated sequences.
+    temperature
+    stochastic
+        Whether to use stochastic sampler,
+        see [ICML2019] "Stochastic Beams and Where to Find Them" for detail.
+        https://arxiv.org/abs/1903.06059
+    sampling
+        Whether to use multinomial sampler.
+    sampling_topp
+        Multinomial sampling with topp,
+        see [ICLR2020] "The Curious Case of Neural Text Degeneration"
+        https://arxiv.org/abs/1904.09751
+    sampling_topk
+        Multinomial sampling with topk,
+        see [ACL2018] "Hierarchical Neural Story Generation"
+        https://www.aclweb.org/anthology/P18-1082.pdf
+    early_return
+        Whether to return when all beams are dead.
+        Without early_return, the sequences will be generated until the
+        maximum length is reached.
+    """
+    def __init__(self, beam_size: int,
+                 decoder: BaseStepDecoder,
+                 eos_id: Optional[int],
+                 vocab_size: int,
+                 scorer: Optional[Callable] = None,
+                 max_length_a: int = 0,
+                 max_length_b: int = 200,
+                 min_length: int = 1,
+                 temperature: float = 1.0,
+                 stochastic: bool = False,
+                 sampling: bool = False,
+                 sampling_topp: float = -1.0,
+                 sampling_topk: int = -1,
+                 early_return: bool = True):
+        super().__init__()
+        self._beam_size = beam_size
+        self._vocab_size = vocab_size
+        assert beam_size > 0,\
+            'beam_size must be larger than 0. Received beam_size={}'.format(beam_size)
+        self._decoder = decoder
+        self._eos_id = eos_id
+        assert eos_id is None or eos_id >= 0, 'eos_id cannot be negative! Received eos_id={}'.format(eos_id)
+        self._max_length_a = max_length_a
+        self._max_length_b = max_length_b
+        self._min_length = min_length
+        if scorer is None:
+            if stochastic:
+                scorer = BeamSearchScorer(alpha=0.0, temperature=temperature)
+            else:
+                scorer = BeamSearchScorer(alpha=1.0, K=5, temperature=temperature)
+
+        self._scorer = scorer
+        self._state_batch_axis = decoder.state_batch_axis
+        self._sampling = sampling
+        self._sampling_topp = sampling_topp
+        self._sampling_topk = sampling_topk
+        self._early_return = early_return
+        if sampling:
+            self._updater = _MultinomialStepUpdate(
+                beam_size=beam_size,
+                vocab_size=vocab_size,
+                eos_id=eos_id,
+                state_batch_axis=decoder.state_batch_axis,
+                sampling_topp=sampling_topp,
+                sampling_topk=sampling_topk,
+                temperature=temperature
+            )
+        else:
+            self._updater = _BeamSearchStepUpdate(
+                beam_size=beam_size,
+                vocab_size=vocab_size,
+                eos_id=eos_id,
+                scorer=scorer,
+                state_batch_axis=decoder.state_batch_axis,
+                stochastic=stochastic
+            )
+
+        if not stochastic:
+            self._updater.hybridize()
+        else:
+            if isinstance(scorer, BeamSearchScorer):
+                if scorer._alpha != 0.0:
+                    warnings.warn(
+                        'To use stochastic beam search, we need to set the alpha as 0.0')
+
+    def forward(self, inputs, states, src_seq_lengths=None):
+        """Sample by beam search.
+
+        Parameters
+        ----------
+        inputs : mx.np.ndarray
+            The initial input of the decoder. Shape is (batch_size,).
+        states : Object that contains mx.np.ndarrays
+            The initial states of the decoder.
+        src_seq_lengths : mx.np.ndarray
+            The source sequence lengths. Shape is (batch_size,).
+
+        Returns
+        -------
+        samples : mx.np.ndarray
+            Samples draw by beam search. Shape (batch_size, beam_size, length).
+            DType is int32.
+        scores : mx.np.ndarray
+            Scores of the samples. Shape (batch_size, beam_size).
+             We make sure that scores[i, :] are in descending order.
+        valid_length : mx.np.ndarray
+            The valid length of the samples. Shape (batch_size, beam_size).
+            DType is int32.
+        """
+        ctx = inputs.ctx
+        batch_size = inputs.shape[0]
+        beam_size = self._beam_size
+        if src_seq_lengths is not None:
+            max_src_sequence_length = int(src_seq_lengths.asnumpy().max())
+            max_length = max(self._min_length, max_src_sequence_length * self._max_length_a
+                             + self._max_length_b)
+        else:
+            if self._max_length_a != 0:
+                raise ValueError('If src_seq_lengths is not given, max_length_a must be 0!'
+                                 ' Received {}'
+                                 .format(self._max_length_a))
+            max_length = max(self._min_length, self._max_length_b)
+        # Tile the states and inputs to have shape (batch_size * beam_size, ...)
+        states = _expand_to_beam_size(states, beam_size=beam_size, batch_size=batch_size,
+                                      state_batch_axis=self._state_batch_axis)
+        step_input = _expand_to_beam_size(inputs, beam_size=beam_size,
+                                          batch_size=batch_size).astype(mx.np.int32)
+        # All beams are initialized to alive
+        # Generated samples are initialized to be the inputs
+        # Except the first beam where the scores are set to be zero, all beams have -inf scores.
+        # Valid length is initialized to be 1
+        beam_alive_mask = mx.np.ones(shape=(batch_size, beam_size), ctx=ctx, dtype=mx.np.float32)
+        valid_length = mx.np.ones(shape=(batch_size, beam_size), ctx=ctx, dtype=mx.np.int32)
+        scores = mx.np.zeros(shape=(batch_size, beam_size), ctx=ctx)
+        if beam_size > 1:
+            scores[:, 1:beam_size] = LARGE_NEGATIVE_FLOAT
+        samples = step_input.reshape((batch_size, beam_size, 1))
+        batch_shift = mx.np.arange(0, batch_size * beam_size, beam_size, ctx=ctx, dtype=mx.np.int32)
+        step = mx.np.array(0, ctx=ctx, dtype=mx.np.float32)
+        for i in range(max_length):
+            log_probs, new_states = self._decoder(step_input, states)
+            assert log_probs.shape[1] == self._vocab_size
+            step = step + 1
+            samples, valid_length, scores, chosen_word_ids, beam_alive_mask, states = \
+                self._updater(samples, valid_length, log_probs, scores, step, beam_alive_mask,
+                              new_states, batch_shift)
+            step_input = mx.npx.relu(chosen_word_ids).reshape((-1,))
+            if self._early_return:
+                if mx.np.sum(beam_alive_mask).asnumpy() == 0:
+                    return samples, scores, valid_length
+        beam_alive_mask = beam_alive_mask.astype(mx.np.int32)
+        if self._eos_id is not None:
+            final_word = mx.np.where(beam_alive_mask,
+                                     mx.np.full((batch_size, beam_size), self._eos_id,
+                                                ctx=ctx, dtype=mx.np.int32),
+                                     mx.np.full((batch_size, beam_size), -1, ctx=ctx, dtype=mx.np.int32))
+            samples = mx.np.concatenate([samples,
+                                         final_word.reshape((final_word.shape[0],
+                                                             final_word.shape[1], 1))],
+                                        axis=2)
+            valid_length = valid_length + beam_alive_mask
+        return samples, scores, valid_length
+
+    def __repr__(self):
+        ret = '{name}:(\n' \
+              '  beam_size={beam_size}\n' \
+              '  eos_id={eos_id}\n' \
+              '  vocab_size={vocab_size}\n' \
+              '  max_length_a={max_length_a}\n' \
+              '  max_length_b={max_length_b}\n' \
+              '  scorer={scorer}\n' \
+              '  sampling={sampling}\n' \
+              '  sampling_topp={sampling_topp}\n' \
+              '  sampling_topk={sampling_topk}\n' \
+              ')' \
+            .format(name=self.__class__.__name__,
+                    beam_size=self._beam_size,
+                    eos_id=self._eos_id,
+                    vocab_size=self._vocab_size,
+                    max_length_a=self._max_length_a,
+                    max_length_b=self._max_length_b,
+                    scorer=self._scorer,
+                    sampling=self._sampling,
+                    sampling_topp=self._sampling_topp,
+                    sampling_topk=self._sampling_topk)
+        return ret
+
+class _MultinomialStepUpdate(HybridBlock):
+    def __init__(self, beam_size, vocab_size, eos_id, state_batch_axis,
+                 sampling_topp=-1.0, sampling_topk=-1, temperature=1.0):
+        super().__init__()
+        self._beam_size = beam_size
+        self._vocab_size = vocab_size
+        self._eos_id = eos_id
+        self._state_batch_axis = state_batch_axis
+        self._sampling_topp = sampling_topp
+        self._sampling_topk = sampling_topk
+        self._temperature = temperature
+        self.activation = get_activation('relu')
+        assert eos_id is None or eos_id >= 0, 'eos_id cannot be negative! Received eos_id={}'.format(eos_id)
+        assert sampling_topp <= 0 or sampling_topk <= 0, 'sampling_topp conflicts with sampling_topk'
+
+    def forward(self, samples, valid_length, outputs, scores, step, beam_alive_mask,
+                states, batch_shift):
+        """
+
+        Parameters
+        ----------
+        F
+        samples : mx.np.ndarray
+            The current samples generated by beam search.
+            Shape (batch_size, beam_size, L).
+        valid_length : mx.np.ndarray
+            The current valid lengths of the samples
+        outputs : mx.np.ndarray
+            Outputs from predictor. If from_logits was set to True in scorer, then it's the
+            log probability of the current step. Else, it's the unnormalized outputs before
+            softmax or log_softmax.
+            Shape (batch_size * beam_size, V).
+        scores : mx.np.ndarray
+            The previous scores. Shape (batch_size, beam_size)
+        step : mx.np.ndarray
+            The current step for doing beam search. Begins from 1. Shape ()
+        beam_alive_mask : mx.np.ndarray
+            Shape (batch_size, beam_size)
+        states : nested structure of mx.np.ndarray
+            Each mx.np.ndarray should have shape (N, ...) when state_info is None,
+            or same as the layout in state_info when it's not None.
+        batch_shift : mx.np.ndarray
+            Contains [0, beam_size, 2 * beam_size, ..., (batch_size - 1) * beam_size].
+            Shape (batch_size,)
+
+        Returns
+        -------
+        new_samples : mx.np.ndarray or an empty list
+            The updated samples.
+            When single_step is False, shape (batch_size, beam_size, L + 1)
+        new_valid_length : mx.np.ndarray
+            Valid lengths of the samples. Shape (batch_size, beam_size)
+        new_scores : mx.np.ndarray
+            Shape (batch_size, beam_size)
+        chosen_word_ids : mx.np.ndarray
+            The chosen word ids of the step. Shape (batch_size, beam_size). If it's negative,
+            no word will be appended to the beam.
+        beam_alive_mask : mx.np.ndarray
+            Shape (batch_size, beam_size)
+        new_states : nested structure of mx.np.ndarray
+            Inner mx.np.ndarrays have shape (batch_size * beam_size, ...)
+        """
+        # bsz * beam_size * vocab_size
+        outputs = outputs.reshape((-1, self._beam_size, self._vocab_size))
+        probs = mx.npx.softmax(outputs / self._temperature)
+
+        if self._sampling_topp > 0:
+            probs = mx.np.where(
+                probs > self._sampling_topp,
+                probs,
+                mx.np.zeros_like(probs)
+            )
+        elif self._sampling_topk > 0:
+            topk_probs = mx.npx.topk(probs, axis=2, k=self._sampling_topk, ret_typ='value')
+            # choose the k max prob
+            k_prob = topk_probs[:,:,-1]
+            k_prob = mx.np.expand_dims(k_prob, axis=-1)
+            probs = mx.np.where(
+                probs >= k_prob,
+                probs,
+                mx.np.zeros_like(probs)
+            )
+
+        # renormalize
+        probs_sum = mx.np.sum(probs, axis=2, keepdims=True)
+        probs = probs / probs_sum
+
+        # bsz * beam_size
+        chosen_word_ids, chosen_word_log_probs = \
+            mx.npx.random.categorical(probs, get_prob=True)
+
+        new_scores = scores + mx.np.where(
+            beam_alive_mask,
+            chosen_word_log_probs,
+            mx.np.zeros_like(chosen_word_log_probs)
+        )
+
+        # mask dead words
+        chosen_word_ids = mx.np.where(
+            beam_alive_mask,
+            chosen_word_ids,
+            mx.np.full_like(beam_alive_mask, -1, dtype=mx.np.int32)
+        )
+
+        new_valid_length = valid_length + beam_alive_mask.astype(mx.np.int32)
+        new_samples = mx.np.concatenate(
+            [samples, mx.np.expand_dims(chosen_word_ids, axis=2)],
+            axis=2
+        )
+        new_states = states
+        if self._eos_id is not None:
+            beam_alive_mask = beam_alive_mask * (chosen_word_ids != self._eos_id).astype(mx.np.int32)
+
+        return new_samples, new_valid_length, new_scores, chosen_word_ids,\
+               beam_alive_mask, new_states
diff --git a/src/gluonnlp/third_party/__init__.py b/src/gluonnlp/third_party/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/gluonnlp/third_party/sentencepiece_model_pb2.py b/src/gluonnlp/third_party/sentencepiece_model_pb2.py
new file mode 100644
index 0000000000..bd486b7a03
--- /dev/null
+++ b/src/gluonnlp/third_party/sentencepiece_model_pb2.py
@@ -0,0 +1,703 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: sentencepiece_model.proto
+
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='sentencepiece_model.proto',
+  package='sentencepiece',
+  syntax='proto2',
+  serialized_options=b'H\003',
+  serialized_pb=b'\n\x19sentencepiece_model.proto\x12\rsentencepiece\"\xa1\n\n\x0bTrainerSpec\x12\r\n\x05input\x18\x01 \x03(\t\x12\x14\n\x0cinput_format\x18\x07 \x01(\t\x12\x14\n\x0cmodel_prefix\x18\x02 \x01(\t\x12\x41\n\nmodel_type\x18\x03 \x01(\x0e\x32$.sentencepiece.TrainerSpec.ModelType:\x07UNIGRAM\x12\x18\n\nvocab_size\x18\x04 \x01(\x05:\x04\x38\x30\x30\x30\x12\x17\n\x0f\x61\x63\x63\x65pt_language\x18\x05 \x03(\t\x12 \n\x15self_test_sample_size\x18\x06 \x01(\x05:\x01\x30\x12\"\n\x12\x63haracter_coverage\x18\n \x01(\x02:\x06\x30.9995\x12\x1e\n\x13input_sentence_size\x18\x0b \x01(\x05:\x01\x30\x12$\n\x16shuffle_input_sentence\x18\x13 \x01(\x08:\x04true\x12 \n\x14mining_sentence_size\x18\x0c \x01(\x05\x42\x02\x18\x01\x12\"\n\x16training_sentence_size\x18\r \x01(\x05\x42\x02\x18\x01\x12(\n\x17seed_sentencepiece_size\x18\x0e \x01(\x05:\x07\x31\x30\x30\x30\x30\x30\x30\x12\x1e\n\x10shrinking_factor\x18\x0f \x01(\x02:\x04\x30.75\x12!\n\x13max_sentence_length\x18\x12 \x01(\x05:\x04\x34\x31\x39\x32\x12\x17\n\x0bnum_threads\x18\x10 \x01(\x05:\x02\x31\x36\x12\x1d\n\x12num_sub_iterations\x18\x11 \x01(\x05:\x01\x32\x12$\n\x18max_sentencepiece_length\x18\x14 \x01(\x05:\x02\x31\x36\x12%\n\x17split_by_unicode_script\x18\x15 \x01(\x08:\x04true\x12\x1d\n\x0fsplit_by_number\x18\x17 \x01(\x08:\x04true\x12!\n\x13split_by_whitespace\x18\x16 \x01(\x08:\x04true\x12)\n\x1atreat_whitespace_as_suffix\x18\x18 \x01(\x08:\x05\x66\x61lse\x12\x1b\n\x0csplit_digits\x18\x19 \x01(\x08:\x05\x66\x61lse\x12\x17\n\x0f\x63ontrol_symbols\x18\x1e \x03(\t\x12\x1c\n\x14user_defined_symbols\x18\x1f \x03(\t\x12\x16\n\x0erequired_chars\x18$ \x01(\t\x12\x1c\n\rbyte_fallback\x18# \x01(\x08:\x05\x66\x61lse\x12+\n\x1dvocabulary_output_piece_score\x18  \x01(\x08:\x04true\x12\x1e\n\x10hard_vocab_limit\x18! \x01(\x08:\x04true\x12\x1c\n\ruse_all_vocab\x18\" \x01(\x08:\x05\x66\x61lse\x12\x11\n\x06unk_id\x18( \x01(\x05:\x01\x30\x12\x11\n\x06\x62os_id\x18) \x01(\x05:\x01\x31\x12\x11\n\x06\x65os_id\x18* \x01(\x05:\x01\x32\x12\x12\n\x06pad_id\x18+ \x01(\x05:\x02-1\x12\x18\n\tunk_piece\x18- \x01(\t:\x05<unk>\x12\x16\n\tbos_piece\x18. \x01(\t:\x03<s>\x12\x17\n\teos_piece\x18/ \x01(\t:\x04</s>\x12\x18\n\tpad_piece\x18\x30 \x01(\t:\x05<pad>\x12\x1a\n\x0bunk_surface\x18, \x01(\t:\x05 \xe2\x81\x87 \x12+\n\x1ctrain_extremely_large_corpus\x18\x31 \x01(\x08:\x05\x66\x61lse\"5\n\tModelType\x12\x0b\n\x07UNIGRAM\x10\x01\x12\x07\n\x03\x42PE\x10\x02\x12\x08\n\x04WORD\x10\x03\x12\x08\n\x04\x43HAR\x10\x04*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xd1\x01\n\x0eNormalizerSpec\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1c\n\x14precompiled_charsmap\x18\x02 \x01(\x0c\x12\x1e\n\x10\x61\x64\x64_dummy_prefix\x18\x03 \x01(\x08:\x04true\x12&\n\x18remove_extra_whitespaces\x18\x04 \x01(\x08:\x04true\x12 \n\x12\x65scape_whitespaces\x18\x05 \x01(\x08:\x04true\x12\x1e\n\x16normalization_rule_tsv\x18\x06 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"y\n\x0cSelfTestData\x12\x33\n\x07samples\x18\x01 \x03(\x0b\x32\".sentencepiece.SelfTestData.Sample\x1a)\n\x06Sample\x12\r\n\x05input\x18\x01 \x01(\t\x12\x10\n\x08\x65xpected\x18\x02 \x01(\t*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"\xfe\x03\n\nModelProto\x12\x37\n\x06pieces\x18\x01 \x03(\x0b\x32\'.sentencepiece.ModelProto.SentencePiece\x12\x30\n\x0ctrainer_spec\x18\x02 \x01(\x0b\x32\x1a.sentencepiece.TrainerSpec\x12\x36\n\x0fnormalizer_spec\x18\x03 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x12\x33\n\x0eself_test_data\x18\x04 \x01(\x0b\x32\x1b.sentencepiece.SelfTestData\x12\x38\n\x11\x64\x65normalizer_spec\x18\x05 \x01(\x0b\x32\x1d.sentencepiece.NormalizerSpec\x1a\xd2\x01\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\r\n\x05score\x18\x02 \x01(\x02\x12\x42\n\x04type\x18\x03 \x01(\x0e\x32,.sentencepiece.ModelProto.SentencePiece.Type:\x06NORMAL\"T\n\x04Type\x12\n\n\x06NORMAL\x10\x01\x12\x0b\n\x07UNKNOWN\x10\x02\x12\x0b\n\x07\x43ONTROL\x10\x03\x12\x10\n\x0cUSER_DEFINED\x10\x04\x12\x08\n\x04\x42YTE\x10\x06\x12\n\n\x06UNUSED\x10\x05*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\x42\x02H\x03'
+)
+
+
+
+_TRAINERSPEC_MODELTYPE = _descriptor.EnumDescriptor(
+  name='ModelType',
+  full_name='sentencepiece.TrainerSpec.ModelType',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='UNIGRAM', index=0, number=1,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='BPE', index=1, number=2,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='WORD', index=2, number=3,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='CHAR', index=3, number=4,
+      serialized_options=None,
+      type=None),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=1294,
+  serialized_end=1347,
+)
+_sym_db.RegisterEnumDescriptor(_TRAINERSPEC_MODELTYPE)
+
+_MODELPROTO_SENTENCEPIECE_TYPE = _descriptor.EnumDescriptor(
+  name='Type',
+  full_name='sentencepiece.ModelProto.SentencePiece.Type',
+  filename=None,
+  file=DESCRIPTOR,
+  values=[
+    _descriptor.EnumValueDescriptor(
+      name='NORMAL', index=0, number=1,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='UNKNOWN', index=1, number=2,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='CONTROL', index=2, number=3,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='USER_DEFINED', index=3, number=4,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='BYTE', index=4, number=6,
+      serialized_options=None,
+      type=None),
+    _descriptor.EnumValueDescriptor(
+      name='UNUSED', index=5, number=5,
+      serialized_options=None,
+      type=None),
+  ],
+  containing_type=None,
+  serialized_options=None,
+  serialized_start=2100,
+  serialized_end=2184,
+)
+_sym_db.RegisterEnumDescriptor(_MODELPROTO_SENTENCEPIECE_TYPE)
+
+
+_TRAINERSPEC = _descriptor.Descriptor(
+  name='TrainerSpec',
+  full_name='sentencepiece.TrainerSpec',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='input', full_name='sentencepiece.TrainerSpec.input', index=0,
+      number=1, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='input_format', full_name='sentencepiece.TrainerSpec.input_format', index=1,
+      number=7, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='model_prefix', full_name='sentencepiece.TrainerSpec.model_prefix', index=2,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='model_type', full_name='sentencepiece.TrainerSpec.model_type', index=3,
+      number=3, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='vocab_size', full_name='sentencepiece.TrainerSpec.vocab_size', index=4,
+      number=4, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=8000,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='accept_language', full_name='sentencepiece.TrainerSpec.accept_language', index=5,
+      number=5, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='self_test_sample_size', full_name='sentencepiece.TrainerSpec.self_test_sample_size', index=6,
+      number=6, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='character_coverage', full_name='sentencepiece.TrainerSpec.character_coverage', index=7,
+      number=10, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.9995),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='input_sentence_size', full_name='sentencepiece.TrainerSpec.input_sentence_size', index=8,
+      number=11, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='shuffle_input_sentence', full_name='sentencepiece.TrainerSpec.shuffle_input_sentence', index=9,
+      number=19, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='mining_sentence_size', full_name='sentencepiece.TrainerSpec.mining_sentence_size', index=10,
+      number=12, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=b'\030\001', file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='training_sentence_size', full_name='sentencepiece.TrainerSpec.training_sentence_size', index=11,
+      number=13, type=5, cpp_type=1, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=b'\030\001', file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='seed_sentencepiece_size', full_name='sentencepiece.TrainerSpec.seed_sentencepiece_size', index=12,
+      number=14, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1000000,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='shrinking_factor', full_name='sentencepiece.TrainerSpec.shrinking_factor', index=13,
+      number=15, type=2, cpp_type=6, label=1,
+      has_default_value=True, default_value=float(0.75),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='max_sentence_length', full_name='sentencepiece.TrainerSpec.max_sentence_length', index=14,
+      number=18, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=4192,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='num_threads', full_name='sentencepiece.TrainerSpec.num_threads', index=15,
+      number=16, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=16,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='num_sub_iterations', full_name='sentencepiece.TrainerSpec.num_sub_iterations', index=16,
+      number=17, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=2,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='max_sentencepiece_length', full_name='sentencepiece.TrainerSpec.max_sentencepiece_length', index=17,
+      number=20, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=16,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='split_by_unicode_script', full_name='sentencepiece.TrainerSpec.split_by_unicode_script', index=18,
+      number=21, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='split_by_number', full_name='sentencepiece.TrainerSpec.split_by_number', index=19,
+      number=23, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='split_by_whitespace', full_name='sentencepiece.TrainerSpec.split_by_whitespace', index=20,
+      number=22, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='treat_whitespace_as_suffix', full_name='sentencepiece.TrainerSpec.treat_whitespace_as_suffix', index=21,
+      number=24, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='split_digits', full_name='sentencepiece.TrainerSpec.split_digits', index=22,
+      number=25, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='control_symbols', full_name='sentencepiece.TrainerSpec.control_symbols', index=23,
+      number=30, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='user_defined_symbols', full_name='sentencepiece.TrainerSpec.user_defined_symbols', index=24,
+      number=31, type=9, cpp_type=9, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='required_chars', full_name='sentencepiece.TrainerSpec.required_chars', index=25,
+      number=36, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='byte_fallback', full_name='sentencepiece.TrainerSpec.byte_fallback', index=26,
+      number=35, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='vocabulary_output_piece_score', full_name='sentencepiece.TrainerSpec.vocabulary_output_piece_score', index=27,
+      number=32, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='hard_vocab_limit', full_name='sentencepiece.TrainerSpec.hard_vocab_limit', index=28,
+      number=33, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='use_all_vocab', full_name='sentencepiece.TrainerSpec.use_all_vocab', index=29,
+      number=34, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='unk_id', full_name='sentencepiece.TrainerSpec.unk_id', index=30,
+      number=40, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='bos_id', full_name='sentencepiece.TrainerSpec.bos_id', index=31,
+      number=41, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='eos_id', full_name='sentencepiece.TrainerSpec.eos_id', index=32,
+      number=42, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=2,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='pad_id', full_name='sentencepiece.TrainerSpec.pad_id', index=33,
+      number=43, type=5, cpp_type=1, label=1,
+      has_default_value=True, default_value=-1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='unk_piece', full_name='sentencepiece.TrainerSpec.unk_piece', index=34,
+      number=45, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"<unk>".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='bos_piece', full_name='sentencepiece.TrainerSpec.bos_piece', index=35,
+      number=46, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"<s>".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='eos_piece', full_name='sentencepiece.TrainerSpec.eos_piece', index=36,
+      number=47, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"</s>".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='pad_piece', full_name='sentencepiece.TrainerSpec.pad_piece', index=37,
+      number=48, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b"<pad>".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='unk_surface', full_name='sentencepiece.TrainerSpec.unk_surface', index=38,
+      number=44, type=9, cpp_type=9, label=1,
+      has_default_value=True, default_value=b" \342\201\207 ".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='train_extremely_large_corpus', full_name='sentencepiece.TrainerSpec.train_extremely_large_corpus', index=39,
+      number=49, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=False,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _TRAINERSPEC_MODELTYPE,
+  ],
+  serialized_options=None,
+  is_extendable=True,
+  syntax='proto2',
+  extension_ranges=[(200, 536870912), ],
+  oneofs=[
+  ],
+  serialized_start=45,
+  serialized_end=1358,
+)
+
+
+_NORMALIZERSPEC = _descriptor.Descriptor(
+  name='NormalizerSpec',
+  full_name='sentencepiece.NormalizerSpec',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='name', full_name='sentencepiece.NormalizerSpec.name', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='precompiled_charsmap', full_name='sentencepiece.NormalizerSpec.precompiled_charsmap', index=1,
+      number=2, type=12, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"",
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='add_dummy_prefix', full_name='sentencepiece.NormalizerSpec.add_dummy_prefix', index=2,
+      number=3, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='remove_extra_whitespaces', full_name='sentencepiece.NormalizerSpec.remove_extra_whitespaces', index=3,
+      number=4, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='escape_whitespaces', full_name='sentencepiece.NormalizerSpec.escape_whitespaces', index=4,
+      number=5, type=8, cpp_type=7, label=1,
+      has_default_value=True, default_value=True,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='normalization_rule_tsv', full_name='sentencepiece.NormalizerSpec.normalization_rule_tsv', index=5,
+      number=6, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=True,
+  syntax='proto2',
+  extension_ranges=[(200, 536870912), ],
+  oneofs=[
+  ],
+  serialized_start=1361,
+  serialized_end=1570,
+)
+
+
+_SELFTESTDATA_SAMPLE = _descriptor.Descriptor(
+  name='Sample',
+  full_name='sentencepiece.SelfTestData.Sample',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='input', full_name='sentencepiece.SelfTestData.Sample.input', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='expected', full_name='sentencepiece.SelfTestData.Sample.expected', index=1,
+      number=2, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=1641,
+  serialized_end=1682,
+)
+
+_SELFTESTDATA = _descriptor.Descriptor(
+  name='SelfTestData',
+  full_name='sentencepiece.SelfTestData',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='samples', full_name='sentencepiece.SelfTestData.samples', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[_SELFTESTDATA_SAMPLE, ],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=True,
+  syntax='proto2',
+  extension_ranges=[(200, 536870912), ],
+  oneofs=[
+  ],
+  serialized_start=1572,
+  serialized_end=1693,
+)
+
+
+_MODELPROTO_SENTENCEPIECE = _descriptor.Descriptor(
+  name='SentencePiece',
+  full_name='sentencepiece.ModelProto.SentencePiece',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='piece', full_name='sentencepiece.ModelProto.SentencePiece.piece', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=b"".decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='score', full_name='sentencepiece.ModelProto.SentencePiece.score', index=1,
+      number=2, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='type', full_name='sentencepiece.ModelProto.SentencePiece.type', index=2,
+      number=3, type=14, cpp_type=8, label=1,
+      has_default_value=True, default_value=1,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+    _MODELPROTO_SENTENCEPIECE_TYPE,
+  ],
+  serialized_options=None,
+  is_extendable=True,
+  syntax='proto2',
+  extension_ranges=[(200, 536870912), ],
+  oneofs=[
+  ],
+  serialized_start=1985,
+  serialized_end=2195,
+)
+
+_MODELPROTO = _descriptor.Descriptor(
+  name='ModelProto',
+  full_name='sentencepiece.ModelProto',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='pieces', full_name='sentencepiece.ModelProto.pieces', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='trainer_spec', full_name='sentencepiece.ModelProto.trainer_spec', index=1,
+      number=2, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='normalizer_spec', full_name='sentencepiece.ModelProto.normalizer_spec', index=2,
+      number=3, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='self_test_data', full_name='sentencepiece.ModelProto.self_test_data', index=3,
+      number=4, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+    _descriptor.FieldDescriptor(
+      name='denormalizer_spec', full_name='sentencepiece.ModelProto.denormalizer_spec', index=4,
+      number=5, type=11, cpp_type=10, label=1,
+      has_default_value=False, default_value=None,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      serialized_options=None, file=DESCRIPTOR),
+  ],
+  extensions=[
+  ],
+  nested_types=[_MODELPROTO_SENTENCEPIECE, ],
+  enum_types=[
+  ],
+  serialized_options=None,
+  is_extendable=True,
+  syntax='proto2',
+  extension_ranges=[(200, 536870912), ],
+  oneofs=[
+  ],
+  serialized_start=1696,
+  serialized_end=2206,
+)
+
+_TRAINERSPEC.fields_by_name['model_type'].enum_type = _TRAINERSPEC_MODELTYPE
+_TRAINERSPEC_MODELTYPE.containing_type = _TRAINERSPEC
+_SELFTESTDATA_SAMPLE.containing_type = _SELFTESTDATA
+_SELFTESTDATA.fields_by_name['samples'].message_type = _SELFTESTDATA_SAMPLE
+_MODELPROTO_SENTENCEPIECE.fields_by_name['type'].enum_type = _MODELPROTO_SENTENCEPIECE_TYPE
+_MODELPROTO_SENTENCEPIECE.containing_type = _MODELPROTO
+_MODELPROTO_SENTENCEPIECE_TYPE.containing_type = _MODELPROTO_SENTENCEPIECE
+_MODELPROTO.fields_by_name['pieces'].message_type = _MODELPROTO_SENTENCEPIECE
+_MODELPROTO.fields_by_name['trainer_spec'].message_type = _TRAINERSPEC
+_MODELPROTO.fields_by_name['normalizer_spec'].message_type = _NORMALIZERSPEC
+_MODELPROTO.fields_by_name['self_test_data'].message_type = _SELFTESTDATA
+_MODELPROTO.fields_by_name['denormalizer_spec'].message_type = _NORMALIZERSPEC
+DESCRIPTOR.message_types_by_name['TrainerSpec'] = _TRAINERSPEC
+DESCRIPTOR.message_types_by_name['NormalizerSpec'] = _NORMALIZERSPEC
+DESCRIPTOR.message_types_by_name['SelfTestData'] = _SELFTESTDATA
+DESCRIPTOR.message_types_by_name['ModelProto'] = _MODELPROTO
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+TrainerSpec = _reflection.GeneratedProtocolMessageType('TrainerSpec', (_message.Message,), {
+  'DESCRIPTOR' : _TRAINERSPEC,
+  '__module__' : 'sentencepiece_model_pb2'
+  # @@protoc_insertion_point(class_scope:sentencepiece.TrainerSpec)
+  })
+_sym_db.RegisterMessage(TrainerSpec)
+
+NormalizerSpec = _reflection.GeneratedProtocolMessageType('NormalizerSpec', (_message.Message,), {
+  'DESCRIPTOR' : _NORMALIZERSPEC,
+  '__module__' : 'sentencepiece_model_pb2'
+  # @@protoc_insertion_point(class_scope:sentencepiece.NormalizerSpec)
+  })
+_sym_db.RegisterMessage(NormalizerSpec)
+
+SelfTestData = _reflection.GeneratedProtocolMessageType('SelfTestData', (_message.Message,), {
+
+  'Sample' : _reflection.GeneratedProtocolMessageType('Sample', (_message.Message,), {
+    'DESCRIPTOR' : _SELFTESTDATA_SAMPLE,
+    '__module__' : 'sentencepiece_model_pb2'
+    # @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData.Sample)
+    })
+  ,
+  'DESCRIPTOR' : _SELFTESTDATA,
+  '__module__' : 'sentencepiece_model_pb2'
+  # @@protoc_insertion_point(class_scope:sentencepiece.SelfTestData)
+  })
+_sym_db.RegisterMessage(SelfTestData)
+_sym_db.RegisterMessage(SelfTestData.Sample)
+
+ModelProto = _reflection.GeneratedProtocolMessageType('ModelProto', (_message.Message,), {
+
+  'SentencePiece' : _reflection.GeneratedProtocolMessageType('SentencePiece', (_message.Message,), {
+    'DESCRIPTOR' : _MODELPROTO_SENTENCEPIECE,
+    '__module__' : 'sentencepiece_model_pb2'
+    # @@protoc_insertion_point(class_scope:sentencepiece.ModelProto.SentencePiece)
+    })
+  ,
+  'DESCRIPTOR' : _MODELPROTO,
+  '__module__' : 'sentencepiece_model_pb2'
+  # @@protoc_insertion_point(class_scope:sentencepiece.ModelProto)
+  })
+_sym_db.RegisterMessage(ModelProto)
+_sym_db.RegisterMessage(ModelProto.SentencePiece)
+
+
+DESCRIPTOR._options = None
+_TRAINERSPEC.fields_by_name['mining_sentence_size']._options = None
+_TRAINERSPEC.fields_by_name['training_sentence_size']._options = None
+# @@protoc_insertion_point(module_scope)
diff --git a/src/gluonnlp/third_party/sentencepiece_pb2.py b/src/gluonnlp/third_party/sentencepiece_pb2.py
new file mode 100644
index 0000000000..8347974ddd
--- /dev/null
+++ b/src/gluonnlp/third_party/sentencepiece_pb2.py
@@ -0,0 +1,193 @@
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: sentencepiece.proto
+
+import sys
+_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import message as _message
+from google.protobuf import reflection as _reflection
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf import descriptor_pb2
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor.FileDescriptor(
+  name='sentencepiece.proto',
+  package='sentencepiece',
+  syntax='proto2',
+  serialized_pb=_b('\n\x13sentencepiece.proto\x12\rsentencepiece\"\xdf\x01\n\x11SentencePieceText\x12\x0c\n\x04text\x18\x01 \x01(\t\x12>\n\x06pieces\x18\x02 \x03(\x0b\x32..sentencepiece.SentencePieceText.SentencePiece\x12\r\n\x05score\x18\x03 \x01(\x02\x1a\x62\n\rSentencePiece\x12\r\n\x05piece\x18\x01 \x01(\t\x12\n\n\x02id\x18\x02 \x01(\r\x12\x0f\n\x07surface\x18\x03 \x01(\t\x12\r\n\x05\x62\x65gin\x18\x04 \x01(\r\x12\x0b\n\x03\x65nd\x18\x05 \x01(\r*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02*\t\x08\xc8\x01\x10\x80\x80\x80\x80\x02\"J\n\x16NBestSentencePieceText\x12\x30\n\x06nbests\x18\x01 \x03(\x0b\x32 .sentencepiece.SentencePieceTextB\x02H\x03')
+)
+_sym_db.RegisterFileDescriptor(DESCRIPTOR)
+
+
+
+
+_SENTENCEPIECETEXT_SENTENCEPIECE = _descriptor.Descriptor(
+  name='SentencePiece',
+  full_name='sentencepiece.SentencePieceText.SentencePiece',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='piece', full_name='sentencepiece.SentencePieceText.SentencePiece.piece', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='id', full_name='sentencepiece.SentencePieceText.SentencePiece.id', index=1,
+      number=2, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='surface', full_name='sentencepiece.SentencePieceText.SentencePiece.surface', index=2,
+      number=3, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='begin', full_name='sentencepiece.SentencePieceText.SentencePiece.begin', index=3,
+      number=4, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='end', full_name='sentencepiece.SentencePieceText.SentencePiece.end', index=4,
+      number=5, type=13, cpp_type=3, label=1,
+      has_default_value=False, default_value=0,
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=True,
+  syntax='proto2',
+  extension_ranges=[(200, 536870912), ],
+  oneofs=[
+  ],
+  serialized_start=153,
+  serialized_end=251,
+)
+
+_SENTENCEPIECETEXT = _descriptor.Descriptor(
+  name='SentencePieceText',
+  full_name='sentencepiece.SentencePieceText',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='text', full_name='sentencepiece.SentencePieceText.text', index=0,
+      number=1, type=9, cpp_type=9, label=1,
+      has_default_value=False, default_value=_b("").decode('utf-8'),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='pieces', full_name='sentencepiece.SentencePieceText.pieces', index=1,
+      number=2, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+    _descriptor.FieldDescriptor(
+      name='score', full_name='sentencepiece.SentencePieceText.score', index=2,
+      number=3, type=2, cpp_type=6, label=1,
+      has_default_value=False, default_value=float(0),
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[_SENTENCEPIECETEXT_SENTENCEPIECE, ],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=True,
+  syntax='proto2',
+  extension_ranges=[(200, 536870912), ],
+  oneofs=[
+  ],
+  serialized_start=39,
+  serialized_end=262,
+)
+
+
+_NBESTSENTENCEPIECETEXT = _descriptor.Descriptor(
+  name='NBestSentencePieceText',
+  full_name='sentencepiece.NBestSentencePieceText',
+  filename=None,
+  file=DESCRIPTOR,
+  containing_type=None,
+  fields=[
+    _descriptor.FieldDescriptor(
+      name='nbests', full_name='sentencepiece.NBestSentencePieceText.nbests', index=0,
+      number=1, type=11, cpp_type=10, label=3,
+      has_default_value=False, default_value=[],
+      message_type=None, enum_type=None, containing_type=None,
+      is_extension=False, extension_scope=None,
+      options=None),
+  ],
+  extensions=[
+  ],
+  nested_types=[],
+  enum_types=[
+  ],
+  options=None,
+  is_extendable=False,
+  syntax='proto2',
+  extension_ranges=[],
+  oneofs=[
+  ],
+  serialized_start=264,
+  serialized_end=338,
+)
+
+_SENTENCEPIECETEXT_SENTENCEPIECE.containing_type = _SENTENCEPIECETEXT
+_SENTENCEPIECETEXT.fields_by_name['pieces'].message_type = _SENTENCEPIECETEXT_SENTENCEPIECE
+_NBESTSENTENCEPIECETEXT.fields_by_name['nbests'].message_type = _SENTENCEPIECETEXT
+DESCRIPTOR.message_types_by_name['SentencePieceText'] = _SENTENCEPIECETEXT
+DESCRIPTOR.message_types_by_name['NBestSentencePieceText'] = _NBESTSENTENCEPIECETEXT
+
+SentencePieceText = _reflection.GeneratedProtocolMessageType('SentencePieceText', (_message.Message,), dict(
+
+  SentencePiece = _reflection.GeneratedProtocolMessageType('SentencePiece', (_message.Message,), dict(
+    DESCRIPTOR = _SENTENCEPIECETEXT_SENTENCEPIECE,
+    __module__ = 'sentencepiece_pb2'
+    # @@protoc_insertion_point(class_scope:sentencepiece.SentencePieceText.SentencePiece)
+    ))
+  ,
+  DESCRIPTOR = _SENTENCEPIECETEXT,
+  __module__ = 'sentencepiece_pb2'
+  # @@protoc_insertion_point(class_scope:sentencepiece.SentencePieceText)
+  ))
+_sym_db.RegisterMessage(SentencePieceText)
+_sym_db.RegisterMessage(SentencePieceText.SentencePiece)
+
+NBestSentencePieceText = _reflection.GeneratedProtocolMessageType('NBestSentencePieceText', (_message.Message,), dict(
+  DESCRIPTOR = _NBESTSENTENCEPIECETEXT,
+  __module__ = 'sentencepiece_pb2'
+  # @@protoc_insertion_point(class_scope:sentencepiece.NBestSentencePieceText)
+  ))
+_sym_db.RegisterMessage(NBestSentencePieceText)
+
+
+DESCRIPTOR.has_options = True
+DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('H\003'))
+# @@protoc_insertion_point(module_scope)
diff --git a/src/gluonnlp/utils/__init__.py b/src/gluonnlp/utils/__init__.py
index 25d686885b..40dda40a87 100644
--- a/src/gluonnlp/utils/__init__.py
+++ b/src/gluonnlp/utils/__init__.py
@@ -1,28 +1,7 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import, arguments-differ
-"""Module for utility functions."""
-
-from . import files, parallel, parameter, version, seed
-from .files import *
-from .parallel import *
+from . import config
+from . import lazy_imports
+from . import preprocessing
+from . import registry
+from . import testing
 from .parameter import *
-from .version import *
-from .seed import *
-
-__all__ = parallel.__all__ + parameter.__all__ + files.__all__ + version.__all__ + seed.__all__
+from .misc import *
diff --git a/src/gluonnlp/utils/config.py b/src/gluonnlp/utils/config.py
new file mode 100644
index 0000000000..f8dd0083a7
--- /dev/null
+++ b/src/gluonnlp/utils/config.py
@@ -0,0 +1,26 @@
+import yacs.config
+
+
+class CfgNode(yacs.config.CfgNode):
+    def clone_merge(self, cfg_filename_or_other_cfg):
+        """Create a new cfg by cloning and merging with the given cfg
+
+        Parameters
+        ----------
+        cfg_filename_or_other_cfg
+
+        Returns
+        -------
+
+        """
+        ret = self.clone()
+        if isinstance(cfg_filename_or_other_cfg, str):
+            ret.merge_from_file(cfg_filename_or_other_cfg)
+            return ret
+        elif isinstance(cfg_filename_or_other_cfg, CfgNode):
+            ret.merge_from_other_cfg(cfg_filename_or_other_cfg)
+            return ret
+        elif cfg_filename_or_other_cfg is None:
+            return ret
+        else:
+            raise TypeError('Type of config path is not supported!')
diff --git a/src/gluonnlp/utils/files.py b/src/gluonnlp/utils/files.py
deleted file mode 100644
index 0a2e8c292f..0000000000
--- a/src/gluonnlp/utils/files.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint:disable=redefined-outer-name,logging-format-interpolation
-"""Utility functions for files."""
-
-__all__ = ['mkdir', 'glob', 'remove']
-
-import os
-import warnings
-import logging
-import tempfile
-import glob as _glob
-from .. import _constants as C
-
-def glob(url, separator=','):
-    """Return a list of paths matching a pathname pattern.
-
-    The pattern may contain simple shell-style wildcards.
-    Input may also include multiple patterns, separated by separator.
-
-    Parameters
-    ----------
-    url : str
-        The name of the files
-    separator : str, default is ','
-        The separator in url to allow multiple patterns in the input
-    """
-    patterns = [url] if separator is None else url.split(separator)
-    result = []
-    for pattern in patterns:
-        result.extend(_glob.glob(os.path.expanduser(pattern.strip())))
-    return result
-
-def remove(filename):
-    """Remove a file
-
-    Parameters
-    ----------
-    filename : str
-        The name of the target file to remove
-    """
-    if C.S3_PREFIX in filename:
-        msg = 'Removing objects on S3 is not supported: {}'.format(filename)
-        raise NotImplementedError(msg)
-    try:
-        os.remove(filename)
-    except OSError as e:
-        # file has already been removed.
-        if e.errno == 2:
-            pass
-        else:
-            raise e
-
-def mkdir(dirname):
-    """Create a directory.
-
-    Parameters
-    ----------
-    dirname : str
-        The name of the target directory to create.
-    """
-    if C.S3_PREFIX in dirname:
-        warnings.warn('Directory %s is not created because it contains %s'
-                      %(dirname, C.S3_PREFIX))
-        return
-    dirname = os.path.expanduser(dirname)
-    if not os.path.exists(dirname):
-        try:
-            os.makedirs(dirname)
-        except OSError as e:
-            # errno 17 means the file already exists
-            if e.errno != 17:
-                raise e
-
-class _TempFilePath:
-    """A TempFilePath that provides a path to a temporarily file, and automatically
-    cleans up the temp file at exit.
-    """
-    def __init__(self):
-        self.temp_dir = os.path.join(tempfile.gettempdir(), str(hash(os.times())))
-        if not os.path.exists(self.temp_dir):
-            os.makedirs(self.temp_dir)
-
-    def __enter__(self):
-        self.temp_path = os.path.join(self.temp_dir, str(hash(os.times())))
-        return self.temp_path
-
-    def __exit__(self, exec_type, exec_value, traceback):
-        os.remove(self.temp_path)
-
-def _transfer_file_s3(filename, s3_filename, upload=True):
-    """Transfer a file between S3 and local file system."""
-    try:
-        import boto3  # pylint: disable=import-outside-toplevel
-    except ImportError:
-        raise ImportError('boto3 is required to support s3 URI. Please install'
-                          'boto3 via `pip install boto3`')
-    # parse s3 uri
-    prefix_len = len(C.S3_PREFIX)
-    bucket_idx = s3_filename[prefix_len:].index('/') + prefix_len
-    bucket_name = s3_filename[prefix_len:bucket_idx]
-
-    # filename after the bucket, excluding '/'
-    key_name = s3_filename[bucket_idx + 1:]
-
-    log_level = logging.getLogger().getEffectiveLevel()
-    logging.getLogger().setLevel(logging.INFO)
-    # upload to s3
-    s3 = boto3.client('s3')
-    if upload:
-        s3.upload_file(filename, bucket_name, key_name)
-    else:
-        s3.download_file(bucket_name, key_name, filename)
-    logging.getLogger().setLevel(log_level)
diff --git a/src/gluonnlp/utils/lazy_imports.py b/src/gluonnlp/utils/lazy_imports.py
new file mode 100644
index 0000000000..8b26275b0e
--- /dev/null
+++ b/src/gluonnlp/utils/lazy_imports.py
@@ -0,0 +1,157 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Lazy import some third-party libraries."""
+__all__ = ['try_import_sentencepiece',
+           'try_import_yttm',
+           'try_import_subword_nmt',
+           'try_import_huggingface_tokenizers',
+           'try_import_spacy',
+           'try_import_scipy',
+           'try_import_mwparserfromhell',
+           'try_import_fasttext',
+           'try_import_langid',
+           'try_import_boto3',
+           'try_import_jieba']
+
+
+def try_import_sentencepiece():
+    try:
+        import sentencepiece  # pylint: disable=import-outside-toplevel
+    except ImportError:
+        raise ImportError(
+            'sentencepiece is not installed. You must install sentencepiece '
+            'in order to use the Sentencepiece tokenizer. '
+            'You can refer to the official installation guide '
+            'in https://github.com/google/sentencepiece#installation')
+    return sentencepiece
+
+
+def try_import_yttm():
+    try:
+        import youtokentome as yttm
+    except ImportError:
+        raise ImportError('YouTokenToMe is not installed. You may try to install it via '
+                          '`pip install youtokentome`.')
+    return yttm
+
+
+def try_import_subword_nmt():
+    try:
+        import subword_nmt
+    except ImportError:
+        raise ImportError('subword-nmt is not installed. You can run `pip install subword_nmt` '
+                          'to install the subword-nmt BPE implementation. You may also '
+                          'refer to the official installation guide in '
+                          'https://github.com/rsennrich/subword-nmt.')
+    return subword_nmt
+
+
+def try_import_huggingface_tokenizers():
+    try:
+        import tokenizers
+    except ImportError:
+        raise ImportError(
+            'HuggingFace tokenizers is not installed. You can run `pip install tokenizers` '
+            'to use the HuggingFace BPE tokenizer. You may refer to the official installation '
+            'guide in https://github.com/huggingface/tokenizers.')
+    return tokenizers
+
+
+def try_import_spacy():
+    try:
+        import spacy  # pylint: disable=import-outside-toplevel
+        from pkg_resources import parse_version  # pylint: disable=import-outside-toplevel
+        assert parse_version(spacy.__version__) >= parse_version('2.0.0'), \
+            'We only support spacy>=2.0.0'
+    except ImportError:
+        raise ImportError(
+            'spaCy is not installed. You must install spaCy in order to use the '
+            'SpacyTokenizer. You can refer to the official installation guide '
+            'in https://spacy.io/usage/.')
+    return spacy
+
+
+def try_import_scipy():
+    try:
+        import scipy
+    except ImportError:
+        raise ImportError('SciPy is not installed. '
+                          'You must install SciPy >= 1.0.0 in order to use the '
+                          'TruncNorm. You can refer to the official '
+                          'installation guide in https://www.scipy.org/install.html .')
+    return scipy
+
+
+def try_import_mwparserfromhell():
+    try:
+        import mwparserfromhell
+    except ImportError:
+        raise ImportError('mwparserfromhell is not installed. You must install '
+                          'mwparserfromhell in order to run the script. You can use '
+                          '`pip install mwparserfromhell` or refer to guide in '
+                          'https://github.com/earwig/mwparserfromhell.')
+    return mwparserfromhell
+
+
+def try_import_autogluon():
+    try:
+        import autogluon
+    except ImportError:
+        raise ImportError('AutoGluon is not installed. You must install autogluon in order to use '
+                          'the functionality. You can follow the guide in '
+                          'https://github.com/awslabs/autogluon for installation.')
+    return autogluon
+
+
+def try_import_fasttext():
+    try:
+        import fasttext
+    except ImportError:
+        raise ImportError('FastText is not installed. You must install fasttext in order to use the'
+                          ' functionality. See https://github.com/facebookresearch/fastText for '
+                          'more information.')
+    return fasttext
+
+
+def try_import_langid():
+    try:
+        import langid
+    except ImportError:
+        raise ImportError('"langid" is not installed. You must install langid in order to use the'
+                          ' functionality. You may try to use `pip install langid`.')
+    return langid
+
+
+def try_import_boto3():
+    try:
+        import boto3
+    except ImportError:
+        raise ImportError('"boto3" is not installed. To enable fast downloading in EC2. You should '
+                          'install boto3 and correctly configure the S3. '
+                          'See https://boto3.readthedocs.io/ for more information. '
+                          'If you are using EC2, downloading from s3:// will '
+                          'be multiple times faster than using the traditional http/https URL.')
+    return boto3
+
+
+def try_import_jieba():
+    try:
+        import jieba
+    except ImportError:
+        raise ImportError('"jieba" is not installed. You must install jieba tokenizer. '
+                          'You may try to use `pip install jieba`')
+    return jieba
diff --git a/src/gluonnlp/utils/misc.py b/src/gluonnlp/utils/misc.py
new file mode 100644
index 0000000000..38d1fa6258
--- /dev/null
+++ b/src/gluonnlp/utils/misc.py
@@ -0,0 +1,653 @@
+import os
+import sys
+import inspect
+import logging
+import warnings
+import functools
+import uuid
+from types import ModuleType
+from typing import Optional, Tuple
+import numpy as np
+import hashlib
+import requests
+import itertools
+import random
+try:
+    import tqdm
+except ImportError:
+    tqdm = None
+from .lazy_imports import try_import_boto3
+from mxnet.gluon.utils import shape_is_known, replace_file
+from collections import OrderedDict
+import glob as _glob
+
+
+S3_PREFIX = 's3://'
+
+
+def glob(url, separator=','):
+    """Return a list of paths matching a pathname pattern.
+
+    The pattern may contain simple shell-style wildcards.
+    Input may also include multiple patterns, separated by separator.
+
+    Parameters
+    ----------
+    url : str
+        The name of the files
+    separator : str, default is ','
+        The separator in url to allow multiple patterns in the input
+    """
+    patterns = [url] if separator is None else url.split(separator)
+    result = []
+    for pattern in patterns:
+        result.extend(_glob.glob(os.path.expanduser(pattern.strip())))
+    return result
+
+
+class AverageSGDTracker(object):
+    def __init__(self, params=None):
+        """Maintain a set of shadow variables "v" that is calculated by
+
+            v[:] = (1 - 1/t) v + 1/t \theta
+
+        The t is the number of training steps.
+
+        It is also known as "Polyak-Rupert averaging" applied to SGD and was rediscovered in
+        "Towards Optimal One Pass Large Scale Learning withAveraged Stochastic Gradient Descent"
+         Wei Xu (2011).
+
+        The idea is to average the parameters obtained by stochastic gradient descent.
+
+
+        Parameters
+        ----------
+        params : ParameterDict
+            The parameters that we are going to track.
+        """
+        self._track_params = None
+        self._average_params = None
+        self._initialized = False
+        self._n_steps = 0
+        if params is not None:
+            self.apply(params)
+
+    @property
+    def n_steps(self):
+        return self._n_steps
+
+    @property
+    def average_params(self):
+        return self._average_params
+
+    @property
+    def initialized(self):
+        return self._initialized
+
+    def apply(self, params):
+        """ Tell the moving average tracker which parameters we are going to track.
+
+        Parameters
+        ----------
+        params : ParameterDict
+            The parameters that we are going to track and calculate the moving average.
+        """
+        assert self._track_params is None, 'The MovingAverageTracker is already initialized and'\
+                                           ' is not allowed to be initialized again. '
+        self._track_params = params
+        self._n_steps = 0
+
+    def step(self):
+        assert self._track_params is not None, 'You will need to use `.apply(params)`' \
+                                               ' to initialize the MovingAverageTracker.'
+        for k, v in self._track_params.items():
+            assert shape_is_known(v.shape),\
+                'All shapes of the tracked parameters must be given.' \
+                ' The shape of {} is {}, and it has not been fully initialized.' \
+                ' You should call step after the first forward of the model.'.format(k, v.shape)
+        ctx = next(iter(self._track_params.values())).list_ctx()[0]
+        if self._average_params is None:
+            self._average_params = OrderedDict([(k, v.data(ctx).copy()) for k, v in self._track_params.items()])
+        self._n_steps += 1
+        decay = 1.0 / self._n_steps
+        for name, average_param in self._average_params.items():
+            average_param += decay * (self._track_params[name].data(ctx) - average_param)
+
+    def copy_back(self, params=None):
+        """ Copy the average parameters back to the given parameters
+
+        Parameters
+        ----------
+        params : ParameterDict
+            The parameters that we will copy tha average params to.
+            If it is not given, the tracked parameters will be updated
+
+        """
+        if params is None:
+            params = self._track_params
+        for k, v in params.items():
+            v.set_data(self._average_params[k])
+
+
+def file_line_number(path: str) -> int:
+    """
+
+    Parameters
+    ----------
+    path
+        The path to calculate the number of lines in a file.
+
+    Returns
+    -------
+    ret
+        The number of lines
+    """
+    ret = 0
+    with open(path, 'rb') as f:
+        for _ in f:
+            ret += 1
+        return ret
+
+
+def md5sum(filename):
+    """Calculate the md5sum of a file
+
+    Parameters
+    ----------
+    filename
+        Name of the file
+
+    Returns
+    -------
+    ret
+        The md5sum
+    """
+    with open(filename, mode='rb') as f:
+        d = hashlib.md5()
+        for buf in iter(functools.partial(f.read, 1024*100), b''):
+            d.update(buf)
+    return d.hexdigest()
+
+
+def sha1sum(filename):
+    """Calculate the sha1sum of a file
+
+    Parameters
+    ----------
+    filename
+        Name of the file
+
+    Returns
+    -------
+    ret
+        The sha1sum
+    """
+    with open(filename, mode='rb') as f:
+        d = hashlib.sha1()
+        for buf in iter(functools.partial(f.read, 1024*100), b''):
+            d.update(buf)
+    return d.hexdigest()
+
+
+def naming_convention(file_dir, file_name):
+    """Rename files with 8-character hash"""
+    long_hash = sha1sum(os.path.join(file_dir, file_name))
+    file_prefix, file_sufix = file_name.split('.')
+    new_name = '{file_prefix}-{short_hash}.{file_sufix}'.format(
+        file_prefix=file_prefix,
+        short_hash=long_hash[:8],
+        file_sufix=file_sufix)
+    return new_name, long_hash
+
+
+def logging_config(folder: Optional[str] = None,
+                   name: Optional[str] = None,
+                   logger: logging.Logger = logging.root,
+                   level: int = logging.INFO,
+                   console_level: int = logging.INFO,
+                   console: bool = True,
+                   overwrite_handler: bool = False) -> str:
+    """Config the logging module. It will set the logger to save to the specified file path.
+
+    Parameters
+    ----------
+    folder
+        The folder to save the log
+    name
+        Name of the saved
+    logger
+        The logger
+    level
+        Logging level
+    console_level
+        Logging level of the console log
+    console
+        Whether to also log to console
+    overwrite_handler
+        Whether to overwrite the existing handlers in the logger
+
+    Returns
+    -------
+    folder
+        The folder to save the log file.
+    """
+    if name is None:
+        name = inspect.stack()[-1][1].split('.')[0]
+    if folder is None:
+        folder = os.path.join(os.getcwd(), name)
+    if not os.path.exists(folder):
+        os.makedirs(folder, exist_ok=True)
+    need_file_handler = True
+    need_console_handler = True
+    # Check all loggers.
+    if overwrite_handler:
+        logger.handlers = []
+    else:
+        for handler in logger.handlers:
+            if isinstance(handler, logging.StreamHandler):
+                need_console_handler = False
+    logpath = os.path.join(folder, name + ".log")
+    print("All Logs will be saved to {}".format(logpath))
+    logger.setLevel(level)
+    formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    if need_file_handler:
+        logfile = logging.FileHandler(logpath)
+        logfile.setLevel(level)
+        logfile.setFormatter(formatter)
+        logger.addHandler(logfile)
+    if console and need_console_handler:
+        # Initialze the console logging
+        logconsole = logging.StreamHandler()
+        logconsole.setLevel(console_level)
+        logconsole.setFormatter(formatter)
+        logger.addHandler(logconsole)
+    return folder
+
+
+# TODO(sxjscience) Consider to move it into the official MXNet gluon package
+#  Also currently we have not printed the grad_req flag in Parameters, i.e.,
+#  print(net.collect_params()) will not print the grad_req flag.
+def count_parameters(params) -> Tuple[int, int]:
+    """
+
+    Parameters
+    ----------
+    params
+
+
+    Returns
+    -------
+    num_params
+        The number of parameters that requires gradient
+    num_fixed_params
+        The number of parameters that does not require gradient
+    """
+    # TODO(sxjscience), raise warning if there are -1/0s in the parameters
+    num_params = 0
+    num_fixed_params = 0
+    for k, v in params.items():
+        if v.grad_req != 'null':
+            if v._data is None:
+                warnings.warn('"{}" is not initialized! The total parameter count '
+                              'will not be correct.'.format(k))
+            else:
+                num_params += np.prod(v.shape)
+        else:
+            if v._data is None:
+                warnings.warn('"{}" is not initialized! The total fixed parameter count '
+                              'will not be correct.'.format(k))
+            else:
+                num_fixed_params += np.prod(v.shape)
+    return num_params, num_fixed_params
+
+
+def set_seed(seed):
+    import mxnet as mx
+    mx.random.seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+
+def sizeof_fmt(num, suffix='B'):
+    for unit in ['', 'Ki', 'Mi', 'Gi', 'Ti', 'Pi', 'Ei', 'Zi']:
+        if abs(num) < 1024.0:
+            return '{:.1f} {}{}'.format(num, unit, suffix)
+        num /= 1024.0
+    return '{:.1f} {}{}'.format(num, 'Yi', suffix)
+
+
+def grouper(iterable, n, fillvalue=None):
+    """Collect data into fixed-length chunks or blocks"""
+    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
+    args = [iter(iterable)] * n
+    return itertools.zip_longest(*args, fillvalue=fillvalue)
+
+def repeat(iterable, count=None):
+    if count is None:
+        while True:
+            for sample in iterable:
+                yield sample
+    else:
+        for i in range(count):
+            for sample in iterable:
+                yield sample
+
+def parse_ctx(data_str):
+    import mxnet as mx
+    if data_str == '-1' or data_str == '':
+        ctx_l = [mx.cpu()]
+    else:
+        ctx_l = [mx.gpu(int(x)) for x in data_str.split(',')]
+    return ctx_l
+
+
+def load_checksum_stats(path: str) -> dict:
+    """
+
+    Parameters
+    ----------
+    path
+        Path to the stored checksum
+
+    Returns
+    -------
+    file_stats
+    """
+    file_stats = dict()
+    with open(path, 'r', encoding='utf-8') as f:
+        for line in f:
+            name, hex_hash, file_size = line.strip().split()
+            file_stats[name] = hex_hash
+    return file_stats
+
+
+class GoogleDriveDownloader:
+    """
+    Minimal class to download shared files from Google Drive.
+
+    Based on: https://github.com/ndrplz/google-drive-downloader
+    """
+
+    CHUNK_SIZE = 32768
+    DOWNLOAD_URL = 'https://docs.google.com/uc?export=download'
+
+    @staticmethod
+    def download_file_from_google_drive(file_id, dest_path, overwrite=False, showsize=False):
+        """Downloads a shared file from google drive into a given folder.
+        Optionally unzips it.
+
+        Parameters
+        ----------
+        file_id: str
+            the file identifier.
+            You can obtain it from the sharable link.
+        dest_path: str
+            the destination where to save the downloaded file.
+            Must be a path (for example: './downloaded_file.txt')
+        overwrite: bool
+            optional, if True forces re-download and overwrite.
+        showsize: bool
+            optional, if True print the current download size.
+        """
+
+        destination_directory = os.path.dirname(dest_path)
+        if not os.path.exists(destination_directory):
+            os.makedirs(destination_directory)
+
+        if not os.path.exists(dest_path) or overwrite:
+
+            session = requests.Session()
+
+            print('Downloading {} into {}... '.format(file_id, dest_path), end='')
+            sys.stdout.flush()
+
+            response = session.get(GoogleDriveDownloader.DOWNLOAD_URL,
+                                   params={'id': file_id}, stream=True)
+
+            token = GoogleDriveDownloader._get_confirm_token(response)
+            if token:
+                params = {'id': file_id, 'confirm': token}
+                response = session.get(GoogleDriveDownloader.DOWNLOAD_URL,
+                                       params=params, stream=True)
+
+            if showsize:
+                print()  # Skip to the next line
+
+            current_download_size = [0]
+            GoogleDriveDownloader._save_response_content(response, dest_path, showsize,
+                                                         current_download_size)
+            print('Done.')
+
+    @staticmethod
+    def _get_confirm_token(response):
+        for key, value in response.cookies.items():
+            if key.startswith('download_warning'):
+                return value
+        return None
+
+    @staticmethod
+    def _save_response_content(response, destination, showsize, current_size):
+        with open(destination, 'wb') as f:
+            for chunk in response.iter_content(GoogleDriveDownloader.CHUNK_SIZE):
+                if chunk:  # filter out keep-alive new chunks
+                    f.write(chunk)
+                    if showsize:
+                        print('\r' + sizeof_fmt(current_size[0]), end=' ')
+                        sys.stdout.flush()
+                        current_size[0] += GoogleDriveDownloader.CHUNK_SIZE
+
+
+def download(url: str,
+             path: Optional[str] = None,
+             overwrite: Optional[bool] = False,
+             sha1_hash: Optional[str] = None,
+             retries: Optional[int] = 5,
+             verify_ssl: Optional[bool] = True) -> str:
+    """Download a given URL
+
+    Parameters
+    ----------
+    url
+        URL to download
+    path
+        Destination path to store downloaded file. By default stores to the
+        current directory with same name as in url.
+    overwrite
+        Whether to overwrite destination file if already exists.
+    sha1_hash
+        Expected sha1 hash in hexadecimal digits. Will ignore existing file when hash is specified
+        but doesn't match.
+    retries
+        The number of times to attempt the download in case of failure or non 200 return codes
+    verify_ssl
+        Verify SSL certificates.
+
+    Returns
+    -------
+    fname
+        The file path of the downloaded file.
+    """
+    is_s3 = url.startswith(S3_PREFIX)
+    if is_s3:
+        boto3 = try_import_boto3()
+        s3 = boto3.resource('s3')
+        components = url[len(S3_PREFIX):].split('/')
+        if len(components) < 2:
+            raise ValueError('Invalid S3 url. Received url={}'.format(url))
+        s3_bucket_name = components[0]
+        s3_key = '/'.join(components[1:])
+    if path is None:
+        fname = url.split('/')[-1]
+        # Empty filenames are invalid
+        assert fname, 'Can\'t construct file-name from this URL. ' \
+            'Please set the `path` option manually.'
+    else:
+        path = os.path.expanduser(path)
+        if os.path.isdir(path):
+            fname = os.path.join(path, url.split('/')[-1])
+        else:
+            fname = path
+    assert retries >= 0, "Number of retries should be at least 0, currently it's {}".format(
+        retries)
+
+    if not verify_ssl:
+        warnings.warn(
+            'Unverified HTTPS request is being made (verify_ssl=False). '
+            'Adding certificate verification is strongly advised.')
+
+    if overwrite or not os.path.exists(fname) or (sha1_hash and not sha1sum(fname) == sha1_hash):
+        dirname = os.path.dirname(os.path.abspath(os.path.expanduser(fname)))
+        if not os.path.exists(dirname):
+            os.makedirs(dirname, exist_ok=True)
+        while retries + 1 > 0:
+            # Disable pyling too broad Exception
+            # pylint: disable=W0703
+            try:
+                print('Downloading {} from {}...'.format(fname, url))
+                if is_s3:
+                    response = s3.meta.client.head_object(Bucket=s3_bucket_name,
+                                                          Key=s3_key)
+                    total_size = int(response.get('ContentLength', 0))
+                    random_uuid = str(uuid.uuid4())
+                    tmp_path = '{}.{}'.format(fname, random_uuid)
+                    if tqdm is not None:
+                        def hook(t_obj):
+                            def inner(bytes_amount):
+                                t_obj.update(bytes_amount)
+                            return inner
+                        with tqdm.tqdm(total=total_size, unit='iB', unit_scale=True) as t:
+                            s3.meta.client.download_file(s3_bucket_name, s3_key, tmp_path,
+                                                         Callback=hook(t))
+                    else:
+                        s3.meta.client.download_file(s3_bucket_name, s3_key, tmp_path)
+                else:
+                    r = requests.get(url, stream=True, verify=verify_ssl)
+                    if r.status_code != 200:
+                        raise RuntimeError('Failed downloading url {}'.format(url))
+                    # create uuid for temporary files
+                    random_uuid = str(uuid.uuid4())
+                    total_size = int(r.headers.get('content-length', 0))
+                    chunk_size = 1024
+                    if tqdm is not None:
+                        t = tqdm.tqdm(total=total_size, unit='iB', unit_scale=True)
+                    with open('{}.{}'.format(fname, random_uuid), 'wb') as f:
+                        for chunk in r.iter_content(chunk_size=chunk_size):
+                            if chunk:  # filter out keep-alive new chunks
+                                if tqdm is not None:
+                                    t.update(len(chunk))
+                                f.write(chunk)
+                    if tqdm is not None:
+                        t.close()
+                # if the target file exists(created by other processes)
+                # and have the same hash with target file
+                # delete the temporary file
+                if not os.path.exists(fname) or (sha1_hash and not sha1sum(fname) == sha1_hash):
+                    # atomic operation in the same file system
+                    replace_file('{}.{}'.format(fname, random_uuid), fname)
+                else:
+                    try:
+                        os.remove('{}.{}'.format(fname, random_uuid))
+                    except OSError:
+                        pass
+                    finally:
+                        warnings.warn(
+                            'File {} exists in file system so the downloaded file is deleted'.format(fname))
+                if sha1_hash and not sha1sum(fname) == sha1_hash:
+                    raise UserWarning(
+                        'File {} is downloaded but the content hash does not match.'
+                        ' The repo may be outdated or download may be incomplete. '
+                        'If the "repo_url" is overridden, consider switching to '
+                        'the default repo.'.format(fname))
+                break
+            except Exception as e:
+                retries -= 1
+                if retries <= 0:
+                    raise e
+
+                print('download failed due to {}, retrying, {} attempt{} left'
+                      .format(repr(e), retries, 's' if retries > 1 else ''))
+
+    return fname
+
+
+def check_version(min_version: str,
+                  warning_only: bool = False,
+                  library: Optional[ModuleType] = None):
+    """Check the version of gluonnlp satisfies the provided minimum version.
+    An exception is thrown if the check does not pass.
+
+    Parameters
+    ----------
+    min_version
+        Minimum version
+    warning_only
+        Printing a warning instead of throwing an exception.
+    library
+        The target library for version check. Checks gluonnlp by default
+    """
+    # pylint: disable=import-outside-toplevel
+    from .. import __version__
+    if library is None:
+        version = __version__
+        name = 'GluonNLP'
+    else:
+        version = library.__version__
+        name = library.__name__
+    from packaging.version import parse
+    bad_version = parse(version.replace('.dev', '')) < parse(min_version)
+    if bad_version:
+        msg = 'Installed {} version {} does not satisfy the ' \
+              'minimum required version {}'.format(name, version, min_version)
+        if warning_only:
+            warnings.warn(msg)
+        else:
+            raise AssertionError(msg)
+
+
+def init_comm(backend, gpus):
+    """Init communication backend
+
+    Parameters
+    ----------
+    backend
+    gpus
+
+    Returns
+    -------
+    store
+    num_workers
+    rank
+    local_rank
+    is_master_node
+    ctx_l
+    """
+    # backend specific implementation
+    import mxnet as mx
+    if backend == 'horovod':
+        try:
+            import horovod.mxnet as hvd  # pylint: disable=import-outside-toplevel
+        except ImportError:
+            logging.info('horovod must be installed.')
+            sys.exit(1)
+        hvd.init()
+        store = None
+        num_workers = hvd.size()
+        rank = hvd.rank()
+        local_rank = hvd.local_rank()
+        is_master_node = rank == local_rank
+        ctx_l = [mx.gpu(local_rank)]
+        logging.info('GPU communication supported by horovod')
+    else:
+        store = mx.kv.create(backend)
+        num_workers = store.num_workers
+        rank = store.rank
+        local_rank = 0
+        is_master_node = rank == local_rank
+        if gpus == '-1' or gpus == '':
+            ctx_l = [mx.cpu()]
+            logging.info('Runing on CPU')
+        else:
+            ctx_l = [mx.gpu(int(x)) for x in gpus.split(',')]
+            logging.info('GPU communication supported by KVStore')
+
+    return store, num_workers, rank, local_rank, is_master_node, ctx_l
diff --git a/src/gluonnlp/utils/parallel.py b/src/gluonnlp/utils/parallel.py
deleted file mode 100644
index 42dc5e60f7..0000000000
--- a/src/gluonnlp/utils/parallel.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utility functions for parallel processing."""
-import queue
-import threading
-
-__all__ = ['Parallelizable', 'Parallel']
-
-class Parallelizable:
-    """Base class for parallelizable unit of work, which can be invoked by `Parallel`.
-    The subclass must implement the `forward_backward` method, and be used
-    together with `Parallel`. For example::
-
-        class ParallelNet(Parallelizable):
-            def __init__(self):
-                self._net = Model()
-                self._loss = gluon.loss.SoftmaxCrossEntropyLoss()
-
-            def forward_backward(self, x):
-                data, label = x
-                with mx.autograd.record():
-                    out = self._net(data)
-                    loss = self._loss(out, label)
-                loss.backward()
-                return loss
-
-        net = ParallelNet()
-        ctx = [mx.gpu(0), mx.gpu(1)]
-        parallel = Parallel(len(ctx), net)
-        # Gluon block is initialized after forwarding the first batch
-        initialized = False
-
-        for batch in batches:
-            for x in gluon.utils.split_and_load(batch, ctx):
-                parallel.put(x)
-            losses = [parallel.get() for _ in ctx]
-            trainer.step()
-    """
-    def forward_backward(self, x):
-        """ Forward and backward computation. """
-        raise NotImplementedError()
-
-class Parallel:
-    """Class for parallel processing with `Parallelizable`s. It invokes a
-    `Parallelizable` with multiple Python threads. For example::
-
-        class ParallelNet(Parallelizable):
-            def __init__(self):
-                self._net = Model()
-                self._loss = gluon.loss.SoftmaxCrossEntropyLoss()
-
-            def forward_backward(self, x):
-                data, label = x
-                mx.autograd.record():
-                    out = self._net(data)
-                    loss = self._loss(out, label)
-                loss.backward()
-                return loss
-
-        net = ParallelNet()
-        ctx = [mx.gpu(0), mx.gpu(1)]
-        parallel = Parallel(len(ctx), net)
-
-        for batch in batches:
-            for x in gluon.utils.split_and_load(batch, ctx):
-                parallel.put(x)
-            losses = [parallel.get() for _ in ctx]
-            trainer.step()
-
-    Parameters
-    ----------
-    num_workers : int
-        Number of worker threads. If set to 0, the main thread is used as the worker for
-        debugging purpose.
-    parallelizable :
-        Parallelizable net whose `forward` and `backward` methods are invoked
-        by multiple worker threads.
-    serial_init : bool, default True
-        Execute the first `num_workers` inputs in main thread, so that the `Block`
-        used in `parallizable` is initialized serially. Initialize a `Block` with
-        multiple threads may cause unexpected behavior.
-    """
-
-    class _StopSignal:
-        """Internal class to signal stop. """
-        def __init__(self, msg):
-            self._msg = msg
-
-    def __init__(self, num_workers, parallizable, serial_init=True):
-        self._in_queue = queue.Queue(-1)
-        self._out_queue = queue.Queue(-1)
-        self._num_workers = num_workers
-        self._threads = []
-        self._parallizable = parallizable
-        self._num_serial = num_workers if serial_init else 0
-
-        def _worker(in_queue, out_queue, parallel):
-            while True:
-                x = in_queue.get()
-                if isinstance(x, Parallel._StopSignal):
-                    return
-                out = parallel.forward_backward(x)
-                out_queue.put(out)
-
-        arg = (self._in_queue, self._out_queue, self._parallizable)
-        for _ in range(num_workers):
-            thread = threading.Thread(target=_worker, args=arg)
-            self._threads.append(thread)
-            thread.start()
-
-    def put(self, x):
-        """Assign input `x` to an available worker and invoke
-        `parallizable.forward_backward` with x. """
-        if self._num_serial > 0 or len(self._threads) == 0:
-            self._num_serial -= 1
-            out = self._parallizable.forward_backward(x)
-            self._out_queue.put(out)
-        else:
-            self._in_queue.put(x)
-
-    def get(self):
-        """Get an output of previous `parallizable.forward_backward` calls.
-        This method blocks if none of previous `parallizable.forward_backward`
-        calls have return any result. """
-        return self._out_queue.get()
-
-    def __del__(self):
-        for thread in self._threads:
-            if thread.is_alive():
-                self._in_queue.put(self._StopSignal('stop'))
-        for thread in self._threads:
-            thread.join(10)
diff --git a/src/gluonnlp/utils/parameter.py b/src/gluonnlp/utils/parameter.py
index f44bdf8542..2898933c93 100644
--- a/src/gluonnlp/utils/parameter.py
+++ b/src/gluonnlp/utils/parameter.py
@@ -15,22 +15,21 @@
 # specific language governing permissions and limitations
 # under the License.
 """Utility functions for trainer and parameters."""
+__all__ = ['grad_global_norm', 'clip_grad_global_norm']
 
-__all__ = ['grad_global_norm', 'clip_grad_global_norm', 'save_parameters',
-           'save_states', 'load_parameters', 'load_states']
 
 import warnings
 
-from collections import defaultdict
+import numpy as np
 import mxnet as mx
-from mxnet import nd
-from .. import _constants as C
-from .files import _TempFilePath, _transfer_file_s3
+from collections import defaultdict
+from mxnet.gluon import Parameter
+from typing import Iterable, Optional, Tuple
+
 
-def grad_global_norm(parameters, max_norm=None):
+def grad_global_norm(parameters: Iterable[Parameter]) -> float:
     """Calculate the 2-norm of gradients of parameters, and how much they should be scaled down
     such that their 2-norm does not exceed `max_norm`, if `max_norm` if provided.
-
     If gradients exist for more than one context for a parameter, user needs to explicitly call
     ``trainer.allreduce_grads`` so that the gradients are summed first before calculating
     the 2-norm.
@@ -39,7 +38,6 @@ def grad_global_norm(parameters, max_norm=None):
         This function is only for use when `update_on_kvstore` is set to False in trainer.
 
     Example::
-
         trainer = Trainer(net.collect_params(), update_on_kvstore=False, ...)
         for x, y in mx.gluon.utils.split_and_load(X, [mx.gpu(0), mx.gpu(1)]):
             with mx.autograd.record():
@@ -52,69 +50,64 @@ def grad_global_norm(parameters, max_norm=None):
 
     Parameters
     ----------
-    parameters : list of Parameters
-    max_norm: NDArray, optional
-        The maximum L2 norm threshold. If provided, `ratio` and `is_finite` will be returned.
+    parameters
+        The list of Parameters
 
     Returns
     -------
-    NDArray
-      Total norm. Shape is (1,)
-    NDArray
-      Ratio for rescaling gradients based on max_norm s.t. grad = grad / ratio.
-      If total norm is NaN, ratio will be NaN, too.
-      Returned if `max_norm` is provided. Shape is (1,)
-    NDArray
-      Whether the total norm is finite, returned if `max_norm` is provided. Shape is (1,)
+    total_norm
+        Total norm. It's a numpy scalar.
     """
-    # distribute gradients among contexts
+    # Distribute gradients among contexts,
+    # For example, assume there are 8 weights and four GPUs, we can ask each GPU to
+    # compute the squared sum of two weights and then add the results together
     idx = 0
     arrays = defaultdict(list)
     sum_norms = []
+    num_ctx = None
     for p in parameters:
         if p.grad_req != 'null':
             p_grads = p.list_grad()
-            arrays[idx % len(p_grads)].append(p_grads[idx % len(p_grads)])
+            if num_ctx is None:
+                num_ctx = len(p_grads)
+            else:
+                assert num_ctx == len(p_grads)
+            arrays[idx % num_ctx].append(p_grads[idx % num_ctx])
             idx += 1
     assert len(arrays) > 0, 'No parameter found available for gradient norm.'
 
-    ctx, dtype = arrays[0][0].context, 'float32'
-    for idx, arr in enumerate(arrays.values()):
-        sum_norm = mx.nd.multi_sum_sq(*arr, num_arrays=len(arr))
-        sum_norm = nd.add_n(*sum_norm)
-        sum_norms.append(sum_norm.as_in_context(ctx))
+    # TODO(sxjscience)
+    #  Investigate the float16 case.
+    #  The inner computation accumulative type of norm should be float32.
+    ctx = arrays[0][0].context
+    for idx, arr_l in enumerate(arrays.values()):
+        sum_norm = mx.np.linalg.norm(mx.np.concatenate([mx.np.ravel(ele) for ele in arr_l]))
+        sum_norms.append(sum_norm.as_in_ctx(ctx))
+
+    # Reduce over ctx
+    if num_ctx == 1:
+        total_norm = sum_norms[0]
+    else:
+        total_norm = mx.np.linalg.norm(mx.np.concatenate(sum_norms, axis=None))
+    total_norm = float(total_norm)
+    return total_norm
 
-    # reduce
-    total_norm = nd.add_n(*sum_norms).sqrt()
-    if max_norm is None:
-        return total_norm
-    scale = total_norm / max_norm
-    # is_finite = 0 if NaN or Inf, 1 otherwise.
-    is_finite = nd.contrib.isfinite(scale)
-    # if scale is finite, nd.maximum selects the max between scale and 1. That is,
-    # 1 is returned if total_norm does not exceed max_norm.
-    # if scale = NaN or Inf, the result of nd.minimum is undefined. Therefore, we use
-    # choices.take to return NaN or Inf.
-    scale_or_one = nd.maximum(nd.ones((1,), dtype=dtype, ctx=ctx), scale)
-    choices = nd.concat(scale, scale_or_one, dim=0)
-    chosen_scale = choices.take(is_finite)
-    return total_norm, chosen_scale, is_finite
 
-def clip_grad_global_norm(parameters, max_norm, check_isfinite=True):
+def clip_grad_global_norm(parameters: Iterable[Parameter],
+                          max_norm: float,
+                          check_isfinite: bool = True) -> Tuple[float, float, bool]:
     """Rescales gradients of parameters so that the sum of their 2-norm is smaller than `max_norm`.
     If gradients exist for more than one context for a parameter, user needs to explicitly call
     ``trainer.allreduce_grads`` so that the gradients are summed first before calculating
     the 2-norm.
 
     .. note::
-
         This function is only for use when `update_on_kvstore` is set to False in trainer.
         In cases where training happens on multiple contexts, this method should be used in
         conjunction with ``trainer.allreduce_grads()`` and ``trainer.update()``.
         (**not** ``trainer.step()``)
 
     Example::
-
         trainer = Trainer(net.collect_params(), update_on_kvstore=False, ...)
         for x, y in mx.gluon.utils.split_and_load(X, [mx.gpu(0), mx.gpu(1)]):
             with mx.autograd.record():
@@ -128,134 +121,34 @@ def clip_grad_global_norm(parameters, max_norm, check_isfinite=True):
 
     Parameters
     ----------
-    parameters : list of Parameters
-    max_norm : float
-    check_isfinite : bool, default True
-         If True, check that the total_norm is finite (not nan or inf). This
-         requires a blocking .asscalar() call.
-
+    parameters
+        The list of parameters to calculate the norm
+    max_norm
+        If the gradient norm is larger than max_norm, it will be clipped to have max_norm
+    check_isfinite
+         If True, check whether the total_norm is finite (not nan or inf).
     Returns
     -------
-    NDArray or float
-      Total norm. Return type is NDArray of shape (1,) if check_isfinite is
-      False. Otherwise a float is returned.
-
+    total_norm
+        The total norm
+    ratio
+        The expected clipping ratio: grad = grad / ratio
+        It will be calculated as max(total_norm / max_norm, 1)
+    is_finite
+        Whether the total norm is finite
     """
-    total_norm, ratio, is_finite = grad_global_norm(parameters, max_norm)
+    total_norm = grad_global_norm(parameters)
+    is_finite = bool(np.isfinite(total_norm))
+    ratio = np.maximum(1, total_norm / max_norm)
+    if check_isfinite and not is_finite:
+        warnings.warn(
+            UserWarning('nan or inf is detected. Clipping results will be undefined.'
+                        ' Thus, skip clipping'),
+            stacklevel=2)
+        return total_norm, ratio, is_finite
     scale = 1 / ratio
-    if check_isfinite:
-        if is_finite != 1:
-            warnings.warn(
-                UserWarning('nan or inf is detected. '
-                            'Clipping results will be undefined.'), stacklevel=2)
     for p in parameters:
         if p.grad_req != 'null':
             for arr in p.list_grad():
-                arr *= scale.as_in_context(arr.context)
-    return total_norm
-
-def _s3_compatible_save_load(is_save, save_load_method, filename, *args, **kwargs):
-    """Dispatch function for save load with s3."""
-    if C.S3_PREFIX in filename:
-        # create temp dir
-        with _TempFilePath() as temp_path:
-            if is_save:
-                # save model
-                save_load_method(temp_path, *args, **kwargs)
-                _transfer_file_s3(temp_path, filename, upload=is_save)
-            else:
-                # load model
-                _transfer_file_s3(temp_path, filename, upload=is_save)
-                save_load_method(temp_path, *args, **kwargs)
-    else:
-        save_load_method(filename, *args, **kwargs)
-
-def load_parameters(model, filename, ctx=None, allow_missing=False,
-                    ignore_extra=False, cast_dtype=None):
-    """Load parameters from file previously saved by `save_parameters`.
-
-    Both local file system path and S3 URI are supported.
-    For example, 's3://mybucket/folder/net.params', './folder/net.params'.
-
-    Parameters
-    ----------
-    filename : str
-        Path to parameter file.
-    ctx : Context or list of Context, default cpu()
-        Context(s) to initialize loaded parameters on.
-    allow_missing : bool, default False
-        Whether to silently skip loading parameters not represents in the file.
-    ignore_extra : bool, default False
-        Whether to silently ignore parameters from the file that are not
-        present in this Block.
-    cast_dtype : bool, default False
-        Cast the data type of the NDArray loaded from the checkpoint to the dtype
-        provided by the Parameter if any.
-    """
-    if cast_dtype is not None:
-        _s3_compatible_save_load(False, model.load_parameters, filename, ctx=ctx,
-                                 allow_missing=allow_missing, ignore_extra=ignore_extra,
-                                 cast_dtype=cast_dtype)
-    else:
-        _s3_compatible_save_load(False, model.load_parameters, filename, ctx=ctx,
-                                 allow_missing=allow_missing, ignore_extra=ignore_extra)
-
-def save_parameters(model, filename):
-    """Save parameters to file.
-
-    Saved parameters can only be loaded with `Block.load_parameters`. Note that this
-    method only saves parameters, not model structure.
-
-    Both local file system path and S3 URI are supported.
-    For example, 's3://mybucket/folder/net.params', './folder/net.params'.
-
-    Parameters
-    ----------
-    model : mx.gluon.Block
-        The model to save.
-    uri : str
-        Path to file.
-    """
-    _s3_compatible_save_load(True, model.save_parameters, filename)
-
-
-def load_states(trainer, fname):
-    """Loads trainer states (e.g. optimizer, momentum) from a file.
-
-    Both local file system path and S3 URI are supported.
-    For example, 's3://mybucket/folder/net.states', './folder/net.states'.
-
-    Parameters
-    ----------
-    trainer : mxnet.gluon.Trainer
-        The trainer whose states will be loaded.
-    fname : str
-        Path to input states file.
-
-    Note
-    ----
-    `optimizer.param_dict`, which contains Parameter information (such as
-    `lr_mult` and `wd_mult`) will not be loaded from the file, but rather set
-    based on current Trainer's parameters.
-    """
-    _s3_compatible_save_load(False, trainer.load_states, fname)
-
-def save_states(trainer, fname):
-    """Saves trainer states (e.g. optimizer, momentum) to a file.
-
-    Both local file system path and S3 URI are supported.
-    For example, 's3://mybucket/folder/net.states', './folder/net.states'.
-
-    Parameters
-    ----------
-    trainer : mxnet.gluon.Trainer
-        The trainer whose states will be saved.
-    fname : str
-        Path to output states file.
-
-    Note
-    ----
-    `optimizer.param_dict`, which contains Parameter information (such as
-    `lr_mult` and `wd_mult`) will not be saved.
-    """
-    _s3_compatible_save_load(True, trainer.save_states, fname)
+                arr *= scale
+    return total_norm, ratio, is_finite
diff --git a/src/gluonnlp/utils/preprocessing.py b/src/gluonnlp/utils/preprocessing.py
new file mode 100644
index 0000000000..25fd057298
--- /dev/null
+++ b/src/gluonnlp/utils/preprocessing.py
@@ -0,0 +1,112 @@
+import numpy as np
+from typing import List
+
+
+def get_trimmed_lengths(lengths: List[int],
+                        max_length: int,
+                        do_merge: bool = False) -> np.ndarray:
+    """Get the trimmed lengths of multiple text data. It will make sure that
+    the trimmed length is smaller than or equal to the max_length
+
+    - do_merge is True
+        Make sure that sum(trimmed_lengths) <= max_length.
+        The strategy is to always try to trim the longer lengths.
+    - do_merge is False
+        Make sure that all(trimmed_lengths <= max_length)
+
+    Parameters
+    ----------
+    lengths
+        The original lengths of each sample
+    max_length
+        When do_merge is True,
+            We set the max_length constraint on the total length.
+        When do_merge is False,
+            We set the max_length constraint on individual sentences.
+    do_merge
+        Whether these sentences will be merged
+
+    Returns
+    -------
+    trimmed_lengths
+        The trimmed lengths of the
+    """
+    lengths = np.array(lengths)
+    if do_merge:
+        total_length = sum(lengths)
+        if total_length <= max_length:
+            return lengths
+        trimmed_lengths = np.zeros_like(lengths)
+        while sum(trimmed_lengths) != max_length:
+            remainder = max_length - sum(trimmed_lengths)
+            budgets = lengths - trimmed_lengths
+            nonzero_idx = (budgets > 0).nonzero()[0]
+            nonzero_budgets = budgets[nonzero_idx]
+            if remainder // len(nonzero_idx) == 0:
+                for i in range(remainder):
+                    trimmed_lengths[nonzero_idx[i]] += 1
+            else:
+                increment = min(min(nonzero_budgets), remainder // len(nonzero_idx))
+                trimmed_lengths[nonzero_idx] += increment
+        return trimmed_lengths
+    else:
+        return np.minimum(lengths, max_length)
+
+
+def match_tokens_with_char_spans(token_offsets: np.ndarray,
+                                 spans: np.ndarray) -> np.ndarray:
+    """Match the span offsets with the character-level offsets.
+
+    For each span, we perform the following:
+
+    1: Cutoff the boundary
+
+        span[0] = max(span[0], token_offsets[0, 0])
+        span[1] = min(span[1], token_offsets[-1, 1])
+
+    2: Find start + end
+
+    We try to select the smallest number of tokens that cover the entity, i.e.,
+    we will find start + end, in which tokens[start:end + 1] covers the span.
+
+    We will use the following algorithm:
+
+        For "start", we search for
+            token_offsets[start, 0] <= span[0] < token_offsets[start + 1, 0]
+
+        For "end", we search for:
+            token_offsets[end - 1, 1] < spans[1] <= token_offsets[end, 1]
+
+    Parameters
+    ----------
+    token_offsets
+        The offsets of the input tokens. Must be sorted.
+        That is, it will satisfy
+            1. token_offsets[i][0] <= token_offsets[i][1]
+            2. token_offsets[i][0] <= token_offsets[i + 1][0]
+            3. token_offsets[i][1] <= token_offsets[i + 1][1]
+        Shape (#num_tokens, 2)
+    spans
+        The character-level offsets (begin/end) of the selected spans.
+        Shape (#spans, 2)
+
+    Returns
+    -------
+    token_start_ends
+        The token-level starts and ends. The end will also be used.
+        Shape (#spans, 2)
+    """
+    offsets_starts = token_offsets[:, 0]
+    offsets_ends = token_offsets[:, 1]
+    span_char_starts = spans[:, 0]
+    span_char_ends = spans[:, 1]
+
+    # Truncate the span
+    span_char_starts = np.maximum(offsets_starts[0], span_char_starts)
+    span_char_ends = np.minimum(offsets_ends[-1], span_char_ends)
+
+    # Search for valid start + end
+    span_token_starts = np.searchsorted(offsets_starts, span_char_starts, side='right') - 1
+    span_token_ends = np.searchsorted(offsets_ends, span_char_ends, side='left')
+    return np.concatenate((np.expand_dims(span_token_starts, axis=-1),
+                           np.expand_dims(span_token_ends, axis=-1)), axis=-1)
diff --git a/src/gluonnlp/utils/registry.py b/src/gluonnlp/utils/registry.py
new file mode 100644
index 0000000000..973baeb184
--- /dev/null
+++ b/src/gluonnlp/utils/registry.py
@@ -0,0 +1,174 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Create a registry."""
+
+from typing import Optional, List
+import json
+from json import JSONDecodeError
+
+
+class Registry:
+    """Create the registry that will map name to object. This facilitates the users to create
+    custom registry.
+
+    Parameters
+    ----------
+    name
+        The name of the registry
+
+    Examples
+    --------
+
+    >>> from gluonnlp.utils.registry import Registry
+    >>> # Create a registry
+    >>> MODEL_REGISTRY = Registry('MODEL')
+    >>>
+    >>> # To register a class/function with decorator
+    >>> @MODEL_REGISTRY.register()
+...     class MyModel:
+...         pass
+    >>> @MODEL_REGISTRY.register()
+...     def my_model():
+...         return
+    >>>
+    >>> # To register a class object with decorator and provide nickname:
+    >>> @MODEL_REGISTRY.register('test_class')
+...     class MyModelWithNickName:
+...         pass
+    >>> @MODEL_REGISTRY.register('test_function')
+...     def my_model_with_nick_name():
+...         return
+    >>>
+    >>> # To register a class/function object by function call
+...     class MyModel2:
+...         pass
+    >>> MODEL_REGISTRY.register(MyModel2)
+    >>> # To register with a given name
+    >>> MODEL_REGISTRY.register('my_model2', MyModel2)
+    >>> # To list all the registered objects:
+    >>> MODEL_REGISTRY.list_keys()
+
+['MyModel', 'my_model', 'test_class', 'test_function', 'MyModel2', 'my_model2']
+
+    >>> # To get the registered object/class
+    >>> MODEL_REGISTRY.get('test_class')
+
+__main__.MyModelWithNickName
+
+    """
+
+    def __init__(self, name: str) -> None:
+        self._name: str = name
+        self._obj_map: dict[str, object] = dict()
+
+    def _do_register(self, name: str, obj: object) -> None:
+        assert (
+            name not in self._obj_map
+        ), "An object named '{}' was already registered in '{}' registry!".format(
+            name, self._name
+        )
+        self._obj_map[name] = obj
+
+    def register(self, *args):
+        """
+        Register the given object under either the nickname or `obj.__name__`. It can be used as
+         either a decorator or not. See docstring of this class for usage.
+        """
+        if len(args) == 2:
+            # Register an object with nick name by function call
+            nickname, obj = args
+            self._do_register(nickname, obj)
+        elif len(args) == 1:
+            if isinstance(args[0], str):
+                # Register an object with nick name by decorator
+                nickname = args[0]
+                def deco(func_or_class: object) -> object:
+                    self._do_register(nickname, func_or_class)
+                    return func_or_class
+                return deco
+            else:
+                # Register an object by function call
+                self._do_register(args[0].__name__, args[0])
+        elif len(args) == 0:
+            # Register an object by decorator
+            def deco(func_or_class: object) -> object:
+                self._do_register(func_or_class.__name__, func_or_class)
+                return func_or_class
+            return deco
+        else:
+            raise ValueError('Do not support the usage!')
+
+    def get(self, name: str) -> object:
+        ret = self._obj_map.get(name)
+        if ret is None:
+            raise KeyError(
+                "No object named '{}' found in '{}' registry!".format(
+                    name, self._name
+                )
+            )
+        return ret
+
+    def list_keys(self) -> List:
+        return list(self._obj_map.keys())
+
+    def __repr__(self) -> str:
+        s = '{name}(keys={keys})'.format(name=self._name,
+                                         keys=self.list_keys())
+        return s
+
+    def create(self, name: str, *args, **kwargs) -> object:
+        """Create the class object with the given args and kwargs
+
+        Parameters
+        ----------
+        name
+            The name in the registry
+        args
+        kwargs
+
+        Returns
+        -------
+        ret
+            The created object
+        """
+        return self.get(name)(*args, **kwargs)
+
+    def create_with_json(self, name: str, json_str: str):
+        """
+
+        Parameters
+        ----------
+        name
+        json_str
+
+        Returns
+        -------
+
+        """
+        try:
+            args = json.loads(json_str)
+        except JSONDecodeError:
+            raise ValueError('Unable to decode the json string: json_str="{}"'
+                             .format(json_str))
+        if isinstance(args, (list, tuple)):
+            return self.create(name, *args)
+        elif isinstance(args, dict):
+            return self.create(name, **args)
+        else:
+            raise NotImplementedError('The format of json string is not supported! We only support '
+                                      'list/dict. json_str="{}".'
+                                      .format(json_str))
diff --git a/src/gluonnlp/utils/seed.py b/src/gluonnlp/utils/seed.py
deleted file mode 100644
index 3707a878a3..0000000000
--- a/src/gluonnlp/utils/seed.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utility function for setting seed."""
-import random
-import numpy as np
-import mxnet as mx
-
-__all__ = ['set_seed']
-
-def set_seed(seed=0):
-    """Sets the seed for reproducibility
-
-	Parameters
-	----------
-	seed : int
-	    Value of the seed to set
-	"""
-    random.seed(seed)
-    np.random.seed(seed)
-    mx.random.seed(seed)
diff --git a/src/gluonnlp/utils/testing.py b/src/gluonnlp/utils/testing.py
new file mode 100644
index 0000000000..abae1a804e
--- /dev/null
+++ b/src/gluonnlp/utils/testing.py
@@ -0,0 +1,161 @@
+import numpy.testing as npt
+import mxnet as mx
+from mxnet.util import use_np
+
+
+def is_match_states_batch_size(states, states_batch_axis, batch_size) -> bool:
+    """Test whether the generated states have the specified batch size
+
+    Parameters
+    ----------
+    states
+        The states structure
+    states_batch_axis
+        The states batch axis structure
+    batch_size
+        The batch size
+
+    Returns
+    -------
+    ret
+    """
+    if states_batch_axis is None:
+        return True
+    if isinstance(states_batch_axis, int):
+        if states.shape[states_batch_axis] == batch_size:
+            return True
+    for ele_states_batch_axis, ele_states in zip(states_batch_axis, states):
+        ret = is_match_states_batch_size(ele_states, ele_states_batch_axis, batch_size)
+        if ret is False:
+            return False
+    return True
+
+
+@use_np
+def verify_nmt_model(model, batch_size: int = 4,
+                     src_seq_length: int = 5,
+                     tgt_seq_length: int = 10,
+                     atol: float = 1E-4,
+                     rtol: float = 1E-4):
+    """Verify the correctness of an NMT model. Raise error message if it detects problems.
+
+    Parameters
+    ----------
+    model
+        The machine translation model
+    batch_size
+        The batch size to test the nmt model
+    src_seq_length
+        Length of the source sequence
+    tgt_seq_length
+        Length of the target sequence
+    atol
+        Absolute tolerance.
+    rtol
+        Relative tolerance.
+
+    """
+    src_word_sequence = mx.np.random.randint(0, model.src_vocab_size, (batch_size, src_seq_length))
+    tgt_word_sequence = mx.np.random.randint(0, model.tgt_vocab_size, (batch_size, tgt_seq_length))
+    src_valid_length = mx.np.random.randint(1, src_seq_length, (batch_size,))
+    min_tgt_seq_length = max(1, tgt_seq_length - 5)
+    tgt_valid_length = mx.np.random.randint(min_tgt_seq_length, tgt_seq_length, (batch_size,))
+
+    if model.layout == 'NT':
+        full_out = model(src_word_sequence, src_valid_length, tgt_word_sequence, tgt_valid_length)
+    else:
+        full_out = model(src_word_sequence.T, src_valid_length,
+                         tgt_word_sequence.T, tgt_valid_length)
+        full_out = mx.np.swapaxes(full_out, 0, 1)
+    if full_out.shape != (batch_size, tgt_seq_length, model.tgt_vocab_size):
+        raise AssertionError('The output of NMT model does not match the expected output.'
+                             ' Model output shape = {}, Expected (B, T, V) = {}'
+                             .format(full_out.shape,
+                                     (batch_size, tgt_seq_length, model.tgt_vocab_size)))
+    for partial_batch_size in range(1, batch_size + 1):
+        for i in range(1, min_tgt_seq_length):
+            if model.layout == 'NT':
+                partial_out = model(src_word_sequence[:partial_batch_size, :],
+                                    src_valid_length[:partial_batch_size],
+                                    tgt_word_sequence[:partial_batch_size, :(-i)],
+                                    tgt_valid_length[:partial_batch_size]
+                                    - mx.np.array(i, dtype=tgt_valid_length.dtype))
+            else:
+                partial_out = model(src_word_sequence[:partial_batch_size, :].T,
+                                    src_valid_length[:partial_batch_size],
+                                    tgt_word_sequence[:partial_batch_size, :(-i)].T,
+                                    tgt_valid_length[:partial_batch_size]
+                                    - mx.np.array(i, dtype=tgt_valid_length.dtype))
+                partial_out = mx.np.swapaxes(partial_out, 0, 1)
+            # Verify that the partial output matches the full output
+            for b in range(partial_batch_size):
+                partial_vl = tgt_valid_length.asnumpy()[b] - i
+                npt.assert_allclose(full_out[b, :partial_vl].asnumpy(),
+                                    partial_out[b, :partial_vl].asnumpy(), atol, rtol)
+
+
+@use_np
+def verify_nmt_inference(train_model, inference_model,
+                         batch_size=4, src_seq_length=5,
+                         tgt_seq_length=10, atol=1E-4, rtol=1E-4):
+    """Verify the correctness of an NMT inference model. Raise error message if it detects
+    any problems.
+
+    Parameters
+    ----------
+    train_model
+    inference_model
+    batch_size
+    src_seq_length
+    tgt_seq_length
+    atol
+        Absolute tolerance
+    rtol
+        Relative tolerance
+
+    """
+    if train_model.layout == 'NT':
+        src_word_sequences = mx.np.random.randint(0, train_model.src_vocab_size,
+                                                  (batch_size, src_seq_length))
+        tgt_word_sequences = mx.np.random.randint(0, train_model.tgt_vocab_size,
+                                                  (batch_size, tgt_seq_length))
+    else:
+        src_word_sequences = mx.np.random.randint(0, train_model.src_vocab_size,
+                                                  (src_seq_length, batch_size))
+        tgt_word_sequences = mx.np.random.randint(0, train_model.tgt_vocab_size,
+                                                  (tgt_seq_length, batch_size))
+    src_valid_length = mx.np.random.randint(1, src_seq_length, (batch_size,))
+    min_tgt_seq_length = max(1, tgt_seq_length - 5)
+    tgt_valid_length = mx.np.random.randint(min_tgt_seq_length, tgt_seq_length, (batch_size,))
+    full_out = train_model(src_word_sequences, src_valid_length,
+                           tgt_word_sequences, tgt_valid_length)
+    if train_model.layout == 'NT':
+        for partial_batch_size in range(1, batch_size + 1):
+            step_out_l = []
+            states = inference_model.init_states(src_word_sequences[:partial_batch_size, :],
+                                                 src_valid_length[:partial_batch_size])
+            assert is_match_states_batch_size(states, inference_model.state_batch_axis,
+                                              partial_batch_size)
+            for i in range(min_tgt_seq_length):
+                step_out, states = inference_model(tgt_word_sequences[:partial_batch_size, i],
+                                                   states)
+                step_out_l.append(step_out)
+            partial_out = mx.np.stack(step_out_l, axis=1)
+            npt.assert_allclose(full_out[:partial_batch_size, :min_tgt_seq_length].asnumpy(),
+                                partial_out[:partial_batch_size, :].asnumpy(), atol, rtol)
+    elif train_model.layout == 'TN':
+        for partial_batch_size in range(1, batch_size + 1):
+            step_out_l = []
+            states = inference_model.init_states(src_word_sequences[:, :partial_batch_size],
+                                                 src_valid_length[:partial_batch_size])
+            assert is_match_states_batch_size(states, inference_model.state_batch_axis,
+                                              partial_batch_size)
+            for i in range(min_tgt_seq_length):
+                step_out, states = inference_model(tgt_word_sequences[i, :partial_batch_size],
+                                                   states)
+                step_out_l.append(step_out)
+            partial_out = mx.np.stack(step_out_l, axis=0)
+            npt.assert_allclose(full_out[:min_tgt_seq_length, :partial_batch_size].asnumpy(),
+                                partial_out[:, :partial_batch_size].asnumpy(), atol, rtol)
+    else:
+        raise NotImplementedError
diff --git a/src/gluonnlp/utils/version.py b/src/gluonnlp/utils/version.py
deleted file mode 100644
index edacbb44ef..0000000000
--- a/src/gluonnlp/utils/version.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Utility functions for version checking."""
-import warnings
-
-__all__ = ['check_version']
-
-def check_version(min_version, warning_only=False, library=None):
-    """Check the version of gluonnlp satisfies the provided minimum version.
-    An exception is thrown if the check does not pass.
-
-    Parameters
-    ----------
-    min_version : str
-        Minimum version
-    warning_only : bool
-        Printing a warning instead of throwing an exception.
-    library: optional module, default None
-        The target library for version check. Checks gluonnlp by default
-    """
-    # pylint: disable=import-outside-toplevel
-    from .. import __version__
-    if library is None:
-        version = __version__
-        name = 'GluonNLP'
-    else:
-        version = library.__version__
-        name = library.__name__
-    from packaging.version import parse
-    bad_version = parse(version.replace('.dev', '')) < parse(min_version)
-    if bad_version:
-        msg = 'Installed {} version {} does not satisfy the ' \
-              'minimum required version {}'.format(name, version, min_version)
-        if warning_only:
-            warnings.warn(msg)
-        else:
-            raise AssertionError(msg)
diff --git a/src/gluonnlp/vocab/__init__.py b/src/gluonnlp/vocab/__init__.py
deleted file mode 100644
index e600bfe3f4..0000000000
--- a/src/gluonnlp/vocab/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=wildcard-import
-"""Vocabulary."""
-
-from . import bert, elmo, subwords, vocab
-from .bert import *
-from .elmo import *
-from .subwords import *
-from .vocab import *
-
-__all__ = vocab.__all__ + subwords.__all__ + elmo.__all__ + bert.__all__
diff --git a/src/gluonnlp/vocab/bert.py b/src/gluonnlp/vocab/bert.py
deleted file mode 100644
index 3617bb0f82..0000000000
--- a/src/gluonnlp/vocab/bert.py
+++ /dev/null
@@ -1,253 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=consider-iterating-dictionary
-
-"""Vocabulary class used in the BERT."""
-import json
-import os
-
-from ..data.transforms import SentencepieceTokenizer
-from ..data.utils import count_tokens
-from .vocab import Vocab
-
-__all__ = ['BERTVocab']
-
-
-UNKNOWN_TOKEN = '[UNK]'
-PADDING_TOKEN = '[PAD]'
-MASK_TOKEN = '[MASK]'
-SEP_TOKEN = '[SEP]'
-CLS_TOKEN = '[CLS]'
-
-
-class BERTVocab(Vocab):
-    """Specialization of gluonnlp.Vocab for BERT models.
-
-    BERTVocab changes default token representations of unknown and other
-    special tokens of gluonnlp.Vocab and adds convenience parameters to specify
-    mask, sep and cls tokens typically used by Bert models.
-
-    Parameters
-    ----------
-    counter : Counter or None, default None
-        Counts text token frequencies in the text data. Its keys will be indexed according to
-        frequency thresholds such as `max_size` and `min_freq`. Keys of `counter`,
-        `unknown_token`, and values of `reserved_tokens` must be of the same hashable type.
-        Examples: str, int, and tuple.
-    max_size : None or int, default None
-        The maximum possible number of the most frequent tokens in the keys of `counter` that can be
-        indexed. Note that this argument does not count any token from `reserved_tokens`. Suppose
-        that there are different keys of `counter` whose frequency are the same, if indexing all of
-        them will exceed this argument value, such keys will be indexed one by one according to
-        their __cmp__() order until the frequency threshold is met. If this argument is None or
-        larger than its largest possible value restricted by `counter` and `reserved_tokens`, this
-        argument has no effect.
-    min_freq : int, default 1
-        The minimum frequency required for a token in the keys of `counter` to be indexed.
-    unknown_token : hashable object or None, default '[UNK]'
-        The representation for any unknown token. In other words, any unknown token will be indexed
-        as the same representation. If None, looking up an unknown token will result in KeyError.
-    padding_token : hashable object or None, default '[PAD]'
-        The representation for the special token of padding token.
-    bos_token : hashable object or None, default None
-        The representation for the special token of beginning-of-sequence token.
-    eos_token : hashable object or None, default None
-        The representation for the special token of end-of-sequence token.
-    mask_token : hashable object or None, default '[MASK]'
-        The representation for the special token of mask token for BERT.
-    sep_token : hashable object or None, default '[SEP]'
-        A token used to separate sentence pairs for BERT.
-    cls_token : hashable object or None, default '[CLS]'
-        Classification symbol for BERT.
-    reserved_tokens : list of hashable objects or None, default None
-        A list specifying additional tokens to be added to the vocabulary.
-        `reserved_tokens` cannot contain `unknown_token` or duplicate reserved
-        tokens.
-        Keys of `counter`, `unknown_token`, and values of `reserved_tokens`
-        must be of the same hashable type. Examples of hashable types are str,
-        int, and tuple.
-    token_to_idx : dict mapping tokens (hashable objects) to int or None, default None
-        Optionally specifies the indices of tokens to be used by the
-        vocabulary. Each token in `token_to_index` must be part of the Vocab
-        and each index can only be associated with a single token.
-        `token_to_idx` is not required to contain a mapping for all tokens. For
-        example, it is valid to only set the `unknown_token` index to 10 (instead
-        of the default of 0) with `token_to_idx = {'<unk>': 10}`.
-
-    Attributes
-    ----------
-    embedding : instance of :class:`gluonnlp.embedding.TokenEmbedding`
-        The embedding of the indexed tokens.
-    idx_to_token : list of strs
-        A list of indexed tokens where the list indices and the token indices are aligned.
-    reserved_tokens : list of strs or None
-        A list of reserved tokens that will always be indexed.
-    token_to_idx : dict mapping str to int
-        A dict mapping each token to its index integer.
-    unknown_token : hashable object or None, default '[UNK]'
-        The representation for any unknown token. In other words, any unknown token will be indexed
-        as the same representation.
-    padding_token : hashable object or None, default '[PAD]'
-        The representation for padding token.
-    bos_token : hashable object or None, default None
-        The representation for beginning-of-sentence token.
-    eos_token : hashable object or None, default None
-        The representation for end-of-sentence token.
-    mask_token : hashable object or None, default '[MASK]'
-        The representation for the special token of mask token for BERT.
-    sep_token : hashable object or None, default '[SEP]'
-        a token used to separate sentence pairs for BERT.
-    cls_token : hashable object or None, default '[CLS]'
-
-    """
-
-    def __init__(self, counter=None, max_size=None, min_freq=1, unknown_token=UNKNOWN_TOKEN,
-                 padding_token=PADDING_TOKEN, bos_token=None, eos_token=None, mask_token=MASK_TOKEN,
-                 sep_token=SEP_TOKEN, cls_token=CLS_TOKEN, reserved_tokens=None, token_to_idx=None):
-
-        super(BERTVocab, self).__init__(counter=counter, max_size=max_size, min_freq=min_freq,
-                                        unknown_token=unknown_token, padding_token=padding_token,
-                                        bos_token=bos_token, eos_token=eos_token,
-                                        reserved_tokens=reserved_tokens, cls_token=cls_token,
-                                        sep_token=sep_token, mask_token=mask_token,
-                                        token_to_idx=token_to_idx)
-
-    @classmethod
-    def from_json(cls, json_str):
-        """Deserialize BERTVocab object from json string.
-
-        Parameters
-        ----------
-        json_str : str
-            Serialized json string of a BERTVocab object.
-
-
-        Returns
-        -------
-        BERTVocab
-        """
-        vocab_dict = json.loads(json_str)
-        token_to_idx = vocab_dict.get('token_to_idx')
-        unknown_token = vocab_dict.get('unknown_token')
-        reserved_tokens = vocab_dict.get('reserved_tokens')
-        identifiers_to_tokens = vocab_dict.get('identifiers_to_tokens', dict())
-
-        special_tokens = {unknown_token}
-
-        # Backward compatibility for explicit serialization of padding_token,
-        # bos_token, eos_token, mask_token, sep_token, cls_token handling in
-        # the json string as done in older versions of GluonNLP.
-        deprecated_arguments = [
-            'padding_token', 'bos_token', 'eos_token', 'mask_token', 'sep_token', 'cls_token'
-        ]
-        for token_name in deprecated_arguments:
-            token = vocab_dict.get(token_name)
-            if token is not None:
-                assert token_name not in identifiers_to_tokens, 'Invalid json string. ' \
-                    '{} was serialized twice.'.format(token_name)
-                identifiers_to_tokens[token_name] = token
-
-        # Separate reserved from special tokens
-        special_tokens.update(identifiers_to_tokens.values())
-        if reserved_tokens is not None:
-            reserved_tokens = [
-                t for t in reserved_tokens if t not in special_tokens
-            ]
-
-        return cls(counter=count_tokens(token_to_idx.keys()),
-                   unknown_token=unknown_token,
-                   reserved_tokens=reserved_tokens,
-                   token_to_idx=token_to_idx,
-                   **identifiers_to_tokens)
-
-    @classmethod
-    def from_sentencepiece(cls,
-                           path,
-                           mask_token=MASK_TOKEN,
-                           sep_token=SEP_TOKEN,
-                           cls_token=CLS_TOKEN,
-                           unknown_token=None,
-                           padding_token=None,
-                           bos_token=None,
-                           eos_token=None,
-                           reserved_tokens=None):
-        """BERTVocab from pre-trained sentencepiece Tokenizer
-
-        Parameters
-        ----------
-        path : str
-            Path to the pre-trained subword tokenization model.
-        mask_token : hashable object or None, default '[MASK]'
-            The representation for the special token of mask token for BERT
-        sep_token : hashable object or None, default '[SEP]'
-            a token used to separate sentence pairs for BERT.
-        cls_token : hashable object or None, default '[CLS]'
-        unknown_token : hashable object or None, default None
-            The representation for any unknown token. In other words,
-            any unknown token will be indexed as the same representation.
-            If set to None, it is set to the token corresponding to the unk_id()
-            in the loaded sentencepiece model.
-        padding_token : hashable object or None, default '[PAD]'
-            The representation for padding token.
-        bos_token : hashable object or None, default None
-            The representation for the begin of sentence token.
-            If set to None, it is set to the token corresponding to the bos_id()
-            in the loaded sentencepiece model.
-        eos_token : hashable object or None, default None
-            The representation for the end of sentence token.
-            If set to None, it is set to the token corresponding to the bos_id()
-            in the loaded sentencepiece model.
-        reserved_tokens : list of strs or None, optional
-            A list of reserved tokens that will always be indexed.
-
-        Returns
-        -------
-        BERTVocab
-        """
-        sp = SentencepieceTokenizer(os.path.expanduser(path))
-        processor = sp._processor
-
-        # we manually construct token_to_idx, idx_to_token and relevant fields for a BERT vocab.
-        token_to_idx = {t: i for i, t in enumerate(sp.tokens)}
-
-        def _check_consistency(processor, token_id, provided_token):
-            """Check if provided_token is consistent with the special token inferred
-            from the loaded sentencepiece vocab."""
-            if token_id >= 0:
-                # sentencepiece contains this special token.
-                token = processor.IdToPiece(token_id)
-                if provided_token:
-                    assert provided_token == token
-                provided_token = token
-            return provided_token
-
-        unknown_token = _check_consistency(processor, processor.unk_id(), unknown_token)
-        bos_token = _check_consistency(processor, processor.bos_id(), bos_token)
-        eos_token = _check_consistency(processor, processor.eos_id(), eos_token)
-        padding_token = _check_consistency(processor, processor.pad_id(), padding_token)
-
-        return cls(counter=count_tokens(token_to_idx.keys()),
-                   unknown_token=unknown_token,
-                   padding_token=padding_token,
-                   bos_token=bos_token,
-                   eos_token=eos_token,
-                   mask_token=mask_token,
-                   sep_token=sep_token,
-                   cls_token=cls_token,
-                   reserved_tokens=reserved_tokens,
-                   token_to_idx=token_to_idx)
diff --git a/src/gluonnlp/vocab/elmo.py b/src/gluonnlp/vocab/elmo.py
deleted file mode 100644
index 2777860037..0000000000
--- a/src/gluonnlp/vocab/elmo.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Vocabulary class used in the original pre-trained ELMo models."""
-
-# pylint: disable=consider-iterating-dictionary
-
-__all__ = ['ELMoCharVocab']
-
-class ELMoCharVocab:
-    r"""ELMo special character vocabulary
-
-    The vocab aims to map individual tokens to sequences of character ids, compatible with ELMo.
-    To be consistent with previously trained models, we include it here.
-
-    Specifically, char ids 0-255 come from utf-8 encoding bytes.
-    Above 256 are reserved for special tokens.
-
-    Parameters
-    ----------
-    bos_token : hashable object or None, default '<bos>'
-        The representation for the special token of beginning-of-sequence token.
-    eos_token : hashable object or None, default '<eos>'
-        The representation for the special token of end-of-sequence token.
-
-    Attributes
-    ----------
-    max_word_length : 50
-        The maximum number of character a word contains is 50 in ELMo.
-    bos_id : 256
-        The index of beginning of the sentence character is 256 in ELMo.
-    eos_id : 257
-        The index of end of the sentence character is 257 in ELMo.
-    bow_id : 258
-        The index of beginning of the word character is 258 in ELMo.
-    eow_id : 259
-        The index of end of the word character is 259 in ELMo.
-    pad_id : 260
-        The index of padding character is 260 in ELMo.
-    """
-    max_word_length = 50
-    max_word_chars = 48 # excluding bow and eow
-    # char ids 0-255 come from utf-8 encoding bytes
-    bos_id = 256
-    eos_id = 257
-    bow_id = 258
-    eow_id = 259
-    pad_id = 260
-
-    def __init__(self, bos_token='<bos>', eos_token='<eos>'):
-        self._bos_token = bos_token
-        self._eos_token = eos_token
-        self._id_dict = {bos_token: [ELMoCharVocab.bos_id],
-                         eos_token: [ELMoCharVocab.eos_id]}
-
-    def __getitem__(self, tokens):
-        """Looks up indices of text tokens according to the vocabulary.
-
-        Parameters
-        ----------
-        tokens : str or list of strs
-            A source token or tokens to be converted.
-
-        Returns
-        -------
-        int or list of ints
-            A list of char indices or a list of list of char indices according to the vocabulary.
-        """
-
-        if not isinstance(tokens, (list, tuple)):
-            return self._token_to_char_indices(tokens)
-        else:
-            return [self._token_to_char_indices(token) for token in tokens]
-
-    def _token_to_char_indices(self, token):
-        ids = [ELMoCharVocab.pad_id] * ELMoCharVocab.max_word_length
-        ids[0] = ELMoCharVocab.bow_id
-        word_ids = bytearray(token, 'utf-8', 'ignore')[:ELMoCharVocab.max_word_chars]
-        word_ids = self._id_dict.get(token, word_ids)
-        ids[1:(1+len(word_ids))] = word_ids
-        ids[1+len(word_ids)] = ELMoCharVocab.eow_id
-        return ids
-
-    def __call__(self, tokens):
-        """Looks up indices of text tokens according to the vocabulary.
-
-
-        Parameters
-        ----------
-        tokens : str or list of strs
-            A source token or tokens to be converted.
-
-
-        Returns
-        -------
-        int or list of ints
-            A list of char indices or a list of list of char indices according to the vocabulary.
-        """
-
-        return self[tokens]
-
-    def __len__(self):
-        return 262
diff --git a/src/gluonnlp/vocab/subwords.py b/src/gluonnlp/vocab/subwords.py
deleted file mode 100644
index 76c114c583..0000000000
--- a/src/gluonnlp/vocab/subwords.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=consider-iterating-dictionary
-"""Subword functions."""
-import numpy as np
-from mxnet import registry
-
-from ..base import numba_njit
-
-__all__ = [
-    'SubwordFunction', 'ByteSubwords', 'NGramHashes',
-    'register_subword_function', 'create_subword_function',
-    'list_subword_functions'
-]
-
-
-def register_subword_function(subword_cls):
-    """Registers a new subword function."""
-    register_text_embedding = registry.get_register_func(SubwordFunction, 'subword function')
-    return register_text_embedding(subword_cls)
-
-
-def create_subword_function(subword_function_name, **kwargs):
-    """Creates an instance of a subword function."""
-
-    create_ = registry.get_create_func(SubwordFunction, 'token embedding')
-    return create_(subword_function_name, **kwargs)
-
-
-def list_subword_functions():
-    """Get valid subword function names."""
-    reg = registry.get_registry(SubwordFunction)
-    return list(reg.keys())
-
-
-class SubwordFunction:
-    """A SubwordFunction maps words to lists of subword indices.
-
-    This class is abstract and to be subclassed. Use
-    gluonnlp.vocab.list_subword_functions to list all available subword
-    functions.
-
-    A SubwordFunction object is callable and returns a list of ndarrays of
-    subwordindices for the given words in a call.
-
-    """
-
-    def __call__(self, words):
-        """Return a list of ndarrays of subwordindices for the given words."""
-        raise NotImplementedError
-
-    def __len__(self):
-        """Return the number of subwords modeled."""
-        raise NotImplementedError
-
-    def indices_to_subwords(self, subwordindices):
-        """Return list of subwords associated with subword indices.
-
-        This may raise RuntimeError if the subword function is not invertible.
-
-        Parameters
-        ----------
-        subwordindices : iterable of int
-            Subword indices to look up.
-
-        Returns
-        -------
-        Iterable of str.
-
-        """
-        raise NotImplementedError
-
-    def subwords_to_indices(self, subwords):
-        """Return list of subwordindices associated with subwords.
-
-        Parameters
-        ----------
-        subwords : iterable of str
-            Subwords to replace by indices.
-
-        Returns
-        -------
-        Iterable of int.
-
-        """
-        raise NotImplementedError
-
-
-@register_subword_function
-class ByteSubwords(SubwordFunction):
-    """Map words to a list of bytes.
-
-    Parameters
-    ----------
-    encoding : str, default 'utf-8
-        Encoding to use for obtaining bytes.
-
-    """
-
-    def __init__(self, encoding='utf-8'):
-        self.encoding = encoding
-
-    def __call__(self, words):
-        return [list(word.encode(self.encoding)) for word in words]
-
-    def __len__(self):
-        return 256
-
-    def __repr__(self):
-        return 'ByteSubwords(encoding={})'.format(self.encoding)
-
-    def indices_to_subwords(self, subwordindices):
-        """Return list of subwords associated with subword indices.
-
-        Parameters
-        ----------
-        subwordindices : iterable of int
-            Subword indices to look up.
-
-        Returns
-        -------
-        Iterable of str.
-
-        """
-        return subwordindices
-
-    def subwords_to_indices(self, subwords):
-        """Return list of subwordindices associated with subwords.
-
-        Parameters
-        ----------
-        subwords : iterable of str
-            Subwords to replace by indices.
-
-        Returns
-        -------
-        Iterable of int.
-
-        """
-        return subwords
-
-
-@numba_njit
-def _fasttext_ngram_hashes(word, ns, bucket_size):
-    hashes = []
-    max_n = np.max(ns)
-    for i in range(len(word)):  # pylint: disable=consider-using-enumerate
-        if (word[i] & 0xC0) == 0x80:
-            # Byte is continuation byte
-            continue
-        n = 0
-        for j in range(i, len(word)):
-            if (j + 1 < len(word) and word[j + 1] & 0xC0) == 0x80:
-                # Next byte is continuation byte
-                continue
-            n += 1
-            if (np.sum(n == ns)  # n in ns
-                    and not (n == 1 and (i == 0 or j == len(word)))):
-                ngram = word[i:j + 1]
-                h = _fasttext_hash(ngram)
-                hashes.append(h % bucket_size)
-            if n >= max_n:
-                break
-    return hashes
-
-
-@numba_njit
-def _fasttext_hash(ngram):
-    h = np.uint32(2166136261)
-    for c in ngram:
-        # Extra np.uint32 casts due to https://github.com/numba/numba/issues/3112
-        h = np.uint32(h ^ np.uint32(np.int8(c)))
-        h = np.uint32(h * np.uint32(16777619))
-    return h
-
-
-@register_subword_function
-class NGramHashes(SubwordFunction):
-    """Map words to a list of hashes in a restricted domain.
-
-    The hash function is the same as in
-    https://github.com/facebookresearch/fastText
-
-    Parameters
-    ----------
-    num_subwords : int
-        Size of target set for the hash function.
-    ngrams : list of int, default [3, 4, 5, 6]
-        n-s for which to hash the ngrams
-    special_tokens : set of str, default None
-        Set of words for which not to look up subwords.
-
-    """
-
-    def __init__(self, num_subwords, ngrams=(3, 4, 5, 6), special_tokens=None):
-        self.num_subwords = num_subwords
-        self.ngrams = ngrams
-        self._ngrams = np.asarray(ngrams)
-
-        assert not isinstance(special_tokens, str)
-        if special_tokens is None:
-            special_tokens = set()
-
-        self.special_tokens = special_tokens
-
-        # Information for __repr__
-        self.ngrams = ngrams
-
-    @staticmethod
-    def fasttext_hash_asbytes(ngram, encoding='utf-8'):
-        ngram_enc = memoryview(ngram.encode(encoding))
-        with np.errstate(over='ignore'):  # overflow in uint_scalars is expected
-            return _fasttext_hash(ngram_enc)
-
-    def _word_to_hashes(self, word):
-        if word not in self.special_tokens:
-            word_enc = bytearray(('<' + word + '>').encode('utf-8'))
-            with np.errstate(over='ignore'):  # overflow in uint_scalars is expected
-                hashes = _fasttext_ngram_hashes(
-                    memoryview(word_enc), ns=self._ngrams,
-                    bucket_size=self.num_subwords)
-        else:
-            hashes = []
-        return hashes
-
-    def __call__(self, words):
-        return [self._word_to_hashes(word) for word in words]
-
-    def __len__(self):
-        return self.num_subwords
-
-    def __repr__(self):
-        return ('NGramHashes(num_subwords={}, ngrams={})'.format(self.num_subwords, self.ngrams))
-
-    def indices_to_subwords(self, subwordindices):
-        """This raises RuntimeError because the subword function is not invertible.
-
-        Parameters
-        ----------
-        subwordindices : iterable of int
-            Subword indices to look up.
-
-        Returns
-        -------
-        Iterable of str.
-
-        """
-        raise RuntimeError('ngram hash function is not invertible.')
-
-    def subwords_to_indices(self, subwords):
-        """Return list of subwordindices associated with subwords.
-
-        Parameters
-        ----------
-        subwords : iterable of str
-            Subwords to replace by indices.
-
-        Returns
-        -------
-        Iterable of int.
-
-        """
-        return [self.fasttext_hash_asbytes(sw) % self.num_subwords for sw in subwords]
diff --git a/src/gluonnlp/vocab/vocab.py b/src/gluonnlp/vocab/vocab.py
deleted file mode 100644
index 5483b3b678..0000000000
--- a/src/gluonnlp/vocab/vocab.py
+++ /dev/null
@@ -1,627 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# pylint: disable=consider-iterating-dictionary
-"""Vocabulary."""
-__all__ = ['Vocab']
-
-import collections
-import json
-import uuid
-import warnings
-import sys
-from typing import Dict, Hashable, List, Optional
-
-from mxnet import nd
-
-from .. import _constants as C
-from .. import embedding as emb
-from ..data.utils import Counter, DefaultLookupDict, count_tokens
-
-UNK_IDX = 0
-_DEPR_PAD = object()
-_DEPR_BOS = object()
-_DEPR_EOS = object()
-
-
-def _is_py35():
-    return sys.version_info[0] == 3 and sys.version_info[1] == 5
-
-class Vocab:
-    """Indexing and embedding attachment for text tokens.
-
-    Parameters
-    ----------
-    counter
-        Counts text token frequencies in the text data. Its keys will be indexed according to
-        frequency thresholds such as `max_size` and `min_freq`. Keys of `counter`,
-        `unknown_token`, and values of `reserved_tokens` must be of the same hashable type.
-        Examples: str, int, and tuple.
-    max_size
-        The maximum possible number of the most frequent tokens in the keys of `counter` that can be
-        indexed. Note that this argument does not count any token from `reserved_tokens`. Suppose
-        that there are different keys of `counter` whose frequency are the same, if indexing all of
-        them will exceed this argument value, such keys will be indexed one by one according to
-        their __cmp__() order until the frequency threshold is met. If this argument is None or
-        larger than its largest possible value restricted by `counter` and `reserved_tokens`, this
-        argument has no effect.
-    min_freq
-        The minimum frequency required for a token in the keys of `counter` to be indexed.
-    unknown_token
-        The representation for any unknown token. If `unknown_token` is not
-        `None`, looking up any token that is not part of the vocabulary and
-        thus considered unknown will return the index of `unknown_token`. If
-        None, looking up an unknown token will result in `KeyError`.
-    reserved_tokens
-        A list specifying additional tokens to be added to the vocabulary.
-        `reserved_tokens` must not contain the value of `unknown_token` or
-        duplicate tokens. It must neither contain special tokens specified via
-        keyword arguments.
-    token_to_idx
-        If not `None`, specifies the indices of tokens to be used by the
-        vocabulary. Each token in `token_to_index` must be part of the Vocab
-        and each index can only be associated with a single token.
-        `token_to_idx` is not required to contain a mapping for all tokens. For
-        example, it is valid to only set the `unknown_token` index to 10
-        (instead of the default of 0) with `token_to_idx = {'<unk>': 10}`,
-        assuming that there are at least 10 tokens in the vocabulary.
-    `**kwargs`
-        Keyword arguments of the format `xxx_token` can be used to specify
-        further special tokens that will be exposed as attribute of the
-        vocabulary and associated with an index.
-        For example, specifying `mask_token='<mask>` as additional keyword
-        argument when constructing a vocabulary `v` leads to `v.mask_token`
-        exposing the value of the special token: `<mask>`.
-        If the specified token is not part of the Vocabulary, it will be added,
-        just as if it was listed in the `reserved_tokens` argument. The
-        specified tokens are listed together with reserved tokens in the
-        `reserved_tokens` attribute of the vocabulary object.
-    deprecated_padding_token
-        The representation for the special token of padding token. Default:
-        '<pad>'. Specifying padding_token as positional argument is deprecated
-        and support will be removed. Specify it as keyword argument instead
-        (see documentation of `**kwargs` above)
-    deprecated_bos_token
-        The representation for the special token of beginning-of-sequence
-        token. Default: '<bos>'. Specifying bos_token as positional argument is
-        deprecated and support will be removed. Specify it as keyword argument
-        instead (see documentation of `**kwargs` above)
-    deprecated_eos_token
-        The representation for the special token of end-of-sequence token.
-        Default: '<eos>'. Specifying eos_token as positional argument is
-        deprecated and support will be removed. Specify it as keyword argument
-        instead (see documentation of `**kwargs` above)
-
-    Attributes
-    ----------
-    embedding : instance of :class:`gluonnlp.embedding.TokenEmbedding`
-        The embedding of the indexed tokens.
-    idx_to_token : list of strs
-        A list of indexed tokens where the list indices and the token indices are aligned.
-    reserved_tokens : list of strs or None
-        A list of reserved tokens that will always be indexed.
-    token_to_idx : dict mapping str to int
-        A dict mapping each token to its index integer.
-    unknown_token : hashable object or None
-        The representation for any unknown token. In other words, any unknown token will be indexed
-        as the same representation.
-    padding_token : hashable object or None
-        The representation for padding token.
-    bos_token : hashable object or None
-        The representation for beginning-of-sentence token.
-    eos_token : hashable object or None
-        The representation for end-of-sentence token.
-
-
-    Examples
-    --------
-
-    >>> text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world']
-    >>> counter = gluonnlp.data.count_tokens(text_data)
-    >>> my_vocab = gluonnlp.Vocab(counter)
-    >>> fasttext = gluonnlp.embedding.create('fasttext', source='wiki.simple')
-    -etc-
-    >>> my_vocab.set_embedding(fasttext)
-    >>> my_vocab.embedding[['hello', 'world']][:, :5]
-    <BLANKLINE>
-    [[ 0.39567   0.21454  -0.035389 -0.24299  -0.095645]
-     [ 0.10444  -0.10858   0.27212   0.13299  -0.33165 ]]
-    <NDArray 2x5 @cpu(0)>
-    >>> my_vocab[['hello', 'world']]
-    [5, 4]
-
-    >>> input_dim, output_dim = my_vocab.embedding.idx_to_vec.shape
-    >>> layer = gluon.nn.Embedding(input_dim, output_dim)
-    >>> layer.initialize()
-    >>> layer.weight.set_data(my_vocab.embedding.idx_to_vec)
-    >>> layer(mx.nd.array([5, 4]))[:, :5]
-    <BLANKLINE>
-    [[ 0.39567   0.21454  -0.035389 -0.24299  -0.095645]
-     [ 0.10444  -0.10858   0.27212   0.13299  -0.33165 ]]
-    <NDArray 2x5 @cpu(0)>
-    >>> glove = gluonnlp.embedding.create('glove', source='glove.6B.50d')
-    -etc-
-    >>> my_vocab.set_embedding(glove)
-    >>> my_vocab.embedding[['hello', 'world']][:, :5]
-    <BLANKLINE>
-    [[-0.38497   0.80092   0.064106 -0.28355  -0.026759]
-     [-0.41486   0.71848  -0.3045    0.87445   0.22441 ]]
-    <NDArray 2x5 @cpu(0)>
-
-    Extra keyword arguments of the format `xxx_token` are used to expose
-    specified tokens as attributes.
-
-    >>> my_vocab2 = gluonnlp.Vocab(counter, special_token='hi')
-    >>> my_vocab2.special_token
-    'hi'
-
-    With the `token_to_idx` argument the order of the `Vocab`'s index can be
-    adapted. For example, `Vocab` assigns the index `0` to the `unknown_token`
-    by default. With the `token_to_idx` argument, the default can be
-    overwritten. Here we assign index `3` to the unknown token representation
-    `<unk>`.
-
-    >>> tok2idx = {'<unk>': 3}
-    >>> my_vocab3 = gluonnlp.Vocab(counter, token_to_idx=tok2idx)
-    >>> my_vocab3.unknown_token
-    '<unk>'
-    >>> my_vocab3[my_vocab3.unknown_token]
-    3
-    >>> my_vocab[my_vocab.unknown_token]
-    0
-
-    """
-
-    def __init__(self, counter: Optional[Counter] = None, max_size: Optional[int] = None,
-                 min_freq: int = 1, unknown_token: Optional[Hashable] = C.UNK_TOKEN,
-                 deprecated_padding_token: Optional[Hashable] = _DEPR_PAD,
-                 deprecated_bos_token: Optional[Hashable] = _DEPR_BOS,
-                 deprecated_eos_token: Optional[Hashable] = _DEPR_EOS,
-                 reserved_tokens: Optional[List[Hashable]] = None,
-                 token_to_idx: Optional[Dict[Hashable, int]] = None, *,
-                 padding_token: Optional[Hashable] = C.PAD_TOKEN,
-                 bos_token: Optional[Hashable] = C.BOS_TOKEN,
-                 eos_token: Optional[Hashable] = C.EOS_TOKEN, **kwargs):
-
-        # Sanity checks.
-        assert min_freq > 0, '`min_freq` must be set to a positive value.'
-
-        # Deprecation checks and warnings
-        combs = ((deprecated_padding_token, 'padding_token', _DEPR_PAD, padding_token),
-                 (deprecated_bos_token, 'bos_token', _DEPR_BOS, bos_token),
-                 (deprecated_eos_token, 'eos_token', _DEPR_EOS, eos_token))
-        for depr_pos_arg, name, indicator, value in combs:
-            if depr_pos_arg != indicator:
-                warnings.warn(
-                    'Specifying `{n}` as positional argument is deprecated and '
-                    'support will be removed. Please specify `{n}` as keyword argument instead, '
-                    'for example `Vocab(counter, {n}={v})`'.format(n=name, v=depr_pos_arg),
-                    DeprecationWarning)
-                # Store positional argument value in kwargs
-                kwargs[name] = depr_pos_arg
-            elif name not in kwargs:  # Store keyword argument value in kwargs
-                kwargs[name] = value
-
-        # Set up idx_to_token and token_to_idx based on presence of unknown token
-        self._unknown_token = unknown_token
-        self._idx_to_token = [unknown_token] if unknown_token else []
-        if unknown_token:
-            self._token_to_idx = DefaultLookupDict(UNK_IDX)
-        else:
-            self._token_to_idx = {}
-
-        # Handle special tokens
-        special_tokens = []
-        special_iter = kwargs.items()
-        if _is_py35():
-            special_iter = sorted(special_iter)
-        for special_token_name, special_token in special_iter:
-            # Test if kwarg specifies a special token
-            if not special_token_name.endswith('_token'):
-                raise ValueError('{} is invalid. Only keyword arguments '
-                                 'that end in \'_token\' are supported '
-                                 'to declare special tokens.'.format(special_token_name))
-
-            if special_token is not None and special_token not in special_tokens:
-                special_tokens.append(special_token)
-
-        if reserved_tokens is not None:
-            special_tokens.extend(reserved_tokens)
-            special_token_set = set(special_tokens)
-            if unknown_token:
-                assert unknown_token not in special_token_set, \
-                    '`reserved_token` cannot contain `unknown_token`.'
-            assert len(special_token_set) == len(special_tokens), \
-                '`reserved_tokens` cannot contain duplicate reserved tokens or ' \
-                'other special tokens.'
-
-        if not special_tokens:
-            self._reserved_tokens = None
-        else:
-            self._reserved_tokens = special_tokens
-            self._idx_to_token.extend(special_tokens)
-
-        self._token_to_idx.update((token, idx) for idx, token in enumerate(self._idx_to_token))
-        self._embedding = None
-
-        if counter:
-            self._index_counter_keys(counter, unknown_token, special_tokens, max_size, min_freq)
-
-        self._identifiers_to_tokens = kwargs
-        if kwargs:
-            self._expose_tokens_as_attributes(kwargs)
-
-        if token_to_idx:
-            self._sort_index_according_to_user_specification(token_to_idx)
-            if unknown_token:
-                self._token_to_idx._default = \
-                    self._token_to_idx[unknown_token]  # pytype: disable=not-writable
-
-
-    def _index_counter_keys(self, counter, unknown_token, special_tokens, max_size,
-                            min_freq):
-        """Indexes keys of `counter`.
-
-
-        Indexes keys of `counter` according to frequency thresholds such as `max_size` and
-        `min_freq`.
-        """
-
-        unknown_and_special_tokens = set(special_tokens) if special_tokens else set()
-
-        if unknown_token:
-            unknown_and_special_tokens.add(unknown_token)
-
-        token_freqs = sorted(counter.items(), key=lambda x: x[0])
-        token_freqs.sort(key=lambda x: x[1], reverse=True)
-
-        token_cap = len(unknown_and_special_tokens) + (
-            len(counter) if not max_size else max_size)
-
-        for token, freq in token_freqs:
-            if freq < min_freq or len(self._idx_to_token) == token_cap:
-                break
-            if token not in unknown_and_special_tokens:
-                self._idx_to_token.append(token)
-                self._token_to_idx[token] = len(self._idx_to_token) - 1
-
-    def _expose_tokens_as_attributes(self, identifiers_to_tokens):
-        # This method must not be called before internal attributes accessed by
-        # @properties getters are set. Otherwise the @properties may raise
-        # during the hasattr(self, identifier) check
-
-        for identifier, token in identifiers_to_tokens.items():
-            # Special tokens are automatically added to the vocab; assert, just to be sure
-            assert token is None or token in self
-            if identifier.startswith('_'):
-                raise ValueError('It is not allowed to use identifiers starting with '
-                                 'underscore. In Python identifier names beginning with '
-                                 'underscore are internal.')
-            if hasattr(self, identifier):
-                raise ValueError('vocab.{} already exists. '
-                                 'Please choose a different identifier for token {}'
-                                 .format(identifier, token))
-            setattr(self, identifier, token)
-
-    def _sort_index_according_to_user_specification(self, token_to_idx):
-        # Sanity checks
-        if not set(token_to_idx.keys()).issubset(self.token_to_idx.keys()):
-            raise ValueError('User-specified token_to_idx mapping can only contain '
-                             'tokens that will be part of the vocabulary.')
-        if len(set(token_to_idx.values())) != len(token_to_idx):
-            raise ValueError('User-specified indices must not contain duplicates.')
-        if min(token_to_idx.values()) < 0 or max(token_to_idx.values()) >= len(self.token_to_idx):
-            raise ValueError('User-specified indices must not be < 0 or >= the number of tokens '
-                             'that will be in the vocabulary. The current vocab contains {}'
-                             'tokens.'.format(len(self.token_to_idx)))
-
-        # Update index ordering
-        for token, new_idx in token_to_idx.items():
-            old_idx = self.token_to_idx[token]
-            ousted_token = self.idx_to_token[new_idx]
-
-            self.token_to_idx[token] = new_idx
-            self.token_to_idx[ousted_token] = old_idx
-            self.idx_to_token[old_idx] = ousted_token
-            self.idx_to_token[new_idx] = token
-
-    @property
-    def embedding(self):
-        return self._embedding
-
-    @property
-    def idx_to_token(self):
-        return self._idx_to_token
-
-    @property
-    def reserved_tokens(self):
-        return self._reserved_tokens
-
-    @property
-    def token_to_idx(self):
-        return self._token_to_idx
-
-    @property
-    def unknown_token(self):
-        return self._unknown_token
-
-    def __contains__(self, token):
-        """Checks whether a text token exists in the vocabulary.
-
-
-        Parameters
-        ----------
-        token : str
-            A text token.
-
-
-        Returns
-        -------
-        bool
-            Whether the text token exists in the vocabulary (including `unknown_token`).
-        """
-
-        return token in self._token_to_idx
-
-    def __getitem__(self, tokens):
-        """Looks up indices of text tokens according to the vocabulary.
-
-        If `unknown_token` of the vocabulary is None, looking up unknown tokens results in KeyError.
-
-        Parameters
-        ----------
-        tokens : str or list of strs
-            A source token or tokens to be converted.
-
-
-        Returns
-        -------
-        int or list of ints
-            A token index or a list of token indices according to the vocabulary.
-        """
-
-        if not isinstance(tokens, (list, tuple)):
-            return self._token_to_idx[tokens]
-        else:
-            return [self._token_to_idx[token] for token in tokens]
-
-    def __len__(self):
-        return len(self._idx_to_token)
-
-    def set_embedding(self, *embeddings):
-        """Attaches one or more embeddings to the indexed text tokens.
-
-
-        Parameters
-        ----------
-        embeddings : None or tuple of :class:`gluonnlp.embedding.TokenEmbedding` instances
-            The embedding to be attached to the indexed tokens. If a tuple of multiple embeddings
-            are provided, their embedding vectors will be concatenated for the same token.
-        """
-
-        if len(embeddings) == 1 and embeddings[0] is None:
-            self._embedding = None
-            return
-
-        for embs in embeddings:
-            assert isinstance(embs, emb.TokenEmbedding), \
-                'The argument `embeddings` must be an instance or a list of instances of ' \
-                '`gluonnlp.embedding.TokenEmbedding`.'
-            assert embs.idx_to_vec is not None, \
-                'For all specified `embeddings`, `embeddings.idx_to_vec` must be initialized. ' \
-                'Use eg. `emb[emb.unknown_token] = nd.zeros(emsize)` to initialize, ' \
-                'where `emsize` is the desired embedding dimensionality.'
-
-        assert all([embs.unknown_token for embs in embeddings]) or \
-            all([not embs.unknown_token for embs in embeddings]), \
-            'Either all or none of the TokenEmbeddings must have an ' \
-            'unknown_token set.'
-
-        new_vec_len = sum(embs.idx_to_vec.shape[1] for embs in embeddings)
-        # TODO(leezu): Remove once np shape is used by default
-        assert len(self), 'Empty vocab not yet supported'
-        new_idx_to_vec = nd.zeros(shape=(len(self), new_vec_len))
-
-        col_start = 0
-        # Concatenate all the embedding vectors in embedding.
-        for embs in embeddings:
-            if embs and embs.idx_to_vec is not None:
-                col_end = col_start + embs.idx_to_vec.shape[1]
-                # Cancatenate vectors of the unknown token.
-                new_idx_to_vec[0, col_start:col_end] = embs.idx_to_vec[0]
-                new_idx_to_vec[1:, col_start:col_end] = embs[self._idx_to_token[1:]]
-                col_start = col_end
-
-        self._embedding = emb.TokenEmbedding(self.unknown_token,
-                                             init_unknown_vec=None,
-                                             allow_extend=False,
-                                             idx_to_token=self.idx_to_token,
-                                             idx_to_vec=new_idx_to_vec)
-
-    def to_tokens(self, indices):
-        """Converts token indices to tokens according to the vocabulary.
-
-
-        Parameters
-        ----------
-        indices : int or list of ints
-            A source token index or token indices to be converted.
-
-
-        Returns
-        -------
-        str or list of strs
-            A token or a list of tokens according to the vocabulary.
-        """
-
-        to_reduce = False
-        if not isinstance(indices, (list, tuple)):
-            indices = [indices]
-            to_reduce = True
-
-        max_idx = len(self._idx_to_token) - 1
-
-        tokens = []
-        for idx in indices:
-            if not isinstance(idx, int) or idx > max_idx:
-                raise ValueError('Token index {} in the provided `indices` is invalid.'.format(idx))
-            tokens.append(self._idx_to_token[idx])
-
-        return tokens[0] if to_reduce else tokens
-
-    def to_indices(self, tokens):
-        """Looks up indices of text tokens according to the vocabulary.
-
-
-        Parameters
-        ----------
-        tokens : str or list of strs
-            A source token or tokens to be converted.
-
-
-        Returns
-        -------
-        int or list of ints
-            A token index or a list of token indices according to the vocabulary.
-        """
-
-        return self[tokens]
-
-    def __call__(self, tokens):
-        """Looks up indices of text tokens according to the vocabulary.
-
-
-        Parameters
-        ----------
-        tokens : str or list of strs
-            A source token or tokens to be converted.
-
-
-        Returns
-        -------
-        int or list of ints
-            A token index or a list of token indices according to the vocabulary.
-        """
-
-        return self[tokens]
-
-    def __repr__(self):
-        unk = '"{}"'.format(self._unknown_token) if self._unknown_token else 'None'
-        reserved = '"{}"'.format(self._reserved_tokens) if self._reserved_tokens else 'None'
-        return 'Vocab(size={}, unk={}, reserved={})'.format(len(self), unk, reserved)
-
-    def to_json(self):
-        """Serialize Vocab object to json string.
-
-        This method does not serialize the underlying embedding.
-        """
-        if self._embedding:
-            warnings.warn('Serialization of attached embedding '
-                          'to json is not supported. '
-                          'You may serialize the embedding to a binary format '
-                          'separately using vocab.embedding.serialize')
-        vocab_dict = {}
-        vocab_dict['idx_to_token'] = self._idx_to_token
-        vocab_dict['token_to_idx'] = dict(self._token_to_idx)
-        vocab_dict['reserved_tokens'] = self._reserved_tokens
-        vocab_dict['unknown_token'] = self._unknown_token
-        vocab_dict['identifiers_to_tokens'] = self._identifiers_to_tokens
-        return json.dumps(vocab_dict)
-
-    @classmethod
-    def from_json(cls, json_str):
-        """Deserialize Vocab object from json string.
-
-        Parameters
-        ----------
-        json_str : str
-            Serialized json string of a Vocab object.
-
-
-        Returns
-        -------
-        Vocab
-        """
-        vocab_dict = json.loads(json_str)
-        token_to_idx = vocab_dict.get('token_to_idx')
-        unknown_token = vocab_dict.get('unknown_token')
-        reserved_tokens = vocab_dict.get('reserved_tokens')
-        identifiers_to_tokens = vocab_dict.get('identifiers_to_tokens', dict())
-
-        special_tokens = {unknown_token}
-
-        # Backward compatibility for explicit serialization of padding_token,
-        # bos_token, eos_token handling in the json string as done in older
-        # versions of GluonNLP.
-        deprecated_arguments = ['padding_token', 'bos_token', 'eos_token']
-        for token_name in deprecated_arguments:
-            if token_name in vocab_dict:
-                token = vocab_dict[token_name]
-                assert token_name not in identifiers_to_tokens, 'Invalid json string. ' \
-                    '{} was serialized twice.'.format(token_name)
-                identifiers_to_tokens[token_name] = token
-
-        # Separate reserved from special tokens
-        special_tokens.update(identifiers_to_tokens.values())
-        if reserved_tokens is not None:
-            reserved_tokens = [
-                t for t in reserved_tokens if t not in special_tokens
-            ]
-
-        # Backward compatiblity code to deserialize corrupted vocabularies
-        # created without bugfix https://github.com/dmlc/gluon-nlp/pull/749
-        corrected_token_to_idx = collections.defaultdict(list)
-        idx_to_token = vocab_dict.get('idx_to_token')
-        if len(idx_to_token) > len(token_to_idx):  # Index is corrupt
-            warnings.warn(
-                'Detected a corrupted index in the deserialize vocabulary. '
-                'For versions before GluonNLP v0.7 the index is corrupted '
-                'by specifying the same token for different special purposes, '
-                'for example eos_token == padding_token. '
-                'Deserializing the vocabulary nevertheless.'
-            )
-            for token, count in collections.Counter(idx_to_token).items():
-                if count == 1:
-                    continue
-                # Introduce new tokens to avoid invalid duplicates
-                idx = -1
-                while count > 0:
-                    count -= 1
-                    idx = idx_to_token.index(token, idx + 1)
-                    if idx == token_to_idx[token]:
-                        # Valid idx
-                        continue
-                    # Introduce temporary token
-                    token_to_idx.update({str(uuid.uuid4()): idx})
-                    corrected_token_to_idx[token].append(idx)
-
-        vocab = cls(
-            counter=count_tokens(token_to_idx.keys()),
-            unknown_token=unknown_token,
-            reserved_tokens=reserved_tokens,
-            token_to_idx=token_to_idx,
-            **identifiers_to_tokens)
-
-        # Backward compatiblity code to deserialize corrupted vocabularies
-        # created without bugfix https://github.com/dmlc/gluon-nlp/pull/749
-        for token, corrected_idxs in corrected_token_to_idx.items():
-            for idx in corrected_idxs:
-                # delete temporary tokens
-                del vocab._token_to_idx[vocab._idx_to_token[idx]]
-                vocab._idx_to_token[idx] = token
-
-        return vocab
diff --git a/test.sh b/test.sh
new file mode 100755
index 0000000000..2b6c2d36ba
--- /dev/null
+++ b/test.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# Shell script for installing dependencies and running test on AWS Batch
+
+echo $PWD
+
+sudo apt-get install libopenblas-dev
+python3 -m pip install --user --quiet -upgrade pip
+python3 -m pip install --user --quiet setuptools pytest pytest-cov contextvars
+python3 -m pip install --upgrade --quiet cython
+python3 -m pip install --pre --user --quiet "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
+python3 -m pip install --user --quiet -e .[extras]
+python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" /gluon-nlp/tests/
diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 0000000000..ceeaeaf68f
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,33 @@
+# Unit Tests
+
+To run the unittests, use the following command
+
+```bash
+python3 -m pytest --device="cpu" .
+```
+
+To test for certain file, e.g., the `test_models_transformer.py`, use the following command
+
+```bash
+python3 -m pytest --device="cpu" test_models_transformer.py
+```
+
+To test only for gpu device, use the following command
+
+```bash
+python3 -m pytest --device="gpu" test_models_transformer.py
+```
+
+To test both for cpu and gpu device, use the following command
+
+```bash
+python3 -m pytest --device="cpu" --device="gpu" test_models_transformer.py
+```
+
+Refer to the [official guide of pytest](https://docs.pytest.org/en/latest/) for more details.
+
+# Naming Convention
+The naming convention of the tests are `test_{module_name}.py`. 
+For example, the test of [models/transformer.py](../src/gluonnlp/models/transformer.py) will be in 
+`test_models_transformer.py`. The test of [models/__init__.py](../src/gluonnlp/models/__init__.py) 
+is `test_models.py`. 
diff --git a/tests/data/vocab/backward_compat_0_7_corrupted_index b/tests/data/vocab/backward_compat_0_7_corrupted_index
deleted file mode 100644
index 3d97bcfef3..0000000000
--- a/tests/data/vocab/backward_compat_0_7_corrupted_index
+++ /dev/null
@@ -1 +0,0 @@
-{"eos_token": "<eos>", "unknown_token": "<unk>", "bos_token": "<bos>", "padding_token": "<eos>", "idx_to_token": ["<unk>", "<eos>", "<bos>", "<eos>", "token"], "token_to_idx": {"<unk>": 0, "<bos>": 2, "<eos>": 3, "token": 4}, "reserved_tokens": ["<eos>", "<bos>", "<eos>"]}
diff --git a/tests/test_attention_cell.py b/tests/test_attention_cell.py
new file mode 100644
index 0000000000..c7166f19c9
--- /dev/null
+++ b/tests/test_attention_cell.py
@@ -0,0 +1,390 @@
+import numpy as np
+from numpy.testing import assert_allclose
+import pytest
+import mxnet as mx
+from mxnet.gluon import HybridBlock
+from gluonnlp.attention_cell import\
+    multi_head_dot_attn, gen_self_attn_mask, gen_mem_attn_mask,\
+    MultiHeadAttentionCell,\
+    RelAttentionScoreCell
+from gluonnlp.utils.parameter import grad_global_norm
+mx.npx.set_np()
+
+
+@pytest.mark.parametrize('num_heads', [1, 2, 3])
+@pytest.mark.parametrize('scaled', [True, False])
+@pytest.mark.parametrize('normalized', [True, False])
+@pytest.mark.parametrize('hybridize', [True, False])
+@pytest.mark.parametrize('rel_score_type', ['share_head', 'no_share_head', 'no'])
+@pytest.mark.seed(123)
+def test_multi_head_dot_attention_cell(num_heads, scaled, normalized, hybridize, rel_score_type, ctx):
+    with ctx:
+        batch_size = 5
+        query_length, mem_length = 16, 32
+        query_head_units = 8
+        mem_head_units = 6
+        query_units = query_head_units * num_heads
+        mem_units = mem_head_units * num_heads
+        seed = 100
+        attn_cells = dict()
+        for layout in ['NKT', 'NTK', 'TNK']:
+            for use_einsum in [False, True]:
+                attn_cells[(layout, use_einsum)] = MultiHeadAttentionCell(
+                    query_units=query_units,
+                    num_heads=num_heads,
+                    attention_dropout=0.0,
+                    scaled=scaled,
+                    normalized=normalized,
+                    layout=layout,
+                    use_einsum=use_einsum)
+                if hybridize:
+                    attn_cells[(layout, use_einsum)].hybridize()
+        # Generate the data
+        query_np = np.random.normal(0, 1, (batch_size, num_heads, query_length, query_head_units))
+        key_np = np.random.normal(0, 1, (batch_size, num_heads, mem_length, query_head_units))
+        value_np = np.random.normal(0, 1, (batch_size, num_heads, mem_length, mem_head_units))
+        mask_np = np.random.randint(0, 2, (batch_size, query_length, mem_length))
+        if rel_score_type == 'share_head':
+            rel_scores_np = np.random.normal(0, 1, (query_length, mem_length))
+        elif rel_score_type == 'no_share_head':
+            rel_scores_np = np.random.normal(0, 1, (num_heads, query_length, mem_length))
+        else:
+            rel_scores_np = None
+        out_np = None
+        score_np = None
+        attn_weights_np = None
+        stored_layout = None
+        query_grad_np = None
+        key_grad_np = None
+        value_grad_np = None
+        rel_scores_grad_np = None
+        for (layout, use_einsum), attn_cell in attn_cells.items():
+            mx.npx.random.seed(seed)
+            if rel_score_type != 'no':
+                rel_scores = mx.np.array(rel_scores_np, dtype=np.float32)
+            else:
+                rel_scores = None
+            if layout == 'NKT':
+                query = mx.np.array(query_np, dtype=np.float32)
+                key = mx.np.array(key_np, dtype=np.float32)
+                value = mx.np.array(value_np, dtype=np.float32)
+            elif layout == 'NTK':
+                query = mx.np.array(query_np.transpose((0, 2, 1, 3)), dtype=np.float32)
+                key = mx.np.array(key_np.transpose((0, 2, 1, 3)), dtype=np.float32)
+                value = mx.np.array(value_np.transpose((0, 2, 1, 3)), dtype=np.float32)
+            elif layout == 'TNK':
+                query = mx.np.array(query_np.transpose((2, 0, 1, 3)), dtype=np.float32)
+                key = mx.np.array(key_np.transpose((2, 0, 1, 3)), dtype=np.float32)
+                value = mx.np.array(value_np.transpose((2, 0, 1, 3)), dtype=np.float32)
+            else:
+                raise NotImplementedError
+            mask = mx.np.array(mask_np, dtype=np.int32)
+            query.attach_grad()
+            key.attach_grad()
+            value.attach_grad()
+            if rel_scores is not None:
+                rel_scores.attach_grad()
+            with mx.autograd.record():
+                out, [score, attn_weights] = attn_cell(query, key, value, mask, rel_scores)
+                out.backward()
+            if layout == 'NKT':
+                assert out.shape == (batch_size, query_length, num_heads * mem_head_units)
+            elif layout == 'NTK':
+                assert out.shape == (batch_size, query_length, num_heads * mem_head_units)
+            elif layout == 'TNK':
+                assert out.shape == (query_length, batch_size, num_heads * mem_head_units)
+            else:
+                raise NotImplementedError
+            for i in range(num_heads):
+                assert_allclose(attn_weights[:, i, :, :][mask == 0].asnumpy(),
+                                mask[mask == 0].astype(np.float32).asnumpy(), 1E-5, 1E-5)
+
+            if stored_layout is None:
+                out_np = out.asnumpy()
+                score_np = score.asnumpy()
+                attn_weights_np = attn_weights.asnumpy()
+                stored_layout = layout
+                query_grad_np = query.grad.asnumpy()
+                key_grad_np = key.grad.asnumpy()
+                value_grad_np = value.grad.asnumpy()
+                if rel_score_type != 'no':
+                    rel_scores_grad_np = rel_scores.grad.asnumpy()
+            else:
+                assert stored_layout == 'NKT'
+                # Begin to match the output
+                if layout == 'NKT':
+                    m_out_np = out.asnumpy()
+                    m_score_np = score.asnumpy()
+                    m_attn_weights_np = attn_weights.asnumpy()
+                    m_query_grad_np = query.grad.asnumpy()
+                    m_key_grad_np = key.grad.asnumpy()
+                    m_value_grad_np = value.grad.asnumpy()
+                    if rel_score_type != 'no':
+                        m_rel_scores_grad_np = rel_scores.grad.asnumpy()
+                elif layout == 'NTK':
+                    m_out_np = out.asnumpy()
+                    m_score_np = score.asnumpy()
+                    m_attn_weights_np = attn_weights.asnumpy()
+                    m_query_grad_np = query.grad.asnumpy().transpose((0, 2, 1, 3))
+                    m_key_grad_np = key.grad.asnumpy().transpose((0, 2, 1, 3))
+                    m_value_grad_np = value.grad.asnumpy().transpose((0, 2, 1, 3))
+                    if rel_score_type != 'no':
+                        m_rel_scores_grad_np = rel_scores.grad.asnumpy()
+                elif layout == 'TNK':
+                    m_out_np = out.asnumpy().transpose((1, 0, 2))
+                    m_score_np = score.asnumpy()
+                    m_attn_weights_np = attn_weights.asnumpy()
+                    m_query_grad_np = query.grad.asnumpy().transpose((1, 2, 0, 3))
+                    m_key_grad_np = key.grad.asnumpy().transpose((1, 2, 0, 3))
+                    m_value_grad_np = value.grad.asnumpy().transpose((1, 2, 0, 3))
+                    if rel_score_type != 'no':
+                        m_rel_scores_grad_np = rel_scores.grad.asnumpy()
+                else:
+                    raise NotImplementedError
+                assert_allclose(m_out_np, out_np, 1E-5, 1E-5)
+                assert_allclose(m_score_np, score_np, 1E-5, 1E-5)
+                assert_allclose(m_attn_weights_np, attn_weights_np, 1E-5, 1E-5)
+                assert_allclose(m_query_grad_np, query_grad_np, 1E-5, 1E-5)
+                assert_allclose(m_key_grad_np, key_grad_np, 1E-5, 1E-5)
+                assert_allclose(m_value_grad_np, value_grad_np, 1E-5, 1E-5)
+                if rel_score_type != 'no':
+                    assert_allclose(m_rel_scores_grad_np, rel_scores_grad_np, 1E-5, 1E-5)
+
+
+@pytest.mark.parametrize('scaled', [True, False])
+@pytest.mark.parametrize('normalized', [True, False])
+@pytest.mark.seed(123)
+def test_dot_product_attention(scaled, normalized, ctx):
+    with ctx:
+        num_heads = 4
+        batch_size = 32
+        query_length, mem_length = 16, 32
+        num_channel = 8
+        query = mx.np.random.normal(0, 1, (batch_size, num_heads, query_length, num_channel))
+        key = mx.np.random.normal(0, 1, (batch_size, num_heads, mem_length, num_channel))
+        value = mx.np.random.normal(0, 1, (batch_size, num_heads, mem_length, num_channel))
+        mask = mx.np.random.randint(0, 2, (batch_size, query_length, mem_length))
+        out, [score, attn_weights] = multi_head_dot_attn(mx.nd, query, key, value, mask,
+                                                         scaled=scaled, normalized=normalized)
+        assert out.shape == (batch_size, query_length, num_heads * num_channel)
+        for i in range(num_heads):
+            assert_allclose(attn_weights[:, i, :, :][mask == 0].asnumpy(),
+                            mask[mask == 0].astype(np.float32).asnumpy(), 1E-5, 1E-5)
+
+
+@pytest.mark.seed(123)
+def test_gen_attn_mask(ctx):
+    class GenSelfAttnMask(HybridBlock):
+        def __init__(self, dtype, layout, attn_type):
+            super().__init__()
+            self._dtype = dtype
+            self._layout = layout
+            self._attn_type = attn_type
+
+        def hybrid_forward(self, F, data, valid_length):
+            return gen_self_attn_mask(F, data, valid_length,
+                                      dtype=self._dtype,
+                                      layout=self._layout,
+                                      attn_type=self._attn_type)
+
+    class GenMemAttnMask(HybridBlock):
+        def __init__(self, dtype, layout):
+            super().__init__()
+            self._dtype = dtype
+            self._layout = layout
+
+        def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length):
+            return gen_mem_attn_mask(F, mem, mem_valid_length, data, valid_length,
+                                     dtype=self._dtype, layout=self._layout)
+
+    with ctx:
+        batch_size = 4
+        query_length = 8
+        mem_length = 6
+        nchannel = 5
+        data = mx.np.random.normal(0, 1, (batch_size, query_length, nchannel), dtype=np.float32)
+        valid_length = mx.np.random.randint(1, query_length + 1, (batch_size,))
+
+        mem = mx.np.random.normal(0, 1, (batch_size, mem_length, nchannel), dtype=np.float32)
+        mem_valid_length = mx.np.random.randint(1, mem_length + 1, (batch_size,))
+
+        for hybridize in [False, True]:
+            # Test Full Attention Mask
+            mask_gen_nt = GenSelfAttnMask(dtype=np.float32, layout='NT', attn_type='full')
+            mask_gen_tn = GenSelfAttnMask(dtype=np.float32, layout='TN', attn_type='full')
+            if hybridize:
+                mask_gen_nt.hybridize()
+                mask_gen_tn.hybridize()
+            mask_nt = mask_gen_nt(data, valid_length)
+            mask_nt = mask_nt.asnumpy()
+            mask_tn = mask_gen_tn(mx.np.swapaxes(data, 0, 1), valid_length)
+            mask_tn = mask_tn.asnumpy()
+            mask = mask_nt
+            assert_allclose(mask_nt, mask_tn)
+            for b in range(batch_size):
+                v_l = valid_length.asnumpy()[b]
+                for i in range(v_l):
+                    assert (mask[b, i, :v_l] == 1).all()
+                    assert(mask[b, i, v_l:] == 0).all()
+                for i in range(v_l, query_length):
+                    assert (mask[b, i, :] == 0).all()
+
+            # Test Causal Attention Mask
+            mask_gen_nt = GenSelfAttnMask(dtype=np.float32, layout='NT', attn_type='causal')
+            mask_gen_tn = GenSelfAttnMask(dtype=np.float32, layout='TN', attn_type='causal')
+            if hybridize:
+                mask_gen_nt.hybridize()
+                mask_gen_tn.hybridize()
+            mask_nt = mask_gen_nt(data, valid_length)
+            mask_tn = mask_gen_tn(mx.np.swapaxes(data, 0, 1), valid_length)
+            assert_allclose(mask_nt.asnumpy(), mask_tn.asnumpy())
+            mask = mask_nt.asnumpy()
+            for b in range(batch_size):
+                v_l = valid_length.asnumpy()[b]
+                for i in range(v_l):
+                    assert (mask[b, i, :(i + 1)] == 1).all()
+                    assert (mask[b, i, (i + 1):] == 0).all()
+                for i in range(v_l, query_length):
+                    assert (mask[b, i, :] == 0).all()
+
+            # Test Mem Attention Mask
+            mask_gen_nt = GenMemAttnMask(dtype=np.float32, layout='NT')
+            mask_gen_tn = GenMemAttnMask(dtype=np.float32, layout='TN')
+            if hybridize:
+                mask_gen_nt.hybridize()
+                mask_gen_tn.hybridize()
+            mask_nt = mask_gen_nt(mem, mem_valid_length, data, valid_length)
+            mask_tn = mask_gen_tn(mx.np.swapaxes(mem, 0, 1), mem_valid_length,
+                                  mx.np.swapaxes(data, 0, 1), valid_length)
+            mask = mask_nt.asnumpy()
+            assert_allclose(mask_nt.asnumpy(), mask_tn.asnumpy())
+            for b in range(batch_size):
+                data_v_l = valid_length.asnumpy()[b]
+                mem_v_l = mem_valid_length.asnumpy()[b]
+                for i in range(data_v_l):
+                    assert (mask[b, i, :mem_v_l] == 1).all()
+                    assert (mask[b, i, mem_v_l:] == 0).all()
+                for i in range(data_v_l, query_length):
+                    assert (mask[b, i, :] == 0).all()
+
+
+@pytest.mark.parametrize('num_heads', [1, 2, 3])
+@pytest.mark.parametrize('method', ['transformer_xl', 'shaw', 't5'])
+@pytest.mark.parametrize('bidirectional', [False, True])
+@pytest.mark.parametrize('hybridize', [False, True])
+@pytest.mark.seed(123)
+def test_multi_head_rel_attn_score(num_heads, method, bidirectional, hybridize, ctx):
+    batch_size = 6
+    query_length = 25
+    mem_length = 20
+    query_head_units = 7
+
+    # Initialize the attention cell with relative positional embedding
+    base_layout = 'NKT'
+    base_use_einsum = False
+    if method == 'shaw':
+        num_buckets = None
+        max_distance = 20
+    elif method == 't5':
+        num_buckets = 10
+        max_distance = 20
+    elif method == 'transformer_xl':
+        num_buckets = None
+        max_distance = None
+    else:
+        raise NotImplementedError
+    base_score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units,
+                                            num_heads=num_heads,
+                                            dropout=0.0,
+                                            method=method,
+                                            num_buckets=num_buckets,
+                                            max_distance=max_distance,
+                                            layout=base_layout,
+                                            use_einsum=base_use_einsum)
+    base_score_cell.initialize()
+    if hybridize:
+        base_score_cell.hybridize()
+    # Generate the data
+    query = mx.np.random.normal(0, 1,
+                                (batch_size, num_heads, query_length, query_head_units),
+                                dtype=np.float32)
+    if method != 't5':
+        rel_score_grad = mx.np.random.normal(0, 1, (batch_size, num_heads, query_length, mem_length),
+                                             dtype=np.float32)
+    else:
+        rel_score_grad = mx.np.random.normal(0, 1,
+                                             (num_heads, query_length, mem_length),
+                                             dtype=np.float32)
+    query_positions = mx.np.arange(query_length, dtype=np.int32)
+    mem_positions = mx.np.arange(mem_length, dtype=np.int32)
+    rel_positions = mx.np.expand_dims(query_positions, axis=-1)\
+                    - mx.np.expand_dims(mem_positions, axis=0)
+    mask = mx.np.random.randint(0, 2, (batch_size, query_length, mem_length), dtype=np.int32)
+    query.attach_grad()
+    with mx.autograd.record():
+        rel_score = base_score_cell(rel_positions, query)
+        rel_score.backward(rel_score_grad)
+    original_rel_score = rel_score.asnumpy()
+    original_grad_norm = grad_global_norm(base_score_cell.collect_params().values())
+    original_query_grad_norm = np.linalg.norm(query.grad.asnumpy())
+    assert original_grad_norm > 0
+    # 1. Test for permutation equivariant
+    # We can permutate the query, rel_positions and the rel_score_grad and the result should
+    # always be the same.
+    query_perm = mx.np.array(np.random.permutation(query_length), dtype=np.int32)
+    mem_perm = mx.np.array(np.random.permutation(mem_length, ), dtype=np.int32)
+
+    query.grad[:] = 0
+    with mx.autograd.record():
+        rel_score = base_score_cell(rel_positions[query_perm, :][:, mem_perm],
+                                    query[:, :, query_perm, :])
+        if method != 't5':
+            rel_score.backward(rel_score_grad[:, :, query_perm, :][:, :, :, mem_perm])
+        else:
+            rel_score.backward(rel_score_grad[:, query_perm, :][:, :, mem_perm])
+    permutated_out = rel_score.asnumpy()
+    permutated_grad_norm = grad_global_norm(base_score_cell.collect_params().values())
+    permutated_query_grad_norm = np.linalg.norm(query.grad.asnumpy())
+    if method != 't5':
+        assert_allclose(
+            original_rel_score[:, :, query_perm.asnumpy(), :][:, :, :, mem_perm.asnumpy()],
+            permutated_out, 1E-4, 1E-4)
+    else:
+        assert_allclose(original_rel_score[:, query_perm.asnumpy(), :][:, :, mem_perm.asnumpy()],
+                        permutated_out, 1E-4, 1E-4)
+    assert_allclose(permutated_grad_norm, original_grad_norm, 1E-4, 1E-4)
+    assert_allclose(permutated_query_grad_norm, original_query_grad_norm, 1E-4, 1E-4)
+    # 2. Test for different layout + use/not use einsum
+    for layout in ['NKT', 'NTK', 'TNK']:
+        for use_einsum in [False, True]:
+            if layout == base_layout and use_einsum == base_use_einsum:
+                continue
+            score_cell = RelAttentionScoreCell(query_units=num_heads * query_head_units,
+                                               num_heads=num_heads,
+                                               dropout=0.0,
+                                               method=method,
+                                               num_buckets=num_buckets,
+                                               max_distance=max_distance,
+                                               layout=layout,
+                                               use_einsum=use_einsum)
+            score_cell.initialize()
+            if hybridize:
+                score_cell.hybridize()
+            score_cell.load_dict({name: param.data() for name, param in base_score_cell.collect_params().items()})
+            query.attach_grad()
+            query.grad[:] = 0
+            with mx.autograd.record():
+                if layout == 'NKT':
+                    rel_score = score_cell(rel_positions, query)
+                    rel_score.backward(rel_score_grad)
+                elif layout == 'NTK':
+                    rel_score = score_cell(rel_positions, query.transpose((0, 2, 1, 3)))
+                    rel_score.backward(rel_score_grad)
+                elif layout == 'TNK':
+                    rel_score = score_cell(rel_positions, query.transpose((2, 0, 1, 3)))
+                    rel_score.backward(rel_score_grad)
+                else:
+                    raise NotImplementedError
+            assert_allclose(rel_score.asnumpy(), original_rel_score, 1E-5, 1E-5)
+            layout_query_grad_norm = np.linalg.norm(query.grad.asnumpy())
+            assert_allclose(layout_query_grad_norm, original_query_grad_norm, 1E-5, 1E-5)
diff --git a/tests/unittest/batchify/test_batchify.py b/tests/test_data_batchify.py
similarity index 60%
rename from tests/unittest/batchify/test_batchify.py
rename to tests/test_data_batchify.py
index 943d1b397d..ef03a60e21 100644
--- a/tests/unittest/batchify/test_batchify.py
+++ b/tests/test_data_batchify.py
@@ -3,43 +3,37 @@
 from collections import namedtuple
 import mxnet as mx
 from gluonnlp.data import batchify
-
 import pytest
 
+mx.npx.set_np()
 
 def test_list():
     data = [object() for _ in range(5)]
     passthrough = batchify.List()(data)
     assert passthrough == data
 
-MyNamedTuple = namedtuple('MyNamedTuple', ['data', 'label'])
+_TestNamedTuple = namedtuple('_TestNamedTuple', ['data', 'label'])
 
 
 def test_named_tuple():
-    a = MyNamedTuple([1, 2, 3, 4], 0)
-    b = MyNamedTuple([5, 7], 1)
-    c = MyNamedTuple([1, 2, 3, 4, 5, 6, 7], 0)
+    a = _TestNamedTuple([1, 2, 3, 4], 0)
+    b = _TestNamedTuple([5, 7], 1)
+    c = _TestNamedTuple([1, 2, 3, 4, 5, 6, 7], 0)
     with pytest.raises(ValueError):
-        wrong_batchify_fn = batchify.NamedTuple(MyNamedTuple, {
-            'data0': batchify.Pad(pad_val=0),
-            'label': batchify.Stack()
-        })
+        wrong_batchify_fn = batchify.NamedTuple(_TestNamedTuple, {'data0': batchify.Pad(), 'label': batchify.Stack()})
     with pytest.raises(ValueError):
-        wrong_batchify_fn = batchify.NamedTuple(
-            MyNamedTuple,
-            [batchify.Pad(pad_val=0), batchify.Stack(),
-             batchify.Stack()])
+        wrong_batchify_fn = batchify.NamedTuple(_TestNamedTuple, [batchify.Pad(), batchify.Stack(), batchify.Stack()])
     with pytest.raises(ValueError):
-        wrong_batchify_fn = batchify.NamedTuple(MyNamedTuple, (batchify.Pad(pad_val=0), ))
+        wrong_batchify_fn = batchify.NamedTuple(_TestNamedTuple, (batchify.Pad(),))
     with pytest.raises(ValueError):
-        wrong_batchify_fn = batchify.NamedTuple(MyNamedTuple, [1, 2])
-    for batchify_fn in [batchify.NamedTuple(MyNamedTuple, {'data': batchify.Pad(pad_val=0), 'label': batchify.Stack()}),
-                        batchify.NamedTuple(MyNamedTuple, [batchify.Pad(pad_val=0), batchify.Stack()]),
-                        batchify.NamedTuple(MyNamedTuple, (batchify.Pad(pad_val=0), batchify.Stack()))]:
+        wrong_batchify_fn = batchify.NamedTuple(_TestNamedTuple, [1, 2])
+    for batchify_fn in [batchify.NamedTuple(_TestNamedTuple, {'data': batchify.Pad(), 'label': batchify.Stack()}),
+                        batchify.NamedTuple(_TestNamedTuple, [batchify.Pad(), batchify.Stack()]),
+                        batchify.NamedTuple(_TestNamedTuple, (batchify.Pad(), batchify.Stack()))]:
         sample = batchify_fn([a, b, c])
-        gt_data = batchify.Pad(pad_val=0)([a[0], b[0], c[0]])
+        gt_data = batchify.Pad()([a[0], b[0], c[0]])
         gt_label = batchify.Stack()([a[1], b[1], c[1]])
-        assert isinstance(sample, MyNamedTuple)
+        assert isinstance(sample, _TestNamedTuple)
         assert_allclose(sample.data.asnumpy(), gt_data.asnumpy())
         assert_allclose(sample.label.asnumpy(), gt_label.asnumpy())
         with pytest.raises(ValueError):
@@ -51,12 +45,12 @@ def test_dict():
     b = {'data': [5, 7], 'label': 1}
     c = {'data': [1, 2, 3, 4, 5, 6, 7], 'label': 0}
     with pytest.raises(ValueError):
-        wrong_batchify_fn = batchify.Dict([batchify.Pad(pad_val=0), batchify.Stack()])
+        wrong_batchify_fn = batchify.Dict([batchify.Pad(), batchify.Stack()])
     with pytest.raises(ValueError):
-        wrong_batchify_fn = batchify.NamedTuple(MyNamedTuple, {'a': 1, 'b': 2})
-    batchify_fn = batchify.Dict({'data': batchify.Pad(pad_val=0), 'label': batchify.Stack()})
+        wrong_batchify_fn = batchify.NamedTuple(_TestNamedTuple, {'a': 1, 'b': 2})
+    batchify_fn = batchify.Dict({'data': batchify.Pad(), 'label': batchify.Stack()})
     sample = batchify_fn([a, b, c])
-    gt_data = batchify.Pad(pad_val=0)([a['data'], b['data'], c['data']])
+    gt_data = batchify.Pad()([a['data'], b['data'], c['data']])
     gt_label = batchify.Stack()([a['label'], b['label'], c['label']])
     assert isinstance(sample, dict)
     assert_allclose(sample['data'].asnumpy(), gt_data.asnumpy())
@@ -64,16 +58,9 @@ def test_dict():
 
 
 def test_pad():
-    with pytest.warns(UserWarning):
-        # UserWarning: Using Pad with NDArrays is discouraged for speed reasons.
-        padded = batchify.Pad(pad_val=-1)([mx.nd.array([]),
-                                           mx.nd.arange(1)]).asnumpy().flatten().tolist()
+    padded = batchify.Pad(val=-1)([mx.np.array([]), mx.np.arange(1)]).asnumpy().flatten().tolist()
     assert padded == [-1.0, 0.0]
-    with pytest.warns(UserWarning):
-        # UserWarning: Using Pad with NDArrays is discouraged for speed reasons.
-        padded = batchify.Pad(pad_val=-1,
-                              round_to=2)([mx.nd.array([]),
-                                           mx.nd.arange(1)]).asnumpy().flatten().tolist()
+    padded = batchify.Pad(val=-1, round_to=2)([mx.np.array([]), mx.np.arange(1)]).asnumpy().flatten().tolist()
     assert padded == [-1.0, -1.0, 0.0, -1.0]
 
 
@@ -115,20 +102,14 @@ def _verify_padded_arr(padded_arr, original_arr, pad_axis, pad_val, pad_length,
                                 shapes[i][axis] = np.random.randint(length_min, length_max)
                             random_data_npy = [np.random.normal(0, 1, shape).astype(dtype)
                                                for shape in shapes]
-                            batchify_fn = batchify.Pad(axis=axis, pad_val=pad_val, ret_length=True, dtype=_dtype)
-                            batch_data, valid_length = batchify_fn(random_data_npy)
-                            with pytest.warns(UserWarning):
-                                # UserWarning: Using Pad with NDArrays is discouraged for speed reasons.
-                                batch_data_use_mx, valid_length_use_mx = batchify_fn(
-                                    [mx.nd.array(ele, dtype=dtype) for ele in random_data_npy])
+                            batchify_fn = batchify.Pad(axis=axis, val=pad_val, dtype=_dtype)
+                            batch_data = batchify_fn(random_data_npy)
+                            batch_data_use_mx = batchify_fn(
+                                [mx.np.array(ele, dtype=dtype) for ele in random_data_npy])
                             assert_allclose(batch_data_use_mx.asnumpy(), batch_data.asnumpy())
-                            assert_allclose(valid_length_use_mx.asnumpy(), valid_length.asnumpy())
                             assert batch_data.dtype == batch_data_use_mx.dtype == dtype
-                            assert valid_length.dtype == valid_length_use_mx.dtype == np.int32
-                            valid_length = valid_length.asnumpy()
                             batch_data = batch_data.asnumpy()
                             for i in range(batch_size):
-                                assert (valid_length[i] == shapes[i][axis])
                                 pad_length = max(shape[axis] for shape in shapes)
                                 _verify_padded_arr(batch_data[i], random_data_npy[i], axis, pad_val, pad_length, dtype)
                             # Each instance contains 3 arrays, we pad part of them according to index
@@ -144,53 +125,40 @@ def _verify_padded_arr(padded_arr, original_arr, pad_axis, pad_val, pad_length,
                                 batchify_fn = []
                                 for j in range(TOTAL_ELE_NUM):
                                     if j in pad_index:
-                                        batchify_fn.append(batchify.Pad(axis=axis, pad_val=pad_val, ret_length=True,
+                                        batchify_fn.append(batchify.Pad(axis=axis, val=pad_val,
                                                                         dtype=_dtype))
                                     else:
                                         batchify_fn.append(batchify.Stack(dtype=_dtype))
                                 batchify_fn = batchify.Tuple(batchify_fn)
                                 ret_use_npy = batchify_fn(random_data_npy)
-                                with pytest.warns(UserWarning):
-                                    # Using Pad with NDArrays is discouraged for speed reasons.
-                                    ret_use_mx = batchify_fn([tuple(mx.nd.array(ele[i], dtype=dtype)
-                                                                    for i in range(TOTAL_ELE_NUM))
-                                                              for ele in random_data_npy])
+                                ret_use_mx = batchify_fn(
+                                    [tuple(mx.np.array(ele[i], dtype=dtype) for i in range(TOTAL_ELE_NUM)) for ele in
+                                     random_data_npy])
                                 for i in range(TOTAL_ELE_NUM):
                                     if i in pad_index:
                                         assert ret_use_npy[i][0].dtype == ret_use_mx[i][0].dtype == dtype
-                                        assert ret_use_npy[i][1].dtype == ret_use_mx[i][1].dtype == np.int32
                                         assert_allclose(ret_use_npy[i][0].asnumpy(),
                                                         ret_use_mx[i][0].asnumpy())
-                                        assert_allclose(ret_use_npy[i][1].asnumpy(),
-                                                        ret_use_mx[i][1].asnumpy())
-                                        assert (ret_use_npy[i][1].shape == (batch_size,))
                                     else:
                                         assert ret_use_npy[i].dtype == ret_use_mx[i].dtype == dtype
                                         assert_allclose(ret_use_npy[i].asnumpy(), ret_use_mx[i].asnumpy())
                                 for i in range(batch_size):
                                     for j in range(TOTAL_ELE_NUM):
                                         if j in pad_index:
-                                            batch_data, valid_length = ret_use_npy[j][0].asnumpy(), \
-                                                                       ret_use_npy[j][1].asnumpy()
-                                            assert (valid_length[i] == shapes[j][i][axis])
+                                            batch_data = ret_use_npy[j].asnumpy()
                                         else:
                                             batch_data = ret_use_npy[j].asnumpy()
                                         pad_length = max(ele[j].shape[axis] for ele in random_data_npy)
-                                        _verify_padded_arr(batch_data[i], random_data_npy[i][j], axis, pad_val,
-                                                           pad_length,
-                                                           dtype)
+                                        _verify_padded_arr(batch_data[i], random_data_npy[i][j],
+                                                           axis, pad_val, pad_length, dtype)
                         for _dtype in [np.float16, np.float32]:
                             shapes = [[2 for _ in range(ndim)] for _ in range(batch_size)]
                             for i in range(len(shapes)):
                                 shapes[i][axis] = np.random.randint(length_min, length_max)
                             random_data_npy = [np.random.normal(0, 1, shape).astype(dtype)
                                                for shape in shapes]
-                            batchify_fn = batchify.Pad(axis=axis, pad_val=pad_val, ret_length=True, dtype=_dtype)
-                            batch_data, valid_length = batchify_fn(random_data_npy)
-                            with pytest.warns(UserWarning):
-                                # UserWarning: Using Pad with NDArrays is discouraged for speed reasons.
-                                batch_data_use_mx, valid_length_use_mx = batchify_fn(
-                                    [mx.nd.array(ele, dtype=dtype) for ele in random_data_npy])
-                            assert_allclose(valid_length_use_mx.asnumpy(), valid_length.asnumpy())
+                            batchify_fn = batchify.Pad(axis=axis, val=pad_val, dtype=_dtype)
+                            batch_data = batchify_fn(random_data_npy)
+                            batch_data_use_mx = batchify_fn(
+                                [mx.np.array(ele, dtype=dtype) for ele in random_data_npy])
                             assert batch_data.dtype == batch_data_use_mx.dtype == _dtype
-                            assert valid_length.dtype == valid_length_use_mx.dtype == np.int32
diff --git a/tests/test_data_filtering.py b/tests/test_data_filtering.py
new file mode 100644
index 0000000000..4c3d9e575c
--- /dev/null
+++ b/tests/test_data_filtering.py
@@ -0,0 +1,37 @@
+import pytest
+from gluonnlp.data.filtering import ProfanityFilter, MosesNormalizer, LanguageIdentifier
+import multiprocessing
+
+
+def test_profanity_filter():
+    profanity_filter = ProfanityFilter('en')
+    filter_word = 'anal'
+    unfilter_word = 'analysis'
+    for text in [' ' + filter_word, ' ' + filter_word + ' ',
+                 filter_word, filter_word + ' ' + unfilter_word]:
+        assert profanity_filter.match(text) is True
+    for text in [' ' + unfilter_word, unfilter_word, unfilter_word + ' ']:
+        assert profanity_filter.match(text) is False
+
+
+def test_sentence_normalizer():
+    normalizer = MosesNormalizer('en')
+    assert normalizer('    hello  world!!".\t\t\r') == ' hello world!!."  '
+    assert normalizer(
+        b'We therefore defend, and will continue to defend wherever necessary, our position of \xe2\x80\x98no diversion\xe2\x80\x99.\n'.decode('utf-8')) == \
+           "We therefore defend, and will continue to defend wherever necessary, our position of 'no diversion'. "
+    normalizer = MosesNormalizer('en', remove_non_printable_char=False)
+    assert normalizer('    hello  world!!".\t\t\r') == ' hello world!!."\t\t'
+    normalizer = MosesNormalizer('en', remove_non_printable_char=False, unicode_norm_form='NFKC')
+    assert normalizer('    hello  world!!"⁵.\t\t\r') == ' hello world!!"5.\t\t'
+
+
+@pytest.mark.parametrize('algo', ['fasttext', 'fasttext_compressed', 'langid'])
+def test_language_identifier(algo):
+    lang_id_model = LanguageIdentifier(algo=algo)
+    lang_label, score = lang_id_model('你好，世界')
+    assert lang_label == 'zh'
+    with multiprocessing.Pool(2) as pool:
+        out = pool.map(lang_id_model, ['你好，世界', 'Hello World'])
+    assert out[0][0] == 'zh'
+    assert out[1][0] == 'en'
diff --git a/tests/test_data_loading.py b/tests/test_data_loading.py
new file mode 100644
index 0000000000..1a69a45e32
--- /dev/null
+++ b/tests/test_data_loading.py
@@ -0,0 +1,85 @@
+import os
+import sys
+import tempfile
+import pytest
+
+import mxnet as mx
+import numpy as np
+from numpy.testing import assert_almost_equal
+
+from gluonnlp.data.loading import NumpyDataset, DatasetLoader
+from gluonnlp.data.sampler import SplitSampler, FixedBucketSampler
+
+mx.npx.set_np()
+
+
+def prepare_dataset(filename, allow_pickle=False):
+    return NumpyDataset(filename[0], allow_pickle=allow_pickle)
+
+
+def prepare_bucket_sampler(dataset, batch_size, shuffle=False, num_buckets=1):
+    lengths = dataset.transform(lambda x: len(x))
+    sampler = FixedBucketSampler(lengths,
+                                 batch_size=batch_size,
+                                 num_buckets=num_buckets,
+                                 ratio=0,
+                                 shuffle=shuffle)
+    return sampler
+
+
+@pytest.mark.skipif(sys.version_info >= (3, 8),
+                    reason='The test fails everytime in python3.8 due to the issues'
+                           ' in MXNet: '
+                           'https://github.com/apache/incubator-mxnet/issues/17782, '
+                           'https://github.com/apache/incubator-mxnet/issues/17774')
+def test_dataset_loader():
+    with tempfile.TemporaryDirectory() as root:
+        num_files = 5
+        for i in range(num_files):
+            np.save(os.path.join(root, 'part_{}.npy'.format(i)),
+                    np.random.uniform(size=(100, 20)))
+        data = os.path.join(root)
+        split_sampler = SplitSampler(num_files, num_parts=1, part_index=0, shuffle=False)
+
+        dataset_params = {'allow_pickle': True}
+        sampler_params = {'batch_size': 2}
+        all_data = np.load(os.path.join(data, 'part_{}.npy'.format(0)))
+        for i in range(1, num_files):
+            all_data = np.concatenate((all_data,
+                                       np.load(os.path.join(data, 'part_{}.npy'.format(i)))))
+        for num_dataset_workers in [1, 2]:
+            for num_batch_workers in [1, 2]:
+                dataloader = DatasetLoader(os.path.join(data, '*.npy'),
+                                           file_sampler=split_sampler,
+                                           dataset_fn=prepare_dataset,
+                                           dataset_params=dataset_params,
+                                           batch_sampler_fn=prepare_bucket_sampler,
+                                           batch_sampler_params=sampler_params,
+                                           num_dataset_workers=num_dataset_workers,
+                                           num_batch_workers=num_batch_workers,
+                                           pin_memory=True,
+                                           circle_length=1)
+                for i, x in enumerate(dataloader):
+                    assert_almost_equal(x.asnumpy(), all_data[i * 2:(i + 1) * 2])
+
+        # test cache
+        split_sampler = SplitSampler(1, num_parts=1, part_index=0,
+                                     repeat=2, shuffle=False)
+        X = np.load(os.path.join(data, 'part_{}.npy'.format(0)))
+        X = np.concatenate((X, X))
+        for num_dataset_workers in [1]:
+            for num_batch_workers in [1]:
+                dataloader = DatasetLoader(os.path.join(data, 'part_0.npy'),
+                                           file_sampler=split_sampler,
+                                           dataset_fn=prepare_dataset,
+                                           dataset_params=dataset_params,
+                                           batch_sampler_fn=prepare_bucket_sampler,
+                                           batch_sampler_params=sampler_params,
+                                           num_dataset_workers=num_dataset_workers,
+                                           num_batch_workers=num_batch_workers,
+                                           pin_memory=True,
+                                           circle_length=1,
+                                           dataset_cached=True,
+                                           num_max_dataset_cached=1)
+                for i, x in enumerate(dataloader):
+                    assert_almost_equal(x.asnumpy(), X[i * 2:(i + 1) * 2])
diff --git a/tests/test_data_sampler.py b/tests/test_data_sampler.py
new file mode 100644
index 0000000000..9597004be3
--- /dev/null
+++ b/tests/test_data_sampler.py
@@ -0,0 +1,205 @@
+import pytest
+import numpy as np
+from mxnet.gluon import data
+from gluonnlp.data import sampler as s
+
+
+N = 1000
+
+
+def test_sorted_sampler():
+    dataset = data.SimpleDataset([np.random.normal(0, 1, (np.random.randint(10, 100), 1, 1))
+                                  for _ in range(N)])
+    gt_sample_id = sorted(range(len(dataset)), key=lambda i: dataset[i].shape, reverse=True)
+    sample_ret = list(s.SortedSampler([ele.shape[0] for ele in dataset]))
+    for lhs, rhs in zip(gt_sample_id, sample_ret):
+        assert lhs == rhs
+
+
+@pytest.mark.parametrize('seq_lengths', [[np.random.randint(10, 100) for _ in range(N)],
+                                         [(np.random.randint(10, 100), np.random.randint(10, 100))
+                                           for _ in range(N)]])
+@pytest.mark.parametrize('ratio', [0.0, 0.5])
+@pytest.mark.parametrize('shuffle', [False, True])
+@pytest.mark.parametrize('num_buckets', [1, 10, 100, 5000])
+@pytest.mark.parametrize('bucket_scheme', [s.ConstWidthBucket(),
+                                           s.LinearWidthBucket(),
+                                           s.ExpWidthBucket()])
+@pytest.mark.parametrize('use_average_length', [False, True])
+def test_fixed_bucket_sampler(seq_lengths, ratio, shuffle, num_buckets, bucket_scheme,
+                              use_average_length):
+    sampler = s.FixedBucketSampler(seq_lengths,
+                                   batch_size=8,
+                                   num_buckets=num_buckets,
+                                   ratio=ratio, shuffle=shuffle,
+                                   use_average_length=use_average_length,
+                                   bucket_scheme=bucket_scheme)
+    # here we print sampler to cover the __repr__ func of the sampler
+    print(sampler)
+    total_sampled_ids = []
+    for batch_sample_ids in sampler:
+        total_sampled_ids.extend(batch_sample_ids)
+    assert len(set(total_sampled_ids)) == len(total_sampled_ids) == N
+
+
+@pytest.mark.parametrize('bucket_keys', [[1, 5, 10, 100], [10, 100], [200]])
+@pytest.mark.parametrize('ratio', [0.0, 0.5])
+@pytest.mark.parametrize('shuffle', [False, True])
+def test_fixed_bucket_sampler_with_single_key(bucket_keys, ratio, shuffle):
+    seq_lengths = [np.random.randint(10, 100) for _ in range(N)]
+    sampler = s.FixedBucketSampler(seq_lengths, batch_size=8, num_buckets=None,
+                                   bucket_keys=bucket_keys, ratio=ratio, shuffle=shuffle)
+    print(sampler)
+    total_sampled_ids = []
+    for batch_sample_ids in sampler:
+        total_sampled_ids.extend(batch_sample_ids)
+    assert len(set(total_sampled_ids)) == len(total_sampled_ids) == N
+
+@pytest.mark.parametrize('bucket_keys', [[(1, 1), (5, 10), (10, 20), (20, 10), (100, 100)],
+                                         [(20, 20), (30, 15), (100, 100)],
+                                         [(100, 200)]])
+@pytest.mark.parametrize('ratio', [0.0, 0.5])
+@pytest.mark.parametrize('shuffle', [False, True])
+def test_fixed_bucket_sampler_with_single_key(bucket_keys, ratio, shuffle):
+    seq_lengths = [(np.random.randint(10, 100), np.random.randint(10, 100)) for _ in range(N)]
+    sampler = s.FixedBucketSampler(seq_lengths, batch_size=8, num_buckets=None,
+                                   bucket_keys=bucket_keys, ratio=ratio, shuffle=shuffle)
+    print(sampler)
+    total_sampled_ids = []
+    for batch_sample_ids in sampler:
+        total_sampled_ids.extend(batch_sample_ids)
+    assert len(set(total_sampled_ids)) == len(total_sampled_ids) == N
+
+
+def test_fixed_bucket_sampler_compactness():
+    samples = list(
+        s.FixedBucketSampler(
+            np.arange(16, 32), 8, num_buckets=2,
+            bucket_scheme=s.ConstWidthBucket()))
+    assert len(samples) == 2
+
+
+@pytest.mark.parametrize('seq_lengths', [[np.random.randint(10, 100) for _ in range(N)],
+                                         [(np.random.randint(10, 100), np.random.randint(10, 100))
+                                          for _ in range(N)]])
+@pytest.mark.parametrize('mult', [10, 100])
+@pytest.mark.parametrize('batch_size', [5, 7])
+@pytest.mark.parametrize('shuffle', [False, True])
+def test_sorted_bucket_sampler(seq_lengths, mult, batch_size, shuffle):
+    sampler = s.SortedBucketSampler(sort_keys=seq_lengths,
+                                    batch_size=batch_size,
+                                    mult=mult, shuffle=shuffle)
+    total_sampled_ids = []
+    for batch_sample_ids in sampler:
+        total_sampled_ids.extend(batch_sample_ids)
+    assert len(set(total_sampled_ids)) == len(total_sampled_ids) == N
+
+
+@pytest.mark.parametrize('num_samples', [30])
+@pytest.mark.parametrize('num_parts', [3, 7])
+@pytest.mark.parametrize('repeat', [1, 3])
+def test_split_sampler(num_samples, num_parts, repeat):
+    total_count = 0
+    indices = []
+    for part_idx in range(num_parts):
+        sampler = s.SplitSampler(num_samples, num_parts, part_idx, repeat=repeat)
+        count = 0
+        for i in sampler:
+            count += 1
+            indices.append(i)
+        total_count += count
+        assert count == len(sampler)
+    assert total_count == num_samples * repeat
+    assert np.allclose(sorted(indices), np.repeat(list(range(num_samples)), repeat))
+
+
+@pytest.mark.parametrize('num_samples', [30])
+@pytest.mark.parametrize('num_parts', [3, 7])
+def test_split_sampler_even_size(num_samples, num_parts):
+    total_count = 0
+    indices = []
+    for part_idx in range(num_parts):
+        sampler = s.SplitSampler(num_samples, num_parts, part_idx, even_size=True)
+        count = 0
+        for i in sampler:
+            count += 1
+            indices.append(i)
+        total_count += count
+        assert count == len(sampler)
+        print(count)
+    expected_count = int(num_samples + num_parts - 1) // num_parts * num_parts
+    assert total_count == expected_count, (total_count, expected_count)
+
+
+@pytest.mark.parametrize('seq_lengths', [[np.random.randint(10, 100) for _ in range(N)],
+                                         [(np.random.randint(10, 100), np.random.randint(10, 100)) for _ in range(N)]])
+@pytest.mark.parametrize('max_num_tokens', [200, 500])
+@pytest.mark.parametrize('max_num_sentences', [-1, 5])
+@pytest.mark.parametrize('required_batch_size_multiple', [1, 5])
+@pytest.mark.parametrize('shuffle', [True, False])
+@pytest.mark.parametrize('seed', [100, None])
+def test_bounded_budget_sampler(seq_lengths, max_num_tokens, max_num_sentences,
+                                required_batch_size_multiple, shuffle, seed):
+    sampler = s.BoundedBudgetSampler(seq_lengths, max_num_tokens, max_num_sentences,
+                                     required_batch_size_multiple, shuffle, seed)
+    print(sampler)
+    total_sampled_ids = []
+    for batch_sample_ids in sampler:
+        total_sampled_ids.extend(batch_sample_ids)
+    assert len(set(total_sampled_ids)) == len(total_sampled_ids) == N
+    assert sorted(total_sampled_ids) == list(range(len(total_sampled_ids)))
+
+
+@pytest.mark.parametrize('seq_lengths', [[np.random.randint(10, 100) for _ in range(N)],
+                                         [(np.random.randint(10, 100), np.random.randint(10, 100)) for _ in range(N)]])
+@pytest.mark.parametrize('max_num_tokens', [200, 500])
+@pytest.mark.parametrize('max_num_sentences', [-1, 5])
+@pytest.mark.parametrize('required_batch_size_multiple', [1, 5])
+@pytest.mark.parametrize('shuffle', [True, False])
+@pytest.mark.parametrize('num_parts', [1, 4])
+@pytest.mark.parametrize('even_size', [False])
+def test_sharded_iterator(seq_lengths, max_num_tokens, max_num_sentences,
+                          required_batch_size_multiple, shuffle,
+                          num_parts, even_size):
+    total_sampled_ids = []
+    for part_index in range(num_parts):
+        # we use independent (but same) sampler to simulate multi process situation
+        sampler = s.BoundedBudgetSampler(seq_lengths, max_num_tokens, max_num_sentences,
+                                         required_batch_size_multiple, shuffle, seed=100)
+        sharded_iter = s.ShardedIterator(sampler, num_parts, part_index, even_size)
+        print(sharded_iter)
+        for batch_sample_ids in sharded_iter:
+            total_sampled_ids.extend(batch_sample_ids)
+    assert len(set(total_sampled_ids)) == len(total_sampled_ids) == N
+    assert sorted(total_sampled_ids) == list(range(len(total_sampled_ids)))
+
+
+@pytest.mark.parametrize('seq_lengths', [[np.random.randint(10, 100) for _ in range(N)],
+                                         [(np.random.randint(10, 100), np.random.randint(10, 100)) for _ in range(N)]])
+@pytest.mark.parametrize('max_num_tokens', [200, 500])
+@pytest.mark.parametrize('max_num_sentences', [-1, 5])
+@pytest.mark.parametrize('required_batch_size_multiple', [1, 5])
+@pytest.mark.parametrize('shuffle', [True, False])
+@pytest.mark.parametrize('num_parts', [1, 4])
+@pytest.mark.parametrize('even_size', [True])
+def test_sharded_iterator_even_size(seq_lengths, max_num_tokens, max_num_sentences,
+                          required_batch_size_multiple, shuffle,
+                          num_parts, even_size):
+    total_sampled_ids = []
+    first_batch_num = None
+    for part_index in range(num_parts):
+        batch_num = 0
+        # we use independent (but same) sampler to simulate multi process situation
+        sampler = s.BoundedBudgetSampler(seq_lengths, max_num_tokens, max_num_sentences,
+                                         required_batch_size_multiple, shuffle, seed=100)
+        sharded_iter = s.ShardedIterator(sampler, num_parts, part_index, even_size)
+        print(sharded_iter)
+        for batch_sample_ids in sharded_iter:
+            total_sampled_ids.extend(batch_sample_ids)
+            batch_num += 1
+        # assert batch num of each parts equals
+        if first_batch_num is None:
+            first_batch_num = batch_num
+        else:
+            assert first_batch_num == batch_num
+    assert len(set(total_sampled_ids)) == N
diff --git a/tests/test_data_tokenizers.py b/tests/test_data_tokenizers.py
new file mode 100644
index 0000000000..148a82c396
--- /dev/null
+++ b/tests/test_data_tokenizers.py
@@ -0,0 +1,621 @@
+import pytest
+import random
+import collections
+import pickle
+from uuid import uuid4
+import os
+import unicodedata
+import tempfile
+from gluonnlp.data.tokenizers import WhitespaceTokenizer, MosesTokenizer, JiebaTokenizer,\
+    SpacyTokenizer, SubwordNMTTokenizer, YTTMTokenizer, SentencepieceTokenizer, \
+    HuggingFaceBPETokenizer, HuggingFaceByteBPETokenizer, HuggingFaceWordPieceTokenizer
+from gluonnlp.base import get_repo_url
+from gluonnlp.data import Vocab
+from gluonnlp.utils.misc import download
+
+
+EN_SAMPLES = ['Four score and seven years ago our fathers brought forth on this continent, '
+              'a new nation, conceived in Liberty, and dedicated to the proposition '
+              'that all men are created equal.',
+              'In spite of the debate going on for months about the photos of Özil with the '
+              'Turkish President Recep Tayyip Erdogan, he regrets the return of '
+              'the 92-match national player Özil.']
+DE_SAMPLES = ['Goethe stammte aus einer angesehenen bürgerlichen Familie; sein Großvater'
+              ' mütterlicherseits war als Stadtschultheiß höchster Justizbeamter der'
+              ' Stadt Frankfurt, sein Vater Doktor der Rechte und kaiserlicher Rat.',
+              '"Das ist eine Frage, die natürlich davon abhängt, dass man einmal ins '
+              'Gespräch kommt, dass man mit ihm auch darüber spricht, warum er das eine '
+              'oder andere offenbar so empfunden hat, wie das in seinem Statement niedergelegt'
+              ' ist", sagte Grindel im Fußball-Podcast "Phrasenmäher" der "Bild-Zeitung.']
+ZH_SAMPLES = ['苟活者在淡红的血色中，会依稀看见微茫的希望；真的猛士，将更奋然而前行。',
+              '参加工作，哈尔滨工业大学无线电工程系电子仪器及测量技术专业毕业。']
+
+SUBWORD_TEST_SAMPLES = ["Hello, y'all! How are you Ⅷ 😁 😁 😁 ?",
+                        'GluonNLP is great！！！!!!',
+                        "GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:!@# 'abc'"]
+
+
+def random_inject_space(sentence):
+    words = sentence.split()
+    ret = ''
+    for i, word in enumerate(words):
+        ret += word
+        if i < len(words) - 1:
+            n_space_tokens = random.randint(1, 10)
+            for j in range(n_space_tokens):
+                ret += random.choice([' ', '\t', '\r', '\n'])
+    return ret
+
+
+def verify_encode_token_with_offsets(tokenizer, all_sentences, gt_offsets=None):
+    if gt_offsets is None:
+        for sentences in [all_sentences[0], all_sentences]:
+            enc_tokens = tokenizer.encode(sentences, str)
+            tokens, offsets = tokenizer.encode_with_offsets(sentences, str)
+            if isinstance(sentences, list):
+                for ele_tokens, ele_enc_tokens, ele_offsets, ele_sentence in\
+                        zip(tokens, enc_tokens, offsets, sentences):
+                    for tok, offset, enc_tok in zip(ele_tokens, ele_offsets, ele_enc_tokens):
+                        assert ele_sentence[offset[0]:offset[1]] == tok
+                        assert tok == enc_tok
+            else:
+                for tok, offset, enc_tok in zip(tokens, offsets, enc_tokens):
+                    assert sentences[offset[0]:offset[1]] == tok
+                    assert tok == enc_tok
+    else:
+        for sentences, ele_gt_offsets in [(all_sentences[0], gt_offsets[0]),
+                                          (all_sentences, gt_offsets)]:
+            enc_tokens = tokenizer.encode(sentences, str)
+            tokens, offsets = tokenizer.encode_with_offsets(sentences, str)
+            assert ele_gt_offsets == offsets
+
+
+def verify_sentencepiece_tokenizer_with_offsets(tokenizer, all_sentences):
+    for sentences in [all_sentences[0], all_sentences]:
+        enc_tokens = tokenizer.encode(sentences, str)
+        tokens, offsets = tokenizer.encode_with_offsets(sentences, str)
+        if isinstance(sentences, list):
+            for ele_tokens, ele_enc_tokens, ele_offsets, ele_sentence\
+                    in zip(tokens, enc_tokens, offsets, sentences):
+                for i, (tok, offset, enc_tok) in enumerate(zip(ele_tokens, ele_offsets,
+                                                               ele_enc_tokens)):
+                    assert tok == enc_tok
+                    ele_sel_tok = unicodedata.normalize('NFKC',
+                                                        ele_sentence[offset[0]:offset[1]]).strip()
+                    if tokenizer.is_first_subword(tok):
+                        real_tok = tok[1:]
+                    else:
+                        real_tok = tok
+                    assert ele_sel_tok == real_tok,\
+                        'ele_sel_tok={}, real_tok={}'.format(ele_sel_tok, real_tok)
+
+
+def verify_encode_with_offsets_consistency(tokenizer, all_sentences):
+    for sentences in [all_sentences[0], all_sentences]:
+        enc_tokens = tokenizer.encode(sentences, int)
+        tokens, offsets = tokenizer.encode_with_offsets(sentences, int)
+        str_tokens, str_offsets = tokenizer.encode_with_offsets(sentences, str)
+        assert offsets == str_offsets
+        assert tokens == str_tokens == enc_tokens
+
+
+def verify_encode_token(tokenizer, all_sentences, all_gt_tokens):
+    for sentences, gt_tokens in [(all_sentences[0], all_gt_tokens[0]),
+                                 (all_sentences, all_gt_tokens)]:
+        tokenizer_encode_ret = tokenizer.encode(sentences)
+        assert tokenizer_encode_ret == gt_tokens,\
+            'Whole Encoded: {}, \nWhole GT: {}'.format(tokenizer_encode_ret, gt_tokens)
+
+
+def verify_decode(tokenizer, all_sentences, out_type=str):
+    for sentences in [all_sentences[0], all_sentences]:
+        assert tokenizer.decode(tokenizer.encode(sentences, out_type)) == sentences
+
+
+def verify_decode_spm(tokenizer, all_sentences, gt_int_decode_sentences):
+    for sentences, case_gt_int_decode in [(all_sentences[0], gt_int_decode_sentences[0]),
+                                          (all_sentences, gt_int_decode_sentences)]:
+        if isinstance(sentences, str):
+            gt_str_decode_sentences = sentences
+            if tokenizer.lowercase:
+                gt_str_decode_sentences = gt_str_decode_sentences.lower()
+            gt_str_decode_sentences = unicodedata.normalize('NFKC', gt_str_decode_sentences)
+        elif isinstance(sentences, list):
+            gt_str_decode_sentences = []
+            for ele in sentences:
+                ele_gt_decode = ele
+                if tokenizer.lowercase:
+                    ele_gt_decode = ele_gt_decode.lower()
+                ele_gt_decode = unicodedata.normalize('NFKC', ele_gt_decode)
+                gt_str_decode_sentences.append(ele_gt_decode)
+        else:
+            raise NotImplementedError
+        assert tokenizer.decode(tokenizer.encode(sentences, str)) == gt_str_decode_sentences
+        assert tokenizer.decode(tokenizer.encode(sentences, int)) == case_gt_int_decode
+
+
+def verify_decode_subword_nmt(tokenizer, all_sentences, gt_int_decode, gt_str_decode):
+    for sentences, case_gt_int_deocde, case_gt_str_decode in [(all_sentences[0], gt_int_decode[0], gt_str_decode[0]),
+                                                              (all_sentences, gt_int_decode, gt_str_decode)]:
+        assert tokenizer.decode(tokenizer.encode(sentences, str)) == case_gt_str_decode
+        assert tokenizer.decode(tokenizer.encode(sentences, int)) == case_gt_int_deocde
+
+
+def verify_decode_hf(tokenizer, all_sentences, gt_decode_sentences):
+    for sentences, case_gt_deocde in [(all_sentences[0], gt_decode_sentences[0]),
+                                          (all_sentences, gt_decode_sentences)]:
+        assert tokenizer.decode(tokenizer.encode(sentences, str)) == case_gt_deocde
+        assert tokenizer.decode(tokenizer.encode(sentences, int)) == case_gt_deocde
+
+
+def verify_decode_no_vocab_raise(tokenizer):
+    # When the vocab is not attached, should raise ValueError
+    for sentences in [EN_SAMPLES[0], EN_SAMPLES]:
+        with pytest.raises(ValueError):
+            tokenizer.encode(sentences, int)
+    with pytest.raises(ValueError):
+        tokenizer.decode([0])
+    with pytest.raises(ValueError):
+        tokenizer.decode([[0], [1]])
+
+
+def verify_pickleble(tokenizer, cls):
+    # Verify if the tokenizer is pickleable and has the same behavior after dumping/loading
+    tokenizer_p = pickle.loads(pickle.dumps(tokenizer))
+    assert isinstance(tokenizer_p, cls)
+    assert tokenizer.encode(SUBWORD_TEST_SAMPLES, str) == tokenizer_p.encode(SUBWORD_TEST_SAMPLES, str)
+
+def test_whitespace_tokenizer():
+    tokenizer = WhitespaceTokenizer()
+    gt_en_tokenized = [['Four', 'score', 'and', 'seven', 'years', 'ago', 'our', 'fathers', 'brought',
+                        'forth', 'on', 'this', 'continent,', 'a', 'new', 'nation,', 'conceived',
+                        'in', 'Liberty,', 'and', 'dedicated', 'to', 'the', 'proposition', 'that',
+                        'all', 'men', 'are', 'created', 'equal.'],
+                       ['In', 'spite', 'of', 'the', 'debate', 'going', 'on', 'for', 'months',
+                        'about', 'the', 'photos', 'of', 'Özil', 'with', 'the', 'Turkish',
+                        'President', 'Recep', 'Tayyip', 'Erdogan,', 'he', 'regrets', 'the',
+                        'return', 'of', 'the', '92-match', 'national', 'player', 'Özil.']]
+    gt_de_tokenized = [['Goethe', 'stammte', 'aus', 'einer', 'angesehenen', 'bürgerlichen',
+                        'Familie;', 'sein', 'Großvater', 'mütterlicherseits', 'war', 'als',
+                        'Stadtschultheiß', 'höchster', 'Justizbeamter', 'der', 'Stadt',
+                        'Frankfurt,', 'sein', 'Vater', 'Doktor', 'der', 'Rechte', 'und',
+                        'kaiserlicher', 'Rat.'],
+                       ['"Das', 'ist', 'eine', 'Frage,', 'die', 'natürlich', 'davon', 'abhängt,',
+                        'dass', 'man', 'einmal', 'ins', 'Gespräch', 'kommt,', 'dass', 'man', 'mit',
+                        'ihm', 'auch', 'darüber', 'spricht,', 'warum', 'er', 'das', 'eine', 'oder',
+                        'andere', 'offenbar', 'so', 'empfunden', 'hat,', 'wie', 'das', 'in',
+                        'seinem', 'Statement', 'niedergelegt', 'ist",', 'sagte', 'Grindel', 'im',
+                        'Fußball-Podcast', '"Phrasenmäher"', 'der', '"Bild-Zeitung.']]
+    for _ in range(2):
+        # Inject noise and test for encode
+        noisy_en_samples = [random_inject_space(ele) for ele in EN_SAMPLES]
+        noisy_de_samples = [random_inject_space(ele) for ele in DE_SAMPLES]
+        verify_encode_token(tokenizer, noisy_en_samples + noisy_de_samples,
+                            gt_en_tokenized + gt_de_tokenized)
+        # Test for decode
+        verify_decode(tokenizer, EN_SAMPLES + DE_SAMPLES, str)
+        # Test for encode_with_offsets
+        verify_encode_token_with_offsets(tokenizer, noisy_en_samples + noisy_de_samples)
+    verify_decode_no_vocab_raise(tokenizer)
+
+    # Test for output_type = int
+    vocab = Vocab(collections.Counter(sum(gt_en_tokenized + gt_de_tokenized,
+                                          [])))
+    tokenizer.set_vocab(vocab)
+    verify_decode(tokenizer, EN_SAMPLES + DE_SAMPLES, int)
+    verify_pickleble(tokenizer, WhitespaceTokenizer)
+    verify_encode_token_with_offsets(tokenizer, EN_SAMPLES + DE_SAMPLES)
+
+
+def test_moses_tokenizer():
+    en_tokenizer = MosesTokenizer('en')
+    de_tokenizer = MosesTokenizer('de')
+    gt_en_tokenized = [['Four', 'score', 'and', 'seven', 'years', 'ago', 'our', 'fathers',
+                        'brought', 'forth', 'on', 'this', 'continent', ',', 'a', 'new', 'nation',
+                        ',', 'conceived', 'in', 'Liberty', ',', 'and', 'dedicated', 'to', 'the',
+                        'proposition', 'that', 'all', 'men', 'are', 'created', 'equal', '.'],
+                       ['In', 'spite', 'of', 'the', 'debate', 'going', 'on', 'for', 'months',
+                        'about', 'the', 'photos', 'of', 'Özil', 'with', 'the', 'Turkish',
+                        'President', 'Recep', 'Tayyip', 'Erdogan', ',', 'he', 'regrets', 'the',
+                        'return', 'of', 'the', '92-match', 'national', 'player', 'Özil', '.']]
+    gt_de_tokenized = [['Goethe', 'stammte', 'aus', 'einer', 'angesehenen', 'bürgerlichen',
+                        'Familie', ';', 'sein', 'Großvater', 'mütterlicherseits', 'war', 'als',
+                        'Stadtschultheiß', 'höchster', 'Justizbeamter', 'der', 'Stadt',
+                        'Frankfurt', ',', 'sein', 'Vater', 'Doktor', 'der', 'Rechte', 'und',
+                        'kaiserlicher', 'Rat', '.'],
+                       ['&quot;', 'Das', 'ist', 'eine', 'Frage', ',', 'die', 'natürlich', 'davon',
+                        'abhängt', ',', 'dass', 'man', 'einmal', 'ins', 'Gespräch', 'kommt', ',',
+                        'dass', 'man', 'mit', 'ihm', 'auch', 'darüber', 'spricht', ',', 'warum',
+                        'er', 'das', 'eine', 'oder', 'andere', 'offenbar', 'so', 'empfunden',
+                        'hat', ',', 'wie', 'das', 'in', 'seinem', 'Statement', 'niedergelegt',
+                        'ist', '&quot;', ',', 'sagte', 'Grindel', 'im', 'Fußball-Podcast',
+                        '&quot;', 'Phrasenmäher', '&quot;', 'der', '&quot;', 'Bild-Zeitung', '.']]
+    verify_encode_token(en_tokenizer, EN_SAMPLES, gt_en_tokenized)
+    verify_encode_token(de_tokenizer, DE_SAMPLES, gt_de_tokenized)
+    verify_decode(en_tokenizer, EN_SAMPLES, str)
+    verify_decode(de_tokenizer, DE_SAMPLES, str)
+    vocab = Vocab(collections.Counter(sum(gt_en_tokenized + gt_de_tokenized, [])))
+    verify_decode_no_vocab_raise(en_tokenizer)
+    verify_decode_no_vocab_raise(de_tokenizer)
+    en_tokenizer.set_vocab(vocab)
+    de_tokenizer.set_vocab(vocab)
+    verify_decode(en_tokenizer, EN_SAMPLES, int)
+    verify_decode(de_tokenizer, DE_SAMPLES, int)
+    verify_pickleble(en_tokenizer, MosesTokenizer)
+    verify_pickleble(de_tokenizer, MosesTokenizer)
+
+
+def test_jieba_tokenizer():
+    tokenizer = JiebaTokenizer()
+    gt_zh_tokenized = [['苟活', '者', '在', '淡红', '的', '血色', '中', '，',
+                        '会', '依稀', '看见', '微茫', '的', '希望', '；', '真的',
+                        '猛士', '，', '将', '更奋', '然而', '前行', '。'],
+                       ['参加', '工作', '，', '哈尔滨工业大学', '无线电', '工程系', '电子仪器',
+                        '及', '测量', '技术', '专业', '毕业', '。']]
+    verify_encode_token(tokenizer, ZH_SAMPLES, gt_zh_tokenized)
+    verify_decode(tokenizer, ZH_SAMPLES, str)
+    vocab = Vocab(collections.Counter(sum(gt_zh_tokenized, [])))
+    verify_decode_no_vocab_raise(tokenizer)
+    tokenizer.set_vocab(vocab)
+    verify_decode(tokenizer, ZH_SAMPLES, int)
+    verify_pickleble(tokenizer, JiebaTokenizer)
+
+
+def test_spacy_tokenizer():
+    en_tokenizer = SpacyTokenizer('en')
+    de_tokenizer = SpacyTokenizer('de')
+    gt_en_tokenized = [['Four', 'score', 'and', 'seven', 'years', 'ago', 'our', 'fathers',
+                        'brought', 'forth', 'on', 'this', 'continent', ',', 'a', 'new', 'nation',
+                        ',', 'conceived', 'in', 'Liberty', ',', 'and', 'dedicated', 'to', 'the',
+                        'proposition', 'that', 'all', 'men', 'are', 'created', 'equal', '.'],
+                       ['In', 'spite', 'of', 'the', 'debate', 'going', 'on', 'for', 'months',
+                        'about', 'the', 'photos', 'of', 'Özil', 'with', 'the', 'Turkish',
+                        'President', 'Recep', 'Tayyip', 'Erdogan', ',', 'he', 'regrets', 'the',
+                        'return', 'of', 'the', '92-match', 'national', 'player', 'Özil', '.']]
+    gt_de_tokenized = [['Goethe', 'stammte', 'aus', 'einer', 'angesehenen', 'bürgerlichen',
+                        'Familie', ';', 'sein', 'Großvater', 'mütterlicherseits', 'war', 'als',
+                        'Stadtschultheiß', 'höchster', 'Justizbeamter', 'der', 'Stadt', 'Frankfurt',
+                        ',', 'sein', 'Vater', 'Doktor', 'der', 'Rechte', 'und', 'kaiserlicher',
+                        'Rat', '.'],
+                       ['"', 'Das', 'ist', 'eine', 'Frage', ',', 'die', 'natürlich', 'davon',
+                        'abhängt', ',', 'dass', 'man', 'einmal', 'ins', 'Gespräch', 'kommt', ',',
+                        'dass', 'man', 'mit', 'ihm', 'auch', 'darüber', 'spricht', ',', 'warum',
+                        'er', 'das', 'eine', 'oder', 'andere', 'offenbar', 'so', 'empfunden', 'hat',
+                        ',', 'wie', 'das', 'in', 'seinem', 'Statement', 'niedergelegt', 'ist', '"',
+                        ',', 'sagte', 'Grindel', 'im', 'Fußball-Podcast', '"', 'Phrasenmäher', '"',
+                        'der', '"', 'Bild-Zeitung', '.']]
+    verify_encode_token(en_tokenizer, EN_SAMPLES, gt_en_tokenized)
+    verify_encode_token(de_tokenizer, DE_SAMPLES, gt_de_tokenized)
+    vocab = Vocab(collections.Counter(sum(gt_en_tokenized + gt_de_tokenized, [])))
+    en_tokenizer.set_vocab(vocab)
+    de_tokenizer.set_vocab(vocab)
+    verify_pickleble(en_tokenizer, SpacyTokenizer)
+    verify_pickleble(de_tokenizer, SpacyTokenizer)
+    verify_encode_token_with_offsets(en_tokenizer, EN_SAMPLES)
+    verify_encode_token_with_offsets(de_tokenizer, DE_SAMPLES)
+
+
+def test_yttm_tokenizer():
+    with tempfile.TemporaryDirectory() as dir_path:
+        model_path = os.path.join(dir_path, 'yttm.model')
+        download(url=get_repo_url() + 'tokenizer_test_models/yttm/test_ende_yttm-6f2c39.model',
+                 path=model_path)
+        tokenizer = YTTMTokenizer(model_path=model_path)
+        gt_tokenized = [['▁He', 'll', 'o', ',', '▁y', "'", 'all', '!', '▁How', '▁are', '▁you', '▁',
+                         'Ⅷ', '▁', '😁', '▁', '😁', '▁', '😁', '▁?'],
+                        ['▁Gl', 'u', 'on', 'N', 'L', 'P', '▁is', '▁great', '！', '！', '！', '!',
+                         '!', '!'],
+                        ['▁Gl', 'u', 'on', 'N', 'L', 'P', '-A', 'm', 'az', 'on', '-H', 'a', 'ib',
+                         'in', '-L', 'e', 'on', 'ard', '-S', 'hen', 'g', '-S', 'h', 'u', 'ai',
+                         '-', 'X', 'ing', 'j', 'ian', '.', '.', '.', '.', '.', '/', ':', '!',
+                         '@', '#', '▁', "'", 'ab', 'c', "'"]]
+        gt_offsets = [[(0, 2), (2, 4), (4, 5), (5, 6), (6, 8), (8, 9), (9, 12), (12, 13), (13, 17),
+                       (17, 21), (21, 25), (25, 26), (26, 27), (27, 28), (28, 29), (29, 30), (30, 31),
+                       (31, 32), (32, 33), (33, 35)],
+                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 11), (11, 17), (17, 18),
+                       (18, 19), (19, 20), (20, 21), (21, 22), (22, 23)],
+                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 10), (10, 11), (11, 13),
+                       (13, 15), (15, 17), (17, 18), (18, 20), (20, 22), (22, 24), (24, 25), (25, 27),
+                       (27, 30), (30, 32), (32, 35), (35, 36), (36, 38), (38, 39), (39, 40), (40, 42),
+                       (42, 43), (43, 44), (44, 47), (47, 48), (48, 51), (51, 52), (52, 53), (53, 54),
+                       (54, 55), (55, 56), (56, 57), (57, 58), (58, 59), (59, 60), (60, 61), (61, 62),
+                       (62, 63), (63, 65), (65, 66), (66, 67)]]
+        gt_int_decode = ['Hello, y<UNK>all! How are you <UNK> <UNK> <UNK> <UNK> ?',
+                         'GluonNLP is great！！！!!!',
+                         'GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:!@# <UNK>abc<UNK>']
+        gt_str_decode = ["Hello, y'all! How are you Ⅷ 😁 😁 😁 ?",
+                         'GluonNLP is great！！！!!!',
+                         "GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:!@# 'abc'"]
+        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
+        verify_pickleble(tokenizer, YTTMTokenizer)
+        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
+        # Begin to verify decode
+        for sample_sentences, ele_gt_int_decode, ele_gt_str_decode in [(SUBWORD_TEST_SAMPLES[0], gt_int_decode[0], gt_str_decode[0]),
+                                                                       (SUBWORD_TEST_SAMPLES, gt_int_decode, gt_str_decode)]:
+            int_decode = tokenizer.decode(tokenizer.encode(sample_sentences, int))
+            str_decode = tokenizer.decode(tokenizer.encode(sample_sentences, str))
+            assert int_decode == ele_gt_int_decode
+            assert str_decode == ele_gt_str_decode
+        os.remove(model_path)
+
+
+@pytest.mark.seed(123)
+def test_sentencepiece_tokenizer():
+    with tempfile.TemporaryDirectory() as dir_path:
+        model_path = os.path.join(dir_path, 'spm.model')
+        download(url=get_repo_url()
+                     + 'tokenizer_test_models/sentencepiece/case1/test_ende-a9bee4.model',
+                 path=model_path)
+        # Case1
+        tokenizer = SentencepieceTokenizer(model_path)
+        gt_tokenized = [['▁Hel', 'lo', ',', '▁y', "'", 'all', '!', '▁How', '▁are', '▁you',
+                         '▁', 'VI', 'II', '▁', '😁', '▁', '😁', '▁', '😁', '▁?'],
+                        ['▁G', 'lu', 'on', 'N', 'L', 'P', '▁is', '▁great', '!', '!', '!', '!',
+                         '!', '!'],
+                        ['▁G', 'lu', 'on', 'N', 'L', 'P', '-', 'A', 'ma', 'zo', 'n', '-', 'H', 'ai',
+                         'bin', '-', 'L', 'e', 'on', 'ard', '-', 'S', 'hen', 'g', '-', 'S', 'hu', 'ai',
+                         '-', 'X', 'ing', 'j', 'ian', '.', '.', '.', '.', '.', '/', ':', '!', '@',
+                         '#', '▁', "'", 'ab', 'c', "'"]]
+        gt_offsets = [[(0, 3), (3, 5), (5, 6), (6, 8), (8, 9), (9, 12), (12, 13), (13, 17), (17, 21),
+                       (21, 25), (25, 26), (26, 26), (26, 27), (27, 28), (28, 29), (29, 30), (30, 31),
+                       (31, 32), (32, 33), (33, 35)],
+                      [(0, 1), (1, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 11), (11, 17), (17, 18),
+                       (18, 19), (19, 20), (20, 21), (21, 22), (22, 23)],
+                      [(0, 1), (1, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 12),
+                       (12, 14), (14, 15), (15, 16), (16, 17), (17, 19), (19, 22), (22, 23), (23, 24),
+                       (24, 25), (25, 27), (27, 30), (30, 31), (31, 32), (32, 35), (35, 36), (36, 37),
+                       (37, 38), (38, 40), (40, 42), (42, 43), (43, 44), (44, 47), (47, 48), (48, 51),
+                       (51, 52), (52, 53), (53, 54), (54, 55), (55, 56), (56, 57), (57, 58), (58, 59),
+                       (59, 60), (60, 61), (61, 62), (62, 63), (63, 65), (65, 66), (66, 67)]]
+        gt_int_decode = ['Hello, y ⁇ all! How are you VIII  ⁇   ⁇   ⁇  ?',
+                         'GluonNLP is great!!!!!!',
+                         'GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:! ⁇ #  ⁇ abc ⁇ ']
+        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
+        verify_pickleble(tokenizer, SentencepieceTokenizer)
+        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
+        verify_decode_spm(tokenizer, SUBWORD_TEST_SAMPLES, gt_int_decode)
+
+        # Case2, lower_case
+        gt_lower_case_int_decode = ['hello, y ⁇ all! how are you viii  ⁇   ⁇   ⁇  ?',
+                                    'gluonnlp is great!!!!!!',
+                                    'gluonnlp-amazon-haibin-leonard-sheng-shuai-xingjian...../:! ⁇ #  ⁇ abc ⁇ ']
+        tokenizer = SentencepieceTokenizer(model_path, lowercase=True)
+        verify_decode_spm(tokenizer, SUBWORD_TEST_SAMPLES, gt_lower_case_int_decode)
+
+        # Case3, Use the sentencepiece regularization commands, we test whether we can obtain different encoding results
+        tokenizer = SentencepieceTokenizer(model_path, lowercase=True, nbest=-1, alpha=1.0)
+        has_different_encode_out = False
+        encode_out = None
+        for _ in range(10):
+            if encode_out is None:
+                encode_out = tokenizer.encode(SUBWORD_TEST_SAMPLES[0])
+            else:
+                ele_out = tokenizer.encode(SUBWORD_TEST_SAMPLES[0])
+                if ele_out != encode_out:
+                    has_different_encode_out = True
+                    break
+        assert has_different_encode_out
+        os.remove(model_path)
+
+
+def test_subword_nmt_tokenizer():
+    with tempfile.TemporaryDirectory() as dir_path:
+        model_path = os.path.join(dir_path, 'subword_nmt.model')
+        download(url=get_repo_url() + 'tokenizer_test_models/subword-nmt/test_ende-d189ff.model',
+                 path=model_path)
+        vocab_path = os.path.join(dir_path, 'subword_nmt.vocab')
+        download(url=get_repo_url() + 'tokenizer_test_models/subword-nmt/test_ende_vocab-900f81.json',
+                 path=vocab_path)
+
+        # Case 1
+        tokenizer = SubwordNMTTokenizer(model_path, vocab_path)
+        gt_tokenized = [["Hel", "lo", ",</w>", "y", "\'", "all", "!</w>", "How</w>", "are</w>", "you</w>",
+                         "Ⅷ</w>", "😁</w>", "😁</w>", "😁</w>", "?</w>"],
+                        ["Gl", "u", "on", "N", "L", "P</w>", "is</w>", "great", "！", "！", "！", "!!",
+                         "!</w>"],
+                        ["Gl", "u", "on", "N", "L", "P", "-", "Amaz", "on-", "H", "ai", "b", "in-", "Le",
+                         "on", "ard", "-", "Sh", "eng", "-", "Sh", "u", "ai", "-", "X", "ing", "ji",
+                         "an", "..", "...", "/", ":", "!", "@", "#</w>", "\'", "ab", "c", "\'</w>"]]
+        gt_offsets = [[(0, 3), (3, 5), (5, 6), (7, 8), (8, 9), (9, 12), (12, 13), (14, 17), (18, 21),
+                       (22, 25), (26, 27), (28, 29), (30, 31), (32, 33), (34, 35)],
+                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (9, 11), (12, 17), (17, 18),
+                       (18, 19), (19, 20), (20, 22), (22, 23)],
+                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 13), (13, 16),
+                       (16, 17), (17, 19), (19, 20), (20, 23), (23, 25), (25, 27), (27, 30), (30, 31),
+                       (31, 33), (33, 36), (36, 37), (37, 39), (39, 40), (40, 42), (42, 43), (43, 44),
+                       (44, 47), (47, 49), (49, 51), (51, 53), (53, 56), (56, 57), (57, 58), (58, 59),
+                       (59, 60), (60, 61), (62, 63), (63, 65), (65, 66), (66, 67)]]
+        gt_int_decode = ["Hello, y\'all! How are you Ⅷ 😁 😁 😁 ?",
+                         "GluonNLP is great！！！!!!",
+                         "GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:!@# \'abc\'"]
+        gt_str_decode = SUBWORD_TEST_SAMPLES
+        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
+        verify_pickleble(tokenizer, SubwordNMTTokenizer)
+        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
+        verify_decode_subword_nmt(tokenizer, SUBWORD_TEST_SAMPLES, gt_int_decode, gt_str_decode)
+
+        # Case 2, bpe_dropout
+        # We use str decode here because we may not perfectly recover the original sentence with int decode.
+        tokenizer = SubwordNMTTokenizer(model_path, vocab_path, bpe_dropout=0.5)
+        verify_decode(tokenizer, SUBWORD_TEST_SAMPLES, out_type=str)
+
+        os.remove(model_path)
+        os.remove(vocab_path)
+
+
+def test_huggingface_bpe_tokenizer():
+    with tempfile.TemporaryDirectory() as dir_path:
+        model_path = os.path.join(dir_path, 'test_hf_bpe.model')
+        download(url=get_repo_url() + 'tokenizer_test_models/hf_bpe/test_hf_bpe.model',
+                 path=model_path)
+        vocab_path = os.path.join(dir_path, 'test_hf_bpe.vocab')
+        download(url=get_repo_url() + 'tokenizer_test_models/hf_bpe/test_hf_bpe.vocab',
+                 path=vocab_path)
+        hf_vocab_path = os.path.join(dir_path, 'test_hf_bpe.hf_vocab')
+        download(url=get_repo_url() + 'tokenizer_test_models/hf_bpe/test_hf_bpe.hf_vocab',
+                 path=hf_vocab_path)
+
+        # Case 1, default lowercase=False
+        tokenizer = HuggingFaceBPETokenizer(model_path, vocab_path)
+        gt_tokenized = [['Hello</w>', ',</w>', 'y</w>', "'</w>", 'all</w>', '!</w>', 'How</w>',
+                         'are</w>', 'you</w>', '<unk>', '<unk>', '<unk>', '<unk>', '?</w>'],
+                        ['Gl', 'u', 'on', 'N', 'LP</w>', 'is</w>', 'great</w>', '！</w>', '！</w>',
+                         '！</w>', '!</w>', '!</w>', '!</w>'],
+                        ['Gl', 'u', 'on', 'N', 'LP</w>', '-</w>', 'Amazon</w>', '-</w>', 'H', 'ai',
+                         'bin</w>', '-</w>', 'Leonard</w>', '-</w>', 'Sh', 'en', 'g</w>', '-</w>',
+                         'Sh', 'u', 'ai</w>', '-</w>', 'X', 'ing', 'j', 'ian</w>', '.</w>', '.</w>',
+                         '.</w>', '.</w>', '.</w>', '/</w>', ':</w>', '!</w>', '@</w>', '#</w>',
+                         "'</w>", 'ab', 'c</w>', "'</w>"]]
+        gt_offsets = [[(0, 5), (5, 6), (7, 8), (8, 9), (9, 12), (12, 13), (14, 17), (18, 21), (22, 25),
+                       (26, 27), (28, 29), (30, 31), (32, 33), (34, 35)],
+                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 8), (9, 11), (12, 17), (17, 18), (18, 19),
+                       (19, 20), (20, 21), (21, 22), (22, 23)],
+                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 8), (8, 9), (9, 15), (15, 16), (16, 17),
+                       (17, 19), (19, 22), (22, 23), (23, 30), (30, 31), (31, 33), (33, 35), (35, 36),
+                       (36, 37), (37, 39), (39, 40), (40, 42), (42, 43), (43, 44), (44, 47), (47, 48),
+                       (48, 51), (51, 52), (52, 53), (53, 54), (54, 55), (55, 56), (56, 57), (57, 58),
+                       (58, 59), (59, 60), (60, 61), (62, 63), (63, 65), (65, 66), (66, 67)]]
+        # gt_int_decode = gt_str_decode for hf
+        # hf removed the unk tokens in decode result
+        gt_decode = ["Hello , y ' all ! How are you ?",
+                     'GluonNLP is great ！ ！ ！ ! ! !',
+                     "GluonNLP - Amazon - Haibin - Leonard - Sheng - Shuai - Xingjian . . . . . / : ! @ # ' abc '"]
+        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
+        verify_pickleble(tokenizer, HuggingFaceBPETokenizer)
+        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
+        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)
+
+        # Case 2, lowercase=True
+        gt_lowercase_decode = ["hello , y ' all ! how are you ?",
+                               'gluonnlp is great ！ ！ ！ ! ! !',
+                               "gluonnlp - amazon - haibin - leonard - sheng - shuai - xingjian . . . . . / : ! @ # ' abc '"]
+        tokenizer = HuggingFaceBPETokenizer(model_path, vocab_path, lowercase=True)
+        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_lowercase_decode)
+
+        # Case 3, using original hf vocab
+        tokenizer = HuggingFaceBPETokenizer(model_path, hf_vocab_path)
+        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
+        verify_pickleble(tokenizer, HuggingFaceBPETokenizer)
+        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
+        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)
+
+        os.remove(model_path)
+        os.remove(vocab_path)
+        os.remove(hf_vocab_path)
+
+
+def test_huggingface_bytebpe_tokenizer():
+    with tempfile.TemporaryDirectory() as dir_path:
+        model_path = os.path.join(dir_path, 'hf_bytebpe.model')
+        download(url=get_repo_url() + 'tokenizer_test_models/hf_bytebpe/test_hf_bytebpe.model',
+                 path=model_path)
+        vocab_path = os.path.join(dir_path, 'hf_bytebpe.vocab')
+        download(url=get_repo_url() + 'tokenizer_test_models/hf_bytebpe/test_hf_bytebpe.vocab',
+                 path=vocab_path)
+        hf_vocab_path = os.path.join(dir_path, 'hf_bytebpe.hf_vocab')
+        download(url=get_repo_url() + 'tokenizer_test_models/hf_bytebpe/test_hf_bytebpe.hf_vocab',
+                 path=hf_vocab_path)
+
+        # Case 1, default lowercase=False
+        tokenizer = HuggingFaceByteBPETokenizer(model_path, vocab_path)
+        gt_tokenized = [['Hello', ',', 'Ġy', "'", 'all', '!', 'ĠHow', 'Ġare', 'Ġyou',
+                         'Ġâ', 'ħ', '§', 'ĠðŁĺ', 'ģ', 'ĠðŁĺ', 'ģ', 'ĠðŁĺ', 'ģ', 'Ġ?'],
+                        ['Gl', 'u', 'on', 'N', 'LP', 'Ġis', 'Ġgreat', 'ï¼', 'ģ', 'ï¼',
+                         'ģ', 'ï¼', 'ģ', '!!!'],
+                        ['Gl', 'u', 'on', 'N', 'LP', '-', 'Amazon', '-', 'Ha', 'ib', 'in',
+                         '-', 'Le', 'on', 'ard', '-', 'She', 'ng', '-', 'Sh', 'u',
+                         'ai', '-', 'X', 'ing', 'j', 'ian', '.....', '/', ':', '!', '@',
+                         '#', "Ġ'", 'ab', 'c', "'"]]
+        # the defination of the offsets of bytelevel seems not clear
+        gt_offsets = [[(0, 5), (5, 6), (6, 8), (8, 9), (9, 12), (12, 13), (13, 17), (17, 21),
+                       (21, 25), (25, 27), (26, 27), (26, 27), (27, 29), (28, 29), (29, 31),
+                       (30, 31), (31, 33), (32, 33), (33, 35)],
+                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 8), (8, 11), (11, 17), (17, 18),
+                       (17, 18), (18, 19), (18, 19), (19, 20), (19, 20), (20, 23)],
+                      [(0, 2), (2, 3), (3, 5), (5, 6), (6, 8), (8, 9), (9, 15), (15, 16),
+                       (16, 18), (18, 20), (20, 22), (22, 23), (23, 25), (25, 27), (27, 30),
+                       (30, 31), (31, 34), (34, 36), (36, 37), (37, 39), (39, 40), (40, 42),
+                       (42, 43), (43, 44), (44, 47), (47, 48), (48, 51), (51, 56),
+                       (56, 57), (57, 58), (58, 59), (59, 60), (60, 61), (61, 63),
+                       (63, 65), (65, 66), (66, 67)]]
+        gt_decode = ["Hello, y'all! How are you Ⅷ 😁 😁 😁 ?",
+                     'GluonNLP is great！！！!!!',
+                     "GluonNLP-Amazon-Haibin-Leonard-Sheng-Shuai-Xingjian...../:!@# 'abc'"]
+        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
+        verify_pickleble(tokenizer, HuggingFaceByteBPETokenizer)
+        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
+        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)
+
+        # Case 2, lowercase=True
+        gt_lowercase_int_decode = ["hello, y'all! how are you ⅷ 😁 😁 😁 ?",
+                                   'gluonnlp is great！！！!!!',
+                                   "gluonnlp-amazon-haibin-leonard-sheng-shuai-xingjian...../:!@# 'abc'"]
+        tokenizer = HuggingFaceByteBPETokenizer(model_path, vocab_path, lowercase=True)
+        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_lowercase_int_decode)
+
+        # Case 3, using original hf vocab
+        tokenizer = HuggingFaceByteBPETokenizer(model_path, hf_vocab_path)
+        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
+        verify_pickleble(tokenizer, HuggingFaceByteBPETokenizer)
+        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
+        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)
+
+        os.remove(model_path)
+        os.remove(vocab_path)
+        os.remove(hf_vocab_path)
+
+
+def test_huggingface_wordpiece_tokenizer():
+    with tempfile.TemporaryDirectory() as dir_path:
+        vocab_path = os.path.join(dir_path, 'hf_wordpiece.vocab')
+        download(url=get_repo_url()
+                     + 'tokenizer_test_models/hf_wordpiece/test_hf_wordpiece.vocab',
+                 path=vocab_path)
+        hf_vocab_path = os.path.join(dir_path, 'hf_wordpiece.hf_vocab')
+        download(url=get_repo_url()
+                     + 'tokenizer_test_models/hf_wordpiece/test_hf_wordpiece.hf_vocab',
+                 path=hf_vocab_path)
+
+        # Case 1, lowercase=True
+        tokenizer = HuggingFaceWordPieceTokenizer(vocab_path, lowercase=True)
+        gt_tokenized = [["hello", ",", "y", "'", "all", "!", "how", "are", "you",
+                         "<unk>", "<unk>", "<unk>", "<unk>", "?"],
+                        ["gl", "##uo", "##nn", "##l", "##p", "is", "great", "\uff01",
+                         "\uff01", "\uff01", "!", "!", "!"],
+                        ["gl", "##uo", "##nn", "##l", "##p", "-", "amazon", "-", "hai",
+                         "##bin", "-", "leonard", "-", "shen", "##g", "-", "shu", "##ai", "-",
+                         "xin", "##g", "##ji", "##an", ".", ".", ".", ".", ".", "/", ":", "!",
+                         "@", "#", "'", "abc", "'"]]
+        gt_offsets = [[(0, 5), (5, 6), (7, 8), (8, 9), (9, 12), (12, 13), (14, 17), (18, 21),
+                       (22, 25), (26, 27), (28, 29), (30, 31), (32, 33), (34, 35)],
+                      [(0, 2), (2, 4), (4, 6), (6, 7), (7, 8), (9, 11), (12, 17), (17, 18),
+                       (18, 19), (19, 20), (20, 21), (21, 22), (22, 23)],
+                      [(0, 2), (2, 4), (4, 6), (6, 7), (7, 8), (8, 9), (9, 15), (15, 16), (16, 19),
+                       (19, 22), (22, 23), (23, 30), (30, 31), (31, 35), (35, 36), (36, 37), (37, 40),
+                       (40, 42), (42, 43), (43, 46), (46, 47), (47, 49), (49, 51), (51, 52), (52, 53),
+                       (53, 54), (54, 55), (55, 56), (56, 57), (57, 58), (58, 59), (59, 60), (60, 61),
+                       (62, 63), (63, 66), (66, 67)]]
+        gt_decode = ["hello, y'all! how are you?",
+                     "gluonnlp is great ！ ！ ！!!!",
+                     "gluonnlp - amazon - haibin - leonard - sheng - shuai - xingjian..... / :! @ #'abc '"]
+        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
+        verify_pickleble(tokenizer, HuggingFaceWordPieceTokenizer)
+        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
+        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)
+
+        # Case 2, lowercase=False
+        gt_lowercase_decode = [", y'all! are you?",
+                               "is great ！ ！ ！!!!",
+                               "- - - - - -..... / :! @ #'abc '"]
+        tokenizer = HuggingFaceWordPieceTokenizer(vocab_path, lowercase=False)
+        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_lowercase_decode)
+
+        # Case 3, using original hf vocab
+        tokenizer = HuggingFaceWordPieceTokenizer(hf_vocab_path, lowercase=True)
+        verify_encode_token(tokenizer, SUBWORD_TEST_SAMPLES, gt_tokenized)
+        verify_pickleble(tokenizer, HuggingFaceWordPieceTokenizer)
+        verify_encode_token_with_offsets(tokenizer, SUBWORD_TEST_SAMPLES, gt_offsets)
+        verify_decode_hf(tokenizer, SUBWORD_TEST_SAMPLES, gt_decode)
+
+        os.remove(vocab_path)
+        os.remove(hf_vocab_path)
diff --git a/tests/test_data_vocab.py b/tests/test_data_vocab.py
new file mode 100644
index 0000000000..f3e35988e5
--- /dev/null
+++ b/tests/test_data_vocab.py
@@ -0,0 +1,134 @@
+import pytest
+import collections
+import random
+import uuid
+import os
+import numpy as np
+from gluonnlp.data.vocab import Vocab
+
+
+def test_vocab():
+    def check_same_vocab(vocab1, vocab2):
+        assert vocab1.all_tokens == vocab2.all_tokens
+        assert len(vocab1._special_token_kv) == len(vocab2._special_token_kv)
+        for k, v in vocab1._special_token_kv.items():
+            assert v == vocab2._special_token_kv[k]
+            assert getattr(vocab1, k) == getattr(vocab2, k)
+
+    def check_consistency(vocab):
+        for i, token in enumerate(vocab.all_tokens):
+            assert vocab[token] == i
+        if hasattr(vocab, 'unk_token'):
+            assert vocab['some1234123dasf'] == vocab[vocab.unk_token]
+        assert len(vocab) == len(vocab.all_tokens)
+        if len(vocab.all_tokens) > 0:
+            random_idx = [random.randint(0, len(vocab.all_tokens) - 1) for _ in range(20)]
+            assert vocab.to_tokens(random_idx) == [vocab.all_tokens[i] for i in random_idx]
+            assert vocab.to_tokens(np.array(random_idx)) == [vocab.all_tokens[i] for i in random_idx]
+            random_tokens = vocab.to_tokens(random_idx)
+            assert vocab[random_tokens] == random_idx
+            if vocab.has_unk:
+                assert vocab[random_tokens + ['213412hadhfk']]\
+                       == random_idx + [vocab.unk_id]
+            for k, v in vocab.special_tokens_kv.items():
+                idx_property = k[:-6] + '_id'
+                assert getattr(vocab, idx_property) == vocab[v]
+
+        # Test for serialize/deserailze from json
+        json_str = vocab.to_json()
+        new_vocab = Vocab.from_json(json_str)
+        check_same_vocab(new_vocab, vocab)
+        # Test for save/load from file
+        while True:
+            fname = '{}.json'.format(uuid.uuid4())
+            if os.path.exists(fname):
+                continue
+            vocab.save(path=fname)
+            new_vocab = Vocab.load(fname)
+            check_same_vocab(new_vocab, vocab)
+            os.remove(fname)
+            break
+
+    words = ['a', 'a', 'b', 'd', 'c', 'b', 'a', 'c', 'd', 'd', 'd']
+    random.shuffle(words)
+    counter = collections.Counter(words)
+    vocab = Vocab(counter, max_size=2, min_freq=None)
+    check_consistency(vocab)
+    assert vocab.all_tokens == ['d', 'a', '<unk>']
+    # Test for unknown token
+    vocab = Vocab(tokens=counter, max_size=2, min_freq=None, unk_token='<unk2>')
+    check_consistency(vocab)
+    assert vocab.all_tokens == ['d', 'a', '<unk2>']
+
+    vocab = Vocab(tokens=counter, max_size=None, min_freq=None,
+                  pad_token=Vocab.PAD_TOKEN, eos_token=Vocab.EOS_TOKEN,
+                  bos_token=Vocab.BOS_TOKEN, cls_token=Vocab.CLS_TOKEN,
+                  sep_token=Vocab.SEP_TOKEN, mask_token=Vocab.MASK_TOKEN)
+    check_consistency(vocab)
+    assert vocab.unk_token == Vocab.UNK_TOKEN
+    assert vocab.pad_token == Vocab.PAD_TOKEN
+    assert vocab.eos_token == Vocab.EOS_TOKEN
+    assert vocab.bos_token == Vocab.BOS_TOKEN
+    assert vocab.cls_token == Vocab.CLS_TOKEN
+    assert vocab.sep_token == Vocab.SEP_TOKEN
+    assert vocab.mask_token == Vocab.MASK_TOKEN
+    assert vocab.special_token_keys == ['unk_token', 'bos_token', 'cls_token', 'eos_token', 'mask_token', 'pad_token', 'sep_token']
+    assert vocab.special_tokens == ['<unk>', '<bos>', '<cls>', '<eos>', '<mask>', '<pad>', '<sep>']
+    assert vocab.all_tokens == ['d', 'a', 'c', 'b', '<unk>', '<bos>', '<cls>', '<eos>', '<mask>', '<pad>', '<sep>']
+
+    vocab = Vocab(counter, bos_token=Vocab.BOS_TOKEN, eos_token=Vocab.EOS_TOKEN,
+                  pad_token=Vocab.PAD_TOKEN)
+    check_consistency(vocab)
+    assert vocab.all_tokens == ['d', 'a', 'c', 'b', '<unk>', '<bos>', '<eos>', '<pad>']
+
+    vocab = Vocab(counter, max_size=None, min_freq=None,
+                  pad_token=Vocab.PAD_TOKEN, eos_token=Vocab.EOS_TOKEN,
+                  bos_token=Vocab.BOS_TOKEN, mask_token='<mask2>',
+                  other3_token='<other3>', other2_token='<other2>')
+    check_consistency(vocab)
+    assert vocab.all_tokens == ['d', 'a', 'c', 'b', '<unk>', '<bos>', '<eos>', '<mask2>', '<other2>', '<other3>', '<pad>']
+    assert vocab.mask_token == '<mask2>'
+    assert vocab.other2_token == '<other2>'
+    assert vocab.other3_token == '<other3>'
+    assert vocab.special_token_keys == ['unk_token', 'bos_token', 'eos_token', 'mask_token', 'other2_token', 'other3_token', 'pad_token']
+    assert vocab.special_tokens == ['<unk>', '<bos>', '<eos>', '<mask2>', '<other2>', '<other3>', '<pad>']
+
+    vocab = Vocab(counter, max_size=1, min_freq=10000, unk_token=None)
+    check_consistency(vocab)
+    assert vocab.all_tokens == []
+
+    vocab = Vocab([], pad_token=Vocab.PAD_TOKEN, eos_token=Vocab.EOS_TOKEN,
+                  bos_token=Vocab.BOS_TOKEN, mask_token='<mask2>')
+    check_consistency(vocab)
+    assert vocab.all_tokens == ['<unk>', '<bos>', '<eos>', '<mask2>', '<pad>']
+    vocab = Vocab(pad_token=Vocab.PAD_TOKEN, eos_token=Vocab.EOS_TOKEN,
+                  bos_token=Vocab.BOS_TOKEN, mask_token='<mask2>')
+    check_consistency(vocab)
+    assert vocab.all_tokens == ['<unk>', '<bos>', '<eos>', '<mask2>', '<pad>']
+
+    vocab = Vocab(['<unk2>', '<pad>', '<bos>', '<eos>', '<mask>', 'a'],
+                  pad_token=Vocab.PAD_TOKEN, eos_token=Vocab.EOS_TOKEN,
+                  bos_token=Vocab.BOS_TOKEN, mask_token='<mask>')
+    check_consistency(vocab)
+    assert vocab.all_tokens == ['<unk2>', '<pad>', '<bos>', '<eos>', '<mask>', 'a', '<unk>']
+    assert vocab.special_tokens == ['<pad>', '<bos>', '<eos>', '<mask>', '<unk>']
+    assert vocab.special_token_keys == ['pad_token', 'bos_token', 'eos_token', 'mask_token', 'unk_token']
+
+    # Check errors
+    with pytest.raises(ValueError):
+        vocab = Vocab(['a', 'a', 'a'])
+    with pytest.raises(ValueError):
+        vocab = Vocab(['a', 'b', 'c'], mask_token='<mask>', another_mask_token='<mask>')
+    with pytest.raises(ValueError):
+        vocab = Vocab(['a', 'b', 'c'], mask_token='<mask>', another_mask_token='<mask>')
+    vocab = Vocab(['a', 'b', 'c'])
+    check_consistency(vocab)
+    
+    # Check emoji
+    all_tokens = ['<unk>', '😁']
+    vocab = Vocab(all_tokens, unk_token='<unk>')
+    vocab_file = str(uuid.uuid4()) + '.vocab'
+    vocab.save(vocab_file)
+    vocab = Vocab.load(vocab_file)
+    assert vocab.all_tokens == all_tokens
+    os.remove(vocab_file)
diff --git a/tests/test_embedding.py b/tests/test_embedding.py
new file mode 100644
index 0000000000..b9be912339
--- /dev/null
+++ b/tests/test_embedding.py
@@ -0,0 +1,50 @@
+import numpy as np
+import collections
+import os
+import tempfile
+import pytest
+from gluonnlp.embedding import load_embeddings, get_fasttext_model
+from gluonnlp.data import Vocab
+
+def test_load_embeddings():
+    text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world', 'sadgood']
+    counter = collections.Counter(text_data)
+    vocab1 = Vocab(counter)
+    # load with vocab
+    matrix1 = load_embeddings(vocab1)
+    assert len(matrix1) == len(vocab1)
+    # load without vocab
+    matrix2, vocab2 = load_embeddings()
+    assert len(matrix2) == len(vocab2)
+    np.testing.assert_almost_equal(matrix1[vocab1["hello"]], matrix2[vocab2["hello"]])
+
+    # test_unk_method
+    def simple(words):
+        return np.ones((len(words), 50))
+    matrix3 = load_embeddings(vocab1, unk_method=simple)
+    assert sum(matrix3[vocab1['sadgood']] == 1) == matrix3.shape[-1]
+    np.testing.assert_almost_equal(matrix3[vocab1["hello"]], matrix2[vocab2["hello"]])
+
+    # load txt
+    with tempfile.TemporaryDirectory() as root:
+        path = os.path.join(root, "tmp.txt")
+        with open(path, "w") as f:
+            f.write("{} {}\n".format(matrix1.shape[0], matrix1.shape[1]))
+            for word, vec in zip(vocab1.all_tokens, matrix1):
+                f.write(word + " ")
+                f.write(" ".join([str(num) for num in vec.tolist()]))
+                f.write("\n")
+        matrix4 = load_embeddings(vocab1, path)
+        np.testing.assert_almost_equal(matrix4, matrix1)
+
+        
+def test_get_fasttext_model():
+    text_data = ['hello', 'world', 'hello', 'nice', 'world', 'hi', 'world']
+    counter = collections.Counter(text_data)
+    vocab1 = Vocab(counter)
+    matrix1 = load_embeddings(vocab1, 'wiki.en')
+    ft = get_fasttext_model('wiki.en')
+    np.testing.assert_almost_equal(matrix1[vocab1["hello"]], ft['hello'], decimal=4)
+    with pytest.raises(ValueError):
+        get_fasttext_model('wiki.multi.ar')
+
diff --git a/tests/test_gluon_block.py b/tests/test_gluon_block.py
new file mode 100644
index 0000000000..17a2c250b1
--- /dev/null
+++ b/tests/test_gluon_block.py
@@ -0,0 +1,88 @@
+import mxnet as mx
+import numpy as np
+from numpy.testing import assert_allclose
+from mxnet.gluon import HybridBlock, Constant
+from mxnet.gluon.data import DataLoader
+import itertools
+mx.npx.set_np()
+
+
+def test_const():
+    class Foo(HybridBlock):
+        def __init__(self):
+            super().__init__()
+            self.weight = Constant(np.ones((10, 10)))
+
+        def hybrid_forward(self, F, x, weight):
+            return x, weight.astype(np.float32)
+
+    foo = Foo()
+    foo.hybridize()
+    foo.initialize()
+
+
+def test_scalar():
+    class Foo(HybridBlock):
+        def hybrid_forward(self, F, x):
+            return x * x * 2
+
+    foo = Foo()
+    foo.hybridize()
+    foo.initialize()
+    out = foo(mx.np.array(1.0))
+    assert_allclose(out.asnumpy(), np.array(2.0))
+
+
+def test_gluon_nonzero_hybridize():
+    class Foo(HybridBlock):
+        def __init__(self):
+            super().__init__()
+
+        def hybrid_forward(self, F, x):
+            dat = F.np._internal.nonzero(x)
+            return dat.sum() + dat
+
+    foo = Foo()
+    foo.hybridize()
+    out = foo(mx.np.array([1, 0, 2, 0, 3, 0]))
+    out.wait_to_read()
+    out = foo(mx.np.array([0, 0, 0, 0, 0, 0]))
+    out.wait_to_read()
+
+
+def test_gluon_boolean_mask():
+    class Foo(HybridBlock):
+        def hybrid_forward(self, F, data, indices):
+            mask = indices < 3
+            data = F.npx.reshape(data, (-1, -2), reverse=True)
+            mask = F.np.reshape(mask, (-1,))
+            sel = F.np._internal.boolean_mask(data, mask)
+            return sel
+    data = mx.np.random.normal(0, 1, (5, 5, 5, 5, 16))
+    indices = mx.np.random.randint(0, 5, (5, 5, 5, 5))
+    data.attach_grad()
+    indices.attach_grad()
+    foo = Foo()
+    foo.hybridize()
+    with mx.autograd.record():
+        out = foo(data, indices)
+        out.backward()
+    out.wait_to_read()
+
+
+def test_basic_dataloader():
+    def grouper(iterable, n, fillvalue=None):
+        """Collect data into fixed-length chunks or blocks"""
+        # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
+        args = [iter(iterable)] * n
+        return itertools.zip_longest(*args, fillvalue=fillvalue)
+    ctx_l = [mx.cpu(i) for i in range(8)]
+    dataset = [mx.np.ones((10,)) * i for i in range(100000)]
+    dataloader = DataLoader(dataset, 2, num_workers=4, prefetch=10)
+
+    for i, data_l in enumerate(grouper(dataloader, len(ctx_l))):
+        for data, ctx in zip(data_l, ctx_l):
+            if data is None:
+                continue
+            data = data.as_in_ctx(ctx)
+            mx.npx.waitall()
diff --git a/tests/test_initializer.py b/tests/test_initializer.py
new file mode 100644
index 0000000000..002ab5ca0e
--- /dev/null
+++ b/tests/test_initializer.py
@@ -0,0 +1,35 @@
+import pytest
+from gluonnlp import initializer
+import mxnet as mx
+from mxnet.gluon import nn
+mx.npx.set_np()
+
+
+def test_truncnorm_string_alias_works():
+    try:
+        layer = nn.Dense(in_units=1, units=1, weight_initializer='truncnorm')
+        layer.initialize()
+    except RuntimeError:
+        pytest.fail('Layer couldn\'t be initialized')
+
+
+def test_truncnorm_all_values_inside_boundaries():
+    mean = 0
+    std = 0.01
+    layer = nn.Dense(in_units=1, units=1000)
+    layer.initialize(init=initializer.TruncNorm(mean, std))
+    assert (layer.weight.data() <= 2 * std).asnumpy().all()
+    assert (layer.weight.data() >= -2 * std).asnumpy().all()
+
+
+def test_truncnorm_generates_values_with_defined_mean_and_std():
+    from scipy import stats
+
+    mean = 10
+    std = 5
+    layer = nn.Dense(in_units=1, units=100000)
+    layer.initialize(init=initializer.TruncNorm(mean, std))
+    samples = layer.weight.data().reshape((-1, )).asnumpy()
+
+    p_value = stats.kstest(samples, 'truncnorm', args=(-2, 2, mean, std)).pvalue
+    assert p_value > 0.0001
diff --git a/tests/test_layers.py b/tests/test_layers.py
new file mode 100644
index 0000000000..e88f2c2167
--- /dev/null
+++ b/tests/test_layers.py
@@ -0,0 +1,249 @@
+import pytest
+import numpy as np
+from numpy.testing import assert_allclose
+import mxnet as mx
+from gluonnlp.layers import\
+    MultiHeadDense, PositionalEmbedding, \
+    SinusoidalPositionalEmbedding, \
+    LearnedPositionalEmbedding, \
+    BucketPositionalEmbedding, \
+    AdaptiveEmbedding, \
+    ProjectedAdaptiveLogSoftmaxWithLoss, \
+    get_activation
+from gluonnlp.op import relative_position_bucket
+mx.npx.set_np()
+
+
+def test_multi_head_dense():
+    def _verify(num_heads, hybridize):
+        layer = MultiHeadDense(32, num_heads)
+        layer.initialize()
+        if hybridize:
+            layer.hybridize()
+        in_data = mx.np.ones((5, 4, 10))
+        out = layer(in_data)
+        if not isinstance(num_heads, (list, tuple)):
+            num_heads = (num_heads,)
+        else:
+            num_heads = tuple(num_heads)
+        assert out.shape == (5,) + num_heads + (4, 32)
+        out_data = out.asnumpy()
+        weight_data = layer.weight.data().asnumpy()
+        bias_data = layer.bias.data().asnumpy()
+        gt_data = (in_data.asnumpy().dot(weight_data.T) + bias_data)\
+            .reshape((5, 4, np.prod(num_heads), 32))
+        gt_data = np.moveaxis(gt_data, -2, 1)
+        gt_data = gt_data.reshape((5,) + num_heads + (4, 32))
+        assert_allclose(gt_data, out_data, 1E-4, 1E-4)
+    for parallel_num in [3, (2, 3), (3, 2, 3)]:
+        for hybridize in [True, False]:
+            _verify(parallel_num, hybridize)
+
+
+def test_sinusoidal_positional_embedding():
+    def _gt_sinusoidal_embedding(np_data, units):
+        half_dim = units // 2
+        emb = np.log(10000) / (half_dim - 1)
+        emb = np.exp(np.arange(half_dim, dtype=np.float32) * -emb)
+        emb = np.expand_dims(np_data.astype(np.float32), axis=-1) * emb
+        emb = np.concatenate([np.sin(emb), np.cos(emb)], axis=-1)
+        if units % 2 == 1:
+            # zero pad
+            emb = np.concatenate([emb, np.expand_dims(np.zeros_like(np_data), axis=-1)], axis=-1)
+        return emb
+    for units in [31, 16]:
+        for hybridize in [False, True]:
+            pos_embed = SinusoidalPositionalEmbedding(units=units, dtype=np.float32)
+            pos_embed.initialize(mx.init.Normal(0.01))
+            if hybridize:
+                pos_embed.hybridize()
+            in_data = mx.np.array([100, 5, 20], dtype=np.int32)
+            out = pos_embed(in_data)
+            gt_out = _gt_sinusoidal_embedding(in_data.asnumpy(), units=units)
+            assert_allclose(gt_out, out.asnumpy(), 1E-5, 1E-5)
+
+
+def test_positional_embedding():
+    pos_embed = PositionalEmbedding(method='sinusoidal', units=128)
+    assert isinstance(pos_embed._embed, SinusoidalPositionalEmbedding)
+    pos_embed = PositionalEmbedding(method='learned', units=128)
+    assert isinstance(pos_embed._embed, LearnedPositionalEmbedding)
+
+
+def test_get_activation():
+    # Here we just test that the scripts are runnable. Should be revised to test for correctness
+    for act_type in ['leaky', 'identity', 'elu', 'gelu', 'gelu(tanh)', 'gelu(sigmoid)',
+                     'relu', 'sigmoid', 'tanh', 'softrelu', 'softsign']:
+        act = get_activation(act_type)
+        act.hybridize()
+        _ = act(mx.np.random.normal(0, 1, (10, 10)))
+
+
+@pytest.mark.parametrize('vocab_size,cutoffs,div_val',
+                         [[1000, 10, 1.5],
+                          [1000, [5], 1.5],
+                          [500, [20, 100], 1.5],
+                          [1000, 5, 1.0]])
+@pytest.mark.parametrize('embed_size', [128])
+@pytest.mark.parametrize('units', [16])
+def test_adaptive_embedding(vocab_size, cutoffs, embed_size, units, div_val):
+    embed = AdaptiveEmbedding(vocab_size=vocab_size, embed_size=embed_size,
+                              units=units, cutoffs=cutoffs, div_val=div_val)
+    embed.initialize()
+    embed.hybridize()
+    # Test for parameter number
+    estimated_param_num = 0
+    if isinstance(cutoffs, int):
+        cutoffs = [cutoffs]
+    if div_val != 1.0:
+        for i, (lhs, rhs) in enumerate(zip([0] + cutoffs, cutoffs + [vocab_size])):
+            estimated_param_num += (rhs - lhs) * int(embed_size / div_val ** i)
+            estimated_param_num += int(embed_size / div_val ** i) * units
+        total_param_num = sum([np.prod(p.shape) for p in embed.collect_params().values()])
+    else:
+        estimated_param_num = vocab_size * embed_size + embed_size * units
+        total_param_num = sum([np.prod(p.shape) for p in embed.collect_params().values()])
+    assert total_param_num == estimated_param_num
+    # Test for forward
+    out = embed(mx.np.random.randint(0, vocab_size, 20))
+    mx.npx.waitall()
+    assert out.shape == (20, units)
+
+
+@pytest.mark.parametrize('vocab_size,cutoffs,div_val',
+                         [[1000, 10, 1.5],
+                          [1000, [5], 1.5],
+                          [500, [20, 100], 1.5],
+                          [500, [20, 100], 1.],
+                          [1000, 10, 1.0],
+                          [1000, [], 1.0],
+                          [1000, None, 1.0]])
+@pytest.mark.parametrize('embed_size', [128])
+@pytest.mark.parametrize('in_units', [16])
+# TODO This test even passes without sharing the parameters. It needs to be improved.
+def test_projected_adaptive_softmax(vocab_size, cutoffs, embed_size, in_units, div_val):
+    layer = ProjectedAdaptiveLogSoftmaxWithLoss(vocab_size=vocab_size, cutoffs=cutoffs,
+                                                embed_size=embed_size, in_units=in_units,
+                                                div_val=div_val)
+    layer.initialize()
+    layer.hybridize()
+    hidden = mx.np.random.normal(0, 1, (4, 4, 4, 16))
+    target = mx.np.random.randint(0, vocab_size, (4, 4, 4,))
+    out = layer(hidden, target)
+    mx.npx.waitall()
+    assert out.shape == (4, 4, 4)
+
+    # Test for weight sharing
+    embed_layer = AdaptiveEmbedding(vocab_size=vocab_size, cutoffs=cutoffs,
+                                    units=in_units, embed_size=embed_size,
+                                    div_val=div_val)
+    layer_with_shared_proj = \
+        ProjectedAdaptiveLogSoftmaxWithLoss(vocab_size=vocab_size,
+                                            cutoffs=cutoffs,
+                                            embed_size=embed_size,
+                                            in_units=in_units,
+                                            div_val=div_val)
+    layer_with_shared_proj.share_parameters(embed_layer.collect_params('inter_proj'))
+    layer_with_shared_embed = \
+        ProjectedAdaptiveLogSoftmaxWithLoss(vocab_size=vocab_size,
+                                            cutoffs=cutoffs,
+                                            embed_size=embed_size,
+                                            in_units=in_units,
+                                            div_val=div_val)
+    layer_with_shared_embed.share_parameters(embed_layer.collect_params('embed'))
+    layer_with_shared_proj_embed = \
+        ProjectedAdaptiveLogSoftmaxWithLoss(vocab_size=vocab_size,
+                                            cutoffs=cutoffs,
+                                            embed_size=embed_size,
+                                            in_units=in_units,
+                                            div_val=div_val)
+    layer_with_shared_proj_embed.share_parameters(embed_layer.collect_params('(embed|inter_proj)'))
+    embed_layer.initialize()
+    embed_layer.hybridize()
+    layer_with_shared_proj.initialize()
+    layer_with_shared_proj.hybridize()
+    layer_with_shared_embed.initialize()
+    layer_with_shared_embed.hybridize()
+    layer_with_shared_proj_embed.initialize()
+    layer_with_shared_proj_embed.hybridize()
+
+    hidden = mx.np.random.normal(0, 1, (4, 4, 4, 16))
+    target = mx.np.random.randint(0, vocab_size, (4, 4, 4,))
+    with mx.autograd.record():
+        loss = ((hidden - embed_layer(target)) ** 2).sum()
+        loss.backward()
+    assert embed_layer(target).asnumpy().shape == hidden.shape
+
+    embed_weights = {}
+    embed_grads = {}
+    proj_weights = {}
+    proj_grads = {}
+    for k, v in embed_layer.collect_params().items():
+        if '_embed' in k:
+            arr_id = int(k[-len('_weight') - 1])
+            embed_weights[arr_id] = v.data()[0].asnumpy()
+            embed_grads[arr_id] = v.grad()[0].asnumpy()
+        elif '_inter_proj' in k:
+            arr_id = int(k[-len('_weight') - 1])
+            proj_weights[arr_id] = v.data()[0].asnumpy()
+            proj_grads[arr_id] = v.grad()[0].asnumpy()
+
+    # Check shared proj
+    for k, v in layer_with_shared_proj.collect_params().items():
+        if '_embed' in k and '_weight' in k:
+            arr_id = int(k[-len('_weight') - 1])
+            with pytest.raises(AssertionError):
+                assert_allclose(v.data()[0].asnumpy(), embed_weights[arr_id])
+        elif '_inter_proj' in k and '_weight' in k:
+            arr_id = int(k[-len('_weight') - 1])
+            assert_allclose(v.data()[0].asnumpy(), proj_weights[arr_id])
+            assert_allclose(v.grad()[0].asnumpy(), proj_grads[arr_id])
+
+    # Check shared embed
+    for k, v in layer_with_shared_embed.collect_params().items():
+        if '_embed' in k and '_weight' in k:
+            arr_id = int(k[-len('_weight') - 1])
+            assert_allclose(v.data()[0].asnumpy(), embed_weights[arr_id])
+            assert_allclose(v.grad()[0].asnumpy(), embed_grads[arr_id])
+        elif '_inter_proj' in k and '_weight' in k:
+            arr_id = int(k[-len('_weight') - 1])
+            with pytest.raises(AssertionError):
+                assert_allclose(v.data()[0].asnumpy(), proj_weights[arr_id])
+
+    # Check shared proj + shared embed
+    for k, v in layer_with_shared_proj_embed.collect_params().items():
+        if '_embed' in k and '_weight' in k:
+            arr_id = int(k[-len('_weight') - 1])
+            assert_allclose(v.data()[0].asnumpy(), embed_weights[arr_id])
+            assert_allclose(v.grad()[0].asnumpy(), embed_grads[arr_id])
+        elif '_inter_proj' in k and '_weight' in k:
+            arr_id = int(k[-len('_weight') - 1])
+            assert_allclose(v.data()[0].asnumpy(), proj_weights[arr_id])
+            assert_allclose(v.grad()[0].asnumpy(), proj_grads[arr_id])
+
+
+@pytest.mark.parametrize('units', [16])
+@pytest.mark.parametrize('num_buckets', [32, 64])
+@pytest.mark.parametrize('bidirectional', [True, False])
+@pytest.mark.parametrize('max_distance', [128, 256])
+@pytest.mark.seed(123)
+def test_bucket_positional_embedding(units, num_buckets, bidirectional, max_distance):
+    embed = BucketPositionalEmbedding(units=units, bidirectional=bidirectional,
+                                      num_buckets=num_buckets, max_distance=max_distance)
+    embed.initialize()
+    relative_positions1 = mx.np.random.randint(-10000, 10000, (128, 25), dtype=np.int32)
+    relative_positions2 = mx.np.random.randint(-10, 10, (128, 25), dtype=np.int32)
+    relative_positions = mx.np.concatenate([relative_positions1, relative_positions2], axis=-1)
+    buckets = relative_position_bucket(mx, relative_positions, bidirectional=bidirectional,
+                                       num_buckets=num_buckets, max_distance=max_distance)
+    out = embed(relative_positions)
+    for i in range(num_buckets):
+        cnt = (buckets == i).sum().asnumpy()
+        if cnt > 1:
+            assert_allclose(mx.np.linalg.norm(out[buckets == i].std(axis=0)).asnumpy(), 0,
+                            1E-5, 1E-5)
+    if bidirectional:
+        assert mx.np.all(buckets[relative_positions < 0] >= num_buckets // 2).asnumpy()
+    out_of_bound_cnt = buckets[relative_positions > max_distance].sum()
+    if out_of_bound_cnt.asnumpy() > 0:
+        assert buckets[relative_positions > max_distance].std().asnumpy() == 0
diff --git a/tests/test_loss.py b/tests/test_loss.py
new file mode 100644
index 0000000000..5e438a7c6f
--- /dev/null
+++ b/tests/test_loss.py
@@ -0,0 +1,41 @@
+import mxnet as mx
+import numpy as np
+import pytest
+from numpy.testing import assert_allclose
+import scipy.special as sspecial
+from gluonnlp.loss import LabelSmoothCrossEntropyLoss
+mx.npx.set_np()
+
+
+@pytest.mark.parametrize('label_shape', [(5, 3), (3,), (2, 3, 2)])
+@pytest.mark.parametrize('alpha', [0.0, 0.1])
+@pytest.mark.parametrize('from_logits', [True, False])
+@pytest.mark.parametrize('hybridize', [True, False])
+def test_label_smoothing(label_shape, alpha, from_logits, hybridize):
+    def _np_label_smoothing(pred, labels, alpha, from_logits):
+        flatten_pred = pred.reshape((-1, pred.shape[-1]))
+        flatten_labels = labels.reshape((-1,))
+        smoothed_labels = np.full_like(flatten_pred,
+                                       fill_value=alpha / flatten_pred.shape[-1])
+        smoothed_labels[np.arange(flatten_pred.shape[0]), flatten_labels]\
+            = 1 - alpha + alpha / flatten_pred.shape[-1]
+        if not from_logits:
+            flatten_logits = np.log(sspecial.softmax(flatten_pred, axis=-1))
+        else:
+            flatten_logits = flatten_pred
+        # Calculate cross-entropy
+        loss = - (smoothed_labels * flatten_logits).sum(axis=-1)
+        return loss.reshape(labels.shape)
+    label_num = 5
+    loss = LabelSmoothCrossEntropyLoss(num_labels=label_num, alpha=alpha, from_logits=from_logits)
+    if hybridize:
+        loss.hybridize()
+    if from_logits:
+        pred = mx.np.random.uniform(-10, -1, label_shape + (label_num,))
+    else:
+        pred = mx.np.random.normal(0, 1, label_shape + (label_num,))
+    labels = mx.np.random.randint(0, label_num, label_shape)
+    out = loss(pred, labels)
+    np_out = _np_label_smoothing(pred.asnumpy(), labels.asnumpy(), alpha, from_logits)
+    assert_allclose(np_out, out.asnumpy(), 1E-4, 1E-4)
+
diff --git a/tests/test_models.py b/tests/test_models.py
new file mode 100644
index 0000000000..03491b6272
--- /dev/null
+++ b/tests/test_models.py
@@ -0,0 +1,40 @@
+import tempfile
+import pytest
+import mxnet as mx
+import os
+from gluonnlp.models import get_backbone, list_backbone_names
+from gluonnlp.utils.misc import count_parameters
+mx.npx.set_np()
+
+
+def test_list_backbone_names():
+    assert len(list_backbone_names()) > 0
+
+
+@pytest.mark.parametrize('name', list_backbone_names())
+def test_get_backbone(name, ctx):
+    with tempfile.TemporaryDirectory() as root, ctx:
+        model_cls, cfg, tokenizer, local_params_path, _ = get_backbone(name, root=root)
+        net = model_cls.from_cfg(cfg)
+        net.load_parameters(local_params_path)
+        net.hybridize()
+        num_params, num_fixed_params = count_parameters(net.collect_params())
+        assert num_params > 0
+
+        # Test for model export + save
+        batch_size = 1
+        sequence_length = 4
+        inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
+        token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length))
+        valid_length = mx.np.random.randint(1, sequence_length, (batch_size,))
+        if 'roberta' in name:
+            out = net(inputs, valid_length)
+        elif 'xlmr' in name:
+            # Skip for XLMR tests. It takes too much CPU memory.
+            return
+        elif 'bart' in name:
+            out = net(inputs, valid_length, inputs, valid_length)
+        else:
+            out = net(inputs, token_types, valid_length)
+        mx.npx.waitall()
+        net.export(os.path.join(root, 'model'))
diff --git a/tests/test_models_albert.py b/tests/test_models_albert.py
new file mode 100644
index 0000000000..71170500c1
--- /dev/null
+++ b/tests/test_models_albert.py
@@ -0,0 +1,176 @@
+import pytest
+import numpy as np
+from numpy.testing import assert_allclose
+import mxnet as mx
+import tempfile
+from gluonnlp.models.albert import AlbertModel, AlbertForMLM, AlbertForPretrain,\
+    list_pretrained_albert, get_pretrained_albert
+mx.npx.set_np()
+
+
+def get_test_cfg():
+    vocab_size = 500
+    num_token_types = 3
+    num_layers = 3
+    num_heads = 2
+    units = 64
+    hidden_size = 96
+    hidden_dropout_prob = 0.0
+    attention_dropout_prob = 0.0
+    cfg = AlbertModel.get_cfg().clone()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = vocab_size
+    cfg.MODEL.num_token_types = num_token_types
+    cfg.MODEL.units = units
+    cfg.MODEL.hidden_size = hidden_size
+    cfg.MODEL.num_heads = num_heads
+    cfg.MODEL.num_layers = num_layers
+    cfg.MODEL.hidden_dropout_prob = hidden_dropout_prob
+    cfg.MODEL.attention_dropout_prob = attention_dropout_prob
+    return cfg
+
+
+@pytest.mark.parametrize('static_alloc,static_shape', [(False, False),
+                                                       (True, True)])
+@pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN'])
+def test_albert_backbone(static_alloc, static_shape, compute_layout):
+    batch_size = 3
+    cfg = get_test_cfg()
+    cfg.defrost()
+    cfg.MODEL.compute_layout = compute_layout
+    cfg.freeze()
+    model = AlbertModel.from_cfg(cfg, use_pooler=True)
+    model.initialize()
+    model.hybridize(static_alloc=static_alloc, static_shape=static_shape)
+    cfg_tn = cfg.clone()
+    cfg_tn.defrost()
+    cfg_tn.MODEL.layout = 'TN'
+    cfg_tn.freeze()
+    model_tn = AlbertModel.from_cfg(cfg_tn, use_pooler=True)
+    model_tn.share_parameters(model.collect_params())
+    model_tn.hybridize(static_alloc=static_alloc, static_shape=static_shape)
+
+    for seq_length in [64, 96]:
+        valid_length = mx.np.random.randint(seq_length // 2, seq_length, (batch_size,))
+        inputs = mx.np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length))
+        token_types = mx.np.random.randint(0, cfg.MODEL.num_token_types, (batch_size, seq_length))
+        contextual_embedding, pooled_out = model(inputs, token_types, valid_length)
+        contextual_embedding_tn, pooled_out_tn = model_tn(inputs.T, token_types.T, valid_length)
+        # Verify layout
+        assert_allclose(np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
+                        contextual_embedding.asnumpy(), 1E-4, 1E-4)
+        assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)
+        assert contextual_embedding.shape == (batch_size, seq_length, cfg.MODEL.units)
+        assert pooled_out.shape == (batch_size, cfg.MODEL.units)
+        # Ensure the embeddings that exceed valid_length are masked
+        contextual_embedding_np = contextual_embedding.asnumpy()
+        pooled_out_np = pooled_out.asnumpy()
+        for i in range(batch_size):
+            ele_valid_length = valid_length[i].asnumpy()
+            assert_allclose(contextual_embedding_np[i, ele_valid_length:],
+                            np.zeros_like(contextual_embedding_np[i, ele_valid_length:]),
+                            1E-5, 1E-5)
+        # Ensure that the content are correctly masked
+        new_inputs = mx.np.concatenate([inputs, inputs[:, :5]], axis=-1)
+        new_token_types = mx.np.concatenate([token_types, token_types[:, :5]], axis=-1)
+        new_contextual_embedding, new_pooled_out = \
+            model(new_inputs, new_token_types, valid_length)
+        new_contextual_embedding_np = new_contextual_embedding.asnumpy()
+        new_pooled_out_np = new_pooled_out.asnumpy()
+        for i in range(batch_size):
+            ele_valid_length = valid_length[i].asnumpy()
+            assert_allclose(new_contextual_embedding_np[i, :ele_valid_length],
+                            contextual_embedding_np[i, :ele_valid_length], 1E-5, 1E-5)
+        assert_allclose(new_pooled_out_np, pooled_out_np, 1E-4, 1E-4)
+
+
+@pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN'])
+def test_albert_for_mlm_model(compute_layout):
+    batch_size = 3
+    cfg = get_test_cfg()
+    cfg.defrost()
+    cfg.MODEL.compute_layout = compute_layout
+    cfg.freeze()
+    albert_mlm_model = AlbertForMLM(backbone_cfg=cfg)
+    albert_mlm_model.initialize()
+    albert_mlm_model.hybridize()
+    cfg_tn = cfg.clone()
+    cfg_tn.defrost()
+    cfg_tn.MODEL.layout = 'TN'
+    cfg_tn.freeze()
+    albert_mlm_tn_model = AlbertForMLM(backbone_cfg=cfg_tn)
+    albert_mlm_tn_model.share_parameters(albert_mlm_model.collect_params())
+    albert_mlm_tn_model.hybridize()
+
+    num_mask = 16
+    seq_length = 64
+    inputs = mx.np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length))
+    token_types = mx.np.random.randint(0, cfg.MODEL.num_token_types, (batch_size, seq_length))
+    valid_length = mx.np.random.randint(seq_length // 2, seq_length, (batch_size,))
+    masked_positions = mx.np.random.randint(0, seq_length // 2, (batch_size, num_mask))
+    contextual_embeddings, pooled_out, mlm_scores = albert_mlm_model(inputs, token_types, valid_length, masked_positions)
+    contextual_embeddings_tn, pooled_out_tn, mlm_scores_tn = albert_mlm_tn_model(inputs.T, token_types.T, valid_length, masked_positions)
+    assert_allclose(np.swapaxes(contextual_embeddings_tn.asnumpy(), 0, 1),
+                    contextual_embeddings.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-4, 1E-4)
+    assert mlm_scores.shape == (batch_size, num_mask, cfg.MODEL.vocab_size)
+
+
+@pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN'])
+def test_albert_for_pretrain_model(compute_layout):
+    batch_size = 3
+    cfg = get_test_cfg()
+    cfg.defrost()
+    cfg.MODEL.compute_layout = compute_layout
+    cfg.freeze()
+    albert_pretrain_model = AlbertForPretrain(backbone_cfg=cfg)
+    albert_pretrain_model.initialize()
+    albert_pretrain_model.hybridize()
+    cfg_tn = cfg.clone()
+    cfg_tn.defrost()
+    cfg_tn.MODEL.layout = 'TN'
+    cfg_tn.freeze()
+    albert_pretrain_model_tn = AlbertForPretrain(backbone_cfg=cfg_tn)
+    albert_pretrain_model_tn.share_parameters(albert_pretrain_model.collect_params())
+    albert_pretrain_model_tn.hybridize()
+
+    num_mask = 16
+    seq_length = 64
+    inputs = mx.np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, seq_length))
+    token_types = mx.np.random.randint(0, cfg.MODEL.num_token_types, (batch_size, seq_length))
+    valid_length = mx.np.random.randint(seq_length // 2, seq_length, (batch_size,))
+    masked_positions = mx.np.random.randint(0, seq_length // 2, (batch_size, num_mask))
+    contextual_embeddings, pooled_out, sop_score, mlm_scores =\
+        albert_pretrain_model(inputs, token_types, valid_length, masked_positions)
+    contextual_embeddings_tn, pooled_out_tn, sop_score_tn, mlm_scores_tn = \
+        albert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
+    assert_allclose(np.swapaxes(contextual_embeddings_tn.asnumpy(), 0, 1),
+                    contextual_embeddings.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(sop_score.asnumpy(), sop_score_tn.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(mlm_scores.asnumpy(), mlm_scores_tn.asnumpy(), 1E-4, 1E-4)
+    assert mlm_scores.shape == (batch_size, num_mask, cfg.MODEL.vocab_size)
+    assert sop_score.shape == (batch_size, 2)
+
+
+def test_list_pretrained_albert():
+    assert len(list_pretrained_albert()) > 0
+
+
+@pytest.mark.remote_required
+@pytest.mark.parametrize('model_name', list_pretrained_albert())
+def test_albert_get_pretrained(model_name):
+    assert len(list_pretrained_albert()) > 0
+    with tempfile.TemporaryDirectory() as root:
+        cfg, tokenizer, backbone_params_path, mlm_params_path =\
+            get_pretrained_albert(model_name, load_backbone=True, load_mlm=True, root=root)
+        assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
+        albert_model = AlbertModel.from_cfg(cfg)
+        albert_model.load_parameters(backbone_params_path)
+        albert_mlm_model = AlbertForMLM(cfg)
+        if mlm_params_path is not None:
+            albert_mlm_model.load_parameters(mlm_params_path)
+        # Just load the backbone
+        albert_mlm_model = AlbertForMLM(cfg)
+        albert_mlm_model.backbone_model.load_parameters(backbone_params_path)
diff --git a/tests/test_models_bart.py b/tests/test_models_bart.py
new file mode 100644
index 0000000000..36e7a9d596
--- /dev/null
+++ b/tests/test_models_bart.py
@@ -0,0 +1,52 @@
+import pytest
+import mxnet as mx
+import tempfile
+from gluonnlp.models.bart import BartModel, \
+    list_pretrained_bart, get_pretrained_bart, bart_cfg_reg
+
+
+mx.npx.set_np()
+
+
+def test_list_pretrained_bart():
+    assert len(list_pretrained_bart()) > 0
+
+
+@pytest.mark.remote_required
+@pytest.mark.parametrize('model_name', list_pretrained_bart())
+def test_bart(model_name):
+    # test from pretrained
+    assert len(list_pretrained_bart()) > 0
+    with tempfile.TemporaryDirectory() as root:
+        cfg, tokenizer, params_path, _ =\
+            get_pretrained_bart(model_name, load_backbone=True, root=root)
+        assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
+        # test standard bart encoder and decoder
+        bart_model = BartModel.from_cfg(cfg)
+        bart_model.load_parameters(params_path)
+        # test bart encoder and decoder with pooler
+        bart_model_with_pooler = BartModel.from_cfg(
+            cfg, use_pooler=True, classifier_activation=False)
+        bart_model_with_pooler.load_parameters(params_path)
+
+
+def test_bart_cfg_registry():
+    assert len(bart_cfg_reg.list_keys()) > 0
+
+
+@pytest.mark.parametrize('cfg_key', bart_cfg_reg.list_keys())
+def test_bart_cfg(cfg_key):
+    cfg = BartModel.get_cfg(cfg_key)
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 32
+    cfg.freeze()
+    model = BartModel.from_cfg(cfg)
+    model.initialize()
+    model.hybridize()
+    cfg.defrost()
+    cfg.MODEL.layout = 'TN'
+    cfg.freeze()
+    model_tn = BartModel.from_cfg(cfg)
+    model_tn.share_parameters(model.collect_params())
+    model_tn.hybridize()
+    mx.npx.waitall()
diff --git a/tests/test_models_bert.py b/tests/test_models_bert.py
new file mode 100644
index 0000000000..30ae207248
--- /dev/null
+++ b/tests/test_models_bert.py
@@ -0,0 +1,106 @@
+import pytest
+from numpy.testing import assert_allclose
+import mxnet as mx
+import tempfile
+from gluonnlp.models.bert import BertModel, BertForMLM, BertForPretrain,\
+    list_pretrained_bert, get_pretrained_bert
+mx.npx.set_np()
+
+
+def test_list_pretrained_bert():
+    assert len(list_pretrained_bert()) > 0
+
+
+@pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN'])
+def test_bert_small_cfg(compute_layout, ctx):
+    with ctx:
+        cfg = BertModel.get_cfg()
+        cfg.defrost()
+        cfg.MODEL.vocab_size = 100
+        cfg.MODEL.units = 12 * 4
+        cfg.MODEL.hidden_size = 64
+        cfg.MODEL.num_layers = 2
+        cfg.MODEL.num_heads = 2
+        cfg.MODEL.compute_layout = compute_layout
+        cfg.freeze()
+
+        # Generate TN layout
+        cfg_tn = cfg.clone()
+        cfg_tn.defrost()
+        cfg_tn.MODEL.layout = 'TN'
+        cfg_tn.freeze()
+
+        # Sample data
+        batch_size = 4
+        sequence_length = 8
+        num_mask = 3
+        inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
+        token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length))
+        valid_length = mx.np.random.randint(3, sequence_length, (batch_size,))
+        masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))
+
+        # Test for BertModel
+        bert_model = BertModel.from_cfg(cfg)
+        bert_model.initialize()
+        bert_model.hybridize()
+        contextual_embedding, pooled_out = bert_model(inputs, token_types, valid_length)
+        bert_model_tn = BertModel.from_cfg(cfg_tn)
+        bert_model_tn.share_parameters(bert_model.collect_params())
+        bert_model_tn.hybridize()
+        contextual_embedding_tn, pooled_out_tn = bert_model_tn(inputs.T, token_types.T, valid_length)
+        assert_allclose(contextual_embedding.asnumpy(),
+                        mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(),
+                        1E-4, 1E-4)
+        assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
+
+        # Test for BertForMLM
+        bert_mlm_model = BertForMLM(cfg)
+        bert_mlm_model.initialize()
+        bert_mlm_model.hybridize()
+        contextual_embedding, pooled_out, mlm_score = bert_mlm_model(inputs, token_types,
+                                                                     valid_length, masked_positions)
+        bert_mlm_model_tn = BertForMLM(cfg_tn)
+        bert_mlm_model_tn.share_parameters(bert_mlm_model.collect_params())
+        bert_mlm_model_tn.hybridize()
+        contextual_embedding_tn, pooled_out_tn, mlm_score_tn =\
+            bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
+        assert_allclose(contextual_embedding.asnumpy(),
+                        mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(),
+                        1E-4, 1E-4)
+        assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
+        assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4)
+
+        # Test for BertForPretrain
+        bert_pretrain_model = BertForPretrain(cfg)
+        bert_pretrain_model.initialize()
+        bert_pretrain_model.hybridize()
+        contextual_embedding, pooled_out, nsp_score, mlm_scores =\
+            bert_pretrain_model(inputs, token_types, valid_length, masked_positions)
+        bert_pretrain_model_tn = BertForPretrain(cfg_tn)
+        bert_pretrain_model_tn.share_parameters(bert_pretrain_model.collect_params())
+        bert_pretrain_model_tn.hybridize()
+        contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_scores_tn = \
+            bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
+        assert_allclose(contextual_embedding.asnumpy(),
+                        mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(),
+                        1E-4, 1E-4)
+        assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
+        assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-4, 1E-4)
+        assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4)
+
+
+@pytest.mark.remote_required
+@pytest.mark.parametrize('model_name', list_pretrained_bert())
+def test_bert_get_pretrained(model_name, ctx):
+    assert len(list_pretrained_bert()) > 0
+    with tempfile.TemporaryDirectory() as root, ctx:
+        cfg, tokenizer, backbone_params_path, mlm_params_path =\
+            get_pretrained_bert(model_name, load_backbone=True, load_mlm=True, root=root)
+        assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
+        bert_model = BertModel.from_cfg(cfg)
+        bert_model.load_parameters(backbone_params_path)
+        bert_mlm_model = BertForMLM(cfg)
+        if mlm_params_path is not None:
+            bert_mlm_model.load_parameters(mlm_params_path)
+        bert_mlm_model = BertForMLM(cfg)
+        bert_mlm_model.backbone_model.load_parameters(backbone_params_path)
diff --git a/tests/test_models_electra.py b/tests/test_models_electra.py
new file mode 100644
index 0000000000..998ee72f53
--- /dev/null
+++ b/tests/test_models_electra.py
@@ -0,0 +1,93 @@
+import pytest
+import numpy as np
+from numpy.testing import assert_allclose
+import mxnet as mx
+import tempfile
+from gluonnlp.models.electra import ElectraModel, ElectraDiscriminator,\
+    ElectraGenerator,\
+    list_pretrained_electra, get_pretrained_electra, get_generator_cfg
+mx.npx.set_np()
+
+
+def test_list_pretrained_electra():
+    assert len(list_pretrained_electra()) > 0
+
+
+def get_test_cfg():
+    cfg = ElectraModel.get_cfg()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 100
+    cfg.MODEL.units = 12 * 8
+    cfg.MODEL.hidden_size = 128
+    cfg.MODEL.num_heads = 2
+    cfg.MODEL.num_layers = 2
+    cfg.freeze()
+    return cfg
+
+
+@pytest.mark.parametrize('compute_layout', ['auto', 'NT', 'TN'])
+def test_electra_model(compute_layout, ctx):
+    with ctx:
+        cfg = get_test_cfg()
+        cfg.defrost()
+        cfg.MODEL.compute_layout = compute_layout
+        cfg.freeze()
+
+        # Generate TN layout
+        cfg_tn = cfg.clone()
+        cfg_tn.defrost()
+        cfg_tn.MODEL.layout = 'TN'
+        cfg_tn.freeze()
+
+        # Sample data
+        batch_size = 4
+        sequence_length = 16
+        num_mask = 3
+        inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
+        token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length))
+        valid_length = mx.np.random.randint(3, sequence_length, (batch_size,))
+        masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))
+
+        electra_model = ElectraModel.from_cfg(cfg)
+        electra_model.initialize()
+        electra_model.hybridize()
+        contextual_embedding, pooled_out = electra_model(inputs, token_types, valid_length)
+        electra_model_tn = ElectraModel.from_cfg(cfg_tn)
+        electra_model_tn.share_parameters(electra_model.collect_params())
+        electra_model_tn.hybridize()
+        contextual_embedding_tn, pooled_out_tn = electra_model_tn(inputs.T, token_types.T, valid_length)
+        assert_allclose(contextual_embedding.asnumpy(),
+                        np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
+                        1E-4, 1E-4)
+        assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(),
+                        1E-4, 1E-4)
+
+
+@pytest.mark.remote_required
+@pytest.mark.parametrize('model_name', list_pretrained_electra())
+def test_electra_get_pretrained(model_name, ctx):
+    assert len(list_pretrained_electra()) > 0
+    with tempfile.TemporaryDirectory() as root, ctx:
+        cfg, tokenizer, backbone_params_path, (disc_params_path, gen_params_path) =\
+            get_pretrained_electra(model_name, root=root,
+                                   load_backbone=True, load_disc=True, load_gen=True)
+        assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
+        electra_model = ElectraModel.from_cfg(cfg)
+        electra_model.load_parameters(backbone_params_path)
+
+        electra_disc_model = ElectraDiscriminator(cfg)
+        electra_disc_model.load_parameters(disc_params_path)
+        electra_disc_model = ElectraDiscriminator(cfg)
+        electra_disc_model.backbone_model.load_parameters(backbone_params_path)
+
+        gen_cfg = get_generator_cfg(cfg)
+        electra_gen_model = ElectraGenerator(gen_cfg)
+        electra_gen_model.load_parameters(gen_params_path)
+        electra_gen_model.tie_embeddings(
+            electra_disc_model.backbone_model.word_embed.collect_params(),
+            electra_disc_model.backbone_model.token_type_embed.collect_params(),
+            electra_disc_model.backbone_model.token_pos_embed.collect_params(),
+            electra_disc_model.backbone_model.embed_layer_norm.collect_params())
+
+        electra_gen_model = ElectraGenerator(cfg)
+        electra_gen_model.backbone_model.load_parameters(backbone_params_path)
diff --git a/tests/test_models_gpt2.py b/tests/test_models_gpt2.py
new file mode 100644
index 0000000000..818f3cc8a8
--- /dev/null
+++ b/tests/test_models_gpt2.py
@@ -0,0 +1,159 @@
+import pytest
+import numpy as np
+import mxnet as mx
+import tempfile
+from numpy.testing import assert_allclose
+from gluonnlp.models.gpt2 import GPT2Model, GPT2ForLM, \
+    list_pretrained_gpt2, get_pretrained_gpt2
+from gluonnlp.loss import LabelSmoothCrossEntropyLoss
+
+mx.npx.set_np()
+
+
+def test_list_pretrained_gpt2():
+    assert len(list_pretrained_gpt2()) > 0
+
+
+@pytest.mark.parametrize('compute_layout', ['auto', 'TN', 'NT'])
+def test_gpt2_small_config(compute_layout, ctx):
+    cfg = GPT2Model.get_cfg()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 1000
+    cfg.MODEL.units = 128
+    cfg.MODEL.num_layers = 2
+    cfg.MODEL.num_heads = 2
+    cfg.MODEL.compute_layout = compute_layout
+    cfg.freeze()
+
+    # Generate TN layout
+    cfg_tn = cfg.clone()
+    cfg_tn.defrost()
+    cfg_tn.MODEL.layout = 'TN'
+    cfg_tn.freeze()
+    
+    with ctx:
+        batch_size = 4
+        sequence_length = 16
+        inputs = mx.np.random.randint(0, 1000, (batch_size, sequence_length), ctx=ctx)
+
+        gpt2_model = GPT2Model.from_cfg(cfg)
+        gpt2_model.initialize(ctx=ctx)
+        gpt2_model.hybridize()
+        hiddens, _ = gpt2_model(
+            inputs,
+            gpt2_model.init_states(batch_size, ctx),
+            mx.np.array(0, dtype=np.int32, ctx=ctx)
+        )
+        gpt2_model_tn = GPT2Model.from_cfg(cfg_tn)
+        gpt2_model_tn.share_parameters(gpt2_model.collect_params())
+        gpt2_model_tn.hybridize()
+        hiddens_tn, _ = gpt2_model_tn(
+            inputs.T,
+            gpt2_model_tn.init_states(batch_size, ctx),
+            mx.np.array(0, dtype=np.int32, ctx=ctx)
+        )
+        assert_allclose(np.swapaxes(hiddens_tn.asnumpy(), 0, 1),
+                        hiddens.asnumpy(), 1E-4, 1E-4)
+
+        # Test for GPT2ForLM
+        gpt2_lm_model = GPT2ForLM(cfg)
+        gpt2_lm_model.initialize(ctx=ctx)
+        gpt2_lm_model.hybridize()
+        logits, states = gpt2_lm_model(
+            inputs,
+            gpt2_lm_model.init_states(batch_size, ctx),
+            mx.np.array(0, dtype=np.int32, ctx=ctx)
+        )
+        gpt2_lm_model_tn = GPT2ForLM(cfg_tn)
+        gpt2_lm_model_tn.share_parameters(gpt2_lm_model.collect_params())
+        gpt2_lm_model_tn.hybridize()
+        logits_tn, states_tn = gpt2_lm_model_tn(
+            inputs.T,
+            gpt2_lm_model_tn.init_states(batch_size, ctx),
+            mx.np.array(0, dtype=np.int32, ctx=ctx)
+        )
+        assert_allclose(np.swapaxes(logits_tn.asnumpy(), 0, 1),
+                        logits.asnumpy(), 1E-4, 1E-4)
+        assert_allclose(np.swapaxes(states_tn.asnumpy(), 2, 3),
+                        states.asnumpy(), 1E-4, 1E-4)
+
+
+def test_gpt2_incremental_states(ctx):
+    with ctx:
+        batch_size = 4
+        sequence_length = 5
+        inputs = mx.np.random.randint(0, 1000, (batch_size, sequence_length), ctx=ctx)
+
+        cfg = GPT2Model.get_cfg()
+        gpt2_model = GPT2Model.from_cfg(cfg)
+        gpt2_model.initialize(ctx=ctx)
+        gpt2_model.hybridize()
+
+        one_time_hiddens, one_time_states = gpt2_model(
+            inputs,
+            gpt2_model.init_states(batch_size, ctx),
+            mx.np.array(0, dtype=np.int32, ctx=ctx)
+        )
+
+        states = gpt2_model.init_states(batch_size, ctx)
+        for i in range(sequence_length):
+            hiddens, states = gpt2_model(
+                inputs[:, i:i+1],
+                states,
+                mx.np.array(i, dtype=np.int32, ctx=ctx)
+            )
+        incremental_states = states
+        incremental_hiddens = hiddens
+        assert_allclose(incremental_states.asnumpy(),
+                        states.asnumpy(), 1E-4, 1E-4)
+        assert_allclose(incremental_hiddens.asnumpy(),
+                        hiddens.asnumpy(), 1E-4, 1E-4)
+
+
+@pytest.mark.remote_required
+@pytest.mark.parametrize('model_name', list_pretrained_gpt2())
+def test_gpt2(model_name, ctx):
+    # test from pretrained
+    assert len(list_pretrained_gpt2()) > 0
+    with tempfile.TemporaryDirectory() as root, ctx:
+        cfg, tokenizer, params_path, lm_params_path =\
+            get_pretrained_gpt2(model_name, load_backbone=True, load_lm=True, root=root)
+        assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
+        # test backbone
+        gpt2_model = GPT2Model.from_cfg(cfg)
+        gpt2_model.load_parameters(params_path)
+        # test lm model
+        gpt2_lm_model = GPT2ForLM(cfg)
+        gpt2_lm_model.load_parameters(lm_params_path)
+
+    # test forward
+    batch_size = 3
+    seq_length = 32
+    ctx = mx.cpu()
+    vocab_size = len(tokenizer.vocab)
+    input_ids = mx.np.array(
+        np.random.randint(
+            2,
+            vocab_size,
+            (batch_size, seq_length)
+        ),
+        dtype=np.int32,
+        ctx=ctx
+    )
+    logits, _ = gpt2_lm_model(
+        input_ids,
+        gpt2_lm_model.init_states(batch_size, ctx),
+        mx.np.array(0, dtype=np.int32, ctx=ctx)
+    )
+    mx.npx.waitall()
+    # test backward
+    label_smooth_loss = LabelSmoothCrossEntropyLoss(num_labels=vocab_size)
+    with mx.autograd.record():
+        logits, _ = gpt2_lm_model(
+            input_ids,
+            gpt2_lm_model.init_states(batch_size, ctx),
+            mx.np.array(0, dtype=np.int32, ctx=ctx)
+        )
+        loss = label_smooth_loss(logits, input_ids)
+        loss.backward()
+    mx.npx.waitall()
diff --git a/tests/test_models_mobilebert.py b/tests/test_models_mobilebert.py
new file mode 100644
index 0000000000..d7f22ac533
--- /dev/null
+++ b/tests/test_models_mobilebert.py
@@ -0,0 +1,104 @@
+import pytest
+import numpy as np
+from numpy.testing import assert_allclose
+import mxnet as mx
+import tempfile
+from gluonnlp.models.mobilebert import MobileBertModel, MobileBertForMLM, MobileBertForPretrain,\
+    list_pretrained_mobilebert, get_pretrained_mobilebert
+mx.npx.set_np()
+
+
+def test_list_pretrained_mobilebert():
+    assert len(list_pretrained_mobilebert()) > 0
+
+
+@pytest.mark.parametrize('compute_layout', ['auto', 'TN', 'NT'])
+def test_mobilebert_model_small_cfg(compute_layout):
+    cfg = MobileBertModel.get_cfg()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 100
+    cfg.MODEL.num_layers = 2
+    cfg.MODEL.hidden_size = 128
+    cfg.MODEL.num_heads = 2
+    cfg.MODEL.compute_layout = compute_layout
+    cfg.freeze()
+
+    # Generate TN layout
+    cfg_tn = cfg.clone()
+    cfg_tn.defrost()
+    cfg_tn.MODEL.layout = 'TN'
+    cfg_tn.freeze()
+
+    batch_size = 4
+    sequence_length = 16
+    num_mask = 3
+    inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
+    token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length))
+    valid_length = mx.np.random.randint(3, sequence_length, (batch_size,))
+    masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))
+
+    mobile_bert_model = MobileBertModel.from_cfg(cfg)
+    mobile_bert_model.initialize()
+    mobile_bert_model.hybridize()
+    mobile_bert_model_tn = MobileBertModel.from_cfg(cfg_tn)
+    mobile_bert_model_tn.share_parameters(mobile_bert_model.collect_params())
+    mobile_bert_model_tn.hybridize()
+    contextual_embedding, pooled_out = mobile_bert_model(inputs, token_types, valid_length)
+    contextual_embedding_tn, pooled_out_tn = mobile_bert_model_tn(inputs.T,
+                                                                  token_types.T, valid_length)
+    assert_allclose(contextual_embedding.asnumpy(),
+                    np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
+                    1E-4, 1E-4)
+    assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
+
+    # Test for MobileBertForMLM
+    mobile_bert_mlm_model = MobileBertForMLM(cfg)
+    mobile_bert_mlm_model.initialize()
+    mobile_bert_mlm_model.hybridize()
+    mobile_bert_mlm_model_tn = MobileBertForMLM(cfg_tn)
+    mobile_bert_mlm_model_tn.share_parameters(mobile_bert_mlm_model.collect_params())
+    mobile_bert_model_tn.hybridize()
+    contextual_embedding, pooled_out, mlm_scores = mobile_bert_mlm_model(inputs, token_types,
+                                                                         valid_length,
+                                                                         masked_positions)
+    contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\
+        mobile_bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
+    assert_allclose(contextual_embedding.asnumpy(),
+                    np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
+                    1E-4, 1E-4)
+    assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-4, 1E-4)
+
+    # Test for MobileBertForPretrain
+    mobile_bert_pretrain_model = MobileBertForPretrain(cfg)
+    mobile_bert_pretrain_model.initialize()
+    mobile_bert_pretrain_model.hybridize()
+    mobile_bert_pretrain_model_tn = MobileBertForPretrain(cfg_tn)
+    mobile_bert_pretrain_model_tn.share_parameters(mobile_bert_pretrain_model.collect_params())
+    mobile_bert_pretrain_model_tn.hybridize()
+    contextual_embedding, pooled_out, nsp_score, mlm_scores =\
+        mobile_bert_pretrain_model(inputs, token_types, valid_length, masked_positions)
+    contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_scores_tn = \
+        mobile_bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
+    assert_allclose(contextual_embedding.asnumpy(),
+                    np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
+                    1E-4, 1E-4)
+    assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(mlm_scores.asnumpy(), mlm_scores_tn.asnumpy(), 1E-4, 1E-4)
+
+
+@pytest.mark.remote_required
+@pytest.mark.parametrize('model_name', list_pretrained_mobilebert())
+def test_mobilebert_get_pretrained(model_name):
+    with tempfile.TemporaryDirectory() as root:
+        cfg, tokenizer, backbone_params_path, mlm_params_path =\
+            get_pretrained_mobilebert(model_name, load_backbone=True, load_mlm=True, root=root)
+        assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
+        mobilebert_model = MobileBertModel.from_cfg(cfg)
+        mobilebert_model.load_parameters(backbone_params_path)
+        mobilebert_pretain_model = MobileBertForPretrain(cfg)
+        if mlm_params_path is not None:
+            mobilebert_pretain_model.load_parameters(mlm_params_path)
+        mobilebert_pretain_model = MobileBertForPretrain(cfg)
+        mobilebert_pretain_model.backbone_model.load_parameters(backbone_params_path)
diff --git a/tests/test_models_roberta.py b/tests/test_models_roberta.py
new file mode 100644
index 0000000000..bedf85f027
--- /dev/null
+++ b/tests/test_models_roberta.py
@@ -0,0 +1,117 @@
+import pytest
+import numpy as np
+import mxnet as mx
+import tempfile
+from numpy.testing import assert_allclose
+from gluonnlp.models.roberta import RobertaModel, RobertaForMLM, \
+    list_pretrained_roberta, get_pretrained_roberta
+from gluonnlp.loss import LabelSmoothCrossEntropyLoss
+
+mx.npx.set_np()
+
+
+def test_list_pretrained_roberta():
+    assert len(list_pretrained_roberta()) > 0
+
+
+@pytest.mark.parametrize('compute_layout', ['auto', 'TN', 'NT'])
+def test_robert_small_config(compute_layout):
+    cfg = RobertaModel.get_cfg()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = 1000
+    cfg.MODEL.num_layers = 2
+    cfg.MODEL.hidden_size = 128
+    cfg.MODEL.num_heads = 2
+    cfg.MODEL.compute_layout = compute_layout
+    cfg.freeze()
+
+    # Generate TN layout
+    cfg_tn = cfg.clone()
+    cfg_tn.defrost()
+    cfg_tn.MODEL.layout = 'TN'
+    cfg_tn.freeze()
+
+    batch_size = 4
+    sequence_length = 16
+    num_mask = 3
+    inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
+    valid_length = mx.np.random.randint(3, sequence_length, (batch_size,))
+    masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))
+
+    roberta_model = RobertaModel.from_cfg(cfg)
+    roberta_model.initialize()
+    roberta_model.hybridize()
+    contextual_embeddings, pooled_out = roberta_model(inputs, valid_length)
+    roberta_model_tn = RobertaModel.from_cfg(cfg_tn)
+    roberta_model_tn.share_parameters(roberta_model.collect_params())
+    roberta_model_tn.hybridize()
+    contextual_embeddings_tn, pooled_out_tn = roberta_model_tn(inputs.T, valid_length)
+    assert_allclose(np.swapaxes(contextual_embeddings_tn.asnumpy(), 0, 1),
+                    contextual_embeddings.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)
+
+    # Test for RobertaForMLM
+    roberta_mlm_model = RobertaForMLM(cfg)
+    roberta_mlm_model.initialize()
+    roberta_mlm_model.hybridize()
+    contextual_embedding, pooled_out, mlm_scores = roberta_mlm_model(inputs, valid_length,
+                                                                     masked_positions)
+    roberta_mlm_model_tn = RobertaForMLM(cfg_tn)
+    roberta_mlm_model_tn.share_parameters(roberta_mlm_model.collect_params())
+    roberta_mlm_model_tn.hybridize()
+    contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\
+        roberta_mlm_model_tn(inputs.T, valid_length.T, masked_positions)
+    assert_allclose(np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
+                    contextual_embedding.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)
+    assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-4, 1E-4)
+
+
+@pytest.mark.remote_required
+@pytest.mark.parametrize('model_name', list_pretrained_roberta())
+def test_roberta(model_name):
+    # test from pretrained
+    assert len(list_pretrained_roberta()) > 0
+    with tempfile.TemporaryDirectory() as root:
+        cfg, tokenizer, params_path, mlm_params_path =\
+            get_pretrained_roberta(model_name, load_backbone=True, load_mlm=True, root=root)
+        assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
+        # test backbone
+        roberta_model = RobertaModel.from_cfg(cfg)
+        roberta_model.load_parameters(params_path)
+        # test mlm model
+        roberta_mlm_model = RobertaForMLM(cfg)
+        if mlm_params_path is not None:
+            roberta_mlm_model.load_parameters(mlm_params_path)
+        roberta_mlm_model = RobertaForMLM(cfg)
+        roberta_mlm_model.backbone_model.load_parameters(params_path)
+
+    # test forward
+    batch_size = 3
+    seq_length = 32
+    vocab_size = len(tokenizer.vocab)
+    input_ids = mx.np.array(
+        np.random.randint(
+            2,
+            vocab_size,
+            (batch_size, seq_length)
+        ),
+        dtype=np.int32
+    )
+    valid_length = mx.np.array(
+        np.random.randint(
+            seq_length // 2,
+            seq_length,
+            (batch_size,)
+        ),
+        dtype=np.int32
+    )
+    contextual_embeddings, pooled_out = roberta_model(input_ids, valid_length)
+    mx.npx.waitall()
+    # test backward
+    label_smooth_loss = LabelSmoothCrossEntropyLoss(num_labels=vocab_size)
+    with mx.autograd.record():
+        contextual_embeddings, pooled_out = roberta_model(input_ids, valid_length)
+        loss = label_smooth_loss(contextual_embeddings, input_ids)
+        loss.backward()
+    mx.npx.waitall()
diff --git a/tests/test_models_transformer.py b/tests/test_models_transformer.py
new file mode 100644
index 0000000000..96cb60ee1d
--- /dev/null
+++ b/tests/test_models_transformer.py
@@ -0,0 +1,170 @@
+import mxnet as mx
+import pytest
+from numpy.testing import assert_allclose
+from gluonnlp.models.transformer import\
+    TransformerEncoder, TransformerDecoder, \
+    TransformerModel, TransformerNMTInference,\
+    transformer_cfg_reg
+from gluonnlp.attention_cell import gen_mem_attn_mask, gen_self_attn_mask
+from gluonnlp.utils.testing import verify_nmt_model, verify_nmt_inference
+mx.npx.set_np()
+
+
+@pytest.mark.parametrize('pre_norm', [False, True])
+@pytest.mark.parametrize('num_enc_layers', [2, 3])
+@pytest.mark.parametrize('num_dec_layers', [2, 3])
+def test_transformer_encoder_decoder(pre_norm, num_enc_layers, num_dec_layers):
+    batch_size = 8
+    src_seq_length = 20
+    tgt_seq_length = 15
+    units = 32
+    enc = TransformerEncoder(units=units, hidden_size=64, num_layers=num_enc_layers, num_heads=4,
+                             dropout=0.0, pre_norm=pre_norm)
+    dec = TransformerDecoder(units=units, hidden_size=64, num_layers=num_dec_layers, num_heads=4,
+                             dropout=0.0, pre_norm=pre_norm)
+    enc.hybridize()
+    dec.hybridize()
+    enc.initialize()
+    dec.initialize()
+    src_data = mx.np.random.normal(0, 1, (batch_size, src_seq_length, units))
+    src_valid_length = mx.np.random.randint(1, src_seq_length, (batch_size,))
+    dst_data = mx.np.random.normal(0, 1, (batch_size, tgt_seq_length, units))
+    dst_valid_length = mx.np.random.randint(5, tgt_seq_length, (batch_size,))
+    encoded_mem = enc(src_data, src_valid_length)
+    full_decode_out = dec(dst_data, dst_valid_length, encoded_mem, src_valid_length)
+
+    # Test for the TN layout
+    enc_tn = TransformerEncoder(units=units, hidden_size=64, num_layers=num_enc_layers, num_heads=4,
+                                dropout=0.0, pre_norm=pre_norm, layout='TN')
+    enc_tn.share_parameters(enc.collect_params())
+    dec_tn = TransformerDecoder(units=units, hidden_size=64, num_layers=num_dec_layers, num_heads=4,
+                                dropout=0.0, pre_norm=pre_norm, layout='TN')
+    dec_tn.share_parameters(dec.collect_params())
+    enc_tn.hybridize()
+    dec_tn.hybridize()
+    encoded_mem_tn = enc_tn(mx.np.swapaxes(src_data, 0, 1), src_valid_length)
+    full_decode_out_tn = dec_tn(mx.np.swapaxes(dst_data, 0, 1), dst_valid_length,
+                                encoded_mem_tn, src_valid_length)
+    assert_allclose(encoded_mem_tn.asnumpy(),
+                    mx.np.swapaxes(encoded_mem, 0, 1).asnumpy(), 1E-5, 1E-5)
+    assert_allclose(full_decode_out_tn.asnumpy(),
+                    mx.np.swapaxes(full_decode_out, 0, 1).asnumpy(), 1E-5, 1E-5)
+
+    # Test the consistency via shifting the data and the valid_length
+    for i in range(1, dst_valid_length.asnumpy().min()):
+        for partial_decode_out in [dec(dst_data[:, :(-i), :],
+                                       dst_valid_length - i, encoded_mem, src_valid_length),
+                                   dec(dst_data, dst_valid_length - i,
+                                       encoded_mem, src_valid_length)]:
+            for b in range(batch_size):
+                vl = dst_valid_length.asnumpy()[b] - i
+                assert_allclose(partial_decode_out.asnumpy()[b, :vl, :],
+                                full_decode_out.asnumpy()[b, :vl, :], 1E-5, 1E-5)
+    # Test the decoder layer
+    self_causal_mask = gen_self_attn_mask(mx, dst_data, dst_valid_length, attn_type='causal')
+    mem_attn_mask = gen_mem_attn_mask(mx, encoded_mem, src_valid_length, dst_data, dst_valid_length)
+    enc_mem_attn_mask = gen_mem_attn_mask(mx, encoded_mem, src_valid_length, dst_data[:, 0:1, :],
+                                          None)
+    h_out = dec.layers[0](dst_data, encoded_mem, self_causal_mask, mem_attn_mask)
+    states = dec.layers[0].init_states(batch_size, h_out.ctx, h_out.dtype)
+    h_out_from_incremental = []
+    for i in range(tgt_seq_length):
+        ele_h_out, states = dec.layers[0].incremental_decode(mx, dst_data[:, i, :], states,
+                                                             encoded_mem, src_valid_length,
+                                                             enc_mem_attn_mask)
+        h_out_from_incremental.append(ele_h_out)
+    h_out_from_incremental = mx.np.stack(h_out_from_incremental, axis=1)
+
+    for i in range(batch_size):
+        val_length = dst_valid_length[i].asnumpy()
+        assert_allclose(h_out_from_incremental[i, :val_length, :].asnumpy(),
+                        h_out[i, :val_length, :].asnumpy(), 1E-5, 1E-5)
+    # Test for the full decoder
+    states = dec.init_states(batch_size, src_data.ctx, src_data.dtype)
+    final_out_from_incremental = []
+    for i in range(tgt_seq_length):
+        ele_final_out, states = dec.incremental_decode(mx, dst_data[:, i, :],
+                                                       states, encoded_mem, src_valid_length)
+        final_out_from_incremental.append(ele_final_out)
+    final_out_from_incremental = mx.np.stack(final_out_from_incremental, axis=1)
+    for i in range(batch_size):
+        val_length = dst_valid_length[i].asnumpy()
+        assert_allclose(final_out_from_incremental[i, :val_length, :].asnumpy(),
+                        full_decode_out[i, :val_length, :].asnumpy(), 1E-5, 1E-5)
+
+
+@pytest.mark.parametrize('train_hybridize,inference_hybridize',
+                         [(False, False), (False, True), (True, True)])
+@pytest.mark.parametrize('enc_pre_norm,dec_pre_norm',
+                         [(False, False), (True, True)])
+@pytest.mark.parametrize('enc_num_layers,dec_num_layers,enc_units,dec_units',
+                         [(2, 2, 24, 24),
+                          (2, 3, 16, 24)])
+@pytest.mark.parametrize('enc_recurrent', [False, True])
+@pytest.mark.parametrize('dec_recurrent', [False, True])
+@pytest.mark.parametrize('tie_weights,layout', [(False, 'NT'), (True, 'NT'), (True, 'TN')])
+def test_transformer_nmt_model(train_hybridize, inference_hybridize,
+                               enc_pre_norm, dec_pre_norm,
+                               enc_units, dec_units,
+                               enc_num_layers, dec_num_layers,
+                               enc_recurrent, dec_recurrent, tie_weights,
+                               layout):
+    src_seq_length = 20
+    tgt_seq_length = 15
+    src_vocab_size = 32
+    tgt_vocab_size = 32
+    if enc_units != dec_units:
+        shared_embed = False
+    else:
+        shared_embed = True
+    model = TransformerModel(src_vocab_size=src_vocab_size,
+                             tgt_vocab_size=tgt_vocab_size,
+                             max_src_length=src_seq_length,
+                             max_tgt_length=tgt_seq_length,
+                             enc_units=enc_units,
+                             enc_hidden_size=64,
+                             enc_num_heads=4,
+                             enc_num_layers=enc_num_layers,
+                             enc_pre_norm=enc_pre_norm,
+                             enc_recurrent=enc_recurrent,
+                             dec_units=dec_units,
+                             dec_hidden_size=64,
+                             dec_num_heads=4,
+                             dec_num_layers=dec_num_layers,
+                             dec_pre_norm=dec_pre_norm,
+                             dec_recurrent=dec_recurrent,
+                             shared_embed=shared_embed,
+                             tie_weights=tie_weights,
+                             dropout=0.0,
+                             layout=layout)
+    inference_model = TransformerNMTInference(model=model)
+    model.initialize()
+    if train_hybridize:
+        model.hybridize()
+    verify_nmt_model(model)
+    if inference_hybridize:
+        inference_model.hybridize()
+    verify_nmt_inference(train_model=model, inference_model=inference_model)
+
+
+def test_transformer_cfg_registry():
+    assert len(transformer_cfg_reg.list_keys()) > 0
+
+
+@pytest.mark.parametrize('cfg_key', transformer_cfg_reg.list_keys())
+def test_transformer_cfg(cfg_key):
+    cfg = TransformerModel.get_cfg(cfg_key)
+    cfg.defrost()
+    cfg.MODEL.src_vocab_size = 32
+    cfg.MODEL.tgt_vocab_size = 32
+    cfg.freeze()
+    model = TransformerModel.from_cfg(cfg)
+    model.initialize()
+    model.hybridize()
+    cfg.defrost()
+    cfg.MODEL.layout = 'TN'
+    cfg.freeze()
+    model_tn = TransformerModel.from_cfg(cfg)
+    model_tn.share_parameters(model.collect_params())
+    model_tn.hybridize()
+    mx.npx.waitall()
diff --git a/tests/test_models_transformer_xl.py b/tests/test_models_transformer_xl.py
new file mode 100644
index 0000000000..f10a9aab66
--- /dev/null
+++ b/tests/test_models_transformer_xl.py
@@ -0,0 +1,84 @@
+import pytest
+from gluonnlp.models.transformer_xl import TransformerXLForLM
+import mxnet as mx
+import numpy as np
+from numpy.testing import assert_allclose
+from gluonnlp.utils.parameter import grad_global_norm
+mx.npx.set_np()
+
+
+@pytest.mark.parametrize('cutoffs,div_val',
+                         [([], 1.0), ([10, 50], 2.0)])
+@pytest.mark.parametrize('mem_length,query_length',
+                         [(20, 20), (10, 6), (6, 10)])
+def test_transformer_xl_for_lm(cutoffs, div_val, mem_length, query_length):
+    vocab_size = 100
+    cfg = TransformerXLForLM.get_cfg()
+    cfg.defrost()
+    cfg.MODEL.vocab_size = vocab_size
+    cfg.MODEL.embed_units = 48
+    cfg.MODEL.units = 32
+    cfg.MODEL.hidden_size = 64
+    cfg.MODEL.num_layers = 2
+    cfg.MODEL.cutoffs = cutoffs
+    cfg.MODEL.div_val = div_val
+    cfg.MODEL.layout = 'NT'
+    cfg.MODEL.dropout = 0.0
+    cfg.MODEL.activation_dropout = 0.0
+    cfg.MODEL.attention_dropout = 0.0
+    cfg.freeze()
+    nt_model = TransformerXLForLM(cfg)
+    nt_model.initialize()
+
+    tn_cfg = cfg.clone()
+    tn_cfg.defrost()
+    tn_cfg.MODEL.layout = 'TN'
+    tn_model = TransformerXLForLM(tn_cfg)
+    tn_model.initialize()
+    for name, param in tn_model.collect_params().items():
+        param.set_data(nt_model.collect_params().get(name).data())
+    assert_allclose(sum(
+        mx.np.linalg.norm(param.data()).asnumpy() for param in nt_model.collect_params().values()),
+                    sum(mx.np.linalg.norm(param.data()).asnumpy() for param in
+                        tn_model.collect_params().values()))
+    batch_size = 3
+    nt_model.set_mem_length(mem_length)
+    tn_model.set_mem_length(mem_length)
+
+    ctx = mx.cpu()
+
+    data = mx.np.random.randint(0, vocab_size, (batch_size, query_length), ctx=ctx, dtype=np.int32)
+    target = mx.np.random.randint(0, vocab_size, (batch_size, query_length), ctx=ctx,
+                                  dtype=np.int32)
+
+    # Check consistency of layout
+    nt_mem_l = nt_model.init_states(batch_size, ctx=ctx)
+    for _ in range(8):
+        with mx.autograd.record():
+            nt_logits, nt_mem_l = nt_model(data, target, nt_mem_l)
+            loss = nt_logits.sum()
+            loss.backward()
+    tn_mem_l = tn_model.init_states(batch_size, ctx=ctx)
+    for _ in range(8):
+        with mx.autograd.record():
+            tn_logits, tn_mem_l = tn_model(data.T, target.T, tn_mem_l)
+            loss = tn_logits.sum()
+            loss.backward()
+    assert_allclose(tn_logits.T.asnumpy(), nt_logits.asnumpy(), 1E-5, 1E-5)
+    for name, tn_param in tn_model.collect_params().items():
+        nt_param = nt_model.collect_params().get(name)
+        if nt_param.grad_req != 'null':
+            assert_allclose(nt_param.grad().asnumpy(), tn_param.grad().asnumpy(), 1E-4, 1E-4)
+
+    # Check step_forward consistency
+    mem_l = nt_model.init_states(batch_size, ctx=ctx)
+    sel_logits, new_mem_l = nt_model(data, target, mem_l)
+    ele_sel_logits_l = []
+    step_new_mem_l = mem_l
+    for i in range(query_length):
+        step_logits, step_new_mem_l = nt_model.step_forward(data[:, i], step_new_mem_l)
+        ele_sel_logits_l.append(step_logits[mx.np.arange(batch_size), target[:, i]])
+    sel_logits_from_step = mx.np.stack(ele_sel_logits_l, axis=-1)
+    assert_allclose(sel_logits_from_step.asnumpy(), sel_logits.asnumpy(), 1E-4, 1E-4)
+    for lhs, rhs in zip(step_new_mem_l, new_mem_l):
+        assert_allclose(lhs.asnumpy(), rhs.asnumpy(), 1E-4, 1E-4)
diff --git a/tests/test_models_xlmr.py b/tests/test_models_xlmr.py
new file mode 100644
index 0000000000..ff9c41fdfd
--- /dev/null
+++ b/tests/test_models_xlmr.py
@@ -0,0 +1,58 @@
+import pytest
+import numpy as np
+import mxnet as mx
+import tempfile
+from gluonnlp.models.xlmr import XLMRModel, \
+    list_pretrained_xlmr, get_pretrained_xlmr
+from gluonnlp.loss import LabelSmoothCrossEntropyLoss
+
+mx.npx.set_np()
+
+
+def test_list_pretrained_xlmr():
+    assert len(list_pretrained_xlmr()) > 0
+
+
+@pytest.mark.remote_required
+def test_xlmr():
+    # test from pretrained
+    assert len(list_pretrained_xlmr()) > 0
+    for model_name in ['fairseq_xlmr_base']:
+        with tempfile.TemporaryDirectory() as root:
+            cfg, tokenizer, params_path, mlm_params_path =\
+                get_pretrained_xlmr(model_name, load_backbone=True, load_mlm=False, root=root)
+            assert cfg.MODEL.vocab_size == len(tokenizer.vocab)
+            # test backbone
+            xlmr_model = XLMRModel.from_cfg(cfg)
+            xlmr_model.load_parameters(params_path)
+            # pass the mlm model
+
+        # test forward
+        batch_size = 1
+        seq_length = 4
+        vocab_size = len(tokenizer.vocab)
+        input_ids = mx.np.array(
+            np.random.randint(
+                2,
+                vocab_size,
+                (batch_size, seq_length)
+            ),
+            dtype=np.int32
+        )
+        valid_length = mx.np.array(
+            np.random.randint(
+                seq_length // 2,
+                seq_length,
+                (batch_size,)
+            ),
+            dtype=np.int32
+        )
+        contextual_embeddings, pooled_out = xlmr_model(input_ids, valid_length)
+        mx.npx.waitall()
+        # test backward
+        label_smooth_loss = LabelSmoothCrossEntropyLoss(num_labels=vocab_size)
+        with mx.autograd.record():
+            contextual_embeddings, pooled_out = xlmr_model(input_ids, valid_length)
+            loss = label_smooth_loss(contextual_embeddings, input_ids)
+            loss.backward()
+        mx.npx.waitall()
diff --git a/tests/test_op.py b/tests/test_op.py
new file mode 100644
index 0000000000..f0394d2280
--- /dev/null
+++ b/tests/test_op.py
@@ -0,0 +1,113 @@
+import numpy as np
+from numpy.testing import assert_allclose
+import mxnet as mx
+from mxnet import gluon
+import pytest
+from gluonnlp.op import *
+mx.npx.set_np()
+
+
+@pytest.mark.parametrize('batch_size', [1, 4])
+@pytest.mark.parametrize('seq_length', [16, 32])
+@pytest.mark.parametrize('num_sel_positions', [1, 5])
+@pytest.mark.parametrize('feature_shape', [(16,), (16, 32)])
+@pytest.mark.parametrize('hybridized', [False, True])
+@pytest.mark.seed(1)
+def test_select_vectors_by_position(batch_size, seq_length, num_sel_positions,
+                                    feature_shape, hybridized):
+    data = mx.np.random.uniform(-1, 1, (batch_size, seq_length) + feature_shape, dtype=np.float32)
+    positions = mx.np.random.randint(0, seq_length, (batch_size, num_sel_positions), dtype=np.int32)
+
+    class Foo(gluon.HybridBlock):
+        def hybrid_forward(self, F, p_data, p_positions):
+            return select_vectors_by_position(F, p_data, p_positions)
+    foo = Foo()
+    if hybridized:
+        foo.hybridize()
+    out_mx = foo(data, positions)
+    out_np = data.asnumpy()[np.expand_dims(np.arange(data.shape[0]).astype(np.int32),
+                                           axis=1),
+                            positions.asnumpy()]
+    assert_allclose(out_mx.asnumpy(), out_np, 1E-4, 1E-4)
+
+
+@pytest.mark.parametrize('batch_size', [1, 4])
+@pytest.mark.parametrize('seq_length', [16, 32])
+@pytest.mark.parametrize('num_sel_positions', [1, 5])
+@pytest.mark.parametrize('feature_shape,increment_shape', [((16,), (16,)),
+                                                           ((16, 32), (16, 1)),
+                                                           ((16, 32), (16, 32))])
+@pytest.mark.parametrize('hybridized', [False, True])
+@pytest.mark.seed(1)
+def test_add_vectors_by_position(batch_size, seq_length, num_sel_positions,
+                                 feature_shape, increment_shape, hybridized):
+    data = mx.np.random.uniform(-1, 1, (batch_size, seq_length) + feature_shape, dtype=np.float32)
+    positions = mx.np.random.randint(0, seq_length, (batch_size, num_sel_positions), dtype=np.int32)
+    increment = mx.np.random.uniform(-1, 1, (batch_size, num_sel_positions) + increment_shape)
+
+    class Foo(gluon.HybridBlock):
+        def hybrid_forward(self, F, p_data, p_increment, p_positions):
+            return add_vectors_by_position(F, p_data, p_increment, p_positions)
+
+    foo = Foo()
+    if hybridized:
+        foo.hybridize()
+    out_mx = foo(data, increment, positions).asnumpy()
+    out_np = data.asnumpy().copy()
+    positions = positions.asnumpy()
+    increment = increment.asnumpy()
+    for bidx in range(batch_size):
+        for sidx in range(num_sel_positions):
+            sel = positions[bidx, sidx]
+            out_np[bidx, sel] += increment[bidx, sidx]
+    assert_allclose(out_np, out_mx, 1E-4, 1E-4)
+
+
+@pytest.mark.parametrize('batch_size', [1, 4])
+@pytest.mark.parametrize('seq_length', [16, 32])
+@pytest.mark.parametrize('num_sel_positions', [1, 5])
+@pytest.mark.parametrize('feature_shape,update_shape', [((16,), (16,)),
+                                                        ((16, 32), (16, 1)),
+                                                        ((16, 32), (16, 32))])
+@pytest.mark.parametrize('hybridized', [False, True])
+@pytest.mark.seed(1)
+def test_update_vectors_by_position(batch_size, seq_length, num_sel_positions,
+                                    feature_shape, update_shape, hybridized):
+    data = mx.np.random.uniform(-1, 1, (batch_size, seq_length) + feature_shape, dtype=np.float32)
+    val = mx.np.random.uniform(-1, 1, (batch_size, num_sel_positions) + update_shape)
+    positions = mx.np.zeros((batch_size, num_sel_positions), dtype=np.int32)
+    for i in range(batch_size):
+        positions[i, :] = np.random.choice(seq_length, num_sel_positions, replace=False)
+
+    class Foo(gluon.HybridBlock):
+        def hybrid_forward(self, F, p_data, p_val, p_positions):
+            return update_vectors_by_position(F, p_data, p_val, p_positions)
+
+    foo = Foo()
+    if hybridized:
+        foo.hybridize()
+    out_mx = foo(data, val, positions)
+    out_np = data.asnumpy().copy()
+    out_np[np.expand_dims(np.arange(data.shape[0]).astype(np.int32), axis=1),
+           positions.asnumpy()] = val.asnumpy()
+    assert_allclose(out_mx.asnumpy(), out_np, 1E-4, 1E-4)
+
+
+@pytest.mark.parametrize('shape', [(10,), (5, 10)])
+@pytest.mark.seed(1)
+def test_gumbel_softmax(shape):
+    # Here, we just verify that it will generate one-hot vectors and will have gradient
+    logits = mx.np.random.uniform(-2, -1, shape)
+    ret = gumbel_softmax(mx, logits)
+    assume_allones = (ret == 1).sum(axis=-1).asnumpy()
+    assert_allclose(assume_allones, np.ones_like(assume_allones))
+
+
+@pytest.mark.seed(1)
+def test_trunc_gumbel():
+    # TODO(?) Improve the test case here
+    #  It's generally difficult to test whether the samples are generated from a truncated gumbel
+    #  distribution. Thus, we just verify that the samples are smaller than the provided threshold
+    for i in range(1000):
+        samples = trunc_gumbel(mx, mx.np.ones((10,)), 1.0).asnumpy()
+        assert (samples < 1.0).all()
diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
new file mode 100644
index 0000000000..48c2331a7a
--- /dev/null
+++ b/tests/test_optimizer.py
@@ -0,0 +1,33 @@
+import itertools
+import numpy as np
+from gluonnlp.optimizer import AdamW
+import mxnet as mx
+from mxnet.test_utils import compare_optimizer
+mx.npx.reset_np()
+
+
+def test_adam(ctx):
+    with ctx:
+        opt1 = AdamW
+        opt2 = AdamW
+        shapes = [(3, 4, 5), (10, 4), (7,)]
+        beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
+        beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
+        cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+        rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+        wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+        mp_options = [{'multi_precision': False}]  # TODO(sxjscience) Test for FP16
+        agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                       {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+        correct_bias_options = [{'correct_bias': True}, {'correct_bias': False}]
+        for dtype in [np.float16, np.float32]:
+            for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                            rg_options, wd_options, mp_options,
+                                            agg_options, correct_bias_options):
+                kwarg = {k: v for param in params for k, v in param.items()}
+                if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                             not kwarg['multi_precision'])):
+                    continue
+                compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                                  opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                                  rtol=1e-4, atol=2e-5)
diff --git a/tests/unittest/test_pytest.py b/tests/test_pytest.py
similarity index 100%
rename from tests/unittest/test_pytest.py
rename to tests/test_pytest.py
diff --git a/tests/test_sequence_sampler.py b/tests/test_sequence_sampler.py
new file mode 100644
index 0000000000..f811c9a66c
--- /dev/null
+++ b/tests/test_sequence_sampler.py
@@ -0,0 +1,210 @@
+import collections
+import functools
+import mxnet as mx
+import numpy as np
+import scipy
+import pytest
+from mxnet.gluon import nn, HybridBlock
+from numpy.testing import assert_allclose
+from gluonnlp.sequence_sampler import BeamSearchScorer, BeamSearchSampler
+mx.npx.set_np()
+
+
+@pytest.mark.parametrize('length', [False, True])
+@pytest.mark.parametrize('alpha', [0.0, 1.0])
+@pytest.mark.parametrize('K', [1.0, 5.0])
+@pytest.mark.parametrize('batch_size', [1, 2])
+@pytest.mark.parametrize('vocab_size', [2, 5])
+@pytest.mark.parametrize('from_logits', [False, True])
+@pytest.mark.parametrize('hybridize', [False, True])
+def test_beam_search_score(length, alpha, K, batch_size, vocab_size, from_logits, hybridize):
+    scorer = BeamSearchScorer(alpha=alpha, K=K, from_logits=from_logits)
+    if hybridize:
+        scorer.hybridize()
+    sum_log_probs = mx.np.zeros((batch_size,))
+    scores = mx.np.zeros((batch_size,))
+    for step in range(1, length + 1):
+        if not from_logits:
+            log_probs = np.random.normal(0, 1, (batch_size, vocab_size))
+            log_probs = np.log((scipy.special.softmax(log_probs, axis=-1)))
+        else:
+            log_probs = np.random.uniform(-10, 0, (batch_size, vocab_size))
+        log_probs = mx.np.array(log_probs, dtype=np.float32)
+        sum_log_probs += log_probs[:, 0]
+        scores = scorer(log_probs, scores, mx.np.array(step))[:, 0]
+    lp = (K + length) ** alpha / (K + 1) ** alpha
+    assert_allclose(scores.asnumpy(), sum_log_probs.asnumpy() / lp, 1E-5, 1E-5)
+
+
+# TODO(sxjscience) Test for the state_batch_axis
+@pytest.mark.parametrize('early_return', [False, True])
+@pytest.mark.parametrize('eos_id', [0, None])
+def test_beam_search(early_return, eos_id):
+    class SimpleStepDecoder(HybridBlock):
+        def __init__(self, vocab_size=5, hidden_units=4):
+            super().__init__()
+            self.x2h_map = nn.Embedding(input_dim=vocab_size, output_dim=hidden_units)
+            self.h2h_map = nn.Dense(units=hidden_units, flatten=False)
+            self.vocab_map = nn.Dense(units=vocab_size, flatten=False)
+
+        @property
+        def state_batch_axis(self):
+            return 0
+
+        def hybrid_forward(self, F, data, state):
+            """
+
+            Parameters
+            ----------
+            F
+            data :
+                (batch_size,)
+            states :
+                (batch_size, C)
+
+            Returns
+            -------
+            out :
+                (batch_size, vocab_size)
+            new_state :
+                (batch_size, C)
+            """
+            new_state = self.h2h_map(state)
+            out = self.vocab_map(self.x2h_map(data) + new_state)
+            return out, new_state
+
+    vocab_size = 3
+    batch_size = 2
+    hidden_units = 3
+    beam_size = 4
+    step_decoder = SimpleStepDecoder(vocab_size, hidden_units)
+    step_decoder.initialize()
+    sampler = BeamSearchSampler(beam_size=4, decoder=step_decoder, eos_id=eos_id, vocab_size=vocab_size,
+                                max_length_b=100, early_return=early_return)
+    states = mx.np.random.normal(0, 1, (batch_size, hidden_units))
+    inputs = mx.np.random.randint(0, vocab_size, (batch_size,))
+    samples, scores, valid_length = sampler(inputs, states)
+    samples = samples.asnumpy()
+    valid_length = valid_length.asnumpy()
+    for i in range(batch_size):
+        for j in range(beam_size):
+            vl = valid_length[i, j]
+            if eos_id is not None:
+                assert samples[i, j, vl - 1] == eos_id
+            if vl < samples.shape[2]:
+                assert (samples[i, j, vl:] == -1).all()
+            assert (samples[i, :, 0] == inputs[i].asnumpy()).all()
+
+
+# TODO(sxjscience) Test for the state_batch_axis
+@pytest.mark.parametrize('early_return', [False, True])
+@pytest.mark.parametrize('eos_id', [0, None])
+def test_beam_search_stochastic(early_return, eos_id):
+    class SimpleStepDecoder(HybridBlock):
+        def __init__(self, vocab_size=5, hidden_units=4):
+            super().__init__()
+            self.x2h_map = nn.Embedding(input_dim=vocab_size, output_dim=hidden_units)
+            self.h2h_map = nn.Dense(units=hidden_units, flatten=False)
+            self.vocab_map = nn.Dense(units=vocab_size, flatten=False)
+
+        @property
+        def state_batch_axis(self):
+            return 0
+
+        def hybrid_forward(self, F, data, state):
+            """
+
+            Parameters
+            ----------
+            F
+            data :
+                (batch_size,)
+            states :
+                (batch_size, C)
+
+            Returns
+            -------
+            out :
+                (batch_size, vocab_size)
+            new_state :
+                (batch_size, C)
+            """
+            new_state = self.h2h_map(state)
+            out = self.vocab_map(self.x2h_map(data) + new_state)
+            return out, new_state
+
+    vocab_size = 3
+    batch_size = 2
+    hidden_units = 3
+    beam_size = 4
+    step_decoder = SimpleStepDecoder(vocab_size, hidden_units)
+    step_decoder.initialize()
+    sampler = BeamSearchSampler(beam_size=4, decoder=step_decoder, eos_id=eos_id, vocab_size=vocab_size,
+                                stochastic=True, max_length_b=100, early_return=early_return)
+    states = mx.np.random.normal(0, 1, (batch_size, hidden_units))
+    inputs = mx.np.random.randint(0, vocab_size, (batch_size,))
+    samples, scores, valid_length = sampler(inputs, states)
+    samples = samples.asnumpy()
+    valid_length = valid_length.asnumpy()
+    for i in range(batch_size):
+        for j in range(beam_size):
+            vl = valid_length[i, j]
+            if eos_id is not None:
+                assert samples[i, j, vl-1] == eos_id
+            if vl < samples.shape[2]:
+                assert (samples[i, j, vl:] == -1).all()
+            assert (samples[i, :, 0] == inputs[i].asnumpy()).all()
+
+    # test for repeativeness
+    has_different_sample = False
+    for _ in range(10):
+        new_samples, scores, valid_length = sampler(inputs, states)
+        if not np.array_equal(new_samples.asnumpy(), samples):
+            has_different_sample = True
+            break
+    assert has_different_sample
+
+@pytest.mark.parametrize('early_return', [False, True])
+@pytest.mark.parametrize('sampling_paras', [(-1.0, -1), (0.05, -1), (-1.0, 1), (-1.0, 3)])
+@pytest.mark.parametrize('eos_id', [0, None])
+def test_multinomial_sampling(early_return, sampling_paras, eos_id):
+    class SimpleStepDecoder(HybridBlock):
+        def __init__(self, vocab_size=5, hidden_units=4):
+            super().__init__()
+            self.x2h_map = nn.Embedding(input_dim=vocab_size, output_dim=hidden_units)
+            self.h2h_map = nn.Dense(units=hidden_units, flatten=False)
+            self.vocab_map = nn.Dense(units=vocab_size, flatten=False)
+
+        @property
+        def state_batch_axis(self):
+            return 0
+
+        def hybrid_forward(self, F, data, state):
+            new_state = self.h2h_map(state)
+            out = self.vocab_map(self.x2h_map(data) + new_state)
+            return out, new_state
+
+    vocab_size = 5
+    batch_size = 2
+    hidden_units = 3
+    beam_size = 4
+    step_decoder = SimpleStepDecoder(vocab_size, hidden_units)
+    step_decoder.initialize()
+    sampling_topp, sampling_topk = sampling_paras
+    sampler = BeamSearchSampler(beam_size=4, decoder=step_decoder, eos_id=eos_id, vocab_size=vocab_size,
+                                stochastic=False,
+                                sampling=True, sampling_topp=sampling_topp, sampling_topk=sampling_topk,
+                                max_length_b=100, early_return=early_return)
+    states = mx.np.random.normal(0, 1, (batch_size, hidden_units))
+    inputs = mx.np.random.randint(0, vocab_size, (batch_size,))
+    samples, scores, valid_length = sampler(inputs, states)
+    samples = samples.asnumpy()
+    valid_length = valid_length.asnumpy()
+    for i in range(batch_size):
+        for j in range(beam_size):
+            vl = valid_length[i, j]
+            if eos_id is not None:
+                assert samples[i, j, vl - 1] == eos_id
+            if vl < samples.shape[2]:
+                assert (samples[i, j, vl:] == -1).all()
+            assert (samples[i, :, 0] == inputs[i].asnumpy()).all()
diff --git a/tests/test_utils_misc.py b/tests/test_utils_misc.py
new file mode 100644
index 0000000000..83593cb3f6
--- /dev/null
+++ b/tests/test_utils_misc.py
@@ -0,0 +1,153 @@
+import pytest
+import tempfile
+import os
+import logging
+import mxnet as mx
+import multiprocessing
+import functools
+from mxnet.gluon import nn
+from pathlib import Path
+import numpy as np
+from numpy.testing import assert_allclose
+from gluonnlp.utils.misc import AverageSGDTracker, download, sha1sum, logging_config
+mx.npx.set_np()
+
+
+def test_average_sgd_tracker():
+    samples = [mx.np.random.normal(0, 1, (10, 3)) for _ in range(10)]
+    no_moving_avg_param_l = []
+    with_moving_avg_param_l = []
+    moving_avg_param = None
+    net_final_moving_avg_param = None
+    for use_moving_avg in [False, True]:
+        net = nn.HybridSequential()
+        net.add(nn.Dense(10), nn.Dense(3))
+        net.initialize(init=mx.init.One())
+        net.hybridize()
+        trainer = mx.gluon.Trainer(net.collect_params(), 'adam')
+        if use_moving_avg:
+            model_averager = AverageSGDTracker(net.collect_params())
+        for sample in samples:
+            out = sample ** 3 + sample
+            with mx.autograd.record():
+                pred = net(sample)
+                loss = ((out - pred) ** 2).mean()
+            loss.backward()
+            trainer.step(1.0)
+            if use_moving_avg:
+                model_averager.step()
+                print(model_averager.average_params)
+            if use_moving_avg:
+                with_moving_avg_param_l.append({k: v.data().asnumpy() for k, v in net.collect_params().items()})
+            else:
+                no_moving_avg_param_l.append({k: v.data().asnumpy() for k, v in net.collect_params().items()})
+        if use_moving_avg:
+            model_averager.copy_back()
+            moving_avg_param = {k: v.asnumpy() for k, v in model_averager.average_params.items()}
+            net_final_moving_avg_param = {k: v.data().asnumpy() for k, v in net.collect_params().items()}
+    # Match the recorded params
+    calculated_moving_param = {k: np.zeros(v.shape) for k, v in no_moving_avg_param_l[0].items()}
+    for step, (no_moving_avg_param, with_moving_avg_param) in enumerate(zip(no_moving_avg_param_l,
+                                                                            with_moving_avg_param_l)):
+        decay = 1.0 / (step + 1)
+        assert len(no_moving_avg_param) == len(with_moving_avg_param)
+        for k in with_moving_avg_param:
+            assert_allclose(no_moving_avg_param[k], with_moving_avg_param[k])
+            calculated_moving_param[k] += decay * (with_moving_avg_param[k] - calculated_moving_param[k])
+    assert len(moving_avg_param) == len(net_final_moving_avg_param)
+    for k in moving_avg_param:
+        assert_allclose(moving_avg_param[k], calculated_moving_param[k], 1E-5, 1E-5)
+        assert_allclose(moving_avg_param[k], net_final_moving_avg_param[k], 1E-5, 1E-5)
+
+
+def s3_enabled():
+    from gluonnlp.utils.lazy_imports import try_import_boto3
+    try:
+        boto3 = try_import_boto3()
+        s3 = boto3.resource('s3')
+        for bucket in s3.buckets.all():
+            print(bucket.name)
+        return True
+    except Exception:
+        return False
+
+
+def verify_download(url, sha1_hash, overwrite):
+    with tempfile.TemporaryDirectory() as root:
+        download_path = os.path.join(root, 'dat0')
+        # Firstly, verify that we are able to get download the data correctly
+        download(url, sha1_hash=sha1_hash, path=download_path, overwrite=overwrite)
+        assert sha1sum(download_path) == sha1_hash
+        os.remove(download_path)
+
+        # Secondly, verify that we are able to download with multiprocessing
+        download_path = os.path.join(root, 'dat1')
+        with multiprocessing.Pool(2) as pool:
+            pool.map(functools.partial(download, sha1_hash=sha1_hash,
+                                       path=download_path, overwrite=overwrite),
+                     [url for _ in range(2)])
+        assert sha1sum(download_path) == sha1_hash
+        os.remove(download_path)
+
+
+@pytest.mark.skipif(not s3_enabled(),
+                    reason='S3 is not supported. So this test is skipped.')
+@pytest.mark.parametrize('overwrite', [False, True])
+@pytest.mark.remote_required
+def test_download_s3(overwrite):
+    verify_download(url='s3://commoncrawl/crawl-data/CC-MAIN-2014-41/cc-index.paths.gz',
+                    sha1_hash='fac65325fdd881b75d6badc0f3caea287d91ed54',
+                    overwrite=overwrite)
+
+
+@pytest.mark.remote_required
+@pytest.mark.parametrize('overwrite', [False, True])
+def test_download_https(overwrite):
+    verify_download(url='https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2014-41/'
+                        'cc-index.paths.gz',
+                    sha1_hash='fac65325fdd881b75d6badc0f3caea287d91ed54',
+                    overwrite=overwrite)
+
+
+@pytest.mark.remote_required
+@pytest.mark.parametrize('overwrite', [False, True])
+def test_download_non_existing(overwrite):
+    with pytest.raises(RuntimeError, match='Failed downloading url'):
+        verify_download(url='https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2014-41/non_existing',
+                        sha1_hash='foo',
+                        overwrite=overwrite)
+
+
+def test_logging_config():
+    logger = logging.getLogger(__name__)
+    with tempfile.TemporaryDirectory() as root:
+        logging_config(folder=root, logger=logger, name='test')
+        file_names = os.listdir(root)
+        assert file_names[0] == 'test.log'
+        file_size = Path(os.path.join(root, 'test.log')).stat().st_size
+        assert file_size == 0
+        logger.info('123')
+        for handler in logger.handlers:
+            handler.flush()
+        file_size_test1 = Path(os.path.join(root, 'test.log')).stat().st_size
+        assert file_size_test1 > 0
+        logging_config(folder=root, logger=logger, name='foo', overwrite_handler=False)
+        logger.info('123')
+        for handler in logger.handlers:
+            handler.flush()
+        file_size_test2 = Path(os.path.join(root, 'test.log')).stat().st_size
+        file_size_foo1 = Path(os.path.join(root, 'foo.log')).stat().st_size
+        assert file_size_test2 > file_size_test1
+        assert file_size_foo1 > 0
+
+        # After overwrite, the old hanlder will be removed
+        logging_config(folder=root, logger=logger, name='zoo', overwrite_handler=True)
+        logger.info('12345')
+        for handler in logger.handlers:
+            handler.flush()
+        file_size_zoo1 = Path(os.path.join(root, 'zoo.log')).stat().st_size
+        file_size_test3 = Path(os.path.join(root, 'test.log')).stat().st_size
+        file_size_foo2 = Path(os.path.join(root, 'foo.log')).stat().st_size
+        assert file_size_test3 == file_size_test2
+        assert file_size_foo2 == file_size_foo1
+        assert file_size_zoo1 > 0
diff --git a/tests/test_utils_parameter.py b/tests/test_utils_parameter.py
new file mode 100644
index 0000000000..b50a862528
--- /dev/null
+++ b/tests/test_utils_parameter.py
@@ -0,0 +1,51 @@
+import pytest
+import mxnet as mx
+import numpy as np
+from gluonnlp.utils.parameter import grad_global_norm, clip_grad_global_norm
+from mxnet.test_utils import assert_almost_equal
+mx.npx.set_np()
+
+
+@pytest.mark.parametrize('max_norm,check_isfinite',
+                         [(1, True),
+                          (1, False),
+                          (100, True),
+                          (100, False)])
+def test_clip_grad_norm(max_norm, check_isfinite):
+
+    def gt_grad_global_norm(parameters):
+        ret = 0
+        for p in parameters:
+            grads = p.list_grad()
+            ret += (grads[0].asnumpy() ** 2).sum()
+        return np.sqrt(ret)
+
+    contexts = [mx.cpu(0), mx.cpu(1)]
+    net = mx.gluon.nn.HybridSequential()
+    # Create a network with 8 layers
+    for _ in range(8):
+        net.add(mx.gluon.nn.Dense(1, weight_initializer='ones', bias_initializer='ones'))
+    net.initialize(ctx=contexts)
+    net.hybridize()
+    trainer = mx.gluon.Trainer(net.collect_params(), 'sgd', update_on_kvstore=False)
+    for ctx in contexts:
+        with mx.autograd.record():
+            out = net(mx.np.ones((1, 1), ctx=ctx))
+        out.backward()
+    trainer.allreduce_grads()
+    # Cache the original gradient for checking
+    original_grad_l = [p.list_grad()[0].asnumpy() for p in net.collect_params().values()]
+    norm = grad_global_norm(net.collect_params().values())
+    gt_norm = gt_grad_global_norm(net.collect_params().values())
+    assert_almost_equal(norm, gt_norm, atol=1e-5)
+    with mx.cpu(2):
+        norm, ratio, is_finite = clip_grad_global_norm(net.collect_params().values(), max_norm,
+                                                       check_isfinite)
+    assert_almost_equal(norm, gt_norm, atol=1e-5)
+    for p, orig_grad in zip(net.collect_params().values(), original_grad_l):
+        for ctx in contexts:
+            if max_norm > norm:
+                assert_almost_equal(p.grad(ctx).asnumpy(), orig_grad)
+            else:
+                ratio = max_norm / norm
+                assert_almost_equal(p.grad(ctx).asnumpy(), orig_grad * ratio)
diff --git a/tests/test_utils_preprocessing.py b/tests/test_utils_preprocessing.py
new file mode 100644
index 0000000000..2f746f2c0c
--- /dev/null
+++ b/tests/test_utils_preprocessing.py
@@ -0,0 +1,39 @@
+import pytest
+import numpy as np
+from numpy.testing import assert_allclose
+from gluonnlp.utils.preprocessing import get_trimmed_lengths, match_tokens_with_char_spans
+
+
+def test_get_trimmed_lengths():
+    for lengths, do_merge, max_length, gt_trimmed_lengths in\
+            [([10, 5, 4, 8], False, 6, [6, 5, 4, 6]),
+             ([10, 5, 4, 8], True, 6, [2, 2, 1, 1]),
+             ([20], False, 30, [20]),
+             ([20], True, 30, [20]),
+             ([15, 20], False, 30, [15, 20]),
+             ([15, 20], True, 30, [15, 15])]:
+        trimmed_lengths = get_trimmed_lengths(lengths,
+                                              max_length=max_length,
+                                              do_merge=do_merge)
+        assert_allclose(trimmed_lengths, np.array(gt_trimmed_lengths))
+
+
+def test_match_tokens_with_char_spans():
+    token_offsets = np.array([(0, 1), (1, 2), (3, 4), (5, 6)])
+    spans = np.array([(0, 3), (4, 6)])
+    out = match_tokens_with_char_spans(token_offsets, spans)
+    assert_allclose(out, np.array([[0, 2],
+                                   [2, 3]]))
+
+    token_offsets = np.array([(5, 10), (10, 20), (20, 25), (26, 30)])
+    spans = np.array([(0, 3), (4, 6), (10, 30),
+                      (22, 23), (15, 25),
+                      (10, 35), (36, 38)])
+    out = match_tokens_with_char_spans(token_offsets, spans)
+    assert_allclose(out, np.array([[0, 0],
+                                   [0, 0],
+                                   [1, 3],
+                                   [2, 2],
+                                   [1, 2],
+                                   [1, 3],
+                                   [3, 3]]))
diff --git a/tests/test_utils_registry.py b/tests/test_utils_registry.py
new file mode 100644
index 0000000000..27383ecfe0
--- /dev/null
+++ b/tests/test_utils_registry.py
@@ -0,0 +1,45 @@
+from gluonnlp.utils.registry import Registry
+
+
+def test_registry():
+    MODEL_REGISTRY = Registry('MODEL')
+    @MODEL_REGISTRY.register()
+    class MyModel:
+        def __init__(self, a, b):
+            self.a = a
+            self.b = b
+
+    @MODEL_REGISTRY.register()
+    def my_model():
+        return
+
+    @MODEL_REGISTRY.register('test_class')
+    class MyModelWithNickName:
+        def __init__(self, a, b, c):
+            self.a = a
+            self.b = b
+            self.c = c
+
+    @MODEL_REGISTRY.register('test_function')
+    def my_model_with_nick_name():
+        return
+
+    class MyModel2:
+        pass
+
+    MODEL_REGISTRY.register(MyModel2)
+    MODEL_REGISTRY.register('my_model2', MyModel2)
+    assert MODEL_REGISTRY.list_keys() ==\
+           ['MyModel', 'my_model', 'test_class', 'test_function', 'MyModel2', 'my_model2']
+    model = MODEL_REGISTRY.create('MyModel', 1, 2)
+    assert model.a == 1 and model.b == 2
+    model = MODEL_REGISTRY.create('MyModel', a=2, b=3)
+    assert model.a == 2 and model.b == 3
+    model = MODEL_REGISTRY.create_with_json('MyModel', '[4, 5]')
+    assert model.a == 4 and model.b == 5
+    model = MODEL_REGISTRY.create_with_json('test_class',
+                                            '{"a": 100, "b": 200, "c": 300}')
+    assert model.a == 100 and model.b == 200 and model.c == 300
+    assert MODEL_REGISTRY.get('test_class') == MyModelWithNickName
+
+
diff --git a/tests/unittest/batchify/test_batchify_embedding.py b/tests/unittest/batchify/test_batchify_embedding.py
deleted file mode 100644
index 830b669d6f..0000000000
--- a/tests/unittest/batchify/test_batchify_embedding.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-import itertools
-
-import pytest
-
-import numpy as np
-import gluonnlp as nlp
-
-
-@pytest.mark.parametrize('reduce_window_size_randomly', [True, False])
-@pytest.mark.parametrize('shuffle', [True, False])
-@pytest.mark.parametrize('cbow', [True, False])
-@pytest.mark.parametrize('stream', [True, False])
-def test_center_context_batchify_stream(reduce_window_size_randomly, shuffle,
-                                        cbow, stream):
-    dataset = [np.arange(100).tolist()] * 3
-    batchify = nlp.data.batchify.EmbeddingCenterContextBatchify(
-        batch_size=8, window_size=5,
-        reduce_window_size_randomly=reduce_window_size_randomly,
-        shuffle=shuffle, cbow=cbow)
-    if stream:
-        stream = nlp.data.SimpleDataStream([dataset, dataset])
-        batches = list(
-            itertools.chain.from_iterable(stream.transform(batchify)))
-    else:
-        samples = batchify(dataset)
-        batches = list(samples)
-    if cbow:
-        assert len(batches) == 37 if not stream else 74
-    elif not reduce_window_size_randomly:
-        assert len(batches) == 363 if not stream else 726
-    else:
-        pass
-
-
-@pytest.mark.parametrize('cbow', [True, False])
-@pytest.mark.parametrize('dtype', [np.float64, np.int64, np.dtype('O')])
-@pytest.mark.parametrize('weight_dtype', [np.float64, np.float32, np.float16])
-@pytest.mark.parametrize('index_dtype', [np.int64, np.int32])
-def test_center_context_batchify(cbow, dtype, weight_dtype, index_dtype):
-    dtype_fn = dtype if dtype is not np.dtype('O') else str
-    dataset = [[dtype_fn(i) for i in range(100)] * 2]
-
-    batchify = nlp.data.batchify.EmbeddingCenterContextBatchify(
-        batch_size=3, window_size=1, cbow=cbow, weight_dtype=weight_dtype,
-        index_dtype=index_dtype)
-    samples = batchify(dataset)
-    center, context = next(iter(samples))
-    (contexts_data, contexts_row, contexts_col) = context
-
-    assert center.dtype == dtype
-    assert contexts_data.dtype == weight_dtype
-    assert contexts_row.dtype == index_dtype
-    assert contexts_col.dtype == dtype
-
-    if cbow:
-        assert center.tolist() == [dtype_fn(i) for i in [0, 1, 2]]
-        assert contexts_data.tolist() == [1, 0.5, 0.5, 0.5, 0.5]
-        assert contexts_row.tolist() == [0, 1, 1, 2, 2]
-        assert contexts_col.tolist() == [dtype_fn(i) for i in [1, 0, 2, 1, 3]]
-    else:
-        assert center.tolist() == [dtype_fn(i) for i in [0, 1, 1]]
-        assert contexts_data.tolist() == [1, 1, 1]
-        assert contexts_row.tolist() == [0, 1, 2]
-        assert contexts_col.tolist() == [dtype_fn(i) for i in [1, 0, 2]]
-
-@pytest.mark.parametrize('cbow', [True, False])
-@pytest.mark.parametrize('dtype', [np.float64, np.float32, np.float16, np.float64, np.int64, str])
-@pytest.mark.parametrize('weight_dtype', [np.float64, np.float32, np.float16])
-@pytest.mark.parametrize('index_dtype', [np.int64, np.int32])
-@pytest.mark.parametrize('emptyatbegin', [True, False])
-def test_center_context_batchify_robustness(cbow, dtype, weight_dtype, index_dtype, emptyatbegin):
-    batchify = nlp.data.batchify.EmbeddingCenterContextBatchify(
-        batch_size=3, window_size=1, cbow=cbow, weight_dtype=weight_dtype,
-        index_dtype=index_dtype)
-
-    if dtype != str:
-        samples = np.arange(5, dtype=dtype)
-    else:
-        samples = [str(i) for i in range(5)]
-    samples = [[], samples] if emptyatbegin else [samples, []]
-    samples = batchify(samples)
-    center, context = next(iter(samples))
-    (contexts_data, contexts_row, contexts_col) = context
-    assert center.dtype == dtype if dtype != str else center.dtype == 'O'
-    assert contexts_data.dtype == weight_dtype
-    assert contexts_row.dtype == index_dtype
-    assert contexts_col.dtype == dtype if dtype != str else center.dtype == 'O'
diff --git a/tests/unittest/batchify/test_batchify_language_model.py b/tests/unittest/batchify/test_batchify_language_model.py
deleted file mode 100644
index d79bff7083..0000000000
--- a/tests/unittest/batchify/test_batchify_language_model.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-import os
-
-import pytest
-
-import gluonnlp as nlp
-import mxnet as mx
-
-
-@pytest.mark.parametrize('batch_size', [7, 80])
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_corpus_batchify(batch_size, wikitext2_test_and_counter):
-    data, counter = wikitext2_test_and_counter
-    vocab = nlp.Vocab(counter)
-    batchify = nlp.data.batchify.CorpusBatchify(vocab, batch_size)
-    batches = batchify(data)
-    assert batches[:].shape == (len(data) // batch_size, batch_size)
-
-
-@pytest.mark.parametrize('batch_size', [7, 80])
-@pytest.mark.parametrize('seq_len', [7, 35])
-@pytest.mark.parametrize('pad_token', ['<pad>', None])
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_corpus_bptt_batchify(batch_size, seq_len, wikitext2_test_and_counter, pad_token):
-    data, counter = wikitext2_test_and_counter
-    vocab = nlp.Vocab(counter, unknown_token=None, bos_token=None, eos_token=None,
-                      padding_token=pad_token)
-
-    # unsupported last_batch
-    with pytest.raises(ValueError):
-        bptt_keep = nlp.data.batchify.CorpusBPTTBatchify(vocab, seq_len, batch_size,
-                                                         last_batch='unsupported')
-
-    # last_batch='keep'
-    if pad_token is not None:
-        bptt_keep = nlp.data.batchify.CorpusBPTTBatchify(vocab, seq_len, batch_size,
-                                                         last_batch='keep')
-        X, Y = zip(*(bptt_keep(data)))
-        X, Y = mx.nd.concat(*X, dim=0), mx.nd.concat(*Y, dim=0)
-        coded = mx.nd.concat(X, Y[-1].expand_dims(0), dim=0).T.reshape(-1).asnumpy().tolist()
-        assert vocab[list(data)] == coded[:len(data)]
-        assert all(pad == vocab[vocab.padding_token] for pad in coded[len(data):])
-    else:
-        with pytest.raises(ValueError):
-            nlp.data.batchify.CorpusBPTTBatchify(vocab, seq_len, batch_size, last_batch='keep')
-
-    # last_batch='discard'
-    bptt_discard = nlp.data.batchify.CorpusBPTTBatchify(vocab, seq_len, batch_size,
-                                                        last_batch='discard')
-    X, Y = zip(*(bptt_discard(data)))
-    X, Y = mx.nd.concat(*X, dim=0), mx.nd.concat(*Y, dim=0)
-    coded = mx.nd.concat(X, Y[-1].expand_dims(0), dim=0).T.reshape(-1).asnumpy().tolist()
-    assert len(data) - len(coded) < batch_size * seq_len
-
-
-@pytest.mark.serial
-def test_bptt_batchify_padding_token():
-    vocab = nlp.Vocab(
-        nlp.data.utils.Counter(['a', 'b', 'c']), padding_token=None)
-    seq_len = 35
-    batch_size = 80
-
-    # Padding token must always be specified for StreamBPTTBatchify
-    with pytest.raises(ValueError):
-        nlp.data.batchify.StreamBPTTBatchify(
-            vocab, seq_len, batch_size, last_batch='discard')
-
-    with pytest.raises(ValueError):
-        nlp.data.batchify.StreamBPTTBatchify(
-            vocab, seq_len, batch_size, last_batch='keep')
-
-    # Padding token must be specified for last_batch='keep' for CorpusBPTTBatchify
-    with pytest.raises(ValueError):
-        nlp.data.batchify.CorpusBPTTBatchify(
-            vocab, seq_len, batch_size, last_batch='keep')
-
-    nlp.data.batchify.CorpusBPTTBatchify(
-        vocab, seq_len, batch_size, last_batch='discard')
-
-
-@pytest.mark.parametrize('batch_size', [7, 80])
-@pytest.mark.parametrize('seq_len', [7, 35])
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_stream_bptt_batchify(
-        seq_len, batch_size, stream_identity_wrappers,
-        wikitext2_simpledatasetstream_skipempty_and_counter):
-    stream, counter = wikitext2_simpledatasetstream_skipempty_and_counter
-    vocab = nlp.vocab.Vocab(counter, bos_token=None)
-
-    bptt_keep = nlp.data.batchify.StreamBPTTBatchify(
-        vocab, seq_len, batch_size, last_batch='keep')
-    bptt_stream = stream_identity_wrappers(bptt_keep(stream))
-    padding_idx = vocab[vocab.padding_token]
-    total_num_tokens = sum(counter.values())
-    num_tokens_per_batch = seq_len * batch_size
-    num_tokens = 0
-    for i, (data, target) in enumerate(bptt_stream):
-        mask = data != padding_idx
-        # count the valid tokens in the batch
-        num_valid_tokens = mask.sum().asscalar()
-        if num_valid_tokens == num_tokens_per_batch:
-            mx.test_utils.assert_almost_equal(data[1:].asnumpy(), target[:-1].asnumpy())
-            assert data.shape == target.shape == (seq_len, batch_size)
-        num_tokens += num_valid_tokens
-    num_batches = sum(1 for _ in bptt_stream)
-    # the last token doesn't appear in data
-    assert num_tokens < total_num_tokens
diff --git a/tests/unittest/conftest.py b/tests/unittest/conftest.py
deleted file mode 100644
index 774b89ef97..0000000000
--- a/tests/unittest/conftest.py
+++ /dev/null
@@ -1,111 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""conftest.py contains configuration for pytest."""
-
-import functools
-import glob
-import itertools
-import os
-
-import pytest
-
-import gluonnlp as nlp
-
-
-###############################################################################
-# Datasets
-###############################################################################
-@pytest.fixture(scope="session")
-def wikitext2_train_and_counter():
-    path = os.path.join('tests', 'data', 'wikitext-2')
-    data = nlp.data.WikiText2(segment='train', root=path)
-    counter = nlp.data.utils.Counter(data)
-    return data, counter
-
-
-@pytest.fixture(scope="session")
-def wikitext2_test_and_counter():
-    path = os.path.join('tests', 'data', 'wikitext-2')
-    data = nlp.data.WikiText2(segment='test', root=path)
-    counter = nlp.data.utils.Counter(data)
-    return data, counter
-
-
-@pytest.fixture(scope="session")
-def wikitext2_val_and_counter():
-    path = os.path.join('tests', 'data', 'wikitext-2')
-    data = nlp.data.WikiText2(segment='val', root=path)
-    counter = nlp.data.utils.Counter(data)
-    return data, counter
-
-
-###############################################################################
-# Stream
-###############################################################################
-@pytest.fixture(params=["prefetch_process", "prefetch_thread", "none"])
-def stream_identity_wrappers(request):
-    """DataStream wrappers that don't change the content of a Stream.
-
-    All DataStreams included in Gluon-NLP should support being wrapped by one
-    of the wrappers returned by this test fixture. When writing a test to test
-    some Stream, make sure to parameterize it by stream_identity_wrappers so
-    that the stream is tested with all possible stream wrappers.
-
-    """
-    if request.param == "prefetch_process":
-        return functools.partial(
-            nlp.data.PrefetchingStream, worker_type='process')
-    elif request.param == "prefetch_thread":
-        return functools.partial(
-            nlp.data.PrefetchingStream, worker_type='thread')
-    elif request.param == "none":
-        return lambda x: x
-    else:
-        raise RuntimeError
-
-
-@pytest.fixture(scope="session")
-def wikitext2_simpledatasetstream_skipempty_and_counter(
-        wikitext2_train_and_counter, wikitext2_test_and_counter,
-        wikitext2_val_and_counter):
-    token_path = os.path.join('tests', 'data', 'wikitext-2/*.tokens')
-    assert len(glob.glob(token_path)) == 3
-    stream = nlp.data.SimpleDatasetStream(
-        nlp.data.CorpusDataset,
-        token_path,
-        skip_empty=True,
-        eos=nlp._constants.EOS_TOKEN)
-    counter = nlp.data.Counter(
-        itertools.chain.from_iterable(itertools.chain.from_iterable(stream)))
-    return stream, counter
-
-
-@pytest.fixture(scope="session")
-def wikitext2_simpledatasetstream_skipempty_flatten_and_counter(
-        wikitext2_train_and_counter, wikitext2_test_and_counter,
-        wikitext2_val_and_counter):
-    token_path = os.path.join('tests', 'data', 'wikitext-2/*.tokens')
-    assert len(glob.glob(token_path)) == 3
-    stream = nlp.data.SimpleDatasetStream(
-        nlp.data.CorpusDataset,
-        token_path,
-        flatten=True,
-        skip_empty=True,
-        eos=nlp._constants.EOS_TOKEN)
-    counter = nlp.data.Counter(
-        itertools.chain.from_iterable(itertools.chain.from_iterable(stream)))
-    return stream, counter
diff --git a/tests/unittest/corpora/test_gbw.py b/tests/unittest/corpora/test_gbw.py
deleted file mode 100644
index 8fba069a7b..0000000000
--- a/tests/unittest/corpora/test_gbw.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-import itertools
-import json
-import os
-import pytest
-
-import gluonnlp as nlp
-import mxnet as mx
-
-
-@pytest.mark.remote_required
-def test_gbw():
-    batch_size = 80
-    seq_len = 35
-
-    stream = nlp.data.GBWStream(segment='test')
-    freq = nlp.data.utils.Counter(
-        itertools.chain.from_iterable(itertools.chain.from_iterable(stream)))
-    assert len(freq) == 21545
-    assert sum(c for c in freq.values()) == 159658
-    assert freq['English'] == 14
diff --git a/tests/unittest/corpora/test_large_text_compression_benchmark.py b/tests/unittest/corpora/test_large_text_compression_benchmark.py
deleted file mode 100644
index 70603205bb..0000000000
--- a/tests/unittest/corpora/test_large_text_compression_benchmark.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-import itertools
-import pytest
-
-import gluonnlp as nlp
-
-
-@pytest.mark.remote_required
-def test_text8():
-    data = nlp.data.Text8()
-    freq = nlp.data.utils.Counter(itertools.chain.from_iterable(data))
-    assert len(freq) == 253854
-    assert sum(c for c in freq.values()) == 17005207
-    assert freq['english'] == 11868
-
-
-@pytest.mark.remote_required
-def test_fil9():
-    data = nlp.data.Fil9()
-    freq = nlp.data.utils.Counter(itertools.chain.from_iterable(data))
-    assert len(freq) == 833184
-    assert sum(c for c in freq.values()) == 124301826
-    assert freq['english'] == 56767
-
-
-@pytest.mark.remote_required
-@pytest.mark.parametrize('segment', ['test', 'train', 'val', 'testraw', 'trainraw', 'valraw'])
-def test_enwik8(segment):
-    _ = nlp.data.Enwik8(segment=segment)
diff --git a/tests/unittest/corpora/test_wikitext.py b/tests/unittest/corpora/test_wikitext.py
deleted file mode 100644
index 5f0df79b26..0000000000
--- a/tests/unittest/corpora/test_wikitext.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-import json
-import os
-import pytest
-
-import gluonnlp as nlp
-import mxnet as mx
-
-
-@pytest.mark.remote_required
-def test_wikitext2():
-    batch_size = 80
-    seq_len = 35
-
-    train = nlp.data.WikiText2(
-        segment='train', root=os.path.join('tests', 'data', 'wikitext-2'))
-    val = nlp.data.WikiText2(
-        segment='val', root=os.path.join('tests', 'data', 'wikitext-2'))
-    test = nlp.data.WikiText2(
-        segment='test', root=os.path.join('tests', 'data', 'wikitext-2'))
-    train_freq, val_freq, test_freq = [
-        nlp.data.utils.Counter(x) for x in [train, val, test]
-    ]
-    assert len(train) == 2075677
-    assert len(train_freq) == 33278
-    assert len(val) == 216347
-    assert len(val_freq) == 13777
-    assert len(test) == 244102
-    assert len(test_freq) == 14143
-    assert test_freq['English'] == 32
-
-    vocab = nlp.Vocab(train_freq)
-    serialized_vocab = vocab.to_json()
-    assert json.loads(serialized_vocab)['idx_to_token'] == vocab._idx_to_token
-
-    bptt_discard = nlp.data.batchify.CorpusBPTTBatchify(
-        vocab, seq_len, batch_size, last_batch='discard')
-    bptt_keep = nlp.data.batchify.CorpusBPTTBatchify(
-        vocab, seq_len, batch_size, last_batch='keep')
-
-    train_data = bptt_discard(train)
-    assert len(train_data) == 741, len(train_data)
-    for i, (data, target) in enumerate(train_data):
-        mx.test_utils.assert_almost_equal(data[1:].asnumpy(), target[:-1].asnumpy())
-        assert data.shape == target.shape == (seq_len, batch_size)
-
-    train_data = bptt_keep(train)
-    assert len(train_data) == 742, len(train_data)
-    assert train_data[-1][0].shape[0] <= seq_len
-    for i, (data, target) in enumerate(train_data):
-        mx.test_utils.assert_almost_equal(data[1:].asnumpy(), target[:-1].asnumpy())
-        assert data.shape == target.shape
-
-    # skip_empty=False
-    train = nlp.data.WikiText2(
-        segment='train',
-        skip_empty=False,
-        root=os.path.join('tests', 'data', 'wikitext-2'))
-    val = nlp.data.WikiText2(
-        segment='val',
-        skip_empty=False,
-        root=os.path.join('tests', 'data', 'wikitext-2'))
-    test = nlp.data.WikiText2(
-        segment='test',
-        skip_empty=False,
-        root=os.path.join('tests', 'data', 'wikitext-2'))
-    train_freq, val_freq, test_freq = [
-        nlp.data.utils.Counter(x) for x in [train, val, test]
-    ]
-    assert len(train) == 2088628
-    assert len(train_freq) == 33278
-    assert len(val) == 217646
-    assert len(val_freq) == 13777
-    assert len(test) == 245569
-    assert len(test_freq) == 14143
-    assert test_freq['English'] == 32
-    batched_data = nlp.data.batchify.CorpusBatchify(vocab, batch_size)(train)
-    assert batched_data[:].shape == (26107, batch_size)
-
-
-@pytest.mark.remote_required
-def test_wikitext2_raw():
-    train = nlp.data.WikiText2Raw(
-        segment='train', root=os.path.join('tests', 'data', 'wikitext-2'))
-    val = nlp.data.WikiText2Raw(
-        segment='val', root=os.path.join('tests', 'data', 'wikitext-2'))
-    test = nlp.data.WikiText2Raw(
-        segment='test', root=os.path.join('tests', 'data', 'wikitext-2'))
-    train_freq, val_freq, test_freq = [
-        nlp.data.utils.Counter(x) for x in [train, val, test]
-    ]
-    assert len(train) == 10843541
-    assert len(train_freq) == 192
-    assert len(val) == 1136862
-    assert len(val_freq) == 168
-    assert len(test) == 1278983
-    assert len(test_freq) == 177
-    assert test_freq['a'.encode('utf-8')[0]] == 81512
-
-
-@pytest.mark.remote_required
-def test_wikitext103_raw():
-    train = nlp.data.WikiText103Raw(
-        segment='train', root=os.path.join('tests', 'data', 'wikitext-103'))
-    val = nlp.data.WikiText103Raw(
-        segment='val', root=os.path.join('tests', 'data', 'wikitext-103'))
-    test = nlp.data.WikiText103Raw(
-        segment='test', root=os.path.join('tests', 'data', 'wikitext-103'))
-    train_freq, val_freq, test_freq = [
-        nlp.data.utils.Counter(x) for x in [train, val, test]
-    ]
-    assert len(train) == 535800393
-    assert len(train_freq) == 203
-    assert len(val) == 1136862
-    assert len(val_freq) == 168
-    assert len(test) == 1278983
-    assert len(test_freq) == 177
-    assert test_freq['a'.encode('utf-8')[0]] == 81512
diff --git a/tests/unittest/test_attention_cell.py b/tests/unittest/test_attention_cell.py
deleted file mode 100644
index b11bbd3198..0000000000
--- a/tests/unittest/test_attention_cell.py
+++ /dev/null
@@ -1,101 +0,0 @@
-import numpy as np
-from numpy.testing import assert_allclose
-import mxnet as mx
-from gluonnlp.model import attention_cell as ac
-
-
-def check_attention_cell_basic(attention_cell, q_channel, k_channel, v_channel,
-                               use_mask, multi_head=False,
-                               num_heads=None):
-    attention_cell.initialize()
-    attention_cell.hybridize()
-    for query_length, mem_length in [(10, 5), (1, 5), (5, 1)]:
-        for batch_size in [1, 3]:
-            if use_mask:
-                mask_nd = mx.random.uniform(0, 1,
-                                            shape=(batch_size, query_length, mem_length)) > 0.3
-            else:
-                mask_nd = None
-            query_nd = mx.nd.random.normal(0, 1, (batch_size, query_length, q_channel))
-            key_nd = mx.nd.random.normal(0, 1, (batch_size, mem_length, k_channel))
-            value_nd = mx.nd.random.normal(0, 1, (batch_size, mem_length, v_channel))
-            read_value, att_weights = attention_cell(query_nd, key_nd, value_nd, mask_nd)
-            att_weights_npy = att_weights.asnumpy()
-            read_value_npy = read_value.asnumpy()
-            value_npy = value_nd.asnumpy()
-            if not multi_head:
-                if use_mask:
-                    assert_allclose(att_weights_npy.sum(axis=-1),
-                                    mx.nd.sum(mask_nd, axis=-1).asnumpy() > 0, 1E-5, 1E-5)
-                else:
-                    assert_allclose(att_weights_npy.sum(axis=-1),
-                                    np.ones(att_weights.shape[:-1]), 1E-5, 1E-5)
-                # Check the read value is correct
-                for i in range(batch_size):
-                    assert_allclose(read_value_npy[i],
-                                    att_weights_npy[i].dot(value_npy[i]), 1E-5, 1E-5)
-                if use_mask:
-                    assert_allclose(mx.nd.norm((1 - mask_nd) * att_weights).asscalar(), 0)
-            else:
-                read_value_npy = read_value_npy.reshape((batch_size, query_length, num_heads,
-                                                         -1))
-                if use_mask:
-                    mask_npy = mask_nd.asnumpy()
-                for j in range(num_heads):
-                    if use_mask:
-                        assert_allclose(att_weights_npy[:, j, :, :].sum(axis=-1),
-                                        mask_npy.sum(axis=-1) > 0, 1E-5, 1E-5)
-                    else:
-                        assert_allclose(att_weights_npy[:, j, :, :].sum(axis=-1),
-                                        np.ones((batch_size, query_length)), 1E-5, 1E-5)
-                    if use_mask:
-                        assert_allclose((1 - mask_npy) * att_weights_npy[:, j, :, :], 0)
-
-
-def test_mlp_attention():
-    for k_channel, q_channel in [(1, 4), (4, 1), (3, 4), (4, 3), (4, 4)]:
-        for use_mask in [True, False]:
-            cell = ac.MLPAttentionCell(units=32)
-            check_attention_cell_basic(cell, q_channel, k_channel, 5, use_mask)
-            cell = ac.MLPAttentionCell(units=16, normalized=True)
-            check_attention_cell_basic(cell, q_channel, k_channel, 5, use_mask)
-
-
-def test_dot_product_attention():
-    for k_channel, q_channel in [(1, 1), (2, 2), (4, 4)]:
-        for use_mask in [True, False]:
-            for scaled in [True, False]:
-                for normalized in [True, False]:
-                    cell = ac.DotProductAttentionCell(scaled=scaled, normalized=normalized)
-                    check_attention_cell_basic(cell, q_channel, k_channel, 5, use_mask)
-
-    for k_channel, q_channel in [(1, 2), (2, 1), (2, 4)]:
-        for use_mask in [True, False]:
-            for scaled in [True, False]:
-                for normalized in [True, False]:
-                    cell = ac.DotProductAttentionCell(units=8, scaled=scaled, normalized=normalized)
-                    check_attention_cell_basic(cell, q_channel, k_channel, 5, use_mask)
-                    cell = ac.DotProductAttentionCell(units=k_channel, luong_style=True,
-                                                   scaled=scaled, normalized=normalized)
-                    check_attention_cell_basic(cell, q_channel, k_channel, 5, use_mask)
-
-
-def test_multihead_attention():
-    for query_units, key_units, value_units, num_heads in [(4, 4, 8, 2), (3, 3, 9, 3),
-                                                           (6, 6, 5, 1)]:
-        for use_mask in [True, False]:
-            for scaled in [True, False]:
-                for normalized in [True, False]:
-                    cell = ac.MultiHeadAttentionCell(
-                        base_cell=ac.DotProductAttentionCell(scaled=scaled, normalized=normalized),
-                        query_units=query_units,
-                        key_units=key_units,
-                        value_units=value_units,
-                        num_heads=num_heads)
-                    check_attention_cell_basic(cell,
-                                               q_channel=query_units // num_heads,
-                                               k_channel=key_units // num_heads,
-                                               v_channel=value_units // num_heads,
-                                               use_mask=use_mask,
-                                               multi_head=True,
-                                               num_heads=num_heads)
diff --git a/tests/unittest/test_bertvocab.py b/tests/unittest/test_bertvocab.py
deleted file mode 100644
index c777f0aa19..0000000000
--- a/tests/unittest/test_bertvocab.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-import pytest
-from mxnet.test_utils import *
-
-import gluonnlp as nlp
-
-
-@pytest.fixture
-def counter():
-    return nlp.data.utils.Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_bertvocab():
-    ctx = mx.cpu()
-
-    bert_base1, vocab1 = nlp.model.get_model('bert_12_768_12',
-                                             dataset_name='book_corpus_wiki_en_cased',
-                                             pretrained=True, ctx=ctx, use_pooler=True,
-                                             use_decoder=False, use_classifier=False)
-
-    bert_base2, vocab2 = nlp.model.get_model('bert_12_768_12',
-                                             dataset_name='book_corpus_wiki_en_uncased',
-                                             pretrained=True, ctx=ctx, use_pooler=True,
-                                             use_decoder=False, use_classifier=False)
-
-    bert_base3, vocab3 = nlp.model.get_model('bert_12_768_12',
-                                             dataset_name='wiki_multilingual_cased',
-                                             pretrained=True, ctx=ctx, use_pooler=True,
-                                             use_decoder=False, use_classifier=False)
-
-    bert_base4, vocab4 = nlp.model.get_model('bert_12_768_12',
-                                             dataset_name='wiki_multilingual_uncased',
-                                             pretrained=True, ctx=ctx, use_pooler=True,
-                                             use_decoder=False, use_classifier=False)
-
-    bert_base5, vocab5 = nlp.model.get_model('bert_12_768_12',
-                                             dataset_name='wiki_cn_cased',
-                                             pretrained=True, ctx=ctx, use_pooler=True,
-                                             use_decoder=False, use_classifier=False)
-
-    bert_base6, vocab6 = nlp.model.get_model('bert_12_768_12',
-                                                dataset_name='kobert_news_wiki_ko_cased',
-                                                pretrained=True, ctx=ctx, use_pooler=True,
-                                                use_decoder=False, use_classifier=False)
-
-    assert vocab1.cls_token == vocab2.cls_token == vocab3.cls_token == \
-        vocab4.cls_token == vocab5.cls_token == vocab6.cls_token == \
-        nlp.vocab.bert.CLS_TOKEN
-
-    assert vocab1.sep_token == vocab2.sep_token == vocab3.sep_token == \
-        vocab4.sep_token == vocab5.sep_token == vocab6.sep_token == \
-        nlp.vocab.bert.SEP_TOKEN
-
-    assert vocab1.mask_token == vocab2.mask_token == vocab3.mask_token == \
-        vocab4.mask_token == vocab5.mask_token == vocab6.mask_token == \
-        nlp.vocab.bert.MASK_TOKEN
-
-    assert vocab1.padding_token == vocab2.padding_token == vocab3.padding_token == \
-        vocab4.padding_token == vocab5.padding_token == vocab6.padding_token == \
-        nlp.vocab.bert.PADDING_TOKEN
-
-    assert vocab1.unknown_token == vocab2.unknown_token == vocab3.unknown_token == \
-        vocab4.unknown_token == vocab5.unknown_token == vocab6.unknown_token == \
-        nlp.vocab.bert.UNKNOWN_TOKEN
-
-
-@pytest.mark.remote_required
-def test_bert_vocab_from_sentencepiece():
-    # the downloaded bpe vocab includes tokens for unk and padding, but without bos/eos.
-    url = 'http://repo.mxnet.io/gluon/dataset/vocab/test-682b5d15.bpe'
-    f = download(url, overwrite=True)
-    bert_vocab = nlp.vocab.BERTVocab.from_sentencepiece(f, eos_token=u'<eos>')
-
-    import sentencepiece
-    spm = sentencepiece.SentencePieceProcessor()
-    spm.Load(f)
-
-    # check special tokens
-    assert spm.IdToPiece(spm.unk_id()) == bert_vocab.unknown_token
-    assert spm.IdToPiece(spm.pad_id()) == bert_vocab.padding_token
-    assert None == bert_vocab.bos_token
-    assert u'<eos>' == bert_vocab.eos_token
-    assert u'<eos>' in bert_vocab
-    reserved_tokens = [u'[MASK]', u'[SEP]', u'[CLS]', u'<eos>', u'[PAD]']
-    assert len(reserved_tokens) == len(bert_vocab.reserved_tokens)
-    assert all(t in bert_vocab.reserved_tokens for t in reserved_tokens)
-    num_tokens = len(spm)
-    for i in range(num_tokens):
-        token = spm.IdToPiece(i)
-        assert bert_vocab[token] == i
-
-
-@pytest.mark.parametrize('unknown_token', ['<unk>', None])
-def test_bert_vocab_serialization(unknown_token):
-    def check(vocab):
-        assert vocab.mask_token == '[MASK]'
-        assert vocab.sep_token == '[SEP]'
-        assert vocab.cls_token == '[CLS]'
-
-        if not unknown_token:
-            with pytest.raises(KeyError):
-                vocab['hello']
-        else:
-            vocab['hello']
-
-    vocab = nlp.vocab.BERTVocab(unknown_token=unknown_token)
-    check(vocab)
-
-    loaded_vocab = nlp.vocab.BERTVocab.from_json(vocab.to_json())
-    check(loaded_vocab)
-
-    # Interoperability of serialization format with nlp.Vocab
-    loaded_vocab = nlp.Vocab.from_json(vocab.to_json())
-    check(loaded_vocab)
diff --git a/tests/unittest/test_bilm_encoder.py b/tests/unittest/test_bilm_encoder.py
deleted file mode 100644
index 7f644dfcb2..0000000000
--- a/tests/unittest/test_bilm_encoder.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-import pytest
-import mxnet as mx
-from gluonnlp.model import BiLMEncoder
-
-
-@pytest.mark.parametrize('hybridize', [False, True])
-def test_bilm_encoder_output_shape_lstm(hybridize):
-    num_layers = 2
-    seq_len = 7
-    hidden_size = 100
-    input_size = 100
-    batch_size = 2
-
-    encoder = BiLMEncoder(mode='lstm',
-                          num_layers=num_layers,
-                          input_size=input_size,
-                          hidden_size=hidden_size,
-                          dropout=0.1,
-                          skip_connection=False)
-
-    output = run_bi_lm_encoding(encoder, batch_size, input_size, seq_len, hybridize)
-    assert output.shape == (num_layers, seq_len, batch_size, 2 * hidden_size), output.shape
-
-
-@pytest.mark.parametrize('hybridize', [False, True])
-def test_bilm_encoder_output_shape_lstmpc(hybridize):
-    num_layers = 2
-    seq_len = 7
-    hidden_size = 100
-    input_size = 100
-    batch_size = 2
-    proj_size = 15
-
-    encoder = BiLMEncoder(mode='lstmpc',
-                          num_layers=num_layers,
-                          input_size=input_size,
-                          hidden_size=hidden_size,
-                          dropout=0.1,
-                          skip_connection=False,
-                          proj_size=proj_size)
-
-    output = run_bi_lm_encoding(encoder, batch_size, input_size, seq_len, hybridize)
-    assert output.shape == (num_layers, seq_len, batch_size, 2 * proj_size), output.shape
-
-
-def run_bi_lm_encoding(encoder, batch_size, input_size, seq_len, hybridize):
-    encoder.initialize()
-
-    if hybridize:
-        encoder.hybridize()
-
-    inputs = mx.random.uniform(shape=(seq_len, batch_size, input_size))
-    inputs_mask = mx.random.uniform(-1, 1, shape=(batch_size, seq_len)) > 1
-
-    state = encoder.begin_state(batch_size=batch_size, func=mx.ndarray.zeros)
-    output, _ = encoder(inputs, state, inputs_mask)
-    return output
diff --git a/tests/unittest/test_candidate_sampler.py b/tests/unittest/test_candidate_sampler.py
deleted file mode 100644
index 8dec05a744..0000000000
--- a/tests/unittest/test_candidate_sampler.py
+++ /dev/null
@@ -1,17 +0,0 @@
-import numpy as np
-import mxnet as mx
-import pytest
-
-from gluonnlp.data import candidate_sampler as cs
-
-
-@pytest.mark.seed(1)
-def test_unigram_candidate_sampler(hybridize):
-    N = 1000
-    sampler = cs.UnigramCandidateSampler(mx.nd.arange(N))
-    sampler.initialize()
-    if hybridize:
-        sampler.hybridize()
-    sampled = sampler(mx.nd.ones(3))
-    print(sampled.asnumpy())
-    assert np.all([729, 594, 690] == sampled.asnumpy())
diff --git a/tests/unittest/test_convolutional_encoder.py b/tests/unittest/test_convolutional_encoder.py
deleted file mode 100644
index bc1506e5c5..0000000000
--- a/tests/unittest/test_convolutional_encoder.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from numpy.testing import assert_almost_equal
-import pytest
-import mxnet as mx
-from gluonnlp import model
-
-
-@pytest.mark.parametrize('hybridize', [True, False])
-@pytest.mark.parametrize('mask', [True, False])
-def test_conv_encoder_nonhighway_forward(hybridize, mask):
-    encoder = model.ConvolutionalEncoder(embed_size=2, num_filters=(1, 1),
-                                         ngram_filter_sizes=(1, 2))
-    print(encoder)
-    encoder.initialize(init='One')
-    if hybridize:
-        encoder.hybridize()
-    inputs = mx.nd.array([[[.7, .8], [.1, 1.5], [.2, .3]], [[.5, .6], [.2, 2.5], [.4, 4]]])
-    if mask:
-        output = encoder(inputs, mx.nd.ones(inputs.shape[:-1]))
-    else:
-        output = encoder(inputs)
-    assert output.shape == (3, 2), output.shape
-    assert_almost_equal(output.asnumpy(),
-                        mx.nd.array([[1.37, 1.42],
-                                     [1.49, 1.49],
-                                     [1.5, 1.5]]).asnumpy(),
-                        decimal=2)
-
-
-@pytest.mark.parametrize('hybridize', [True, False])
-@pytest.mark.parametrize('mask', [True, False])
-def test_conv_encoder_nohighway_forward_largeinputs(hybridize, mask):
-    encoder = model.ConvolutionalEncoder(embed_size=7,
-                                         num_filters=(1, 1, 2, 3),
-                                         ngram_filter_sizes=(1, 2, 3, 4),
-                                         output_size=30)
-    print(encoder)
-    encoder.initialize()
-    if hybridize:
-        encoder.hybridize()
-    inputs = mx.nd.random.uniform(shape=(4, 8, 7))
-    if mask:
-        output = encoder(inputs, mx.nd.ones(inputs.shape[:-1]))
-    else:
-        output = encoder(inputs)
-    assert output.shape == (8, 30), output.shape
-
-
-@pytest.mark.parametrize('hybridize', [True, False])
-@pytest.mark.parametrize('mask', [True, False])
-def test_conv_encoder_highway_forward(hybridize, mask):
-    encoder = model.ConvolutionalEncoder(embed_size=2,
-                                         num_filters=(2, 1),
-                                         ngram_filter_sizes=(1, 2),
-                                         num_highway=2,
-                                         output_size=1)
-    print(encoder)
-    encoder.initialize()
-    if hybridize:
-        encoder.hybridize()
-    inputs = mx.nd.array([[[.7, .8], [.1, 1.5], [.7, .8]], [[.7, .8], [.1, 1.5], [.7, .8]]])
-    if mask:
-        output = encoder(inputs, mx.nd.ones(inputs.shape[:-1]))
-    else:
-        output = encoder(inputs)
-    print(output)
-    assert output.shape == (3, 1), output.shape
-
-
-@pytest.mark.parametrize('hybridize', [True, False])
-@pytest.mark.parametrize('mask', [True, False])
-def test_conv_encoder_highway_default_forward(hybridize, mask):
-    encoder = model.ConvolutionalEncoder()
-    encoder.initialize(init='One')
-    if hybridize:
-        encoder.hybridize()
-    print(encoder)
-    inputs = mx.nd.random.uniform(shape=(10, 20, 15))
-    if mask:
-        output = encoder(inputs, mx.nd.ones(inputs.shape[:-1]))
-    else:
-        output = encoder(inputs)
-    assert output.shape == (20, 525), output.shape
diff --git a/tests/unittest/test_datasets.py b/tests/unittest/test_datasets.py
deleted file mode 100644
index 90f05fe73f..0000000000
--- a/tests/unittest/test_datasets.py
+++ /dev/null
@@ -1,806 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-import datetime
-import os
-import io
-import random
-import warnings
-import threading
-
-from flaky import flaky
-import mxnet as mx
-import numpy as np
-import pytest
-
-import gluonnlp as nlp
-from mxnet.gluon.data import SimpleDataset
-
-###############################################################################
-# Registry
-###############################################################################
-@pytest.mark.serial
-def test_dataset_registry():
-    @nlp.data.register(segment=['train'])
-    class MyDataset(mx.gluon.data.Dataset):
-        def __init__(self, segment='train'):
-            pass
-
-    my_dataset = nlp.data.create('MyDataset')
-
-    with pytest.raises(RuntimeError):
-
-        @nlp.data.register(segment='thisshouldbealistofarguments')
-        class MyDataset2(mx.gluon.data.Dataset):
-            def __init__(self, segment='train'):
-                pass
-
-    with pytest.raises(RuntimeError):
-
-        @nlp.data.register(invalidargument=['train'])
-        class MyDataset3(mx.gluon.data.Dataset):
-            def __init__(self, segment='train'):
-                pass
-
-    @nlp.data.register()
-    class MyDataset4(mx.gluon.data.Dataset):
-        def __init__(self, segment='train'):
-            pass
-
-    my_dataset = nlp.data.create('MyDataset4')
-
-
-    @nlp.data.register
-    class MyDataset5(mx.gluon.data.Dataset):
-        def __init__(self, segment='train'):
-            pass
-
-    my_dataset = nlp.data.create('MyDataset5')
-
-
-###############################################################################
-# Sentiment analysis
-###############################################################################
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_imdb():
-    train = nlp.data.IMDB(segment='train')
-    test = nlp.data.IMDB(segment='test')
-    unsup = nlp.data.IMDB(segment='unsup')
-    assert len(train) == 25000, len(train)
-    assert len(test) == 25000, len(test)
-    assert len(unsup) == 50000, len(unsup)
-
-    for i, (data, score) in enumerate(train):
-        assert isinstance(data, str)
-        assert score <= 4 or score >= 7
-
-    for i, (data, score) in enumerate(test):
-        assert isinstance(data, str)
-        assert score <= 4 or score >= 7
-
-    for i, (data, score) in enumerate(unsup):
-        assert isinstance(data, str)
-        assert score == 0
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_mr():
-    all = nlp.data.MR()
-    assert len(all) == 10662, len(all)
-    for i, (data, label) in enumerate(all):
-        assert isinstance(data, str)
-        assert label <= 1
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_sst_1():
-    train = nlp.data.SST_1(segment='train')
-    test = nlp.data.SST_1(segment='test')
-    dev = nlp.data.SST_1(segment='dev')
-    assert len(train) == 156817, len(train)
-    assert len(test) == 2210, len(test)
-    assert len(dev) == 1101, len(dev)
-    for i, (data, label) in enumerate(train):
-        assert isinstance(data, str)
-        assert label <= 4
-    for i, (data, label) in enumerate(test):
-        assert isinstance(data, str)
-        assert label <= 4
-    for i, (data, label) in enumerate(dev):
-        assert isinstance(data, str)
-        assert label <= 4
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_sst_2():
-    train = nlp.data.SST_2(segment='train')
-    test = nlp.data.SST_2(segment='test')
-    dev = nlp.data.SST_2(segment='dev')
-    assert len(train) == 76961, len(train)
-    assert len(test) == 1821, len(test)
-    assert len(dev) == 872, len(dev)
-    for i, (data, label) in enumerate(train):
-        assert isinstance(data, str)
-        assert label <= 1
-    for i, (data, label) in enumerate(test):
-        assert isinstance(data, str)
-        assert label <= 1
-    for i, (data, label) in enumerate(dev):
-        assert isinstance(data, str)
-        assert label <= 1
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_subj():
-    all = nlp.data.SUBJ()
-    assert len(all) == 10000, len(all)
-    for i, (data, label) in enumerate(all):
-        assert isinstance(data, str)
-        assert label <= 1
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_trec():
-    train = nlp.data.TREC(segment='train')
-    test = nlp.data.TREC(segment='test')
-    assert len(train) == 5452, len(train)
-    assert len(test) == 500, len(test)
-    for i, (data, label) in enumerate(train):
-        assert isinstance(data, str)
-        assert label <= 5
-    for i, (data, label) in enumerate(test):
-        assert isinstance(data, str)
-        assert label <= 5
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_cr():
-    all = nlp.data.CR()
-    assert len(all) == 3775, len(all)
-    for i, (data, label) in enumerate(all):
-        assert isinstance(data, str)
-        assert label <= 1
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_mpqa():
-    all = nlp.data.MPQA()
-    assert len(all) == 10606, len(all)
-    for i, (data, label) in enumerate(all):
-        assert isinstance(data, str)
-        assert label <= 1
-
-
-###############################################################################
-# Word similarity and relatedness datasets
-###############################################################################
-def _assert_similarity_dataset(data):
-    # Check datatypes
-    assert isinstance(data[0][0], str)
-    assert isinstance(data[0][1], str)
-    assert np.isfinite(data[0][2])
-
-    # Check score magnitude
-    assert all(data.min <= row[2] <= data.max for row in data)
-
-
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@flaky(max_runs=2, min_passes=1)
-@pytest.mark.parametrize('segment,length', [('all', 352), ('relatedness', 252),
-                                            ('similarity', 203)])
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_wordsim353(segment, length):
-    # 'all' has length 352 as the original dataset contains the 'money/cash'
-    # pair twice with different similarity ratings, which was fixed by the
-    # http://alfonseca.org/eng/research/wordsim353.html version of the dataset
-    # that we are using.
-    data = nlp.data.WordSim353(segment=segment)
-    assert len(data) == length, len(data)
-    _assert_similarity_dataset(data)
-
-
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_men():
-    for segment, length in [("full", 3000), ("dev", 2000), ("test", 1000)]:
-        data = nlp.data.MEN(segment=segment)
-        assert len(data) == length, len(data)
-        _assert_similarity_dataset(data)
-
-
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@flaky(max_runs=2, min_passes=1)
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_radinsky_mturk():
-    data = nlp.data.RadinskyMTurk()
-    assert len(data) == 287
-    _assert_similarity_dataset(data)
-
-
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@flaky(max_runs=2, min_passes=1)
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_verb143():
-    data = nlp.data.BakerVerb143()
-    assert len(data) == 144
-    _assert_similarity_dataset(data)
-
-
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@flaky(max_runs=2, min_passes=1)
-@pytest.mark.serial
-def test_verb130():
-    data = nlp.data.YangPowersVerb130()
-    assert len(data) == 130
-    _assert_similarity_dataset(data)
-
-
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@flaky(max_runs=2, min_passes=1)
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_rare_words():
-    data = nlp.data.RareWords()
-    assert len(data) == 2034
-    _assert_similarity_dataset(data)
-
-
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@flaky(max_runs=2, min_passes=1)
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_simlex999():
-    data = nlp.data.SimLex999()
-    assert len(data) == 999
-    _assert_similarity_dataset(data)
-
-
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@flaky(max_runs=2, min_passes=1)
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_simverb3500():
-    data = nlp.data.SimVerb3500()
-    assert len(data) == 3500
-    _assert_similarity_dataset(data)
-
-
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@flaky(max_runs=2, min_passes=1)
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.skipif(datetime.date.today() < datetime.date(2019, 11, 21), reason='website down')
-def test_semeval17task2():
-    for segment, length in [("trial", 18), ("test", 500)]:
-        data = nlp.data.SemEval17Task2(segment=segment)
-        assert len(data) == length
-        _assert_similarity_dataset(data)
-
-
-###############################################################################
-# Word analogy datasets
-###############################################################################
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@flaky(max_runs=2, min_passes=1)
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_googleanalogy():
-    data = nlp.data.GoogleAnalogyTestSet()
-    assert len(data[0]) == 4
-    assert len(data) == 10675 + 8869
-
-
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@flaky(max_runs=2, min_passes=1)
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_bigger_analogy():
-    data = nlp.data.BiggerAnalogyTestSet()
-    assert len(data[0]) == 4
-    assert len(data) == 98000
-
-
-###############################################################################
-# CONLL
-###############################################################################
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@flaky(max_runs=2, min_passes=1)
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_conll2000():
-    train = nlp.data.CoNLL2000(segment='train')
-    test = nlp.data.CoNLL2000(segment='test')
-    assert len(train) == 8936, len(train)
-    assert len(test) == 2012, len(test)
-
-    for i, (data, pos, chk) in enumerate(train):
-        assert all(isinstance(d, str) for d in data), data
-        assert all(isinstance(p, str) for p in pos), pos
-        assert all(isinstance(c, str) for c in chk), chk
-
-    for i, (data, pos, chk) in enumerate(test):
-        assert all(isinstance(d, str) for d in data), data
-        assert all(isinstance(p, str) for p in pos), pos
-        assert all(isinstance(c, str) for c in chk), chk
-
-
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@flaky(max_runs=2, min_passes=1)
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_conll2001():
-    for part in range(1, 4):
-        train = nlp.data.CoNLL2001(part, segment='train')
-        testa = nlp.data.CoNLL2001(part, segment='testa')
-        testb = nlp.data.CoNLL2001(part, segment='testb')
-        assert len(train) == 8936, len(train)
-        assert len(testa) == 2012, len(testa)
-        assert len(testb) == 1671, len(testb)
-
-        for dataset in [train, testa, testb]:
-            for i, (data, pos, chk, clause) in enumerate(dataset):
-                assert all(isinstance(d, str) for d in data), data
-                assert all(isinstance(p, str) for p in pos), pos
-                assert all(isinstance(c, str) for c in chk), chk
-                assert all(isinstance(i, str) for i in clause), clause
-
-
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@flaky(max_runs=2, min_passes=1)
-@pytest.mark.parametrize('segment,length', [
-    ('train', 15806),
-    ('testa', 2895),
-    ('testb', 5195),
-])
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_conll2002_ned(segment, length):
-    dataset = nlp.data.CoNLL2002('ned', segment=segment)
-    assert len(dataset) == length, len(dataset)
-    for i, (data, pos, ner) in enumerate(dataset):
-        assert all(isinstance(d, str) for d in data), data
-        assert all(isinstance(p, str) for p in pos), pos
-        assert all(isinstance(n, str) for n in ner), ner
-
-
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@flaky(max_runs=2, min_passes=1)
-@pytest.mark.parametrize('segment,length', [
-    ('train', 8323),
-    ('testa', 1915),
-    ('testb', 1517),
-])
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_conll2002_esp(segment, length):
-    dataset = nlp.data.CoNLL2002('esp', segment=segment)
-    assert len(dataset) == length, len(dataset)
-    for i, (data, ner) in enumerate(dataset):
-        assert all(isinstance(d, str) for d in data), data
-        assert all(isinstance(n, str) for n in ner), ner
-
-
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@flaky(max_runs=2, min_passes=1)
-@pytest.mark.parametrize('segment,length', [
-    ('train', 8936),
-    ('dev', 2012),
-    ('test', 1671),
-])
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_conll2004(segment, length):
-    dataset = nlp.data.CoNLL2004(segment=segment)
-    assert len(dataset) == length, len(dataset)
-
-    for i, x in enumerate(dataset):
-        assert len(x) >= 6, x
-        assert all(isinstance(d, str) for f in x for d in f), x
-        assert max(len(f) for f in x) == min(len(f) for f in x), x
-
-
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@flaky(max_runs=2, min_passes=1)
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_ud21():
-    test_langs = list(nlp._constants.UD21_DATA_FILE_SHA1.items())
-    random.shuffle(test_langs)
-    test_langs = test_langs[:30]
-    for lang, segments in test_langs:
-        segment = list(segments.keys())
-        random.shuffle(segment)
-        segment = segment[0]
-        dataset = nlp.data.UniversalDependencies21(
-            lang=lang, segment=segment)
-        print('processing {}: {}'.format(lang, segment))
-        for i, x in enumerate(dataset):
-            assert len(x) >= 9, x
-            assert all(isinstance(d, str) for f in x for d in f), x
-            assert max(len(f) for f in x) == min(len(f) for f in x)
-
-
-###############################################################################
-# Translation
-###############################################################################
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_iwlst2015():
-    # Test en to vi
-    train_en_vi = nlp.data.IWSLT2015(segment='train')
-    val_en_vi = nlp.data.IWSLT2015(segment='val')
-    test_en_vi = nlp.data.IWSLT2015(segment='test')
-    assert len(train_en_vi) == 133166
-    assert len(val_en_vi) == 1553
-    assert len(test_en_vi) == 1268
-
-    with warnings.catch_warnings():  # TODO https://github.com/dmlc/gluon-nlp/issues/978
-        warnings.simplefilter("ignore")
-        en_vocab, vi_vocab = train_en_vi.src_vocab, train_en_vi.tgt_vocab
-    assert len(en_vocab) == 17191
-    assert len(vi_vocab) == 7709
-
-    train_vi_en = nlp.data.IWSLT2015(segment='train', src_lang='vi', tgt_lang='en')
-    with warnings.catch_warnings():  # TODO https://github.com/dmlc/gluon-nlp/issues/978
-        warnings.simplefilter("ignore")
-        vi_vocab, en_vocab = train_vi_en.src_vocab, train_vi_en.tgt_vocab
-    assert len(en_vocab) == 17191
-    assert len(vi_vocab) == 7709
-    for i in range(10):
-        lhs = train_en_vi[i]
-        rhs = train_vi_en[i]
-        assert lhs[0] == rhs[1] and rhs[0] == lhs[1]
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_wmt2016():
-    train = nlp.data.WMT2016(segment='train', src_lang='en', tgt_lang='de')
-    newstests = [nlp.data.WMT2016(segment='newstest%d' %i, src_lang='en', tgt_lang='de')
-                 for i in range(2012, 2017)]
-    assert len(train) == 4549428
-    assert tuple(len(ele) for ele in newstests) == (3003, 3000, 3003, 2169, 2999)
-
-    newstest_2012_2015 = nlp.data.WMT2016(segment=['newstest%d' %i for i in range(2012, 2016)],
-                                          src_lang='en', tgt_lang='de')
-    assert len(newstest_2012_2015) == 3003 + 3000 + 3003 + 2169
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_wmt2016bpe():
-    train = nlp.data.WMT2016BPE(segment='train', src_lang='en', tgt_lang='de')
-    newstests = [nlp.data.WMT2016BPE(segment='newstest%d' %i, src_lang='en', tgt_lang='de')
-                 for i in range(2012, 2017)]
-    assert len(train) == 4500966
-    assert tuple(len(ele) for ele in newstests) == (3003, 3000, 3003, 2169, 2999)
-
-    newstest_2012_2015 = nlp.data.WMT2016BPE(segment=['newstest%d' %i for i in range(2012, 2016)],
-                                             src_lang='en', tgt_lang='de')
-    assert len(newstest_2012_2015) == 3003 + 3000 + 3003 + 2169
-    with warnings.catch_warnings():  # TODO https://github.com/dmlc/gluon-nlp/issues/978
-        warnings.simplefilter("ignore")
-        en_vocab, de_vocab = train.src_vocab, train.tgt_vocab
-    assert len(en_vocab) == 36548
-    assert len(de_vocab) == 36548
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_wmt2014():
-    train = nlp.data.WMT2014(segment='train', src_lang='en', tgt_lang='de')
-    newstests = [nlp.data.WMT2014(segment='newstest%d' %i, src_lang='en', tgt_lang='de')
-                 for i in range(2009, 2015)]
-    assert len(train) == 4509333
-    assert tuple(len(ele) for ele in newstests) == (2525, 2489, 3003, 3003, 3000, 2737)
-
-    newstest_2009_2013 = nlp.data.WMT2014(segment=['newstest%d' %i for i in range(2009, 2014)],
-                                          src_lang='en', tgt_lang='de')
-    assert len(newstest_2009_2013) == 2525 + 2489 + 3003 + 3003 + 3000
-
-    newstest_2014 = nlp.data.WMT2014(segment='newstest2014', src_lang='de', tgt_lang='en')
-    assert len(newstest_2014) == 3003
-
-    newstest_2014 = nlp.data.WMT2014(segment='newstest2014', src_lang='de', tgt_lang='en', full=True)
-    assert len(newstest_2014) == 3003
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_wmt2014bpe():
-    train = nlp.data.WMT2014BPE(segment='train', src_lang='en', tgt_lang='de')
-    newstests = [nlp.data.WMT2014BPE(segment='newstest%d' %i, src_lang='en', tgt_lang='de')
-                 for i in range(2009, 2015)]
-    assert len(train) == 4493328
-    assert tuple(len(ele) for ele in newstests) == (2525, 2489, 3003, 3003, 3000, 2737)
-
-    newstest_2009_2013 = nlp.data.WMT2014BPE(segment=['newstest%d' %i for i in range(2009, 2014)],
-                                             src_lang='en', tgt_lang='de')
-    assert len(newstest_2009_2013) == 2525 + 2489 + 3003 + 3003 + 3000
-    with warnings.catch_warnings():  # TODO https://github.com/dmlc/gluon-nlp/issues/978
-        warnings.simplefilter("ignore")
-        en_vocab, de_vocab = train.src_vocab, train.tgt_vocab
-    assert len(en_vocab) == 36794
-    assert len(de_vocab) == 36794
-
-    newstest_2014 = nlp.data.WMT2014BPE(segment='newstest2014', src_lang='de', tgt_lang='en')
-    assert len(newstest_2014) == 3003
-
-    newstest_2014 = nlp.data.WMT2014BPE(segment='newstest2014', src_lang='de', tgt_lang='en', full=True)
-    assert len(newstest_2014) == 3003
-
-###############################################################################
-# Question answering
-###############################################################################
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_load_dev_squad():
-    # number of records in dataset is equal to number of different questions
-    train_dataset = nlp.data.SQuAD(segment='train', version='1.1')
-    assert len(train_dataset) == 87599
-
-    val_dataset = nlp.data.SQuAD(segment='dev',version='1.1')
-    assert len(val_dataset) == 10570
-
-    # Each record is a tuple of 6 elements: record_id, question Id, question, context,
-    # list of answer texts, list of answer start indices
-    for record in val_dataset:
-        assert len(record) == 6
-
-    train_dataset_2 = nlp.data.SQuAD(segment='train', version='2.0')
-    assert len(train_dataset_2) == 130319
-
-    val_dataset = nlp.data.SQuAD(segment='dev', version='2.0')
-    assert len(val_dataset) == 11873
-
-    # Each record is a tuple of 7 elements: record_id, question Id, question, context,
-    # list of answer texts, list of answer start indices, is_impossible
-    for record in val_dataset:
-        assert len(record) == 7
-
-###############################################################################
-# Intent Classification and Slot Labeling
-###############################################################################
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.parametrize('dataset,segment,expected_samples', [
-    ('atis', 'train', 4478),
-    ('atis', 'dev', 500),
-    ('atis', 'test', 893),
-    ('snips', 'train', 13084),
-    ('snips', 'dev', 700),
-    ('snips', 'test', 700)])
-def test_intent_slot(dataset, segment, expected_samples):
-    assert dataset in ['atis', 'snips']
-    if dataset == 'atis':
-        data_cls = nlp.data.ATISDataset
-    else:
-        data_cls = nlp.data.SNIPSDataset
-
-    dataset = data_cls(segment=segment)
-
-    assert len(dataset) == expected_samples
-    assert len(dataset[0]) == 3
-    assert all(len(x[0]) == len(x[1]) for x in dataset)
-
-
-def test_counter():
-    x = nlp.data.Counter({'a': 10, 'b': 1, 'c': 1})
-    y = x.discard(3, '<unk>')
-    assert y['a'] == 10
-    assert y['<unk>'] == 2
-
-
-# this test is not tested on CI due to long running time
-def _test_gbw_stream():
-    gbw = nlp.data.GBWStream()
-    counter = nlp.data.Counter(gbw)
-    counter.discard(3, '<unk>')
-    # reference count obtained from:
-    # https://github.com/rafaljozefowicz/lm/blob/master/1b_word_vocab.txt
-    assert counter['the'] == 35936573
-    assert counter['.'] == 29969612
-    vocab = gbw.vocab
-    assert len(vocab) == 793471
-
-
-def test_concatenation():
-    datasets = [
-            SimpleDataset([1,2,3,4]),
-            SimpleDataset([5,6]),
-            SimpleDataset([8,0,9]),
-            ]
-    dataset = nlp.data.ConcatDataset(datasets)
-    assert len(dataset) == 9
-    assert dataset[0] == 1
-    assert dataset[5] == 6
-
-
-def test_tsv():
-    data =  "a,b,c\n"
-    data += "d,e,f\n"
-    data += "g,h,i\n"
-    with open('test_tsv.tsv', 'w') as fout:
-        fout.write(data)
-    num_discard = 1
-    field_separator = nlp.data.utils.Splitter(',')
-    field_indices = [0,2]
-    dataset = nlp.data.TSVDataset('test_tsv.tsv', num_discard_samples=num_discard,
-                                  field_separator=field_separator,
-                                  field_indices=field_indices)
-    num_samples = 3 - num_discard
-    idx = random.randint(0, num_samples - 1)
-    assert len(dataset) == num_samples
-    assert len(dataset[0]) == 2
-    assert dataset[1] == [u'g', u'i']
-
-
-def test_numpy_dataset():
-    a = np.arange(6).reshape((2,3))
-    filename = 'test_numpy_dataset'
-
-    # test npy
-    np.save(filename, a)
-    dataset = nlp.data.NumpyDataset(filename + '.npy')
-    assert dataset.keys is None
-    assert len(dataset) == len(a)
-    assert np.all(dataset[0] == a[0])
-    assert np.all(dataset[1] == a[1])
-
-    # test npz with a single array
-    np.savez(filename, a)
-    dataset = nlp.data.NumpyDataset(filename + '.npz')
-    assert len(dataset) == len(a)
-    assert np.all(dataset[0] == a[0])
-    assert np.all(dataset[1] == a[1])
-
-    # test npz with multiple arrays
-    b = np.arange(16).reshape((2,8))
-    np.savez(filename, a=a, b=b)
-    dataset = nlp.data.NumpyDataset(filename + '.npz')
-    assert dataset.keys == ['a', 'b']
-    assert len(dataset) == len(a)
-    assert np.all(dataset[0][0] == a[0])
-    assert np.all(dataset[1][0] == a[1])
-    assert np.all(dataset[0][1] == b[0])
-    assert np.all(dataset[1][1] == b[1])
-    dataset_b = dataset.get_field('b')
-    assert np.all(dataset_b == b)
-
-
-@pytest.mark.parametrize('cls,name,segment,length,fields', [
-    (nlp.data.GlueCoLA, 'cola', 'train', 8551, 2),
-    (nlp.data.GlueCoLA, 'cola', 'dev', 1043, 2),
-    (nlp.data.GlueCoLA, 'cola', 'test', 1063, 1),
-    # source: https://arxiv.org/pdf/1804.07461.pdf
-    (nlp.data.GlueSST2, 'sst', 'train', 67349, 2),
-    (nlp.data.GlueSST2, 'sst', 'dev', 872, 2),
-    (nlp.data.GlueSST2, 'sst', 'test', 1821, 1),
-    # source: http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark
-    (nlp.data.GlueSTSB, 'sts', 'train', 5749, 3),
-    (nlp.data.GlueSTSB, 'sts', 'dev', 1500, 3),
-    (nlp.data.GlueSTSB, 'sts', 'test', 1379, 2),
-    # source: https://data.quora.com/First-Quora-Dataset-Release-Question-Pairs
-    (nlp.data.GlueQQP, 'qqp', 'train', 363849, 3),
-    (nlp.data.GlueQQP, 'qqp', 'dev', 40430, 3),
-    (nlp.data.GlueQQP, 'qqp', 'test', 390965, 2),
-    # source: http://www.nyu.edu/projects/bowman/multinli/paper.pdf
-    (nlp.data.GlueMNLI, 'mnli', 'train', 392702, 3),
-    (nlp.data.GlueMNLI, 'mnli', 'dev_matched', 9815, 3),
-    (nlp.data.GlueMNLI, 'mnli', 'dev_mismatched', 9832, 3),
-    (nlp.data.GlueMNLI, 'mnli', 'test_matched', 9796, 2),
-    (nlp.data.GlueMNLI, 'mnli', 'test_mismatched', 9847, 2),
-    # source: https://arxiv.org/pdf/1804.07461.pdf
-    (nlp.data.GlueRTE, 'rte', 'train', 2490, 3),
-    (nlp.data.GlueRTE, 'rte', 'dev', 277, 3),
-    (nlp.data.GlueRTE, 'rte', 'test', 3000, 2),
-    # source: https://arxiv.org/pdf/1804.07461.pdf
-    (nlp.data.GlueQNLI, 'qnli', 'train', 108436, 3),
-    (nlp.data.GlueQNLI, 'qnli', 'dev', 5732, 3),
-    (nlp.data.GlueQNLI, 'qnli', 'test', 5740, 2),
-    # source: https://arxiv.org/pdf/1804.07461.pdf
-    (nlp.data.GlueWNLI, 'wnli', 'train', 635, 3),
-    (nlp.data.GlueWNLI, 'wnli', 'dev', 71, 3),
-    (nlp.data.GlueWNLI, 'wnli', 'test', 146, 2),
-    (nlp.data.GlueMRPC, 'mrpc', 'train', 3668, 3),
-    (nlp.data.GlueMRPC, 'mrpc', 'dev', 408, 3),
-    (nlp.data.GlueMRPC, 'mrpc', 'test', 1725, 2),
-])
-@pytest.mark.skipif(datetime.date.today().weekday() != 0, reason='connection refused')
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_glue_data(cls, name, segment, length, fields):
-    with warnings.catch_warnings():
-        if cls is nlp.data.GlueQQP:  # QQP contains incomplete samples and raises warnings
-            warnings.simplefilter("ignore")
-        dataset = cls(segment=segment)
-    assert len(dataset) == length, len(dataset)
-
-    for i, x in enumerate(dataset):
-        assert len(x) == fields, x
-
-@pytest.mark.parametrize('cls,name,segment,length,fields', [
-    (nlp.data.SuperGlueRTE, 'rte', 'train', 2490, 4),
-    (nlp.data.SuperGlueRTE, 'rte', 'val', 277, 4),
-    (nlp.data.SuperGlueRTE, 'rte', 'test', 3000, 3),
-    (nlp.data.SuperGlueCB, 'cb', 'train', 250, 4),
-    (nlp.data.SuperGlueCB, 'cb', 'val', 56, 4),
-    (nlp.data.SuperGlueCB, 'cb', 'test', 250, 3),
-    (nlp.data.SuperGlueWSC, 'wsc', 'train', 554, 4),
-    (nlp.data.SuperGlueWSC, 'wsc', 'val', 104, 4),
-    (nlp.data.SuperGlueWSC, 'wsc', 'test', 146, 3),
-    (nlp.data.SuperGlueWiC, 'wic', 'train', 5428, 10),
-    (nlp.data.SuperGlueWiC, 'wic', 'val', 638, 10),
-    (nlp.data.SuperGlueWiC, 'wic', 'test', 1400, 9),
-    (nlp.data.SuperGlueCOPA, 'copa', 'train', 400, 6),
-    (nlp.data.SuperGlueCOPA, 'copa', 'val', 100, 6),
-    (nlp.data.SuperGlueCOPA, 'copa', 'test', 500, 5),
-    (nlp.data.SuperGlueMultiRC, 'multirc', 'train', 456, 2),
-    (nlp.data.SuperGlueMultiRC, 'multirc', 'val', 83, 2),
-    (nlp.data.SuperGlueMultiRC, 'multirc', 'test', 166, 2),
-    (nlp.data.SuperGlueBoolQ, 'boolq', 'train', 9427, 4),
-    (nlp.data.SuperGlueBoolQ, 'boolq', 'val', 3270, 4),
-    (nlp.data.SuperGlueBoolQ, 'boolq', 'test', 3245, 3),
-    (nlp.data.SuperGlueReCoRD, 'record', 'train', 65709, 4),
-    (nlp.data.SuperGlueReCoRD, 'record', 'val', 7481, 4),
-    (nlp.data.SuperGlueReCoRD, 'record', 'test', 7484, 4),
-    # in AX-b dataset, number of fields may differ
-    (nlp.data.SuperGlueAXb, 'ax_b', None, 1104, None),
-    (nlp.data.SuperGlueAXg, 'ax_g', None, 356, 5),
-])
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_superglue_data(cls, name, segment, length, fields):
-    if segment:
-        dataset = cls(segment=segment, root=os.path.join(
-            'tests', 'externaldata', 'superglue', name))
-    else:
-        dataset = cls(root=os.path.join('tests', 'externaldata', 'superglue', name))
-    assert len(dataset) == length, len(dataset)
-
-    if fields:
-        for i, x in enumerate(dataset):
-            assert len(x) == fields, x
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_parallel_load_pretrained_vocab():
-    def fn(name):
-        root = 'test_parallel_load_pretrained_vocab'
-        _ = nlp.data.utils._load_pretrained_vocab(name, root=root)
-    threads = []
-    name = 'openwebtext_book_corpus_wiki_en_uncased'
-    for _ in range(10):
-        x = threading.Thread(target=fn, args=(name,))
-        threads.append(x)
-    for t in threads:
-        t.start()
-    for t in threads:
-        t.join()
diff --git a/tests/unittest/test_elmo.py b/tests/unittest/test_elmo.py
deleted file mode 100644
index a32670fb5e..0000000000
--- a/tests/unittest/test_elmo.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-
-import mxnet as mx
-import gluonnlp as nlp
-import pytest
-
-
-@pytest.mark.parametrize('has_mask', [False, True])
-def test_elmo_bilm_encoder(has_mask):
-    encoder = nlp.model.BiLMEncoder(mode='lstmpc',
-                                    num_layers=1,
-                                    input_size=10,
-                                    hidden_size=30,
-                                    dropout=0.1,
-                                    skip_connection=True,
-                                    proj_size=10,
-                                    cell_clip=1,
-                                    proj_clip=1)
-    print(encoder)
-    encoder.initialize()
-    inputs = mx.random.uniform(shape=(20, 5, 10))
-    states = encoder.begin_state(mx.nd.zeros, batch_size=5)
-    if has_mask:
-        mask = mx.nd.ones(shape=(5, 20))
-        print('testing forward for elmo bilm with mask')
-        outputs, out_states = encoder(inputs, states, mask)
-    else:
-        print('testing forward for elmo bilm without mask')
-        outputs, out_states = encoder(inputs, states)
-
-    assert outputs.shape == (1, 20, 5, 20), outputs.shape
-    assert len(out_states) == 2, len(out_states)
-    assert out_states[0][0][0].shape == (5, 10), out_states[0][0][0].shape
-    assert out_states[0][0][1].shape == (5, 30), out_states[0][0][1].shape
-    assert out_states[1][0][0].shape == (5, 10), out_states[0][1][0].shape
-    assert out_states[1][0][1].shape == (5, 30), out_states[0][1][1].shape
-
-
-@pytest.mark.parametrize('hybridize', [False, True])
-def test_elmo_char_encoder(hybridize):
-    char_encoder = nlp.model.ELMoCharacterEncoder(output_size=1,
-                                                  char_embed_size=2,
-                                                  filters=[[1, 2], [2, 1]],
-                                                  num_highway=2,
-                                                  conv_layer_activation='relu',
-                                                  max_chars_per_token=50,
-                                                  char_vocab_size=262)
-    print(char_encoder)
-    char_encoder.initialize()
-    if hybridize:
-        char_encoder.hybridize()
-    inputs = mx.nd.ones(shape=(2, 5, 50))
-    print('testing forward for %s' % 'elmo_char_encoder')
-    output = char_encoder(inputs)
-    assert output.shape == (2, 5, 1), output.shape
-
-
-@pytest.mark.parametrize('hybridize', [False, True])
-def test_elmo_model(hybridize):
-    filters=[[1, 32], [2, 32], [3, 64], [4, 128], [5, 256], [6, 512], [7, 1024]]
-    model = nlp.model.ELMoBiLM(rnn_type='lstmpc',
-                               output_size=128,
-                               filters=filters,
-                               char_embed_size=16,
-                               char_vocab_size=262,
-                               num_highway=1,
-                               conv_layer_activation='relu',
-                               max_chars_per_token=50,
-                               input_size=128,
-                               hidden_size=1024,
-                               proj_size=128,
-                               num_layers=2,
-                               cell_clip=1,
-                               proj_clip=1,
-                               skip_connection=True)
-    print(model)
-    model.initialize()
-    if hybridize:
-        model.hybridize()
-    inputs = mx.nd.ones(shape=(5, 20, 50))
-    begin_state = model.begin_state(mx.nd.zeros, batch_size=5)
-    print('testing forward for %s' % 'elmo model')
-    outputs, state = model(inputs, begin_state)
-    assert len(outputs) == 3, len(outputs)
-    assert outputs[0].shape == (5, 20, 256), outputs[0].shape
-    assert len(state) == 2, len(state)
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_get_elmo_models():
-    model_names = ['elmo_2x1024_128_2048cnn_1xhighway', 'elmo_2x2048_256_2048cnn_1xhighway',
-                   'elmo_2x4096_512_2048cnn_2xhighway', 'elmo_2x4096_512_2048cnn_2xhighway']
-    datasets = ['gbw', 'gbw', 'gbw', '5bw']
-
-    for model_name, dataset in zip(model_names, datasets):
-        print('testing forward for %s on dataset %s' % (model_name, dataset))
-        model, _ = nlp.model.get_model(model_name,
-                                       dataset_name=dataset,
-                                       pretrained=dataset is not None)
-
-        print(model)
-        if not dataset:
-            model.collect_params().initialize()
-        begin_state = model.begin_state(mx.nd.zeros, batch_size=20)
-        output, state = model(mx.nd.arange(35000).reshape(20, 35, 50), begin_state)
-        del model
-        mx.nd.waitall()
-
-def test_elmo_vocab():
-    vocab = nlp.vocab.ELMoCharVocab()
-    expected_bos_ids = [vocab.bow_id, vocab.bos_id, vocab.eow_id]+[vocab.pad_id]*(vocab.max_word_length-3)
-    expected_eos_ids = [vocab.bow_id, vocab.eos_id, vocab.eow_id]+[vocab.pad_id]*(vocab.max_word_length-3)
-    expected_hello_ids = [vocab.bow_id, 104, 101, 108, 108, 111, vocab.eow_id]+[vocab.pad_id]*(vocab.max_word_length-7)
-    assert vocab['<bos>'] == expected_bos_ids
-    assert vocab['<eos>'] == expected_eos_ids
-    assert vocab['hello'] == expected_hello_ids
-    assert vocab[['<bos>', 'hello', '<eos>']] == [expected_bos_ids, expected_hello_ids, expected_eos_ids]
diff --git a/tests/unittest/test_highway.py b/tests/unittest/test_highway.py
deleted file mode 100644
index 536b96c133..0000000000
--- a/tests/unittest/test_highway.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from numpy.testing import assert_almost_equal
-import mxnet as mx
-from gluonnlp import model
-
-
-def test_highway_forward_simple_input():
-    highway = model.Highway(input_size=2, num_layers=2)
-    print(highway)
-    highway.initialize(init='One')
-    highway.hnet[0].weight.data()[:] = 1
-    highway.hnet[0].bias.data()[:] = 0
-    highway.hnet[1].weight.data()[:] = 2
-    highway.hnet[1].bias.data()[:] = -2
-    inputs = mx.nd.array([[-2, 1], [3, -2]])
-    output = highway(inputs)
-    print(output)
-    assert output.shape == (2, 2), output.shape
-    assert_almost_equal(output.asnumpy(),
-                        mx.nd.array([[-1.4177, 0.7088], [1.4764, 1.2234]]).asnumpy(),
-                        decimal=4)
-
-
-def test_highway_forward():
-    highway = model.Highway(input_size=2, num_layers=2)
-    print(highway)
-    highway.initialize()
-    inputs = mx.nd.ones((2, 3, 2))
-    output = highway(inputs)
-    print(output)
-    assert output.shape == (2, 3, 2), output.shape
-
-
-def test_highway_default_bias():
-    highway = model.Highway(input_size=4, num_layers=1)
-    print(highway)
-    highway.initialize()
-    assert_almost_equal(highway.hnet[0].bias.data().asnumpy(),
-                        mx.nd.array([0.0, 0.0, 0.0, 0.0, -2.0, -2.0, -2.0, -2.0]).asnumpy(),
-                        decimal=2)
diff --git a/tests/unittest/test_info.py b/tests/unittest/test_info.py
deleted file mode 100644
index f50cca99b1..0000000000
--- a/tests/unittest/test_info.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import sys
-import warnings
-
-import gluonnlp as nlp
-
-def test_get_models():
-    models = nlp.model.list_models()
-    assert len(models)!=0
diff --git a/tests/unittest/test_initializer.py b/tests/unittest/test_initializer.py
deleted file mode 100644
index f253b05d62..0000000000
--- a/tests/unittest/test_initializer.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-from mxnet.gluon import nn
-from gluonnlp import initializer
-
-
-def test_truncnorm_string_alias_works():
-    try:
-        layer = nn.Dense(prefix="test_layer", in_units=1, units=1, weight_initializer='truncnorm')
-        layer.initialize()
-    except RuntimeError:
-        pytest.fail('Layer couldn\'t be initialized')
-
-
-def test_truncnorm_all_values_inside_boundaries():
-    mean = 0
-    std = 0.01
-    layer = nn.Dense(prefix="test_layer", in_units=1, units=1000)
-    layer.initialize(init=initializer.TruncNorm(mean, std))
-    assert ((layer.weight.data() > 2 * std).sum() +
-            (layer.weight.data() < -2 * std).sum()).sum().asscalar() == 0
-
-
-def test_truncnorm_generates_values_with_defined_mean_and_std():
-    from scipy import stats
-
-    mean = 10
-    std = 5
-    layer = nn.Dense(prefix="test_layer", in_units=1, units=100000)
-    layer.initialize(init=initializer.TruncNorm(mean, std))
-    samples = layer.weight.data().reshape((-1, )).asnumpy()
-
-    p_value = stats.kstest(samples, 'truncnorm', args=(-2, 2, mean, std)).pvalue
-    assert p_value > 0.0001
-
diff --git a/tests/unittest/test_lamb.py b/tests/unittest/test_lamb.py
deleted file mode 100644
index 5336a95f85..0000000000
--- a/tests/unittest/test_lamb.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import sys
-import mxnet as mx
-from mxnet.gluon import data as gdata
-from mxnet import gluon, autograd, nd
-from mxnet.gluon import nn
-
-
-def test_lamb_for_fashion_mnist():
-    mnist_train = gdata.vision.FashionMNIST(train=True)
-    mnist_test = gdata.vision.FashionMNIST(train=False)
-
-    batch_size = 512
-    transformer = gdata.vision.transforms.ToTensor()
-    if sys.platform.startswith('win'):
-        num_workers = 0  # 0 disables multi-processing.
-    else:
-        num_workers = 4
-
-    train_iter = gdata.DataLoader(mnist_train.transform_first(transformer),
-                                  batch_size, shuffle=True,
-                                  num_workers=num_workers)
-    test_iter = gdata.DataLoader(mnist_test.transform_first(transformer),
-                                 batch_size, shuffle=False,
-                                 num_workers=num_workers)
-
-    net = nn.Sequential()
-    net.add(nn.Conv2D(6, kernel_size=5),
-            nn.BatchNorm(),
-            nn.Activation('relu'),
-            nn.MaxPool2D(pool_size=2, strides=2),
-            nn.Conv2D(16, kernel_size=5),
-            nn.BatchNorm(),
-            nn.Activation('relu'),
-            nn.MaxPool2D(pool_size=2, strides=2),
-            nn.Dense(120),
-            nn.BatchNorm(),
-            nn.Activation('relu'),
-            nn.Dense(84),
-            nn.BatchNorm(),
-            nn.Activation('relu'),
-            nn.Dense(10))
-
-    ctx = mx.cpu()
-    net.initialize(ctx=ctx)
-
-    trainer = gluon.Trainer(net.collect_params(), 'LAMB', {'learning_rate': 0.001})
-
-    loss = gluon.loss.SoftmaxCrossEntropyLoss()
-
-    num_epochs = 5
-
-    def evaluate_accuracy(data_iter, net, ctx):
-        """Evaluate accuracy of a model on the given data set."""
-        acc_sum, n = 0.0, 0.0
-        for X, y in train_iter:
-            X = X.as_in_context(ctx)
-            y = y.as_in_context(ctx)
-            y_hat = net(X)
-
-            y = y.astype('float32')
-            acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
-            n += y.size
-        return acc_sum / n
-
-    def train(net, train_iter, test_iter, loss, num_epochs, batch_size,
-              trainer, ctx):
-        for epoch in range(num_epochs):
-            train_l_sum, train_acc_sum, n = 0.0, 0.0, 0
-            for X, y in train_iter:
-                X = X.as_in_context(ctx)
-                y = y.as_in_context(ctx)
-                with autograd.record():
-                    y_hat = net(X)
-                    l = loss(y_hat, y).sum()
-                l.backward()
-
-                trainer.step(batch_size)
-                y = y.astype('float32')
-                train_l_sum += l.asscalar()
-                train_acc_sum += (y_hat.argmax(axis=1) == y).sum().asscalar()
-                n += y.size
-            test_acc = evaluate_accuracy(test_iter, net, ctx)
-            print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f'
-                  % (epoch + 1, train_l_sum / n, train_acc_sum / n, test_acc))
-
-    train(net, train_iter, test_iter, loss, num_epochs, batch_size, trainer, ctx)
diff --git a/tests/unittest/test_loss.py b/tests/unittest/test_loss.py
deleted file mode 100644
index c94fd66c98..0000000000
--- a/tests/unittest/test_loss.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import gluonnlp as nlp
-import mxnet as mx
-from mxnet import gluon
-import numpy as np
-
-def testActivationRegularizationLoss():
-    ar = nlp.loss.ActivationRegularizationLoss(2)
-    print(ar)
-    ar(*[mx.nd.arange(1000).reshape(10, 10, 10),
-         mx.nd.arange(1000).reshape(10, 10, 10)])
-
-def testTemporalActivationRegularizationLoss():
-    tar = nlp.loss.TemporalActivationRegularizationLoss(1)
-    print(tar)
-    tar(*[mx.nd.arange(1000).reshape(10, 10, 10),
-          mx.nd.arange(1000).reshape(10, 10, 10)])
-
-def testMaskedSoftmaxCrossEntropyLoss():
-    loss_fn = nlp.loss.MaskedSoftmaxCELoss()
-    pred = mx.nd.array([[[0,0,10],[10,0,0]]]) #N,T,C 1,2,3
-    label = mx.nd.array([[2,2]])
-    valid_length = mx.nd.array([1,])
-    loss = loss_fn(pred, label, valid_length)
-    assert loss < 0.1, "1st timestep prediction is correct, but loss was high"
-    valid_length = mx.nd.array([2,])
-    loss = loss_fn(pred, label, valid_length)
-    assert loss > 1, "2nd timestep prediction was wrong, but loss did not go up"
-
-def testLabelSmoothing():
-    # Testing that the label gets smoothed at the right location
-    sparse_labels = [0,1,2]
-    for epsilon, units in zip([0.1, 0.3, 0.5], [5, 10, 20]):
-        smoother = nlp.loss.LabelSmoothing(epsilon=epsilon, units=units)
-        smoothed_labels = smoother(mx.nd.array(sparse_labels))
-        for i, label in enumerate(sparse_labels):
-            for k in range(units):
-                if k == label:
-                    mx.test_utils.assert_almost_equal(
-                        smoothed_labels[i,k].asnumpy(),
-                        np.array([1 - epsilon/units * (units-1)])
-                    )
-                else:
-                    mx.test_utils.assert_almost_equal(
-                        smoothed_labels[i,k].asnumpy(),
-                        np.array([epsilon/units])
-                    )
diff --git a/tests/unittest/test_lstmpcellwithclip.py b/tests/unittest/test_lstmpcellwithclip.py
deleted file mode 100644
index 321cbb5afb..0000000000
--- a/tests/unittest/test_lstmpcellwithclip.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-import gluonnlp as nlp
-import pytest
-
-
-def test_lstmpcellwithclip():
-    cell = nlp.model.LSTMPCellWithClip(hidden_size=30,
-                                       projection_size=10,
-                                       cell_clip=1,
-                                       projection_clip=1,
-                                       input_size=10)
-    cell.initialize()
-    inputs = mx.random.uniform(shape=(5, 10))
-    states = []
-    states0 = mx.random.uniform(shape=(5, 10))
-    states1 = mx.random.uniform(shape=(5, 30))
-    states.append(states0)
-    states.append(states1)
-    outputs, out_states = cell(inputs, states)
-    assert outputs.shape == (5, 10), outputs.shape
-    assert len(out_states) == 2, len(out_states)
-    assert out_states[0].shape == (5, 10), out_states[0].shape
-    assert out_states[1].shape == (5, 30), out_states[1].shape
diff --git a/tests/unittest/test_metrics.py b/tests/unittest/test_metrics.py
deleted file mode 100644
index ace8155e92..0000000000
--- a/tests/unittest/test_metrics.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-import numpy as np
-from gluonnlp.metric import MaskedAccuracy, LengthNormalizedLoss
-from mxnet.test_utils import assert_almost_equal
-
-def test_acc():
-    pred = mx.nd.array([[0.3, 0.7], [0, 1.], [0.4, 0.6]])
-    label = mx.nd.array([0, 1, 1])
-    mask = mx.nd.array([1, 1, 0])
-    metric = MaskedAccuracy()
-    metric.update([label], [pred], [mask])
-    _, acc = metric.get()
-    matched = (np.argmax(pred.asnumpy(), axis=1) == label.asnumpy()) * mask.asnumpy()
-    valid_count = mask.asnumpy().sum()
-    expected_acc = 1.0 * matched.sum() / valid_count
-    assert acc == expected_acc
-
-    metric = MaskedAccuracy()
-    metric.update([label], [pred])
-    _, acc = metric.get()
-    matched = (np.argmax(pred.asnumpy(), axis=1) == label.asnumpy())
-    valid_count = len(label)
-    expected_acc = 1.0 * matched.sum() / valid_count
-    assert acc == expected_acc
-
-def test_normalized_loss(rtol=1e-5, atol=1e-5):
-    tgt_valid_length = mx.nd.array([1, 3, 2, 7])
-    loss = mx.nd.array([1.1, 2.5, 3.8, 5.3])
-    metric = LengthNormalizedLoss()
-    metric.update([0, tgt_valid_length], loss)
-    _, metric_loss = metric.get()
-    expected_loss = loss.asnumpy().sum() / tgt_valid_length.asnumpy().sum()
-    assert_almost_equal(metric_loss, expected_loss, rtol=rtol, atol=atol)
-
-    tgt_valid_length = mx.nd.array([8, 4, 2, 7])
-    loss = mx.nd.array([8.7, 2.3, 1.8, 9.3])
-    metric = LengthNormalizedLoss()
-    metric.update([0, tgt_valid_length], loss)
-    _, metric_loss = metric.get()
-    expected_loss = loss.asnumpy().sum() / tgt_valid_length.asnumpy().sum()
-    assert_almost_equal(metric_loss, expected_loss, rtol=rtol, atol=atol)
-
diff --git a/tests/unittest/test_model_weight_share.py b/tests/unittest/test_model_weight_share.py
deleted file mode 100644
index add0a6b015..0000000000
--- a/tests/unittest/test_model_weight_share.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import warnings
-
-import mxnet as mx
-import gluonnlp as nlp
-import pytest
-
-def _check_initialized(net):
-    params = net.collect_params()
-    for param in params:
-        try:
-            params[param].list_ctx()
-        except RuntimeError:
-            return False
-        return True
-
-@pytest.mark.parametrize('weight_tied', [False, True])
-def test_awdrnn_weight_share(weight_tied):
-    mode = 'lstm'
-    vocab = 400
-    context = [mx.cpu()]
-
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        model = nlp.model.train.AWDRNN(mode, vocab,
-                                       tie_weights=weight_tied)
-        model_eval = nlp.model.train.AWDRNN(mode, vocab,
-                                            tie_weights=weight_tied,
-                                            params=model.collect_params())
-        model.initialize(mx.init.Xavier(), ctx=context)
-
-        assert _check_initialized(model) == True
-        assert _check_initialized(model_eval) == True
-
-@pytest.mark.parametrize('weight_tied', [False, True])
-def test_standardrnn_weight_share(weight_tied):
-    mode = 'lstm'
-    vocab = 400
-    context = [mx.cpu()]
-    emb_size = 200
-    hidden_size = 200
-    nlayers = 2
-
-    model = nlp.model.train.StandardRNN(mode, vocab,
-                                        emb_size, hidden_size,
-                                        nlayers, weight_tied)
-    model_eval = nlp.model.train.StandardRNN(mode, vocab,
-                                             emb_size, hidden_size,
-                                             nlayers, weight_tied,
-                                             params=model.collect_params())
-    model.initialize(mx.init.Xavier(), ctx=context)
-
-    assert _check_initialized(model) == True
-    assert _check_initialized(model_eval) == True
diff --git a/tests/unittest/test_models.py b/tests/unittest/test_models.py
deleted file mode 100644
index 4a2bafdcf3..0000000000
--- a/tests/unittest/test_models.py
+++ /dev/null
@@ -1,685 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-import os
-import sys
-import warnings
-
-import mxnet as mx
-import pytest
-from mxnet import gluon
-
-import gluonnlp as nlp
-from gluonnlp.base import get_home_dir
-
-
-def eprint(*args, **kwargs):
-    print(*args, file=sys.stderr, **kwargs)
-
-
-# disabled since it takes a long time to download the model
-@pytest.mark.serial
-def _test_pretrained_big_text_models():
-    text_models = ['big_rnn_lm_2048_512']
-    pretrained_to_test = {'big_rnn_lm_2048_512': 'gbw'}
-
-    for model_name in text_models:
-        eprint('testing forward for %s' % model_name)
-        pretrained_dataset = pretrained_to_test.get(model_name)
-        model, _ = nlp.model.get_model(model_name, dataset_name=pretrained_dataset,
-                                       pretrained=True)
-
-        print(model)
-        batch_size = 10
-        hidden = model.begin_state(batch_size=batch_size, func=mx.nd.zeros)
-        output, state = model(mx.nd.arange(330).reshape((33, 10)), hidden)
-        output.wait_to_read()
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_big_text_models(wikitext2_val_and_counter):
-    # use a small vocabulary for testing
-    val, val_freq = wikitext2_val_and_counter
-    vocab = nlp.Vocab(val_freq)
-    text_models = ['big_rnn_lm_2048_512']
-
-    for model_name in text_models:
-        eprint('testing forward for %s' % model_name)
-        model, _ = nlp.model.get_model(model_name, vocab=vocab)
-
-        print(model)
-        model.collect_params().initialize()
-        batch_size = 10
-        hidden = model.begin_state(batch_size=batch_size, func=mx.nd.zeros)
-        output, state = model(mx.nd.arange(330).reshape((33, 10)), hidden)
-        output.wait_to_read()
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.parametrize('dropout_rate', [0.1, 0.0])
-@pytest.mark.parametrize('model_dataset', [('transformer_en_de_512', 'WMT2014')])
-def test_transformer_models(dropout_rate, model_dataset):
-    model_name, pretrained_dataset = model_dataset
-    src = mx.nd.ones((2, 10))
-    tgt = mx.nd.ones((2, 8))
-    valid_len = mx.nd.ones((2,))
-    eprint('testing forward for %s, dropout rate %f' % (model_name, dropout_rate))
-    with warnings.catch_warnings():  # TODO https://github.com/dmlc/gluon-nlp/issues/978
-        warnings.simplefilter("ignore")
-        model, _, _ = nlp.model.get_model(model_name, dataset_name=pretrained_dataset,
-                                          pretrained=pretrained_dataset is not None,
-                                          dropout=dropout_rate)
-
-    print(model)
-    if not pretrained_dataset:
-        model.initialize()
-    output, state = model(src, tgt, src_valid_length=valid_len, tgt_valid_length=valid_len)
-    output.wait_to_read()
-    del model
-    mx.nd.waitall()
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.parametrize('wo_valid_len', [False, True])
-def test_pretrained_roberta_models(wo_valid_len):
-    models = ['roberta_12_768_12', 'roberta_24_1024_16']
-    pretrained_datasets = ['openwebtext_ccnews_stories_books_cased']
-
-    vocab_size = {'openwebtext_ccnews_stories_books_cased': 50265}
-    special_tokens = ['<unk>', '<pad>', '<s>', '</s>', '<mask>']
-    ones = mx.nd.ones((2, 10))
-    valid_length = mx.nd.ones((2,))
-    positions = mx.nd.zeros((2, 3))
-    for model_name in models:
-        for dataset in pretrained_datasets:
-            eprint('testing forward for %s on %s' % (model_name, dataset))
-
-            model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
-                                               pretrained=True)
-            assert len(vocab) == vocab_size[dataset]
-            for token in special_tokens:
-                assert token in vocab, "Token %s not found in the vocab" % token
-            assert vocab['RandomWordByHaibin'] == vocab[vocab.unknown_token]
-            assert vocab.padding_token == '<pad>'
-            assert vocab.unknown_token == '<unk>'
-            assert vocab.bos_token == '<s>'
-            assert vocab.eos_token == '</s>'
-
-            model.hybridize()
-            if wo_valid_len:
-                output = model(ones, masked_positions=positions)
-            else:
-                output = model(ones, valid_length, positions)
-            output[0].wait_to_read()
-            del model
-            mx.nd.waitall()
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.parametrize('wo_valid_len', [False, True])
-def test_pretrained_distilbert_models(wo_valid_len):
-    models = ['distilbert_6_768_12']
-    pretrained_datasets = ['distilbert_book_corpus_wiki_en_uncased']
-
-    vocab_size = {'distilbert_book_corpus_wiki_en_uncased': 30522}
-    special_tokens = ['[UNK]', '[PAD]', '[SEP]', '[CLS]', '[MASK]']
-    ones = mx.nd.ones((2, 10))
-    valid_length = mx.nd.ones((2,))
-    for model_name in models:
-        for dataset in pretrained_datasets:
-            eprint('testing forward for %s on %s' % (model_name, dataset))
-
-            model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
-                                               pretrained=True,
-                                               root='tests/data/model/')
-            assert len(vocab) == vocab_size[dataset]
-            for token in special_tokens:
-                assert token in vocab, "Token %s not found in the vocab" % token
-            assert vocab['RandomWordByHaibin'] == vocab[vocab.unknown_token]
-            assert vocab.padding_token == '[PAD]'
-            assert vocab.unknown_token == '[UNK]'
-
-            model.hybridize()
-            if wo_valid_len:
-                output = model(ones)
-            else:
-                output = model(ones, valid_length)
-            output[0].wait_to_read()
-            del model
-            mx.nd.waitall()
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.parametrize('disable_missing_parameters', [False, True])
-def test_pretrained_bert_models(disable_missing_parameters):
-    models = ['bert_12_768_12', 'bert_24_1024_16']
-    pretrained = {
-        'bert_12_768_12': [
-            'book_corpus_wiki_en_cased', 'book_corpus_wiki_en_uncased', 'wiki_multilingual_uncased',
-            'openwebtext_book_corpus_wiki_en_uncased', 'wiki_multilingual_cased', 'wiki_cn_cased', 'scibert_scivocab_uncased',
-            'scibert_scivocab_cased', 'scibert_basevocab_uncased', 'scibert_basevocab_cased',
-            'biobert_v1.0_pmc_cased', 'biobert_v1.0_pubmed_cased', 'biobert_v1.0_pubmed_pmc_cased',
-            'biobert_v1.1_pubmed_cased', 'clinicalbert_uncased', 'kobert_news_wiki_ko_cased'
-        ],
-        'bert_24_1024_16': ['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased']
-    }
-    vocab_size = {'book_corpus_wiki_en_cased': 28996,
-                  'book_corpus_wiki_en_uncased': 30522,
-                  'openwebtext_book_corpus_wiki_en_uncased': 30522,
-                  'wiki_multilingual_cased': 119547,
-                  'wiki_cn_cased': 21128,
-                  'wiki_multilingual_uncased': 105879,
-                  'scibert_scivocab_uncased': 31090,
-                  'scibert_scivocab_cased': 31116,
-                  'scibert_basevocab_uncased': 30522,
-                  'scibert_basevocab_cased': 28996,
-                  'biobert_v1.0_pubmed_cased': 28996,
-                  'biobert_v1.0_pmc_cased': 28996,
-                  'biobert_v1.0_pubmed_pmc_cased': 28996,
-                  'biobert_v1.1_pubmed_cased': 28996,
-                  'clinicalbert_uncased': 30522,
-                  'kobert_news_wiki_ko_cased': 8002}
-    special_tokens = ['[UNK]', '[PAD]', '[SEP]', '[CLS]', '[MASK]']
-    ones = mx.nd.ones((2, 10))
-    valid_length = mx.nd.ones((2,))
-    positions = mx.nd.zeros((2, 3))
-    for model_name in models:
-        pretrained_datasets = pretrained.get(model_name)
-        for dataset in pretrained_datasets:
-            has_missing_params = any(n in dataset for n in ('biobert', 'clinicalbert'))
-            if not has_missing_params and disable_missing_parameters:
-                # No parameters to disable for models pretrained on this dataset
-                continue
-
-            eprint('testing forward for %s on %s' % (model_name, dataset))
-
-            if not has_missing_params:
-                model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
-                                                   pretrained=True)
-            else:
-                with pytest.raises(AssertionError):
-                    model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
-                                                       pretrained=True)
-
-                if not disable_missing_parameters:
-                    model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
-                                                       pretrained=True,
-                                                       pretrained_allow_missing=True)
-                elif 'biobert' in dataset:
-                    # Biobert specific test case
-                    model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
-                                                       pretrained=True,
-                                                       pretrained_allow_missing=True,
-                                                       use_decoder=False,
-                                                       use_classifier=False)
-                elif 'clinicalbert' in dataset:
-                    # Clinicalbert specific test case
-                    model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
-                                                       pretrained=True,
-                                                       pretrained_allow_missing=True,
-                                                       use_decoder=False)
-                else:
-                    assert False, "Testcase needs to be adapted."
-
-            assert len(vocab) == vocab_size[dataset]
-            for token in special_tokens:
-                assert token in vocab, "Token %s not found in the vocab" % token
-            assert vocab['RandomWordByHaibin'] == vocab[vocab.unknown_token]
-            assert vocab.padding_token == '[PAD]'
-            assert vocab.unknown_token == '[UNK]'
-            assert vocab.bos_token is None
-            assert vocab.eos_token is None
-
-            if has_missing_params and not disable_missing_parameters:
-                with pytest.raises(RuntimeError):
-                    output = model(ones, ones, valid_length, positions)
-                    output[0].wait_to_read()
-            else:
-                output = model(ones, ones, valid_length, positions)
-                output[0].wait_to_read()
-            del model
-            mx.nd.waitall()
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.parametrize('hparam_allow_override', [False, True])
-def test_pretrained_bert_models_override(hparam_allow_override):
-    models = ['bert_12_768_12', 'bert_24_1024_16',
-              'roberta_12_768_12', 'roberta_24_1024_16']
-    pretrained = {
-        'bert_12_768_12':  ['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased'],
-        'bert_24_1024_16': ['book_corpus_wiki_en_uncased', 'book_corpus_wiki_en_cased'],
-        'roberta_12_768_12':  ['openwebtext_ccnews_stories_books_cased'],
-        'roberta_24_1024_16': ['openwebtext_ccnews_stories_books_cased']
-    }
-    ones = mx.nd.ones((2, 10))
-    valid_length = mx.nd.ones((2,))
-    positions = mx.nd.zeros((2, 3))
-    for model_name in models:
-        pretrained_datasets = pretrained.get(model_name)
-        for dataset in pretrained_datasets:
-            eprint('testing forward for %s on %s' % (model_name, dataset))
-
-            if hparam_allow_override:
-                model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
-                                                   pretrained=True,
-                                                   root='tests/data/model/',
-                                                   hparam_allow_override=hparam_allow_override,
-                                                   ignore_extra=True,
-                                                   num_layers=6)
-            else:
-                with pytest.raises(AssertionError):
-                    model, vocab = nlp.model.get_model(model_name, dataset_name=dataset,
-                                                       pretrained=True,
-                                                       root='tests/data/model/',
-                                                       num_layers=6)
-                continue
-            if 'roberta' in model_name:
-                output = model(ones, valid_length, positions)
-            else:
-                output = model(ones, ones, valid_length, positions)
-            output[0].wait_to_read()
-            del model
-            mx.nd.waitall()
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-@pytest.mark.parametrize('wo_valid_len', [False, True])
-def test_bert_models(wo_valid_len):
-    models = ['bert_12_768_12', 'bert_24_1024_16']
-    layers = [12, 24]
-    attention_heads = [12, 16]
-    units = [768, 1024]
-    dataset = 'book_corpus_wiki_en_uncased'
-    vocab_size = 30522
-    batch_size = 2
-    seq_len = 3
-    num_masks = 2
-    ones = mx.nd.ones((batch_size, seq_len))
-    valid_length = mx.nd.ones((batch_size, ))
-    positions = mx.nd.ones((batch_size, num_masks))
-
-    kwargs = [{'use_pooler': False, 'use_decoder': False, 'use_classifier': False},
-              {'use_pooler': True, 'use_decoder': False, 'use_classifier': False},
-              {'use_pooler': True, 'use_decoder': True, 'use_classifier': False},
-              {'use_pooler': True, 'use_decoder': True, 'use_classifier': True},
-              {'use_pooler': False, 'use_decoder': False, 'use_classifier': False,
-               'output_attention': True},
-              {'use_pooler': False, 'use_decoder': False, 'use_classifier': False,
-               'output_attention': True, 'output_all_encodings': True},
-              {'use_pooler': True, 'use_decoder': True, 'use_classifier': True,
-               'output_attention': True, 'output_all_encodings': True}]
-
-    def infer_shape(shapes, unit):
-        inferred_shapes = []
-        for shape in shapes:
-            inferred_shape = list(shape)
-            if inferred_shape[-1] == -1:
-                inferred_shape[-1] = unit
-            inferred_shapes.append(tuple(inferred_shape))
-        return inferred_shapes
-
-    def get_shapes(output):
-        if not isinstance(output, (list, tuple)):
-            return [output.shape]
-
-        shapes = []
-        for out in output:
-            collect_shapes(out, shapes)
-
-        return shapes
-
-    def collect_shapes(item, shapes):
-        if not isinstance(item, (list, tuple)):
-            shapes.append(item.shape)
-            return
-
-        for child in item:
-            collect_shapes(child, shapes)
-
-    for model_name, layer, unit, head in zip(models, layers, units, attention_heads):
-        eprint('testing forward for %s' % model_name)
-
-        expected_shapes = [
-            [(batch_size, seq_len, -1)],
-            [(batch_size, seq_len, -1),
-             (batch_size, -1)],
-            [(batch_size, seq_len, -1),
-             (batch_size, -1),
-             (batch_size, num_masks, vocab_size)],
-            [(batch_size, seq_len, -1),
-             (batch_size, -1),
-             (batch_size, 2),
-             (batch_size, num_masks, vocab_size)],
-            [(batch_size, seq_len, -1)] + [(batch_size, head, seq_len, seq_len)] * layer,
-            [(batch_size, seq_len, -1)] * layer + [(batch_size, head, seq_len, seq_len)] * layer,
-            [(batch_size, seq_len, -1)] * layer + [(batch_size, head, seq_len, seq_len)] * layer +
-            [(batch_size, -1)] + [(batch_size, 2)] + [(batch_size, num_masks, vocab_size)],
-        ]
-
-        for kwarg, expected_shape in zip(kwargs, expected_shapes):
-            eprint('testing forward for %s' % str(kwarg))
-            expected_shape = infer_shape(expected_shape, unit)
-            model, _ = nlp.model.get_model(model_name, dataset_name=dataset,
-                                           pretrained=False, **kwarg)
-            model.initialize()
-            model.hybridize()
-
-            if kwarg['use_decoder']:
-                # position tensor is required for decoding
-                if wo_valid_len:
-                    output = model(ones, ones, masked_positions=positions)
-                else:
-                    output = model(ones, ones, valid_length, positions)
-            else:
-                if wo_valid_len:
-                    output = model(ones, ones)
-                else:
-                    output = model(ones, ones, valid_length)
-
-            out_shapes = get_shapes(output)
-            assert out_shapes == expected_shape, (out_shapes, expected_shape)
-            sync_instance = output[0] if not isinstance(output[0], list) else output[0][0]
-            sync_instance.wait_to_read()
-            del model
-            mx.nd.waitall()
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_language_models():
-    text_models = ['standard_lstm_lm_200', 'standard_lstm_lm_650',
-                   'standard_lstm_lm_1500', 'awd_lstm_lm_1150', 'awd_lstm_lm_600']
-    pretrained_to_test = {'standard_lstm_lm_1500': 'wikitext-2',
-                          'standard_lstm_lm_650': 'wikitext-2',
-                          'standard_lstm_lm_200': 'wikitext-2',
-                          'awd_lstm_lm_1150': 'wikitext-2',
-                          'awd_lstm_lm_600': 'wikitext-2'}
-
-    for model_name in text_models:
-        eprint('testing forward for %s' % model_name)
-        pretrained_dataset = pretrained_to_test.get(model_name)
-        model, _ = nlp.model.get_model(model_name, dataset_name=pretrained_dataset,
-                                       pretrained=pretrained_dataset is not None)
-
-        print(model)
-        if not pretrained_dataset:
-            model.collect_params().initialize()
-        output, state = model(mx.nd.arange(330).reshape(33, 10))
-        output.wait_to_read()
-        del model
-        mx.nd.waitall()
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_cache_models():
-    cache_language_models = ['awd_lstm_lm_1150', 'awd_lstm_lm_600', 'standard_lstm_lm_200',
-                             'standard_lstm_lm_650', 'standard_lstm_lm_1500']
-    datasets = ['wikitext-2']
-    for name in cache_language_models:
-        for dataset_name in datasets:
-            cache_cell = nlp.model.train.get_cache_model(name, dataset_name, window=1, theta=0.6,
-                                                         lambdas=0.2)
-            outs, word_history, cache_history, hidden = cache_cell(mx.nd.arange(
-                10).reshape(10, 1), mx.nd.arange(10).reshape(10, 1), None, None)
-            print(cache_cell)
-            print("outs:")
-            print(outs)
-            print("word_history:")
-            print(word_history)
-            print("cache_history:")
-            print(cache_history)
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_get_cache_model_noncache_models():
-    language_models_params = {
-        'awd_lstm_lm_1150': 'awd_lstm_lm_1150_wikitext-2-f9562ed0.params',
-        'awd_lstm_lm_600': 'awd_lstm_lm_600_wikitext-2-e952becc.params',
-        'standard_lstm_lm_200': 'standard_lstm_lm_200_wikitext-2-b233c700.params',
-        'standard_lstm_lm_650': 'standard_lstm_lm_650_wikitext-2-631f3904.params',
-        'standard_lstm_lm_1500': 'standard_lstm_lm_1500_wikitext-2-a4163513.params'}
-    datasets = ['wikitext-2']
-    for name in language_models_params.keys():
-        for dataset_name in datasets:
-            _, vocab = nlp.model.get_model(name=name, dataset_name=dataset_name, pretrained=True)
-            ntokens = len(vocab)
-
-            cache_cell_0 = nlp.model.train.get_cache_model(name, dataset_name, window=1, theta=0.6,
-                                                           lambdas=0.2)
-            print(cache_cell_0)
-
-            model, _ = nlp.model.get_model(name=name, dataset_name=dataset_name, pretrained=True)
-            cache_cell_1 = nlp.model.train.CacheCell(
-                model, ntokens, window=1, theta=0.6, lambdas=0.2)
-            cache_cell_1.load_parameters(
-                os.path.join(get_home_dir(), 'models', language_models_params.get(name)))
-            print(cache_cell_1)
-
-            outs0, word_history0, cache_history0, hidden0 = cache_cell_0(
-                mx.nd.arange(10).reshape(10, 1), mx.nd.arange(10).reshape(10, 1), None, None)
-            outs1, word_history1, cache_history1, hidden1 = cache_cell_1(
-                mx.nd.arange(10).reshape(10, 1), mx.nd.arange(10).reshape(10, 1), None, None)
-
-            assert outs0.shape == outs1.shape, outs0.shape
-            assert len(word_history0) == len(word_history1), len(word_history0)
-            assert len(cache_history0) == len(cache_history1), len(cache_history0)
-            assert len(hidden0) == len(hidden1), len(hidden0)
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_save_load_cache_models():
-    cache_language_models = ['awd_lstm_lm_1150', 'awd_lstm_lm_600', 'standard_lstm_lm_200',
-                             'standard_lstm_lm_650', 'standard_lstm_lm_1500']
-    datasets = ['wikitext-2']
-    for name in cache_language_models:
-        for dataset_name in datasets:
-            cache_cell = nlp.model.train.get_cache_model(name, dataset_name, window=1, theta=0.6,
-                                                         lambdas=0.2)
-            print(cache_cell)
-            cache_cell.save_parameters(
-                os.path.join(get_home_dir(), 'models', name + '-' + dataset_name + '.params'))
-            cache_cell.load_parameters(
-                os.path.join(get_home_dir(), 'models', name + '-' + dataset_name + '.params'))
-
-
-@pytest.mark.serial
-def test_save_load_big_rnn_models(tmp_path):
-    ctx = mx.cpu()
-    seq_len = 1
-    batch_size = 1
-    num_sampled = 6
-    # network
-    eval_model = nlp.model.language_model.BigRNN(10, 2, 3, 4, 5, 0.1, prefix='bigrnn')
-    model = nlp.model.language_model.train.BigRNN(10, 2, 3, 4, 5, num_sampled, 0.1,
-                                                  prefix='bigrnn')
-    loss = mx.gluon.loss.SoftmaxCrossEntropyLoss()
-    # verify param names
-    model_params = sorted(model.collect_params().keys())
-    eval_model_params = sorted(eval_model.collect_params().keys())
-    for p0, p1 in zip(model_params, eval_model_params):
-        assert p0 == p1, (p0, p1)
-    model.initialize(mx.init.Xavier(), ctx=ctx)
-    trainer = mx.gluon.Trainer(model.collect_params(), 'sgd')
-    # prepare data, label and samples
-    x = mx.nd.ones((seq_len, batch_size))
-    y = mx.nd.ones((seq_len, batch_size))
-    sampled_cls = mx.nd.ones((num_sampled,))
-    sampled_cls_cnt = mx.nd.ones((num_sampled,))
-    true_cls_cnt = mx.nd.ones((seq_len, batch_size))
-    samples = (sampled_cls, sampled_cls_cnt, true_cls_cnt)
-    hidden = model.begin_state(batch_size=batch_size, func=mx.nd.zeros, ctx=ctx)
-    # test forward
-    with mx.autograd.record():
-        pred, hidden, new_y = model(x, y, hidden, samples)
-        assert pred.shape == (seq_len, batch_size, 1 + num_sampled)
-        assert new_y.shape == (seq_len, batch_size)
-        pred = pred.reshape((-3, -1))
-        new_y = new_y.reshape((-1,))
-        l = loss(pred, new_y)
-    l.backward()
-    mx.nd.waitall()
-    path = os.path.join(str(tmp_path), 'test_save_load_big_rnn_models.params')
-    model.save_parameters(path)
-    eval_model.load_parameters(path)
-
-
-def test_big_rnn_model_share_params():
-    ctx = mx.cpu()
-    seq_len = 2
-    batch_size = 1
-    num_sampled = 6
-    vocab_size = 10
-    shape = (seq_len, batch_size)
-    model = nlp.model.language_model.train.BigRNN(vocab_size, 2, 3, 4, 5, num_sampled, 0.1,
-                                                  prefix='bigrnn', sparse_weight=False,
-                                                  sparse_grad=False)
-    loss = mx.gluon.loss.SoftmaxCrossEntropyLoss()
-    model.hybridize()
-    model.initialize(mx.init.Xavier(), ctx=ctx)
-    trainer = mx.gluon.Trainer(model.collect_params(), 'sgd')
-    batch_size = 1
-    x = mx.nd.ones(shape)
-    y = mx.nd.ones(shape)
-    sampled_cls = mx.nd.ones((num_sampled,))
-    sampled_cls_cnt = mx.nd.ones((num_sampled,))
-    true_cls_cnt = mx.nd.ones(shape)
-    samples = (sampled_cls, sampled_cls_cnt, true_cls_cnt)
-    hidden = model.begin_state(batch_size=batch_size, func=mx.nd.zeros, ctx=ctx)
-    with mx.autograd.record():
-        pred, hidden, new_y = model(x, y, hidden, samples)
-        assert pred.shape == (seq_len, batch_size, 1 + num_sampled)
-        assert new_y.shape == (seq_len, batch_size)
-        pred = pred.reshape((-3, -1))
-        new_y = new_y.reshape((-1,))
-        l = loss(pred, new_y)
-    l.backward()
-    assert model.decoder.weight._grad_stype == 'default'
-    mx.nd.waitall()
-    eval_model = nlp.model.language_model.BigRNN(vocab_size, 2, 3, 4, 5, 0.1, prefix='bigrnn',
-                                                 params=model.collect_params())
-    eval_model.hybridize()
-    pred, hidden = eval_model(x, hidden)
-    assert pred.shape == (seq_len, batch_size, vocab_size)
-    mx.nd.waitall()
-
-
-def test_weight_drop():
-    class RefBiLSTM(gluon.Block):
-        def __init__(self, size, **kwargs):
-            super(RefBiLSTM, self).__init__(**kwargs)
-            with self.name_scope():
-                self._lstm_fwd = gluon.rnn.LSTM(size, bidirectional=False, prefix='l0')
-                self._lstm_bwd = gluon.rnn.LSTM(size, bidirectional=False, prefix='r0')
-
-        def forward(self, inpt):
-            fwd = self._lstm_fwd(inpt)
-            bwd_inpt = mx.nd.flip(inpt, 0)
-            bwd = self._lstm_bwd(bwd_inpt)
-            bwd = mx.nd.flip(bwd, 0)
-            return mx.nd.concat(fwd, bwd, dim=2)
-    net1 = RefBiLSTM(10)
-    shared_net1 = RefBiLSTM(10, params=net1.collect_params())
-
-    net2 = gluon.rnn.LSTM(10)
-    shared_net2 = gluon.rnn.LSTM(10, params=net2.collect_params())
-
-    net3 = gluon.nn.HybridSequential()
-    net3.add(gluon.rnn.LSTM(10))
-    shared_net3 = gluon.nn.HybridSequential(params=net3.collect_params())
-    shared_net3.add(gluon.rnn.LSTM(10, params=net3[0].collect_params()))
-
-    x = mx.random.uniform(shape=(3, 4, 5))
-    nets = [(net1, shared_net1),
-            (net2, shared_net2),
-            (net3, shared_net3)]
-    for net, shared_net in nets:
-        net.initialize('uniform')
-        mx.test_utils.assert_almost_equal(net(x).asnumpy(),
-                                          shared_net(x).asnumpy())
-        with mx.autograd.train_mode():
-            mx.test_utils.assert_almost_equal(net(x).asnumpy(),
-                                              shared_net(x).asnumpy())
-
-        grads = {}
-        with mx.autograd.record():
-            y = net(x)
-        y.backward()
-        for name, param in net.collect_params().items():
-            grads[name] = param.grad().copy()
-        with mx.autograd.record():
-            y = shared_net(x)
-        y.backward()
-        for name, param in shared_net.collect_params().items():
-            mx.test_utils.assert_almost_equal(grads[name].asnumpy(), param.grad().asnumpy())
-
-        drop_rate = 0.5
-        nlp.model.utils.apply_weight_drop(net, '.*h2h_weight', drop_rate)
-
-        with mx.autograd.predict_mode():
-            mx.test_utils.assert_almost_equal(net(x).asnumpy(),
-                                              shared_net(x).asnumpy())
-        with mx.autograd.train_mode():
-            assert not mx.test_utils.almost_equal(net(x).asnumpy(),
-                                                  shared_net(x).asnumpy())
-
-        grads = {}
-        with mx.autograd.record():
-            y = net(x)
-        y.backward()
-        for name, param in net.collect_params().items():
-            grads[name] = param.grad().copy()
-        with mx.autograd.record():
-            y = shared_net(x)
-        y.backward()
-        for name, param in shared_net.collect_params().items():
-            assert not mx.test_utils.almost_equal(grads[name].asnumpy(), param.grad().asnumpy())
-
-
-def test_gelu():
-    x = mx.random.uniform(shape=(3, 4, 5))
-    net = nlp.model.GELU()
-    y = net(x)
-    assert y.shape == x.shape
-    y.wait_to_read()
-
-
-def test_transformer_encoder():
-    batch_size = 2
-    seq_length = 5
-    units = 768
-    inputs = mx.random.uniform(shape=(batch_size, seq_length, units))
-    mask = mx.nd.ones([batch_size, seq_length, seq_length])
-    cell = nlp.model.TransformerEncoderCell(units=768, hidden_size=3072, num_heads=12,
-                                            attention_cell='multi_head', dropout=0.0,
-                                            use_residual=True, scaled=True,
-                                            output_attention=False,
-                                            prefix='transformer_cell')
-    cell.collect_params().initialize()
-    cell.hybridize()
-    outputs, attention_weights = cell(inputs, mask)
-    outputs.wait_to_read()
-    mx.nd.waitall()
-    assert outputs.shape == (batch_size, seq_length, units)
diff --git a/tests/unittest/test_optimizer.py b/tests/unittest/test_optimizer.py
deleted file mode 100644
index 4ba42eba72..0000000000
--- a/tests/unittest/test_optimizer.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import warnings
-import mxnet as mx
-from mxnet.test_utils import default_context, assert_almost_equal, rand_ndarray
-import numpy as np
-from gluonnlp import optimizer
-
-def compare_ndarray_tuple(t1, t2, rtol=None, atol=None):
-    """Compare ndarray tuple."""
-    if t1 is not None and t2 is not None:
-        if isinstance(t1, tuple):
-            for s1, s2 in zip(t1, t2):
-                compare_ndarray_tuple(s1, s2, rtol, atol)
-        else:
-            assert_almost_equal(t1.asnumpy(), t2.asnumpy(), rtol=rtol, atol=atol)
-
-
-def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='default',
-                      rtol=1e-4, atol=1e-5, compare_states=True):
-    """Compare opt1 and opt2."""
-    if not isinstance(shape, list):
-        if w_stype == 'default':
-            w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-            w1 = w2.copyto(default_context())
-        elif w_stype == 'row_sparse' or w_stype == 'csr':
-            w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
-            w1 = w2.copyto(default_context()).tostype('default')
-        else:
-            raise Exception("type not supported yet")
-        if g_stype == 'default':
-            g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-            g1 = g2.copyto(default_context())
-        elif g_stype == 'row_sparse' or g_stype == 'csr':
-            g2 = rand_ndarray(shape, g_stype, dtype=dtype)
-            g1 = g2.copyto(default_context()).tostype('default')
-        else:
-            raise Exception("type not supported yet")
-
-        state1 = opt1.create_state_multi_precision(0, w1)
-        state2 = opt2.create_state_multi_precision(0, w2)
-        if compare_states:
-            compare_ndarray_tuple(state1, state2)
-
-        opt1.update_multi_precision(0, w1, g1, state1)
-        opt2.update_multi_precision(0, w2, g2, state2)
-        if compare_states:
-            compare_ndarray_tuple(state1, state2, rtol=rtol, atol=atol)
-        assert_almost_equal(w1.asnumpy(), w2.asnumpy(), rtol=rtol, atol=atol)
-    else:
-        # test multi-tensor: Opt1 single-tensor reference, Opt2 multi-tensor
-        from copy import deepcopy
-        ntensors = len(shape)
-        w1, g1 = [], []
-        for s in shape:
-            w1.append(mx.random.uniform(shape=s, ctx=default_context(), dtype=dtype))
-            g1.append(mx.random.uniform(shape=s, ctx=default_context(), dtype=dtype))
-        w1 = tuple(w1)
-        w2 = deepcopy(w1)
-        g1 = tuple(g1)
-        g2 = deepcopy(g1)
-        state2 = [opt2.create_state_multi_precision(0, w2[i]) for i in range(ntensors)]
-        opt2.update_multi_precision(list(range(ntensors)), w2, g2, state2)
-        for i in range(ntensors):
-            state1 = opt1.create_state_multi_precision(i, w1[i])
-            opt1.update_multi_precision(i, w1[i], g1[i], state1)
-            if compare_states:
-                compare_ndarray_tuple(state1, state2[i], rtol, atol)
-            assert_almost_equal(w1[i].asnumpy(), w2[i].asnumpy(), rtol=rtol, atol=atol)
-
-# BERT ADAM
-class PyBERTAdam(mx.optimizer.Optimizer):
-    """python reference implemenation of BERT style adam"""
-    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6,
-                 wd=0, **kwargs):
-        super(PyBERTAdam, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.wd = wd
-
-    def create_state_multi_precision(self, index, weight):
-        """multi-precision state creation function."""
-        weight_master_copy = None
-        if self.multi_precision and weight.dtype == np.float16:
-            weight_master_copy = weight.astype(np.float32)
-            return (self.create_state(index, weight_master_copy), weight_master_copy)
-        if weight.dtype == np.float16 and not self.multi_precision:
-            warnings.warn('Accumulating with float16 in optimizer can lead to '
-                          'poor accuracy or slow convergence. '
-                          'Consider using multi_precision=True option of the '
-                          'BERTAdam optimizer')
-        return self.create_state(index, weight)
-
-    def create_state(self, index, weight):
-        """Create additional optimizer state: mean, variance
-
-        Parameters
-        ----------
-        weight : NDArray
-        The weight data
-        """
-        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
-                mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
-
-    def update_multi_precision(self, index, weight, grad, state):
-        """Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-        An unique integer key used to index the parameters
-        weight : NDArray
-        weight ndarray
-        grad : NDArray
-        grad ndarray
-        state : NDArray or other objects returned by init_state
-        The auxiliary state used in optimization.
-        """
-        use_multi_precision = self.multi_precision and weight.dtype == np.float16
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-        if use_multi_precision:
-            mean, variance = state[0]
-            weight32 = state[1]
-        else:
-            mean, variance = state
-            weight32 = weight.copy()
-        grad = grad.astype('float32') * self.rescale_grad
-        # clip gradients
-        if self.clip_gradient is not None:
-            mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient, out=grad)
-        # update mean
-        mean[:] = self.beta1 * mean + (1. - self.beta1) * grad
-        # update variance
-        variance[:] = self.beta2 * variance + (1 - self.beta2) * grad.square()
-        # include weight decay
-        update = mean / (mx.nd.sqrt(variance) + self.epsilon) + wd * weight32
-        # update weight
-        if use_multi_precision:
-            weight32 -= lr * update
-            weight[:] = weight32.astype(weight.dtype)
-        else:
-            weight -= lr * update
-
-
-def test_bert_adam():
-    opt1 = PyBERTAdam
-    opt2 = optimizer.BERTAdam
-    shape = (3, 4, 5)
-    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
-    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}]
-    for dtype in [np.float16, np.float32]:
-        for cg_option in cg_options:
-            for rg_option in rg_options:
-                for wd_option in wd_options:
-                    kwarg = {}
-                    kwarg.update(cg_option)
-                    kwarg.update(rg_option)
-                    kwarg.update(wd_option)
-                    if np.float16 == dtype:
-                        kwarg['multi_precision'] = True
-                        rtol = 1e-3
-                    else:
-                        rtol = 1e-4
-                    try:
-                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
-                                          rtol=rtol, atol=2e-5)
-                    except ImportError:
-                        print('skipping test_bert_adam() because an old version of MXNet is found')
-                        return
-
-def test_bert_multi_adam():
-    opt1 = PyBERTAdam
-    opt2 = optimizer.BERTAdam
-    # shapes as Bert-large
-    dims_x = [1024, 4096, 1024, 1024]
-    dims_y = [1, 1, 1024, 4096]
-    dims_occurrences = [3, 1, 2, 2]
-    nlayers = 2
-    shapes=[]
-    for l in range(nlayers):
-        for i, (dx,dy) in enumerate(zip(dims_x, dims_y)):
-            for j in range(dims_occurrences[i]):
-                shapes.append((dx,dy))
-    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
-    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}]
-    for dtype in [np.float16, np.float32]:
-        for cg_option in cg_options:
-            for rg_option in rg_options:
-                for wd_option in wd_options:
-                    kwarg = {}
-                    kwarg.update(cg_option)
-                    kwarg.update(rg_option)
-                    kwarg.update(wd_option)
-                    if np.float16 == dtype:
-                        kwarg['multi_precision'] = True
-                        rtol = 1e-3
-                    else:
-                        rtol = 1e-4
-                    compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes, dtype,
-                                      rtol=rtol, atol=2e-5)
diff --git a/tests/unittest/test_preprocess_utils.py b/tests/unittest/test_preprocess_utils.py
deleted file mode 100644
index 8e29659b8c..0000000000
--- a/tests/unittest/test_preprocess_utils.py
+++ /dev/null
@@ -1,30 +0,0 @@
-"""test data preprocessing utils"""
-
-import numpy as np
-from gluonnlp.data.bert.glue import truncate_seqs_equal, concat_sequences
-
-
-def test_truncate():
-    seqs = [[j*i for j in range(i)] for i in range(1,10)]
-    res1 = [[0], [0, 2], [0, 3, 6], [0, 4, 8], [0, 5, 10], [0, 6], [0, 7], [0, 8], [0, 9]]
-    seq = [[i for i in range(20)]]
-
-    truncated = truncate_seqs_equal(seqs, 20)
-    truncated2 = truncate_seqs_equal(seq, 20)
-
-    assert all(truncated == np.array(res1))
-    assert all(truncated2[0] == np.array(seq)[0])
-
-def test_concat_sequence():
-    seqs = [[3 * i + j for j in range(3)] for i in range(3)]
-    seperators = [['a'], ['b'], ['c']]
-    res = concat_sequences(seqs, seperators)
-    assert res[0] == [0, 1, 2, 'a', 3, 4, 5, 'b', 6, 7, 8, 'c']
-    assert res[1] == [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
-    assert res[2] == [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1]
-
-    seperators = [['a'], [], ['b']]
-    res = concat_sequences(seqs, seperators)
-    assert res[0] == [0, 1, 2, 'a', 3, 4, 5, 6, 7, 8, 'b']
-    assert res[1] == [0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 2]
-    assert res[2] == [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1]
\ No newline at end of file
diff --git a/tests/unittest/test_sampled_logits.py b/tests/unittest/test_sampled_logits.py
deleted file mode 100644
index e544ab5dc0..0000000000
--- a/tests/unittest/test_sampled_logits.py
+++ /dev/null
@@ -1,80 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-import sys
-
-import mxnet as mx
-from mxnet import gluon
-import gluonnlp as nlp
-import pytest
-
-@pytest.mark.parametrize('f', [nlp.model.NCEDense, nlp.model.SparseNCEDense])
-@pytest.mark.parametrize('cls_dtype', ['float32', 'int32'])
-@pytest.mark.parametrize('count_dtype', ['float32', 'int32'])
-def test_nce_loss(f, cls_dtype, count_dtype):
-    ctx = mx.cpu()
-    batch_size = 2
-    num_sampled = 3
-    vocab_size = 10
-    num_hidden = 5
-    model = f(vocab_size, num_sampled, num_hidden)
-    loss = gluon.loss.SigmoidBCELoss()
-    model.hybridize()
-    model.initialize(mx.init.Xavier(), ctx=ctx)
-    trainer = mx.gluon.Trainer(model.collect_params(), 'sgd')
-    x = mx.nd.ones((batch_size, num_hidden))
-    y = mx.nd.ones((batch_size,))
-    sampled_cls = mx.nd.ones((num_sampled,), dtype=cls_dtype)
-    sampled_cls_cnt = mx.nd.ones((num_sampled,), dtype=count_dtype)
-    true_cls_cnt = mx.nd.ones((batch_size,), dtype=count_dtype)
-    samples = (sampled_cls, sampled_cls_cnt, true_cls_cnt)
-    with mx.autograd.record():
-        pred, new_y = model(x, samples, y)
-        assert pred.shape == (batch_size, 1+num_sampled)
-        assert new_y.shape == (batch_size, 1+num_sampled)
-        l = loss(pred, new_y)
-    l.backward()
-    mx.nd.waitall()
-
-@pytest.mark.parametrize('f', [nlp.model.ISDense, nlp.model.SparseISDense])
-@pytest.mark.parametrize('cls_dtype', ['float32', 'int32'])
-@pytest.mark.parametrize('count_dtype', ['float32', 'int32'])
-def test_is_softmax_loss(f, cls_dtype, count_dtype):
-    ctx = mx.cpu()
-    batch_size = 2
-    num_sampled = 3
-    vocab_size = 10
-    num_hidden = 5
-    model = f(vocab_size, num_sampled, num_hidden)
-    loss = gluon.loss.SoftmaxCrossEntropyLoss()
-    model.hybridize()
-    model.initialize(mx.init.Xavier(), ctx=ctx)
-    trainer = mx.gluon.Trainer(model.collect_params(), 'sgd')
-    x = mx.nd.ones((batch_size, num_hidden))
-    y = mx.nd.ones((batch_size,))
-    sampled_cls = mx.nd.ones((num_sampled,), dtype=cls_dtype)
-    sampled_cls_cnt = mx.nd.ones((num_sampled,), dtype=count_dtype)
-    true_cls_cnt = mx.nd.ones((batch_size,), dtype=count_dtype)
-    samples = (sampled_cls, sampled_cls_cnt, true_cls_cnt)
-    with mx.autograd.record():
-        pred, new_y = model(x, samples, y)
-        assert pred.shape == (batch_size, 1+num_sampled)
-        assert new_y.shape == (batch_size,)
-        l = loss(pred, new_y)
-    l.backward()
-    mx.nd.waitall()
diff --git a/tests/unittest/test_sampler.py b/tests/unittest/test_sampler.py
deleted file mode 100644
index b7819b7f69..0000000000
--- a/tests/unittest/test_sampler.py
+++ /dev/null
@@ -1,139 +0,0 @@
-import warnings
-
-import numpy as np
-import pytest
-from mxnet.gluon import data
-
-import gluonnlp as nlp
-from gluonnlp.data import sampler as s
-
-N = 1000
-def test_sorted_sampler():
-    dataset = data.SimpleDataset([np.random.normal(0, 1, (np.random.randint(10, 100), 1, 1))
-                                  for _ in range(N)])
-    gt_sample_id = sorted(range(len(dataset)), key=lambda i: dataset[i].shape, reverse=True)
-    sample_ret = list(s.SortedSampler([ele.shape[0] for ele in dataset]))
-    for lhs, rhs in zip(gt_sample_id, sample_ret):
-        assert lhs == rhs
-
-@pytest.mark.parametrize('seq_lengths', [[np.random.randint(10, 100) for _ in range(N)],
-                                         [(np.random.randint(10, 100), np.random.randint(10, 100))
-                                           for _ in range(N)]])
-@pytest.mark.parametrize('ratio', [0.0, 0.5])
-@pytest.mark.parametrize('shuffle', [False, True])
-@pytest.mark.parametrize('num_buckets', [1, 10, 100, 5000])
-@pytest.mark.parametrize('bucket_scheme', [s.ConstWidthBucket(),
-                                           s.LinearWidthBucket(),
-                                           s.ExpWidthBucket()])
-@pytest.mark.parametrize('use_average_length', [False, True])
-@pytest.mark.parametrize('num_shards', range(4))
-def test_fixed_bucket_sampler(seq_lengths, ratio, shuffle, num_buckets, bucket_scheme,
-                              use_average_length, num_shards):
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        sampler = s.FixedBucketSampler(seq_lengths, batch_size=8, num_buckets=num_buckets,
-                                       ratio=ratio, shuffle=shuffle,
-                                       use_average_length=use_average_length,
-                                       bucket_scheme=bucket_scheme, num_shards=num_shards)
-
-    print(sampler.stats())
-    total_sampled_ids = []
-    for batch_sample_ids in sampler:
-        if num_shards > 0:
-            assert len(batch_sample_ids) == num_shards
-        else:
-            total_sampled_ids.extend(batch_sample_ids)
-    if num_shards == 0:
-        assert len(set(total_sampled_ids)) == len(total_sampled_ids) == N
-
-@pytest.mark.parametrize('bucket_keys', [[1, 5, 10, 100], [10, 100], [200]])
-@pytest.mark.parametrize('ratio', [0.0, 0.5])
-@pytest.mark.parametrize('shuffle', [False, True])
-def test_fixed_bucket_sampler_with_single_key(bucket_keys, ratio, shuffle):
-    seq_lengths = [np.random.randint(10, 100) for _ in range(N)]
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        sampler = s.FixedBucketSampler(seq_lengths, batch_size=8, num_buckets=None,
-                                       bucket_keys=bucket_keys, ratio=ratio, shuffle=shuffle)
-    print(sampler.stats())
-    total_sampled_ids = []
-    for batch_sample_ids in sampler:
-        total_sampled_ids.extend(batch_sample_ids)
-    assert len(set(total_sampled_ids)) == len(total_sampled_ids) == N
-
-@pytest.mark.parametrize('bucket_keys', [[(1, 1), (5, 10), (10, 20), (20, 10), (100, 100)],
-                                         [(20, 20), (30, 15), (100, 100)],
-                                         [(100, 200)]])
-@pytest.mark.parametrize('ratio', [0.0, 0.5])
-@pytest.mark.parametrize('shuffle', [False, True])
-def test_fixed_bucket_sampler_with_single_key(bucket_keys, ratio, shuffle):
-    seq_lengths = [(np.random.randint(10, 100), np.random.randint(10, 100)) for _ in range(N)]
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        sampler = s.FixedBucketSampler(seq_lengths, batch_size=8, num_buckets=None,
-                                       bucket_keys=bucket_keys, ratio=ratio, shuffle=shuffle)
-    print(sampler.stats())
-    total_sampled_ids = []
-    for batch_sample_ids in sampler:
-        total_sampled_ids.extend(batch_sample_ids)
-    assert len(set(total_sampled_ids)) == len(total_sampled_ids) == N
-
-
-def test_fixed_bucket_sampler_compactness():
-    samples = list(
-        s.FixedBucketSampler(
-            np.arange(16, 32), 8, num_buckets=2,
-            bucket_scheme=nlp.data.ConstWidthBucket()))
-    assert len(samples) == 2
-
-
-@pytest.mark.parametrize('seq_lengths', [[np.random.randint(10, 100) for _ in range(N)],
-                                         [(np.random.randint(10, 100), np.random.randint(10, 100))
-                                          for _ in range(N)]])
-@pytest.mark.parametrize('mult', [10, 100])
-@pytest.mark.parametrize('batch_size', [5, 7])
-@pytest.mark.parametrize('shuffle', [False, True])
-def test_sorted_bucket_sampler(seq_lengths, mult, batch_size, shuffle):
-    sampler = s.SortedBucketSampler(sort_keys=seq_lengths,
-                                    batch_size=batch_size,
-                                    mult=mult, shuffle=shuffle)
-    total_sampled_ids = []
-    for batch_sample_ids in sampler:
-        total_sampled_ids.extend(batch_sample_ids)
-    assert len(set(total_sampled_ids)) == len(total_sampled_ids) == N
-
-
-@pytest.mark.parametrize('num_samples', [30])
-@pytest.mark.parametrize('num_parts', [3, 7])
-@pytest.mark.parametrize('repeat', [1, 3])
-def test_split_sampler(num_samples, num_parts, repeat):
-    total_count = 0
-    indices = []
-    for part_idx in range(num_parts):
-        sampler = s.SplitSampler(num_samples, num_parts, part_idx, repeat=repeat)
-        count = 0
-        for i in sampler:
-            count += 1
-            indices.append(i)
-        total_count += count
-        assert count == len(sampler)
-    assert total_count == num_samples * repeat
-    assert np.allclose(sorted(indices), np.repeat(list(range(num_samples)), repeat))
-
-
-@pytest.mark.parametrize('num_samples', [30])
-@pytest.mark.parametrize('num_parts', [3, 7])
-def test_split_sampler_even_size(num_samples, num_parts):
-    total_count = 0
-    indices = []
-    for part_idx in range(num_parts):
-        sampler = s.SplitSampler(num_samples, num_parts, part_idx, even_size=True)
-        count = 0
-        for i in sampler:
-            count += 1
-            indices.append(i)
-        total_count += count
-        assert count == len(sampler)
-        print(count)
-    expected_count = int(num_samples + num_parts - 1) // num_parts * num_parts
-    assert total_count == expected_count, (total_count, expected_count)
diff --git a/tests/unittest/test_sanity.py b/tests/unittest/test_sanity.py
deleted file mode 100644
index 393580274b..0000000000
--- a/tests/unittest/test_sanity.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import pytest
-
-@pytest.mark.gpu
-def test_sanity_gpu():
-    # sanity test that makes sure every marker combination has at least 1 test.
-    # due to https://github.com/pytest-dev/pytest/issues/812
-    import gluonnlp as nlp
-
-@pytest.mark.gpu
-@pytest.mark.serial
-def test_sanity_gpu_serial():
-    # sanity test that makes sure every marker combination has at least 1 test.
-    # due to https://github.com/pytest-dev/pytest/issues/812
-    import gluonnlp as nlp
diff --git a/tests/unittest/test_sequence_sampler.py b/tests/unittest/test_sequence_sampler.py
deleted file mode 100644
index 147ae6673e..0000000000
--- a/tests/unittest/test_sequence_sampler.py
+++ /dev/null
@@ -1,295 +0,0 @@
-import collections
-import functools
-
-import mxnet as mx
-import numpy as np
-import pytest
-from mxnet import gluon
-from mxnet.gluon import nn, rnn
-from numpy.testing import assert_allclose
-
-from gluonnlp import model
-
-
-@pytest.mark.parametrize('length', [False, True])
-@pytest.mark.parametrize('alpha', [0.0, 1.0])
-@pytest.mark.parametrize('K', [1.0, 5.0])
-def test_beam_search_score(length, alpha, K):
-    batch_size = 2
-    scorer = model.BeamSearchScorer(alpha=alpha, K=K)
-    scorer.hybridize()
-    sum_log_probs = mx.nd.zeros((batch_size,))
-    scores = mx.nd.zeros((batch_size,))
-    for step in range(1, length + 1):
-        log_probs = mx.nd.random.normal(0, 1, (batch_size, 1))
-        sum_log_probs += log_probs[:, 0]
-        scores = scorer(log_probs, scores, mx.nd.array([step]))[:, 0]
-    lp = (K + length) ** alpha / (K + 1) ** alpha
-    assert_allclose(scores.asnumpy(), sum_log_probs.asnumpy() / lp, 1E-5, 1E-5)
-
-@pytest.mark.serial
-@pytest.mark.parametrize('top_k', [None, 5])
-def test_sequence_sampler(top_k):
-    vocab_size = np.random.randint(5, 20)
-    batch_size = 1000
-    dist = mx.random.uniform(shape=(vocab_size,))
-    def context_free_distribution(step_input, states):
-        batch_size = step_input.shape[0]
-        return dist.expand_dims(0).broadcast_to(shape=(batch_size, vocab_size)), states
-    sampler = model.SequenceSampler(2, context_free_distribution, vocab_size+1, max_length=500,
-                                    top_k=top_k)
-    samples, _, _ = sampler(mx.nd.ones((batch_size,)), mx.nd.ones((batch_size,)))
-    freq = collections.Counter(samples.asnumpy().flatten().tolist())
-    emp_dist = [0] * vocab_size
-    N = float(len(list(freq.elements())))
-    for i in range(vocab_size):
-        emp_dist[i] = freq[i] / N
-    if top_k is None:
-        true_dist = dist.softmax().asnumpy()
-    else:
-        ranks = dist.argsort(is_ascend=False, dtype='int32')
-        dist = mx.nd.where(ranks < top_k, dist, mx.nd.ones_like(dist)*-99999)
-        true_dist = dist.softmax().asnumpy()
-    assert_allclose(true_dist, np.array(emp_dist), atol=0.01, rtol=0.1)
-
-@pytest.mark.skip(reason='https://github.com/dmlc/gluon-nlp/issues/1020')
-@pytest.mark.seed(1)
-@pytest.mark.parametrize('hybridize', [False, True])
-@pytest.mark.parametrize('sampler_cls', [model.BeamSearchSampler, model.HybridBeamSearchSampler])
-def test_beam_search(hybridize, sampler_cls):
-    def _get_new_states(states, state_info, sel_beam_ids):
-        assert not state_info or isinstance(state_info, (type(states), dict)), \
-                'states and state_info don\'t match'
-        if isinstance(states, list):
-            if not state_info:
-                state_info = [None] * len(states)
-            return [_get_new_states(s, si, sel_beam_ids)
-                    for s, si in zip(states, state_info)]
-        elif isinstance(states, tuple):
-            if not state_info:
-                state_info = [None] * len(states)
-                state_info = tuple(state_info)
-            return tuple(_get_new_states(s, si, sel_beam_ids)
-                         for s, si in zip(states, state_info))
-        elif isinstance(states, dict):
-            if not state_info:
-                state_info = {k: None for k in states.keys()}
-            return {k: _get_new_states(states[k], state_info[k], sel_beam_ids)
-                    for k in states}
-        elif isinstance(states, mx.nd.NDArray):
-            updated_states = []
-            if not state_info:
-                batch_axis = 0
-            else:
-                batch_axis = state_info['__layout__'].find('N')
-            if batch_axis != 0:
-                states = states.swapaxes(0, batch_axis)
-            for beam_id in sel_beam_ids:
-                updated_states.append(states[beam_id])
-            states = mx.nd.stack(*updated_states, axis=batch_axis)
-            return states
-        else:
-            raise NotImplementedError
-
-    def _fetch_step_states(states, state_info, batch_id, beam_size):
-        assert not state_info or isinstance(state_info, (type(states), dict)), \
-                'states and state_info don\'t match'
-        if isinstance(states, list):
-            if not state_info:
-                state_info = [None] * len(states)
-            return [_fetch_step_states(s, si, batch_id, beam_size)
-                    for s, si in zip(states, state_info)]
-        elif isinstance(states, tuple):
-            if not state_info:
-                state_info = [None] * len(states)
-                state_info = tuple(state_info)
-            return tuple(_fetch_step_states(s, si, batch_id, beam_size)
-                         for s, si in zip(states, state_info))
-        elif isinstance(states, dict):
-            if not state_info:
-                state_info = {k: None for k in states.keys()}
-            return {k: _fetch_step_states(states[k], state_info[k], batch_id, beam_size)
-                    for k in states}
-        elif isinstance(states, mx.nd.NDArray):
-            if not state_info:
-                batch_axis = 0
-            else:
-                batch_axis = state_info['__layout__'].find('N')
-            if batch_axis != 0:
-                states = states.swapaxes(0, batch_axis)
-            states = mx.nd.broadcast_axes(states[batch_id:(batch_id + 1)], axis=0, size=beam_size)
-            if batch_axis != 0:
-                states = states.swapaxes(0, batch_axis)
-            return states
-        else:
-            raise NotImplementedError
-
-    def _npy_beam_search(decoder, scorer, inputs, states, eos_id, beam_size, max_length):
-        inputs = np.array([inputs for _ in range(beam_size)], dtype='float32')
-        scores = np.array([0.0] + [-1e18] * (beam_size - 1), dtype='float32')
-        samples = np.expand_dims(inputs, axis=1)
-        beam_done = np.zeros(shape=(beam_size,), dtype='float32')
-        if hasattr(decoder, 'state_info'):
-            state_info = decoder.state_info()
-        else:
-            state_info = None
-        for step in range(max_length):
-            log_probs, states = decoder(mx.nd.array(inputs), states)
-            vocab_size = log_probs.shape[1]
-            candidate_scores = scorer(log_probs, mx.nd.array(scores),
-                                      mx.nd.array([step + 1])).asnumpy()
-            beam_done_inds = np.where(beam_done)[0]
-            if len(beam_done_inds) > 0:
-                candidate_scores[beam_done_inds, :] = -1e18
-                finished_scores = scores[beam_done_inds]
-                candidate_scores = np.concatenate(
-                    (candidate_scores.reshape((-1,)), finished_scores), axis=0)
-            else:
-                candidate_scores = candidate_scores.reshape((-1,))
-            indices = candidate_scores.argsort()[::-1][:beam_size]
-            sel_words = []
-            sel_beam_ids = []
-            new_scores = candidate_scores[indices]
-            for ind in indices:
-                if ind < beam_size * vocab_size:
-                    sel_words.append(ind % vocab_size)
-                    sel_beam_ids.append(ind // vocab_size)
-                else:
-                    sel_words.append(-1)
-                    sel_beam_ids.append(beam_done_inds[ind - beam_size * vocab_size])
-            states = _get_new_states(states, state_info, sel_beam_ids)
-            samples = np.concatenate((samples[sel_beam_ids, :],
-                                      np.expand_dims(np.array(sel_words), axis=1)), axis=1)
-            beam_done = np.logical_or(beam_done[sel_beam_ids], (np.array(sel_words) == eos_id))
-            scores = new_scores
-            inputs = [0 if ele < 0 else ele for ele in sel_words]
-            if beam_done.all():
-                return samples
-        concat_val = - np.ones((beam_size,), dtype='float32') * beam_done + (1 - beam_done) * np.ones(
-            (beam_size,), dtype='float32') * eos_id
-        samples = np.concatenate((samples, np.expand_dims(concat_val, axis=1)), axis=1)
-        return samples
-
-    HIDDEN_SIZE = 2
-    class RNNDecoder(gluon.HybridBlock):
-        def __init__(self, vocab_size, hidden_size, prefix=None, params=None):
-            super(RNNDecoder, self).__init__(prefix=prefix, params=params)
-            self._vocab_size = vocab_size
-            with self.name_scope():
-                self._embed = nn.Embedding(input_dim=vocab_size, output_dim=hidden_size)
-                self._rnn = rnn.RNNCell(input_size=hidden_size, hidden_size=hidden_size)
-                self._map_to_vocab = nn.Dense(vocab_size, in_units=hidden_size)
-
-        def begin_state(self, batch_size):
-            return self._rnn.begin_state(batch_size=batch_size,
-                                         func=functools.partial(mx.random.normal, loc=0, scale=1))
-
-        def hybrid_forward(self, F, inputs, states):
-            out, states = self._rnn(self._embed(inputs), states)
-            log_probs = self._map_to_vocab(out)  # In real-life, we should add a log_softmax after that.
-            return log_probs, states
-
-    class RNNDecoder2(gluon.HybridBlock):
-        def __init__(self, vocab_size, hidden_size, prefix=None, params=None, use_tuple=False):
-            super(RNNDecoder2, self).__init__(prefix=prefix, params=params)
-            self._vocab_size = vocab_size
-            self._use_tuple = use_tuple
-            with self.name_scope():
-                self._embed = nn.Embedding(input_dim=vocab_size, output_dim=hidden_size)
-                self._rnn1 = rnn.RNNCell(input_size=hidden_size, hidden_size=hidden_size)
-                self._rnn2 = rnn.RNNCell(input_size=hidden_size, hidden_size=hidden_size)
-                self._map_to_vocab = nn.Dense(vocab_size, in_units=hidden_size)
-
-        def begin_state(self, batch_size):
-            ret = [self._rnn1.begin_state(batch_size=batch_size,
-                                           func=functools.partial(mx.random.normal, loc=0, scale=1)),
-                    self._rnn2.begin_state(batch_size=batch_size,
-                                           func=functools.partial(mx.random.normal, loc=0, scale=1))]
-            if self._use_tuple:
-                return tuple(ret)
-            else:
-                return ret
-
-        def hybrid_forward(self, F, inputs, states):
-            if self._use_tuple:
-                states1, states2 = states
-            else:
-                [states1, states2] = states
-            out1, states1 = self._rnn1(self._embed(inputs), states1)
-            out2, states2 = self._rnn2(out1, states2)
-            log_probs = self._map_to_vocab(out2)  # In real-life, we should add a log_softmax after that.
-            if self._use_tuple:
-                states = (states1, states2)
-            else:
-                states = [states1, states2]
-            return log_probs, states
-
-    class RNNLayerDecoder(gluon.HybridBlock):
-        def __init__(self, vocab_size, hidden_size, prefix=None, params=None):
-            super(RNNLayerDecoder, self).__init__(prefix=prefix, params=params)
-            self._vocab_size = vocab_size
-            with self.name_scope():
-                self._embed = nn.Embedding(input_dim=vocab_size, output_dim=hidden_size)
-                self._rnn = rnn.RNN(input_size=hidden_size, hidden_size=hidden_size, num_layers=1, activation='tanh')
-                self._map_to_vocab = nn.Dense(vocab_size, flatten=False, in_units=hidden_size)
-
-        def begin_state(self, batch_size):
-            return self._rnn.begin_state(batch_size=batch_size,
-                                         func=functools.partial(mx.random.normal, loc=0, scale=5))
-
-        def state_info(self, *args, **kwargs):
-            return self._rnn.state_info(*args, **kwargs)
-
-        def hybrid_forward(self, F, inputs, states):
-            out, states = self._rnn(self._embed(inputs.expand_dims(0)), states)
-            log_probs = self._map_to_vocab(out).squeeze(axis=0).log_softmax()
-            return log_probs, states
-
-    # Begin Testing
-    for vocab_size in [2, 3]:
-        for decoder_fn in [RNNDecoder,
-                           functools.partial(RNNDecoder2, use_tuple=False),
-                           functools.partial(RNNDecoder2, use_tuple=True),
-                           RNNLayerDecoder]:
-            decoder = decoder_fn(vocab_size=vocab_size, hidden_size=HIDDEN_SIZE)
-            decoder.hybridize()
-            decoder.initialize()
-            if hasattr(decoder, 'state_info'):
-                state_info = decoder.state_info()
-            else:
-                state_info = None
-            for beam_size, bos_id, eos_id, alpha, K in [(2, 1, 3, 0, 1.0), (4, 2, 3, 1.0, 5.0)]:
-                scorer = model.BeamSearchScorer(alpha=alpha, K=K)
-                for max_length in [2, 3]:
-                    for batch_size in [1, 5]:
-                        if sampler_cls is model.HybridBeamSearchSampler:
-                            sampler = sampler_cls(beam_size=beam_size, decoder=decoder,
-                                                  eos_id=eos_id,
-                                                  scorer=scorer, max_length=max_length,
-                                                  vocab_size=vocab_size, batch_size=batch_size)
-                            if hybridize:
-                                sampler.hybridize()
-                        else:
-                            sampler = sampler_cls(beam_size=beam_size, decoder=decoder,
-                                                  eos_id=eos_id,
-                                                  scorer=scorer, max_length=max_length)
-                        print(type(decoder).__name__, beam_size, bos_id, eos_id, \
-                              alpha, K, batch_size)
-                        states = decoder.begin_state(batch_size)
-                        inputs = mx.nd.full(shape=(batch_size,), val=bos_id)
-                        samples, scores, valid_length = sampler(inputs, states)
-                        samples = samples.asnumpy()
-                        scores = scores.asnumpy()
-                        valid_length = valid_length.asnumpy()
-                        for i in range(batch_size):
-                            max_beam_valid_length = int(np.round(valid_length[i].max()))
-                            step_states = _fetch_step_states(states, state_info, i, beam_size)
-                            step_input = bos_id
-                            npy_samples = _npy_beam_search(decoder, scorer, step_input, step_states,
-                                                           eos_id, beam_size, max_length)
-                            selected_samples = samples[i, :, :max_beam_valid_length]
-                            assert_allclose(npy_samples, selected_samples)
-                            for j in range(beam_size):
-                                assert(samples[i, j, valid_length[i, j] - 1] == 3.0)
-                                if valid_length[i, j] < samples.shape[2]:
-                                    assert((samples[i, j, valid_length[i, j]:] == -1.0).all())
diff --git a/tests/unittest/test_stream.py b/tests/unittest/test_stream.py
deleted file mode 100644
index f2f168065f..0000000000
--- a/tests/unittest/test_stream.py
+++ /dev/null
@@ -1,137 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-import itertools
-import os
-import pytest
-import gluonnlp as nlp
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_corpus_stream(
-        stream_identity_wrappers,
-        wikitext2_simpledatasetstream_skipempty_flatten_and_counter):
-    stream, _ = wikitext2_simpledatasetstream_skipempty_flatten_and_counter
-
-    stream = stream_identity_wrappers(stream)
-    counter = nlp.data.Counter(itertools.chain.from_iterable(stream))
-    assert len(counter) == 33278, len(counter)
-    # examine aggregated vocab
-    vocab = nlp.vocab.Vocab(counter, bos_token=None, padding_token=None)
-    assert len(vocab) == 33278, len(vocab)
-    # examine aggregated stats
-    assert sum(counter.values()) == 2075677 + 216347 + 244102
-    counter = nlp.data.Counter(itertools.chain.from_iterable(stream))
-    assert len(counter) == 33278, len(counter)
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_lazy_stream(stream_identity_wrappers):
-    EOS = nlp._constants.EOS_TOKEN
-    path = os.path.join('tests', 'data', 'wikitext-2')
-    token_path = os.path.join('tests', 'data', 'wikitext-2/*test*.tokens')
-    corpus = nlp.data.WikiText2(segment='test', root=path)
-
-    # We don't use wikitext2_simpledatasetstream_skipempty_flatten_and_counter
-    # here as there is no need to work on more than the 'test' segment. The
-    # fixture goes over all segments.
-    stream = nlp.data.SimpleDatasetStream(
-        nlp.data.CorpusDataset,
-        token_path,
-        flatten=True,
-        skip_empty=True,
-        eos=EOS)
-    wrapped_stream = stream_identity_wrappers(stream)
-    transformed_stream = wrapped_stream.transform(lambda d: [s.lower() for s in d])
-
-    wrapped_stream_iter = iter(wrapped_stream)
-    transformed_stream_iter = iter(transformed_stream)
-    for dataset in stream:
-        prefetched_dataset = next(wrapped_stream_iter)
-        transformed_dataset = next(transformed_stream_iter)
-        assert all([
-            w1.lower() == w2.lower() == w3 == w4.lower() for w1, w2, w3, w4 in
-            zip(dataset, prefetched_dataset, transformed_dataset, corpus)
-        ])
-
-
-@pytest.mark.parametrize('num_prefetch', [0, 1, 10])
-@pytest.mark.parametrize('worker_type', ['thread', 'process'])
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_prefetch_stream(num_prefetch, worker_type):
-    EOS = nlp._constants.EOS_TOKEN
-    path = os.path.join('tests', 'data', 'wikitext-2')
-    token_path = os.path.join('tests', 'data', 'wikitext-2/*test*.tokens')
-    test = nlp.data.WikiText2(segment='test', root=path)
-    corpus = nlp.data.SimpleDatasetStream(
-        nlp.data.CorpusDataset, token_path, flatten=True, skip_empty=True)
-    if num_prefetch < 1:
-        with pytest.raises(ValueError):
-            prefetch_corpus = nlp.data.PrefetchingStream(
-                corpus, num_prefetch=num_prefetch, worker_type=worker_type)
-    else:
-        prefetch_corpus = nlp.data.PrefetchingStream(
-            corpus, num_prefetch=num_prefetch, worker_type=worker_type)
-        prefetch_corpus_iter = iter(prefetch_corpus)
-        for x in corpus:
-            y = next(prefetch_corpus_iter)
-            assert all([sx == sy for sx, sy in zip(x, y)])
-
-
-class EagerIterWorksException(Exception):
-    pass
-
-
-@pytest.mark.parametrize('transform', [True, False])
-def test_eager_iter_lazytransform(transform, stream_identity_wrappers):
-    """Test that calling iter(stream.transform(fn)) eagerly calls iter(stream).
-
-    If this test fails, PrefetchingStream(stream.transform(fn)) will not do any
-    prefetching until next(iter(stream.transform(fn))) is called.
-
-    """
-
-    class ExceptionStream(nlp.data.DataStream):
-        def __iter__(self):
-            raise EagerIterWorksException
-
-    stream = stream_identity_wrappers(ExceptionStream())
-    if transform:
-        stream = stream.transform(lambda x: x)
-
-    with pytest.raises(EagerIterWorksException):
-        iter(stream)
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_dataset_stream_sampler():
-    path = os.path.join('tests', 'data', 'wikitext-2')
-    token_path = os.path.join('tests', 'data', 'wikitext-2/*.tokens')
-    test = nlp.data.WikiText2(segment='test', root=path)
-    num_parts = 2
-    lengths = []
-    for part_idx in range(num_parts):
-        sampler = nlp.data.SplitSampler(3, num_parts, part_idx)
-        corpus = nlp.data.SimpleDatasetStream(
-            nlp.data.CorpusDataset, token_path, sampler)
-        for c in corpus:
-            assert len(c) not in lengths
-            lengths.append(len(c))
-    assert len(lengths) == 3
diff --git a/tests/unittest/test_token_embedding.py b/tests/unittest/test_token_embedding.py
deleted file mode 100644
index ed83436a48..0000000000
--- a/tests/unittest/test_token_embedding.py
+++ /dev/null
@@ -1,140 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import functools
-import os
-
-import mxnet as mx
-import pytest
-
-import gluonnlp as nlp
-
-
-class NaiveUnknownLookup:
-    def __init__(self, embsize):
-        self.embsize = embsize
-
-    def __contains__(self, token):
-        return True
-
-    def __getitem__(self, tokens):
-        if isinstance(tokens, str):
-            return mx.nd.ones(self.embsize)
-        else:
-            return mx.nd.ones((len(tokens), self.embsize))
-
-
-@pytest.mark.parametrize('unknown_token', [None, '<unk>', '[UNK]'])
-@pytest.mark.parametrize('init_unknown_vec', [mx.nd.zeros, mx.nd.ones])
-@pytest.mark.parametrize('allow_extend', [True, False])
-@pytest.mark.parametrize('unknown_lookup', [None, NaiveUnknownLookup])
-@pytest.mark.parametrize(
-    'idx_token_vec_mapping',
-    [
-        (None, None),
-        (['<unk>', 'hello', 'world'], mx.nd.zeros(shape=[3, 300])),  # 300 == embsize
-        (['hello', 'world', '<unk>'], mx.nd.zeros(shape=[3, 300])),  # 300 == embsize
-        (['hello', 'world'], mx.nd.zeros(shape=[2, 300])),  # 300 == embsize
-    ])
-def test_token_embedding_constructor(unknown_token, init_unknown_vec, allow_extend, unknown_lookup,
-                                     idx_token_vec_mapping, tmp_path, embsize=300):
-    idx_to_token, idx_to_vec = idx_token_vec_mapping
-
-    TokenEmbedding = functools.partial(
-        nlp.embedding.TokenEmbedding, unknown_token=unknown_token,
-        init_unknown_vec=init_unknown_vec, allow_extend=allow_extend,
-        unknown_lookup=unknown_lookup(embsize) if unknown_lookup is not None else None,
-        idx_to_token=idx_to_token, idx_to_vec=idx_to_vec)
-
-    def test_serialization(emb, tmp_path=tmp_path):
-        emb_path = os.path.join(str(tmp_path), "emb.npz")
-        if unknown_lookup:
-            with pytest.warns(UserWarning):  # UserWarning: Serialization of `unknown_lookup` is not supported
-                emb.serialize(emb_path)
-        else:
-            emb.serialize(emb_path)
-        loaded_emb = nlp.embedding.TokenEmbedding.deserialize(emb_path)
-        assert loaded_emb == emb
-
-    ## Test "legacy" constructor
-    if idx_to_token is None:
-        emb = TokenEmbedding()
-        assert len(emb.idx_to_token) == 1 if unknown_token else len(emb.idx_to_token) == 0
-        # emb does not know the embsize, thus idx_to_vec could not be initialized
-        assert emb.idx_to_vec is None
-        with pytest.raises(AttributeError):
-            # Cannot serialize as idx_to_vec is not initialized
-            test_serialization(emb)
-
-        # Set unknown_token
-        if unknown_token:
-            emb[unknown_token] = mx.nd.zeros(embsize) - 1
-            assert (emb[unknown_token].asnumpy() == mx.nd.zeros(embsize).asnumpy() - 1).all()
-            assert emb.idx_to_vec.shape[1] == embsize
-            test_serialization(emb)
-
-        if allow_extend:
-            emb = TokenEmbedding()
-            if not unknown_token:
-                with pytest.warns(UserWarning):  # UserWarning: encouraged to batch updates
-                    emb[unknown_token] = mx.nd.zeros(embsize) - 1
-            else:
-                emb[unknown_token] = mx.nd.zeros(embsize) - 1
-            assert emb.idx_to_vec.shape[1] == embsize
-            test_serialization(emb)
-
-            emb = TokenEmbedding()
-            with pytest.warns(UserWarning):  # UserWarning: encouraged to batch updates
-                val = mx.nd.zeros(embsize) - 1
-                emb['<some_token>'] = val
-            assert emb.idx_to_vec.shape[0] == 2 if unknown_token else emb.idx_to_vec.shape[0] == 1
-            assert (emb['<some_token>'].asnumpy() == (mx.nd.zeros(embsize) - 1).asnumpy()).all()
-            test_serialization(emb)
-
-    ## Test with idx_to_vec and idx_to_token arguments
-    else:
-        emb = TokenEmbedding()
-
-        if unknown_token and unknown_token not in idx_to_token:
-            assert emb.idx_to_token == [unknown_token] + idx_to_token
-            assert (emb.idx_to_vec[1:].asnumpy() == idx_to_vec.asnumpy()).all()
-            assert (emb.idx_to_vec[0].asnumpy() == init_unknown_vec(embsize).asnumpy()).all()
-        else:
-            assert emb.idx_to_token == idx_to_token
-            assert (emb.idx_to_vec.asnumpy() == idx_to_vec.asnumpy()).all()
-        test_serialization(emb)
-
-        if allow_extend:
-            emb = TokenEmbedding()
-            if not unknown_token:
-                with pytest.warns(UserWarning):  # UserWarning: encouraged to batch updates
-                    emb[unknown_token] = mx.nd.zeros(embsize) - 1
-            else:
-                emb[unknown_token] = mx.nd.zeros(embsize) - 1
-            assert emb.idx_to_vec.shape[1] == embsize
-            test_serialization(emb)
-
-            emb = TokenEmbedding()
-            with pytest.warns(UserWarning):  # UserWarning: encouraged to batch updates
-                emb['<some_token>'] = mx.nd.zeros(embsize) - 1
-            assert (emb['<some_token>'].asnumpy() == (mx.nd.zeros(embsize) - 1).asnumpy()).all()
-
-            if unknown_token and unknown_token not in idx_to_token:
-                assert emb.idx_to_vec.shape[0] == len(idx_to_token) + 2
-            else:
-                assert emb.idx_to_vec.shape[0] == len(idx_to_token) + 1
-            test_serialization(emb)
diff --git a/tests/unittest/test_transforms.py b/tests/unittest/test_transforms.py
deleted file mode 100644
index 3e729e8da8..0000000000
--- a/tests/unittest/test_transforms.py
+++ /dev/null
@@ -1,402 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import sys
-import warnings
-
-import mxnet as mx
-import numpy as np
-import pytest
-from mxnet.gluon.data import SimpleDataset
-from mxnet.gluon.utils import download
-from numpy.testing import assert_allclose
-
-from gluonnlp.data import count_tokens, get_tokenizer
-from gluonnlp.data import transforms as t
-from gluonnlp.model.utils import _load_vocab
-from gluonnlp.vocab import BERTVocab, Vocab
-from gluonnlp.model import get_model
-
-
-def test_clip_sequence():
-    for length in [10, 200]:
-        clip_seq = t.ClipSequence(length=length)
-        for seq_length in [1, 20, 500]:
-            dat_npy = np.random.uniform(0, 1, (seq_length,))
-            ret1 = clip_seq(dat_npy.tolist())
-            assert(len(ret1) == min(seq_length, length))
-            assert_allclose(np.array(ret1), dat_npy[:length])
-            ret2 = clip_seq(mx.nd.array(dat_npy)).asnumpy()
-            assert_allclose(ret2, dat_npy[:length])
-            ret3 = clip_seq(dat_npy)
-            assert_allclose(ret3, dat_npy[:length])
-
-
-def test_pad_sequence():
-    def np_gt(data, length, clip, pad_val):
-        if data.shape[0] >= length:
-            if clip:
-                return data[:length]
-            else:
-                return data
-        else:
-            pad_width = [(0, length - data.shape[0])] + [(0, 0) for _ in range(data.ndim - 1)]
-            return np.pad(data, mode='constant', pad_width=pad_width, constant_values=pad_val)
-
-    for clip in [False, True]:
-        for length in [5, 20]:
-            for pad_val in [-1.0, 0.0, 1.0]:
-                pad_seq = t.PadSequence(length=length, clip=clip, pad_val=pad_val)
-                for seq_length in range(1, 100, 10):
-                    for additional_shape in [(), (5,), (4, 3)]:
-                        dat_npy = np.random.uniform(0, 1, (seq_length,) + additional_shape)
-                        gt_npy = np_gt(dat_npy, length, clip, pad_val)
-                        ret_npy = pad_seq(dat_npy)
-                        ret_mx = pad_seq(mx.nd.array(dat_npy)).asnumpy()
-                        assert_allclose(ret_npy, gt_npy)
-                        assert_allclose(ret_mx, gt_npy)
-                        if len(additional_shape) == 0:
-                            ret_l = np.array(pad_seq(dat_npy.tolist()))
-                            assert_allclose(ret_l, gt_npy)
-
-
-def test_moses_tokenizer():
-    tokenizer = t.SacreMosesTokenizer()
-    text = u"Introducing Gluon: An Easy-to-Use Programming Interface for Flexible Deep Learning."
-    ret = tokenizer(text)
-    assert isinstance(ret, list)
-    assert len(ret) > 0
-
-
-def test_spacy_tokenizer():
-    tokenizer = t.SpacyTokenizer()
-    text = u"Introducing Gluon: An Easy-to-Use Programming Interface for Flexible Deep Learning."
-    try:
-        ret = tokenizer(text)
-    except ImportError:
-        warnings.warn("Spacy not installed, skip test_spacy_tokenizer().")
-        return
-    assert isinstance(ret, list)
-    assert len(ret) > 0
-
-
-def test_moses_detokenizer():
-    detokenizer = t.SacreMosesDetokenizer(return_str=False)
-    text = ['Introducing', 'Gluon', ':', 'An', 'Easy-to-Use', 'Programming',
-            'Interface', 'for', 'Flexible', 'Deep', 'Learning', '.']
-    ret = detokenizer(text)
-    assert isinstance(ret, list)
-    assert len(ret) > 0
-
-
-@pytest.mark.remote_required
-def test_sentencepiece_tokenizer():
-    url_format = 'https://apache-mxnet.s3-accelerate.amazonaws.com/gluon/dataset/vocab/{}'
-    filename = 'test-0690baed.bpe'
-    download(url_format.format(filename), path=os.path.join('tests', 'data', filename))
-    tokenizer = t.SentencepieceTokenizer(os.path.join('tests', 'data', filename))
-    detokenizer = t.SentencepieceDetokenizer(os.path.join('tests', 'data', filename))
-    text = "Introducing Gluon: An Easy-to-Use Programming Interface for Flexible Deep Learning."
-    try:
-        ret = tokenizer(text)
-        detext = detokenizer(ret)
-    except ImportError:
-        warnings.warn("Sentencepiece not installed, skip test_sentencepiece_tokenizer().")
-        return
-    assert isinstance(ret, list)
-    assert all(t in tokenizer.tokens for t in ret)
-    assert len(ret) > 0
-    assert text == detext
-
-
-@pytest.mark.remote_required
-def test_sentencepiece_tokenizer_subword_regularization():
-    url_format = 'https://apache-mxnet.s3-accelerate.amazonaws.com/gluon/dataset/vocab/{}'
-    filename = 'test-31c8ed7b.uni'
-    download(url_format.format(filename), path=os.path.join('tests', 'data', filename))
-    tokenizer = t.SentencepieceTokenizer(os.path.join('tests', 'data', filename),
-                                         -1, 0.1)
-    detokenizer = t.SentencepieceDetokenizer(os.path.join('tests', 'data', filename))
-    text = "Introducing Gluon: An Easy-to-Use Programming Interface for Flexible Deep Learning."
-    try:
-        reg_ret = [tokenizer(text) for _ in range(10)]
-        detext = detokenizer(reg_ret[0])
-    except ImportError:
-        warnings.warn("Sentencepiece not installed, skip test_sentencepiece_tokenizer().")
-        return
-    assert text == detext
-    assert any(reg_ret[i] != reg_ret[0] for i in range(len(reg_ret)))
-    assert all(t in tokenizer.tokens for ret in reg_ret for t in ret)
-    assert all(detokenizer(reg_ret[i]) == detext for i in range(len(reg_ret)))
-
-
-def test_bertbasictokenizer():
-    tokenizer = t.BERTBasicTokenizer(lower=True)
-
-    # test lower_case=True
-    assert tokenizer(u" \tHeLLo!how  \n Are yoU?  ") == [
-        "hello", "!", "how", "are", "you", "?"]
-    assert tokenizer(u"H\u00E9llo") == ["hello"]
-
-    # test chinese
-    assert tokenizer(u"ah\u535A\u63A8zz") == [
-        u"ah", u"\u535A", u"\u63A8", u"zz"]
-
-    # test is_whitespace
-    assert tokenizer._is_whitespace(u" ") == True
-    assert tokenizer._is_whitespace(u"\t") == True
-    assert tokenizer._is_whitespace(u"\r") == True
-    assert tokenizer._is_whitespace(u"\n") == True
-    assert tokenizer._is_whitespace(u"\u00A0") == True
-    assert tokenizer._is_whitespace(u"A") == False
-    assert tokenizer._is_whitespace(u"-") == False
-
-    # test is_control
-    assert tokenizer._is_control(u"\u0005") == True
-    assert tokenizer._is_control(u"A") == False
-    assert tokenizer._is_control(u" ") == False
-    assert tokenizer._is_control(u"\t") == False
-    assert tokenizer._is_control(u"\r") == False
-
-    # test is_punctuation
-    assert tokenizer._is_punctuation(u"-") == True
-    assert tokenizer._is_punctuation(u"$") == True
-    assert tokenizer._is_punctuation(u"`") == True
-    assert tokenizer._is_punctuation(u".") == True
-    assert tokenizer._is_punctuation(u"A") == False
-    assert tokenizer._is_punctuation(u" ") == False
-
-    # test lower_case=False
-    tokenizer = t.BERTBasicTokenizer(lower=False)
-    assert tokenizer(u" \tHeLLo!how  \n Are yoU?  ") == [
-        "HeLLo", "!", "how", "Are", "yoU", "?"]
-
-
-def test_berttokenizer():
-
-    # test WordpieceTokenizer
-    vocab_tokens = ["want", "##want", "##ed", "wa", "un", "runn", "##ing"]
-    vocab = Vocab(
-        count_tokens(vocab_tokens),
-        reserved_tokens=["[CLS]", "[SEP]"],
-        unknown_token="[UNK]", padding_token=None, bos_token=None, eos_token=None)
-    tokenizer = t.BERTTokenizer(vocab=vocab)
-
-    assert tokenizer(u"unwanted running") == [
-        "un", "##want", "##ed", "runn", "##ing"]
-    assert tokenizer(u"unwantedX running") == ["[UNK]", "runn", "##ing"]
-    assert tokenizer.is_first_subword('un')
-    assert not tokenizer.is_first_subword('##want')
-
-    # test BERTTokenizer
-    vocab_tokens = ["[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
-                    "##ing", ","]
-
-    vocab = Vocab(
-        count_tokens(vocab_tokens),
-        reserved_tokens=["[CLS]", "[SEP]"],
-        unknown_token="[UNK]", padding_token=None, bos_token=None, eos_token=None)
-    tokenizer = t.BERTTokenizer(vocab=vocab)
-    tokens = tokenizer(u"UNwant\u00E9d,running")
-    assert tokens == ["un", "##want", "##ed", ",", "runn", "##ing"]
-
-
-def test_bert_sentences_transform():
-    text_a = u'is this jacksonville ?'
-    text_b = u'no it is not'
-    vocab_tokens = ['is', 'this', 'jack', '##son', '##ville', '?', 'no', 'it', 'is', 'not']
-
-    bert_vocab = BERTVocab(count_tokens(vocab_tokens))
-    tokenizer = t.BERTTokenizer(vocab=bert_vocab)
-
-    # test BERTSentenceTransform
-    bert_st = t.BERTSentenceTransform(tokenizer, 15, pad=True, pair=True)
-    token_ids, length, type_ids = bert_st((text_a, text_b))
-
-    text_a_tokens = ['is', 'this', 'jack', '##son', '##ville', '?']
-    text_b_tokens = ['no', 'it', 'is', 'not']
-    text_a_ids = bert_vocab[text_a_tokens]
-    text_b_ids = bert_vocab[text_b_tokens]
-
-    cls_ids = bert_vocab[[bert_vocab.cls_token]]
-    sep_ids = bert_vocab[[bert_vocab.sep_token]]
-    pad_ids = bert_vocab[[bert_vocab.padding_token]]
-
-    concated_ids = cls_ids + text_a_ids + sep_ids + text_b_ids + sep_ids + pad_ids
-    valid_token_ids = np.array([pad_ids[0]] * 15, dtype=np.int32)
-    for i, x in enumerate(concated_ids):
-        valid_token_ids[i] = x
-    valid_type_ids = np.zeros((15,), dtype=np.int32)
-    start = len(text_a_tokens) + 2
-    end = len(text_a_tokens) + 2 + len(text_b_tokens) + 1
-    valid_type_ids[start:end] = 1
-
-    assert all(token_ids == valid_token_ids)
-    assert length == len(vocab_tokens) + 3
-    assert all(type_ids == valid_type_ids)
-
-
-@pytest.mark.remote_required
-def test_bert_sentencepiece_sentences_transform():
-    url = 'http://repo.mxnet.io/gluon/dataset/vocab/test-682b5d15.bpe'
-    with warnings.catch_warnings():
-        # UserWarning: File test-682b5d15.bpe exists in file system so the downloaded file is deleted
-        warnings.simplefilter("ignore")
-        f = download(url, overwrite=True)
-    bert_vocab = BERTVocab.from_sentencepiece(f)
-    bert_tokenizer = t.BERTSPTokenizer(f, bert_vocab, lower=True)
-    assert bert_tokenizer.is_first_subword(u'▁this')
-    assert not bert_tokenizer.is_first_subword(u'this')
-    max_len = 36
-    data_train_raw = SimpleDataset(
-        [[u'This is a very awesome, life-changing sentence.']])
-    transform = t.BERTSentenceTransform(bert_tokenizer,
-                                        max_len,
-                                        pad=True,
-                                        pair=False)
-    try:
-        data_train = data_train_raw.transform(transform)
-    except ImportError:
-        warnings.warn(
-            "Sentencepiece not installed, skip test_bert_sentencepiece_sentences_transform()."
-        )
-        return
-    processed = list(data_train)[0]
-
-    tokens = [
-        u'▁this', u'▁is', u'▁a', u'▁very', u'▁a', u'w', u'es', u'om', u'e', u'▁', u',',
-        u'▁life', u'▁', u'-', u'▁c', u'hang', u'ing', u'▁sentence', u'▁', u'.'
-    ]
-    token_ids = [bert_vocab[bert_vocab.cls_token]
-                 ] + bert_tokenizer.convert_tokens_to_ids(tokens) + [
-                     bert_vocab[bert_vocab.sep_token]
-                 ]
-    token_ids += [bert_vocab[bert_vocab.padding_token]
-                  ] * (max_len - len(token_ids))
-
-    # token ids
-    assert all(processed[0] == np.array(token_ids, dtype='int32'))
-    # sequence length
-    assert processed[1].item() == len(tokens) + 2
-    # segment id
-    assert all(processed[2] == np.array([0] * max_len, dtype='int32'))
-
-
-@pytest.mark.remote_required
-def test_gpt2_transforms():
-    tokenizer = t.GPT2BPETokenizer()
-    detokenizer = t.GPT2BPEDetokenizer()
-    vocab = _load_vocab('openai_webtext', None, root=os.path.join('tests', 'data', 'models'))
-    s = ' natural language processing tools such as gluonnlp and torchtext'
-    subwords = tokenizer(s)
-    indices = vocab[subwords]
-    gt_gpt2_subword = [u'Ġnatural', u'Ġlanguage', u'Ġprocessing', u'Ġtools',
-                       u'Ġsuch', u'Ġas', u'Ġgl', u'u', u'on',
-                       u'nl', u'p', u'Ġand', u'Ġtorch', u'text']
-    gt_gpt2_idx = [3288, 3303, 7587, 4899, 884, 355, 1278, 84, 261, 21283, 79, 290, 28034, 5239]
-    for lhs, rhs in zip(indices, gt_gpt2_idx):
-        assert lhs == rhs
-    for lhs, rhs in zip(subwords, gt_gpt2_subword):
-        assert lhs == rhs
-
-    recovered_sentence = detokenizer([vocab.idx_to_token[i] for i in indices])
-    assert recovered_sentence == s
-
-
-@pytest.mark.remote_required
-def test_get_tokenizer():
-    test_sent = 'Apple, 사과, 沙果'
-    models = [
-        (
-            'roberta_12_768_12', 'openwebtext_ccnews_stories_books_cased', [
-                'Apple', ',', 'Ġì', 'Ĥ¬', 'ê', '³', '¼', ',', 'Ġæ', '²', 'Ļ',
-                'æ', 'ŀ', 'ľ'
-            ]
-        ), (
-            'roberta_24_1024_16', 'openwebtext_ccnews_stories_books_cased', [
-                'Apple', ',', 'Ġì', 'Ĥ¬', 'ê', '³', '¼', ',', 'Ġæ', '²', 'Ļ',
-                'æ', 'ŀ', 'ľ'
-            ]
-        ), (
-            'bert_12_768_12', 'book_corpus_wiki_en_cased',
-            ['Apple', ',', '[UNK]', ',', '[UNK]', '[UNK]']
-        ), (
-            'bert_12_768_12', 'book_corpus_wiki_en_uncased',
-            ['apple', ',', 'ᄉ', '##ᅡ', '##ᄀ', '##ᅪ', ',', '[UNK]', '[UNK]']
-        ), (
-            'bert_12_768_12', 'openwebtext_book_corpus_wiki_en_uncased',
-            ['apple', ',', 'ᄉ', '##ᅡ', '##ᄀ', '##ᅪ', ',', '[UNK]', '[UNK]']
-        ), (
-            'bert_12_768_12', 'wiki_multilingual_cased',
-            ['app', '##le', ',', '[UNK]', ',', '沙', '果']
-        ), (
-            'bert_12_768_12', 'wiki_multilingual_uncased',
-            ['[UNK]', ',', 'ᄉ', '##ᅡ', u'##\u1100\u116a', ',', '沙', '果']
-        ), (
-            'bert_12_768_12', 'wiki_cn_cased',
-            ['[UNK]', ',', 'ᄉ', '##ᅡ', '##ᄀ', '##ᅪ', ',', '沙', '果']
-        ), (
-            'bert_24_1024_16', 'book_corpus_wiki_en_cased',
-            ['Apple', ',', '[UNK]', ',', '[UNK]', '[UNK]']
-        ), (
-            'bert_24_1024_16', 'book_corpus_wiki_en_uncased',
-            ['apple', ',', 'ᄉ', '##ᅡ', '##ᄀ', '##ᅪ', ',', '[UNK]', '[UNK]']
-        ), (
-            'bert_12_768_12', 'scibert_scivocab_uncased',
-            ['apple', ',', '[UNK]', ',', '[UNK]', '[UNK]']
-        ), (
-            'bert_12_768_12', 'scibert_scivocab_cased',
-            ['Appl', '##e', ',', '[UNK]', ',', '[UNK]', '[UNK]']
-        ), (
-            'bert_12_768_12', 'scibert_basevocab_uncased',
-            ['apple', ',', 'ᄉ', '##ᅡ', '##ᄀ', '##ᅪ', ',', '[UNK]', '[UNK]']
-        ), (
-            'bert_12_768_12', 'scibert_basevocab_cased',
-            ['Apple', ',', '[UNK]', ',', '[UNK]', '[UNK]']
-        ), (
-            'bert_12_768_12', 'biobert_v1.0_pmc_cased',
-            ['Apple', ',', '[UNK]', ',', '[UNK]', '[UNK]']
-        ), (
-            'bert_12_768_12', 'biobert_v1.0_pubmed_cased',
-            ['Apple', ',', '[UNK]', ',', '[UNK]', '[UNK]']
-        ), (
-            'bert_12_768_12', 'biobert_v1.0_pubmed_pmc_cased',
-            ['Apple', ',', '[UNK]', ',', '[UNK]', '[UNK]']
-        ), (
-            'bert_12_768_12', 'biobert_v1.1_pubmed_cased',
-            ['Apple', ',', '[UNK]', ',', '[UNK]', '[UNK]']
-        ), (
-            'bert_12_768_12', 'clinicalbert_uncased',
-            ['apple', ',', 'ᄉ', '##ᅡ', '##ᄀ', '##ᅪ', ',', '[UNK]', '[UNK]']
-        ), (
-            'bert_12_768_12', 'kobert_news_wiki_ko_cased',
-            ['▁A', 'p', 'p', 'le', ',', '▁사과', ',', '▁', '沙果']
-        ), (
-            'ernie_12_768_12', 'baidu_ernie_uncased',
-            ['apple', ',', '[UNK]', ',', '沙', '果']
-        )
-    ]
-    for model_nm, dataset_nm, expected in models:
-        _, vocab = get_model(
-            model_nm, dataset_name=dataset_nm, pretrained=False
-        )
-        tok = get_tokenizer(
-            model_name=model_nm, dataset_name=dataset_nm, vocab=vocab
-        )
-        predicted = tok(test_sent)
-        assert predicted == expected
diff --git a/tests/unittest/test_utils.py b/tests/unittest/test_utils.py
deleted file mode 100644
index d2ec88d4e6..0000000000
--- a/tests/unittest/test_utils.py
+++ /dev/null
@@ -1,212 +0,0 @@
-import pytest
-import os
-import io
-import numpy as np
-import mxnet as mx
-import gluonnlp as nlp
-
-@pytest.mark.parametrize('num_workers', [0, 2])
-def test_parallel(num_workers):
-    class ParallelNet(nlp.utils.Parallelizable):
-        def __init__(self, net, loss):
-            self._net = net
-            self._loss = loss
-
-        def forward_backward(self, x):
-            data, label = x
-            with mx.autograd.record():
-                out = self._net(data)
-                loss = self._loss(out, label)
-            loss.backward()
-            return loss
-    # model
-    net = mx.gluon.nn.Dense(2, prefix='test_parallel_')
-    loss = mx.gluon.loss.SoftmaxCELoss()
-    ctxs = [mx.cpu(0), mx.cpu(1)]
-    net.initialize(ctx=ctxs)
-    params = net.collect_params()
-
-    # parallel model
-    para_net = ParallelNet(net, loss)
-    parallel = nlp.utils.Parallel(num_workers, para_net)
-
-    # sample data
-    data = mx.nd.random.uniform(shape=(2,5))
-    label = mx.nd.array([[0], [1]])
-    data_list = mx.gluon.utils.split_and_load(data, ctxs)
-    label_list = mx.gluon.utils.split_and_load(label, ctxs)
-
-    # train parallel
-    epoch = 2
-    params.zero_grad()
-    params.setattr('req', 'add')
-    parallel_loss = 0
-    for i in range(epoch):
-        for x, y in zip(data_list, label_list):
-            parallel.put((x,y))
-        for x, y in zip(data_list, label_list):
-            ls = parallel.get()
-            parallel_loss += ls.asscalar()
-
-    grads = params['test_parallel_weight'].list_grad()
-    parallel_grads_np = [grad.asnumpy() for grad in grads]
-
-    # train serial
-    params.zero_grad()
-    params.setattr('req', 'add')
-    serial_loss = 0
-    for i in range(epoch):
-        with mx.autograd.record():
-            for x, y in zip(data_list, label_list):
-                ls = loss(net(x), y)
-                ls.backward()
-                serial_loss += ls.asscalar()
-
-    grads = params['test_parallel_weight'].list_grad()
-    serial_grads_np = [grad.asnumpy() for grad in grads]
-    assert serial_loss == parallel_loss
-    for para_grad, serial_grad in zip(parallel_grads_np, serial_grads_np):
-        mx.test_utils.assert_almost_equal(para_grad, serial_grad)
-
-@pytest.mark.parametrize('max_norm',
-                         [ None, 1, 3])
-def test_grad_global_norm(max_norm):
-    contexts = [mx.cpu(0), mx.cpu(1)]
-    nunits = 100
-    net = mx.gluon.nn.Dense(nunits, weight_initializer='ones', bias_initializer='ones')
-    net.initialize(ctx=contexts)
-    net.hybridize()
-    trainer = mx.gluon.Trainer(net.collect_params(), 'sgd', update_on_kvstore=False)
-    for ctx in contexts:
-        with mx.autograd.record():
-            out = net(mx.nd.ones((1, 1), ctx=ctx))
-        out.backward()
-    trainer.allreduce_grads()
-    if max_norm:
-        norm, ratio, is_finite = nlp.utils.grad_global_norm(net.collect_params().values(),
-                                                            max_norm)
-    else:
-        norm = nlp.utils.grad_global_norm(net.collect_params().values(),
-                                          max_norm)
-    # Reference
-    ref_norm = 0
-    for p in net.collect_params().values():
-        x = p.list_grad()[0].reshape((-1,)).astype('float32', copy=False)
-        dot_product = mx.nd.dot(x, x)
-        ref_norm += mx.nd.add_n(dot_product)
-    ref_norm = ref_norm.sqrt()
-
-    mx.test_utils.assert_almost_equal(ref_norm.asnumpy(), norm.asnumpy(), rtol=1e-7, atol=1e-7)
-
-@pytest.mark.parametrize('max_norm,check_isfinite',
-                         [(1, True),
-                          (1, False),
-                          (3, True),
-                          (3, False)])
-def test_clip_grad_norm(max_norm, check_isfinite):
-    contexts = [mx.cpu(0), mx.cpu(1)]
-    net = mx.gluon.nn.Dense(1, weight_initializer='ones', bias_initializer='ones')
-    net.initialize(ctx=contexts)
-    net.hybridize()
-    trainer = mx.gluon.Trainer(net.collect_params(), 'sgd', update_on_kvstore=False)
-    for ctx in contexts:
-        with mx.autograd.record():
-            out = net(mx.nd.ones((1, 1), ctx=ctx))
-        out.backward()
-    trainer.allreduce_grads()
-    with mx.cpu(2):
-        norm = nlp.utils.clip_grad_global_norm(net.collect_params().values(),
-                                               max_norm, check_isfinite)
-    if isinstance(norm, mx.nd.NDArray):
-        norm = norm.asnumpy()
-    mx.test_utils.assert_almost_equal(norm, np.sqrt(8), atol=1e-5)
-    for ctx in contexts:
-        if max_norm > np.sqrt(8): # no clipping
-            assert net.weight.grad(ctx).reshape(-1) == 2
-            assert net.bias.grad(ctx).reshape(-1) == 2
-        else:
-            assert net.weight.grad(ctx).reshape(-1) < 2
-            assert net.bias.grad(ctx).reshape(-1) < 2
-
-@pytest.mark.parametrize('filename', ['net.params', './net.params'])
-def test_save_parameters(filename):
-    net = mx.gluon.nn.Dense(1, in_units=1)
-    net.initialize()
-    nlp.utils.save_parameters(net, filename)
-    nlp.utils.load_parameters(net, filename)
-
-@pytest.mark.parametrize('filename', ['net.states', './net.states'])
-def test_save_states(filename):
-    net = mx.gluon.nn.Dense(1, in_units=1)
-    net.initialize()
-    trainer = mx.gluon.Trainer(net.collect_params(), 'sgd',
-                               update_on_kvstore=False)
-    nlp.utils.save_states(trainer, filename)
-    assert os.path.isfile(filename)
-    nlp.utils.load_states(trainer, filename)
-
-@pytest.mark.parametrize('dirname', ['~/dir1', '~/dir1/dir2'])
-def test_mkdir(dirname):
-    nlp.utils.mkdir(dirname)
-    assert os.path.isdir(os.path.expanduser(dirname))
-
-def test_glob():
-    f0 = io.open('test_glob_00', 'w')
-    f1 = io.open('test_glob_01', 'w')
-    f2 = io.open('test_glob_11', 'w')
-    files = nlp.utils.glob('test_glob_0*,test_glob_1*')
-    assert len(files) == 3
-    files_fake = nlp.utils.glob('fake_glob')
-    assert len(files_fake) == 0
-
-def test_version():
-    future_version = '10.11.12'
-    past_version = '0.1.2'
-    with pytest.raises(AssertionError):
-        nlp.utils.check_version(future_version, warning_only=False)
-    with pytest.raises(UserWarning):
-        nlp.utils.check_version(future_version, warning_only=True)
-    nlp.utils.check_version(past_version, warning_only=False)
-    nlp.utils.check_version(past_version, warning_only=True)
-
-def test_train_valid_split():
-    # Create test set
-    data ={}
-    data['texts'] = ['this is','the test','dataset for',
-                     'train_valid_split','the function',
-                     'for splitting','data into',
-                     'a validation','set and','a training',
-                     'set including','stratify option']
-    data['labels'] = [1,2,3,1,2,3,3,3,3,3,2,2]
-
-    # Create a list of review a label pairs
-    dataset = [[text, int(label)] for text, label in zip(data['texts'], data['labels'])]
-    classes,digitized = np.unique(data['labels'],return_inverse=True)
-    n_classes = len(classes)
-    num_class = np.bincount(digitized)
-
-    train_dataset, valid_dataset = nlp.data.train_valid_split(dataset)
-
-    assert (len(valid_dataset) == np.ceil(.05*len(dataset)).astype(int)) and \
-           (len(train_dataset)+len(valid_dataset) == len(dataset))
-
-    train_dataset, valid_dataset = nlp.data.train_valid_split(dataset,stratify=data['labels'])
-
-    valid_labels = [d[1] for d in valid_dataset]
-    valid_num_class = [np.sum(valid_labels==classes[i]) for i in range(len(classes))]
-
-    assert np.all(np.ceil(.05*num_class).astype(int) == valid_num_class) and \
-           (len(train_dataset) + len(valid_dataset) == len(dataset))
-
-@pytest.mark.parametrize('seed', [42, 42])
-def test_set_seed(seed):
-    contexts = [mx.cpu(0), mx.cpu(1)]
-    for ctx in contexts: 
-        nlp.utils.seed.set_seed(seed)
-        x = mx.ndarray.random.randn(2, 2, loc=0, scale=1, ctx=ctx)
-        nlp.utils.seed.set_seed(seed)
-        y = mx.ndarray.random.randn(2, 2, loc=0, scale=1, ctx=ctx)
-        assert x[0][0]==y[0][0]
-        assert x[0][1]==y[0][1]
-        assert x[1][0]==y[1][0]
-        assert x[1][1]==y[1][1]
diff --git a/tests/unittest/test_vocab_embed.py b/tests/unittest/test_vocab_embed.py
deleted file mode 100644
index 1d775e59b0..0000000000
--- a/tests/unittest/test_vocab_embed.py
+++ /dev/null
@@ -1,1468 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# 'License'); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# 'AS IS' BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import functools
-import os
-import random
-import re
-import warnings
-
-import numpy as np
-import pytest
-from mxnet import ndarray as nd
-from mxnet.test_utils import *
-
-import gluonnlp as nlp
-from gluonnlp.base import get_home_dir
-
-
-@pytest.fixture
-def counter():
-    return nlp.data.utils.Counter( ['a', 'b', 'b', 'c', 'c', 'c',
-                                    'some_word$'])
-
-def _get_test_str_of_tokens(token_delim, seq_delim):
-    seq1 = token_delim + token_delim.join(['Life', 'is', 'great', '!']) + token_delim + seq_delim
-    seq2 = token_delim + token_delim.join(['life', 'is', 'good', '.']) + token_delim + seq_delim
-    seq3 = token_delim + token_delim.join(['life', "isn't", 'bad', '.']) + token_delim + seq_delim
-    seqs = seq1 + seq2 + seq3
-    return seqs
-
-def simple_tokenize(source_str, token_delim=' ', seq_delim='\n'):
-    return filter(None, re.split(token_delim + '|' + seq_delim, source_str))
-
-def _test_count_tokens(token_delim, seq_delim):
-    source_str = _get_test_str_of_tokens(token_delim, seq_delim)
-
-    tokens = list(simple_tokenize(source_str, token_delim, seq_delim))
-    cnt1 = nlp.data.count_tokens(tokens, to_lower=False)
-    assert cnt1 == nlp.data.utils.Counter(
-        {'is': 2, 'life': 2, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1,
-         'bad': 1})
-
-    cnt2 = nlp.data.count_tokens(tokens, to_lower=True)
-    assert cnt2 == nlp.data.utils.Counter(
-        {'life': 3, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1}), cnt2
-
-    counter_to_update = nlp.data.utils.Counter({'life': 2})
-
-    cnt3 = nlp.data.utils.count_tokens(tokens, to_lower=False,
-                                   counter=counter_to_update.copy())
-    assert cnt3 == nlp.data.utils.Counter(
-        {'is': 2, 'life': 4, '.': 2, 'Life': 1, 'great': 1, '!': 1, 'good': 1, "isn't": 1,
-         'bad': 1})
-
-    cnt4 = nlp.data.count_tokens(tokens, to_lower=True,
-                             counter=counter_to_update.copy())
-    assert cnt4 == nlp.data.utils.Counter(
-        {'life': 5, 'is': 2, '.': 2, 'great': 1, '!': 1, 'good': 1, "isn't": 1, 'bad': 1})
-
-
-def test_count_tokens():
-    _test_count_tokens(' ', '\n')
-    _test_count_tokens('IS', 'LIFE')
-
-
-def test_vocabulary_getitem(counter):
-    vocab = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token='<unk>',
-                      bos_token=None, eos_token=None, reserved_tokens=None)
-
-    i1 = vocab['c']
-    assert i1 == 2
-    assert vocab.to_indices('c') == 2
-
-    i2 = vocab[['c']]
-    assert i2 == [2]
-    assert vocab.to_indices(['c']) == [2]
-
-    i3 = vocab[['<unk>', 'non-exist']]
-    assert i3 == [0, 0]
-    assert vocab.to_indices(['<unk>', 'non-exist']) == [0, 0]
-
-    i4 = vocab[['a', 'non-exist', 'a', 'b']]
-    assert i4 == [4, 0, 4, 3]
-    assert vocab.to_indices(['a', 'non-exist', 'a', 'b']) == [4, 0, 4, 3]
-
-    no_unk_vocab = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token=None,
-                             bos_token=None, eos_token=None, reserved_tokens=None)
-    assert no_unk_vocab['c'] == 1
-    assert no_unk_vocab.to_indices('c') == 1
-
-    assert no_unk_vocab[['c']] == [1]
-    assert no_unk_vocab.to_indices(['c']) == [1]
-
-    for words in [['<unk>', 'non-exist'], ['a', 'non-exist', 'a', 'b']]:
-        with pytest.raises(KeyError):
-            no_unk_vocab.to_indices(words)
-
-
-def test_vocabulary_to_tokens(counter):
-    vocab = nlp.Vocab(counter, max_size=None, min_freq=1,unknown_token='<unknown>',
-                      bos_token=None, eos_token=None, reserved_tokens=None)
-    i1 = vocab.to_tokens(2)
-    assert i1 == 'c'
-
-    i2 = vocab.to_tokens([2])
-    assert i2 == ['c']
-
-    i3 = vocab.to_tokens([0, 0])
-    assert i3 == ['<unknown>', '<unknown>']
-
-    i4 = vocab.to_tokens([4, 0, 4, 3])
-    assert i4 == ['a', '<unknown>', 'a', 'b']
-
-    for indices in [6, [6,7]]:
-        with pytest.raises(ValueError):
-            vocab.to_tokens(indices)
-
-
-def test_vocabulary(counter):
-    v1 = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token='<unk>',
-                   padding_token=None, bos_token=None, eos_token=None, reserved_tokens=None)
-    assert len(v1) == 5
-    assert v1.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4}
-    assert v1.idx_to_token[1] == 'c'
-    assert v1.unknown_token == '<unk>'
-    assert v1.reserved_tokens is None
-    assert v1.embedding is None
-    assert 'a' in v1
-    assert v1.unknown_token in v1
-
-    v2 = nlp.Vocab(counter, max_size=None, min_freq=2, unknown_token='<unk>',
-                   padding_token=None, bos_token=None, eos_token=None, reserved_tokens=None)
-    assert len(v2) == 3
-    assert v2.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2}
-    assert v2.idx_to_token[1] == 'c'
-    assert v2.unknown_token == '<unk>'
-    assert v2.reserved_tokens is None
-    assert v2.embedding is None
-    assert 'a' not in v2
-    assert v2.unknown_token in v2
-
-    v3 = nlp.Vocab(counter, max_size=None, min_freq=100, unknown_token='<unk>',
-                   padding_token=None, bos_token=None, eos_token=None, reserved_tokens=None)
-    assert len(v3) == 1
-    assert v3.token_to_idx == {'<unk>': 0}
-    assert v3.idx_to_token[0] == '<unk>'
-    assert v3.unknown_token == '<unk>'
-    assert v3.reserved_tokens is None
-    assert v3.embedding is None
-    assert 'a' not in v3
-
-    v4 = nlp.Vocab(counter, max_size=2, min_freq=1, unknown_token='<unk>',
-                   padding_token=None, bos_token=None, eos_token=None, reserved_tokens=None)
-    assert len(v4) == 3
-    assert v4.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2}
-    assert v4.idx_to_token[1] == 'c'
-    assert v4.unknown_token == '<unk>'
-    assert v4.reserved_tokens is None
-    assert v4.embedding is None
-    assert 'a' not in v4
-
-    v5 = nlp.Vocab(counter, max_size=3, min_freq=1, unknown_token='<unk>',
-                   padding_token=None, bos_token=None, eos_token=None, reserved_tokens=None)
-    assert len(v5) == 4
-    assert v5.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2, 'a': 3}
-    assert v5.idx_to_token[1] == 'c'
-    assert v5.unknown_token == '<unk>'
-    assert v5.reserved_tokens is None
-    assert v5.embedding is None
-    assert 'a' in v5
-
-    v6 = nlp.Vocab(counter, max_size=100, min_freq=1, unknown_token='<unk>',
-                   padding_token=None, bos_token=None, eos_token=None, reserved_tokens=None)
-    assert len(v6) == 5
-    assert v6.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2, 'a': 3,
-                               'some_word$': 4}
-    assert v6.idx_to_token[1] == 'c'
-    assert v6.unknown_token == '<unk>'
-    assert v6.reserved_tokens is None
-    assert v6.embedding is None
-    assert 'a' in v6
-
-    v7 = nlp.Vocab(counter, max_size=1, min_freq=2, unknown_token='<unk>',
-                   padding_token=None, bos_token=None, eos_token=None, reserved_tokens=None)
-    assert len(v7) == 2
-    assert v7.token_to_idx == {'<unk>': 0, 'c': 1}
-    assert v7.idx_to_token[1] == 'c'
-    assert v7.unknown_token == '<unk>'
-    assert v7.reserved_tokens is None
-    assert v7.embedding is None
-    assert 'a' not in v7
-
-    with pytest.raises(AssertionError):
-        nlp.Vocab(counter, max_size=None, min_freq=0, unknown_token='<unknown>',
-              reserved_tokens=['b'])
-    with pytest.raises(AssertionError):
-        nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token='<unknown>',
-              reserved_tokens=['b', 'b'])
-    with pytest.raises(AssertionError):
-        nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token='<unknown>',
-              reserved_tokens=['b', '<unknown>'])
-
-    v8 = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token='<unknown>',
-                   padding_token=None, bos_token=None, eos_token=None, reserved_tokens=['b'])
-    assert len(v8) == 5
-    assert v8.token_to_idx == {'<unknown>': 0, 'b': 1, 'c': 2, 'a': 3, 'some_word$': 4}
-    assert v8.idx_to_token[1] == 'b'
-    assert v8.unknown_token == '<unknown>'
-    assert v8.reserved_tokens == ['b']
-    assert v8.embedding is None
-    assert 'a' in v8
-
-    v9 = nlp.Vocab(counter, max_size=None, min_freq=2, unknown_token='<unk>',
-                   padding_token=None, bos_token=None, eos_token=None, reserved_tokens=['b', 'a'])
-    assert len(v9) == 4
-    assert v9.token_to_idx == {'<unk>': 0, 'b': 1, 'a': 2, 'c': 3}
-    assert v9.idx_to_token[1] == 'b'
-    assert v9.unknown_token == '<unk>'
-    assert v9.reserved_tokens == ['b', 'a']
-    assert v9.embedding is None
-    assert 'a' in v9
-
-    v10 = nlp.Vocab(counter, max_size=None, min_freq=100, unknown_token='<unk>',
-                    padding_token=None, bos_token=None, eos_token=None, reserved_tokens=['b', 'c'])
-    assert len(v10) == 3
-    assert v10.token_to_idx == {'<unk>': 0, 'b': 1, 'c': 2}
-    assert v10.idx_to_token[1] == 'b'
-    assert v10.unknown_token == '<unk>'
-    assert v10.reserved_tokens == ['b', 'c']
-    assert v10.embedding is None
-    assert 'a' not in v10
-
-    v11 = nlp.Vocab(counter, max_size=1, min_freq=2, unknown_token='<unk>',
-                    padding_token=None, bos_token=None, eos_token=None,
-                    reserved_tokens=['<pad>', 'b'])
-    assert len(v11) == 4
-    assert v11.token_to_idx == {'<unk>': 0, '<pad>': 1, 'b': 2, 'c': 3}
-    assert v11.idx_to_token[1] == '<pad>'
-    assert v11.unknown_token == '<unk>'
-    assert v11.reserved_tokens == ['<pad>', 'b']
-    assert v11.embedding is None
-    assert 'a' not in v11
-
-    v12 = nlp.Vocab(counter, max_size=None, min_freq=2, unknown_token='b',
-                    padding_token=None, bos_token=None, eos_token=None, reserved_tokens=['<pad>'])
-    assert len(v12) == 3
-    assert v12.token_to_idx == {'b': 0, '<pad>': 1, 'c': 2}
-    assert v12.idx_to_token[1] == '<pad>'
-    assert v12.unknown_token == 'b'
-    assert v12.reserved_tokens == ['<pad>']
-    assert v12.embedding is None
-    assert 'a' not in v12
-
-    v13 = nlp.Vocab(counter, max_size=None, min_freq=2, unknown_token='a',
-                    padding_token=None, bos_token=None, eos_token=None, reserved_tokens=['<pad>'])
-    assert len(v13) == 4
-    assert v13.token_to_idx == {'a': 0, '<pad>': 1, 'c': 2, 'b': 3}
-    assert v13.idx_to_token[1] == '<pad>'
-    assert v13.unknown_token == 'a'
-    assert v13.reserved_tokens == ['<pad>']
-    assert v13.embedding is None
-    assert 'a' in v13
-
-    counter_tuple = nlp.data.utils.Counter([('a', 'a'), ('b', 'b'), ('b', 'b'), ('c', 'c'),
-                                            ('c', 'c'), ('c', 'c'), ('some_word$', 'some_word$')])
-
-    v14 = nlp.Vocab(counter_tuple, max_size=None, min_freq=1, unknown_token=('<unk>', '<unk>'),
-                    padding_token=None, bos_token=None, eos_token=None, reserved_tokens=None)
-    assert len(v14) == 5
-    assert v14.token_to_idx == {('<unk>', '<unk>'): 0, ('c', 'c'): 1, ('b', 'b'): 2, ('a', 'a'): 3,
-                                ('some_word$', 'some_word$'): 4}
-    assert v14.idx_to_token[1] == ('c', 'c')
-    assert v14.unknown_token == ('<unk>', '<unk>')
-    assert v14.reserved_tokens is None
-    assert v14.embedding is None
-    assert ('a', 'a') in v14
-    assert ('<unk>', '<unk>') in v14
-
-
-def _mk_my_pretrain_file(path, token_delim, pretrain_file):
-    path = os.path.expanduser(path)
-    if not os.path.exists(path):
-        os.makedirs(path)
-    seq1 = token_delim.join(['a', '0.1', '0.2', '0.3', '0.4', '0.5']) + '\n'
-    seq2 = token_delim.join(['b', '0.6', '0.7', '0.8', '0.9', '1.0']) + '\n'
-    seqs = seq1 + seq2
-    with open(os.path.join(path, pretrain_file), 'w') as fout:
-        fout.write(seqs)
-
-
-def _mk_my_pretrain_file2(path, token_delim, pretrain_file):
-    path = os.path.expanduser(path)
-    if not os.path.exists(path):
-        os.makedirs(path)
-    seq1 = token_delim.join(['a', '0.01', '0.02', '0.03', '0.04', '0.05']) + '\n'
-    seq2 = token_delim.join(['c', '0.06', '0.07', '0.08', '0.09', '0.1']) + '\n'
-    seqs = seq1 + seq2
-    with open(os.path.join(path, pretrain_file), 'w') as fout:
-        fout.write(seqs)
-
-
-def _mk_my_pretrain_file3(path, token_delim, pretrain_file):
-    path = os.path.expanduser(path)
-    if not os.path.exists(path):
-        os.makedirs(path)
-    seq1 = token_delim.join(['a', '0.1', '0.2', '0.3', '0.4', '0.5']) + '\n'
-    seq2 = token_delim.join(['b', '0.6', '0.7', '0.8', '0.9', '1.0']) + '\n'
-    seq3 = token_delim.join(['<unk1>', '1.1', '1.2', '1.3', '1.4',
-                             '1.5']) + '\n'
-    seqs = seq1 + seq2 + seq3
-    with open(os.path.join(path, pretrain_file), 'w') as fout:
-        fout.write(seqs)
-
-
-def _mk_my_pretrain_file4(path, token_delim, pretrain_file):
-    path = os.path.expanduser(path)
-    if not os.path.exists(path):
-        os.makedirs(path)
-    seq1 = token_delim.join(['a', '0.01', '0.02', '0.03', '0.04', '0.05']) + '\n'
-    seq2 = token_delim.join(['c', '0.06', '0.07', '0.08', '0.09', '0.1']) + '\n'
-    seq3 = token_delim.join(['<unk2>', '0.11', '0.12', '0.13', '0.14', '0.15']) + '\n'
-    seqs = seq1 + seq2 + seq3
-    with open(os.path.join(path, pretrain_file), 'w') as fout:
-        fout.write(seqs)
-
-
-def _mk_my_invalid_pretrain_file(path, token_delim, pretrain_file):
-    path = os.path.expanduser(path)
-    if not os.path.exists(path):
-        os.makedirs(path)
-    seq1 = token_delim.join(['a', '0.1', '0.2', '0.3', '0.4', '0.5']) + '\n'
-    seq2 = token_delim.join(['b', '0.6', '0.7', '0.8', '0.9', '1.0']) + '\n'
-    seq3 = token_delim.join(['c']) + '\n'
-    seqs = seq1 + seq2 + seq3
-    with open(os.path.join(path, pretrain_file), 'w') as fout:
-        fout.write(seqs)
-
-
-def _mk_my_invalid_pretrain_file2(path, token_delim, pretrain_file):
-    path = os.path.expanduser(path)
-    if not os.path.exists(path):
-        os.makedirs(path)
-    seq1 = token_delim.join(['a', '0.1', '0.2', '0.3', '0.4', '0.5']) + '\n'
-    seq2 = token_delim.join(['b', '0.6', '0.7', '0.8', '0.9', '1.0']) + '\n'
-    seq3 = token_delim.join(['c', '0.6', '0.7', '0.8']) + '\n'
-    seqs = seq1 + seq2 + seq3
-    with open(os.path.join(path, pretrain_file), 'w') as fout:
-        fout.write(seqs)
-
-
-@pytest.mark.parametrize('allow_extend', [True, False])
-@pytest.mark.serial
-def test_token_embedding_from_file(tmpdir, allow_extend):
-    embed_root = str(tmpdir)
-    embed_name = 'my_embed'
-    elem_delim = '\t'
-    pretrain_file = 'my_pretrain_file.txt'
-
-    from_file = functools.partial(nlp.embedding.TokenEmbedding.from_file, allow_extend=allow_extend)
-
-    _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, pretrain_file)
-
-    pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file)
-
-    my_embed = from_file(pretrain_file_path, elem_delim)
-
-    assert 'a' in my_embed
-    assert my_embed.unknown_token == '<unk>'
-    assert my_embed.unknown_token in my_embed
-
-    first_vec = my_embed.idx_to_vec[0]
-    assert_almost_equal(first_vec.asnumpy(), np.array([0, 0, 0, 0, 0]))
-
-    # Test properties
-    assert my_embed.token_to_idx == {'<unk>': 0, 'a': 1, 'b': 2}
-    assert my_embed.idx_to_token == ['<unk>', 'a', 'b']
-
-    assert_almost_equal(my_embed.idx_to_vec.asnumpy(),
-                       np.array([[0,  0,  0,  0,  0],
-                                 [0.1, 0.2, 0.3, 0.4, 0.5],
-                                 [0.6, 0.7, 0.8, 0.9, 1]])
-                       )
-
-    # Test __getitem__.
-    unk_vec = my_embed['A']
-    assert_almost_equal(unk_vec.asnumpy(), np.array([0, 0, 0, 0, 0]))
-
-    a_vec = my_embed['a']
-    assert_almost_equal(a_vec.asnumpy(), np.array([0.1, 0.2, 0.3, 0.4, 0.5]))
-
-    my_embed = from_file(pretrain_file_path, elem_delim)
-    # Test __setitem__.
-    my_embed['a'] = nd.array([1, 2, 3, 4, 5])
-    assert_almost_equal(my_embed['a'].asnumpy(), np.array([1, 2, 3, 4, 5]))
-    if allow_extend:
-        with pytest.warns(UserWarning):  # Should add multiple new tokens at a time
-            my_embed['unknown$$$'] = nd.array([0, 0, 0, 0, 0])
-        assert_almost_equal(my_embed['unknown$$$'].asnumpy(), np.array([0, 0, 0, 0, 0]))
-    else:
-        with pytest.raises(KeyError):
-            my_embed['unknown$$$'] = nd.array([0, 0, 0, 0, 0])
-    with pytest.raises(AssertionError):
-        my_embed['<unk>'] = nd.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]])
-    with pytest.raises(AssertionError):
-        my_embed['<unk>'] = nd.array([0])
-
-    unk_vecs = my_embed['<unk$unk@unk>', '<unk$unk@unk>']
-    assert_almost_equal(unk_vecs.asnumpy(), np.array([[0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]))
-
-    # Test loaded unknown vectors.
-    pretrain_file2 = 'my_pretrain_file2.txt'
-    _mk_my_pretrain_file3(os.path.join(embed_root, embed_name), elem_delim, pretrain_file2)
-    pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file2)
-    my_embed2 = from_file(pretrain_file_path, elem_delim, init_unknown_vec=nd.ones, unknown_token='<unk>')
-    unk_vec2 = my_embed2['<unk>']
-    assert_almost_equal(unk_vec2.asnumpy(), np.array([1, 1, 1, 1, 1]))
-    unk_vec2 = my_embed2['<unk$unk@unk>']
-    assert_almost_equal(unk_vec2.asnumpy(), np.array([1, 1, 1, 1, 1]))
-
-    my_embed3 = from_file(pretrain_file_path, elem_delim, init_unknown_vec=nd.ones, unknown_token='<unk1>')
-    unk_vec3 = my_embed3['<unk1>']
-    assert_almost_equal(unk_vec3.asnumpy(), np.array([1.1, 1.2, 1.3, 1.4, 1.5]))
-    unk_vec3 = my_embed3['<unk$unk@unk>']
-    assert_almost_equal(unk_vec3.asnumpy(), np.array([1.1, 1.2, 1.3, 1.4, 1.5]))
-
-    # Test error handling.
-    invalid_pretrain_file = 'invalid_pretrain_file.txt'
-    _mk_my_invalid_pretrain_file(os.path.join(embed_root, embed_name), elem_delim,
-                                 invalid_pretrain_file)
-    pretrain_file_path = os.path.join(embed_root, embed_name, invalid_pretrain_file)
-    with pytest.raises(AssertionError):
-        from_file(pretrain_file_path, elem_delim)
-
-    invalid_pretrain_file2 = 'invalid_pretrain_file2.txt'
-    _mk_my_invalid_pretrain_file2(os.path.join(embed_root, embed_name), elem_delim,
-                                  invalid_pretrain_file2)
-    pretrain_file_path = os.path.join(embed_root, embed_name, invalid_pretrain_file2)
-    with pytest.raises(AssertionError):
-        from_file(pretrain_file_path, elem_delim)
-
-
-def test_embedding_get_and_pretrain_file_names():
-    assert len(nlp.embedding.list_sources(embedding_name='fasttext')) == 486
-    assert len(nlp.embedding.list_sources(embedding_name='glove')) == 10
-    assert len(nlp.embedding.list_sources(embedding_name='word2vec')) == 3
-
-    reg = nlp.embedding.list_sources(embedding_name=None)
-
-    assert len(reg['glove']) == 10
-    assert len(reg['fasttext']) == 486
-    assert len(reg['word2vec']) == 3
-
-    with pytest.raises(KeyError):
-        nlp.embedding.list_sources('unknown$$')
-
-
-@pytest.mark.parametrize('allow_extend', [True, False])
-def test_vocab_set_embedding_with_one_custom_embedding(tmpdir, allow_extend, counter):
-    embed_root = str(tmpdir)
-    embed_name = 'my_embed'
-    elem_delim = '\t'
-    pretrain_file = 'my_pretrain_file1.txt'
-
-    from_file = functools.partial(nlp.embedding.TokenEmbedding.from_file, allow_extend=allow_extend)
-
-    _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, pretrain_file)
-
-    pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file)
-
-    v1 = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token='<unk>',
-                   padding_token=None, bos_token=None, eos_token=None, reserved_tokens=['<pad>'])
-    v1_no_unk = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token=None,
-                          padding_token=None, bos_token=None, eos_token=None,
-                          reserved_tokens=['<pad>'])
-
-    e1 = from_file(pretrain_file_path, elem_delim, init_unknown_vec=nd.ones)
-
-    assert v1.embedding is None
-    assert v1_no_unk.embedding is None
-    v1.set_embedding(e1)
-    v1_no_unk.set_embedding(e1)
-    assert v1.embedding is not None
-    assert v1_no_unk.embedding is not None
-
-    # Test properties
-    assert v1.embedding.token_to_idx == {'<unk>': 0, '<pad>': 1, 'c': 2, 'b': 3, 'a': 4, 'some_word$': 5}
-    assert v1.embedding.idx_to_token == ['<unk>', '<pad>', 'c', 'b', 'a', 'some_word$']
-
-    assert v1_no_unk.embedding.token_to_idx == {'<pad>': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4}
-    assert v1_no_unk.embedding.idx_to_token == ['<pad>', 'c', 'b', 'a', 'some_word$']
-
-    assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(),
-                        np.array([[1, 1, 1, 1, 1],
-                                  [1, 1, 1, 1, 1],
-                                  [1, 1, 1, 1, 1],
-                                  [0.6, 0.7, 0.8, 0.9, 1],
-                                  [0.1, 0.2, 0.3, 0.4, 0.5],
-                                  [1, 1, 1, 1, 1]])
-                        )
-    assert_almost_equal(v1_no_unk.embedding.idx_to_vec.asnumpy(),
-                        np.array([[1, 1, 1, 1, 1],
-                                  [1, 1, 1, 1, 1],
-                                  [0.6, 0.7, 0.8, 0.9, 1],
-                                  [0.1, 0.2, 0.3, 0.4, 0.5],
-                                  [1, 1, 1, 1, 1]])
-                        )
-
-    assert_almost_equal(v1.embedding['c'].asnumpy(),
-                        np.array([1, 1, 1, 1, 1])
-                        )
-    assert_almost_equal(v1_no_unk.embedding['c'].asnumpy(),
-                        np.array([1, 1, 1, 1, 1])
-                        )
-
-    assert_almost_equal(v1.embedding[['c']].asnumpy(),
-                        np.array([[1, 1, 1, 1, 1]])
-                        )
-    assert_almost_equal(v1_no_unk.embedding[['c']].asnumpy(),
-                        np.array([[1, 1, 1, 1, 1]])
-                        )
-
-    assert_almost_equal(v1.embedding[['a', 'not_exist']].asnumpy(),
-                        np.array([[0.1, 0.2, 0.3, 0.4, 0.5],
-                                  [1, 1, 1, 1, 1]])
-                        )
-    with pytest.raises(KeyError):
-        v1_no_unk.embedding['a', 'not_exist']
-
-    assert_almost_equal(v1.embedding[['a', 'b']].asnumpy(),
-                        np.array([[0.1, 0.2, 0.3, 0.4, 0.5],
-                                  [0.6, 0.7, 0.8, 0.9, 1]])
-                        )
-    assert_almost_equal(v1_no_unk.embedding[['a', 'b']].asnumpy(),
-                        np.array([[0.1, 0.2, 0.3, 0.4, 0.5],
-                                  [0.6, 0.7, 0.8, 0.9, 1]])
-                        )
-
-    assert_almost_equal(v1.embedding[['A', 'b']].asnumpy(),
-                        np.array([[1, 1, 1, 1, 1],
-                                  [0.6, 0.7, 0.8, 0.9, 1]])
-                        )
-    with pytest.raises(KeyError):
-        v1_no_unk.embedding['A', 'b']
-
-    v1.embedding['a'] = nd.array([2, 2, 2, 2, 2])
-    v1.embedding['b'] = nd.array([3, 3, 3, 3, 3])
-    v1_no_unk.embedding['a'] = nd.array([2, 2, 2, 2, 2])
-    v1_no_unk.embedding['b'] = nd.array([3, 3, 3, 3, 3])
-
-    assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(),
-                        np.array([[1, 1, 1, 1, 1],
-                                  [1, 1, 1, 1, 1],
-                                  [1, 1, 1, 1, 1],
-                                  [3, 3, 3, 3, 3],
-                                  [2, 2, 2, 2, 2],
-                                  [1, 1, 1, 1, 1]])
-                        )
-
-    assert_almost_equal(v1_no_unk.embedding.idx_to_vec.asnumpy(),
-                        np.array([[1, 1, 1, 1, 1],
-                                  [1, 1, 1, 1, 1],
-                                  [3, 3, 3, 3, 3],
-                                  [2, 2, 2, 2, 2],
-                                  [1, 1, 1, 1, 1]])
-                        )
-
-    v1.embedding['<unk>'] = nd.array([0, 0, 0, 0, 0])
-    assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(),
-                        np.array([[0, 0, 0, 0, 0],
-                                  [1, 1, 1, 1, 1],
-                                  [1, 1, 1, 1, 1],
-                                  [3, 3, 3, 3, 3],
-                                  [2, 2, 2, 2, 2],
-                                  [1, 1, 1, 1, 1]])
-                        )
-    with pytest.raises(KeyError):
-        # The TokenEmbedding assigned to a vocab is never extendable
-        v1_no_unk.embedding['<unk>'] = nd.array([0, 0, 0, 0, 0])
-    v1.embedding['<unk>'] = nd.array([10, 10, 10, 10, 10])
-    assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(),
-                        np.array([[10, 10, 10, 10, 10],
-                                  [1, 1, 1, 1, 1],
-                                  [1, 1, 1, 1, 1],
-                                  [3, 3, 3, 3, 3],
-                                  [2, 2, 2, 2, 2],
-                                  [1, 1, 1, 1, 1]])
-                        )
-
-    v1.set_embedding(None)
-    assert v1.embedding is None
-    v1_no_unk.set_embedding(None)
-    assert v1_no_unk.embedding is None
-
-
-@pytest.mark.parametrize('allow_extend', [True, False])
-def test_vocab_set_embedding_with_two_custom_embeddings(tmpdir, allow_extend, counter):
-    embed_root = str(tmpdir)
-    embed_name = 'my_embed'
-    elem_delim = '\t'
-    pretrain_file1 = 'my_pretrain_file1.txt'
-    pretrain_file2 = 'my_pretrain_file2.txt'
-
-    from_file = functools.partial(nlp.embedding.TokenEmbedding.from_file, allow_extend=allow_extend)
-
-    _mk_my_pretrain_file(os.path.join(embed_root, embed_name), elem_delim, pretrain_file1)
-    _mk_my_pretrain_file2(os.path.join(embed_root, embed_name), elem_delim, pretrain_file2)
-
-    pretrain_file_path1 = os.path.join(embed_root, embed_name, pretrain_file1)
-    pretrain_file_path2 = os.path.join(embed_root, embed_name, pretrain_file2)
-
-    my_embed1 = from_file(pretrain_file_path1, elem_delim, init_unknown_vec=nd.ones)
-    my_embed2 = from_file(pretrain_file_path2, elem_delim)
-
-    v1 = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token='<unk>',
-                   padding_token=None, bos_token=None, eos_token=None, reserved_tokens=None)
-    v1.set_embedding(my_embed1, my_embed2)
-    assert v1.embedding is not None
-    assert v1.embedding.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4}
-    assert v1.embedding.idx_to_token == ['<unk>', 'c', 'b', 'a', 'some_word$']
-
-    with pytest.raises(AssertionError):
-        v1.set_embedding(my_embed1, None, my_embed2)
-    assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(),
-                        np.array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
-                                  [1, 1, 1, 1, 1, 0.06, 0.07, 0.08, 0.09, 0.1],
-                                  [0.6, 0.7, 0.8, 0.9, 1, 0, 0, 0, 0, 0],
-                                  [0.1, 0.2, 0.3, 0.4, 0.5,
-                                   0.01, 0.02, 0.03, 0.04, 0.05],
-                                  [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])
-                        )
-
-    assert_almost_equal(v1.embedding['c'].asnumpy(),
-                        np.array([1, 1, 1, 1, 1, 0.06, 0.07, 0.08, 0.09, 0.1])
-                        )
-
-    assert_almost_equal(v1.embedding[['b', 'not_exist']].asnumpy(),
-                        np.array([[0.6, 0.7, 0.8, 0.9, 1, 0, 0, 0, 0, 0],
-                                  [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])
-                        )
-
-    v1.embedding['a'] = nd.array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
-    v1.embedding['b'] = nd.array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3])
-
-    assert_almost_equal(v1.embedding.idx_to_vec.asnumpy(),
-                        np.array([[1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
-                                  [1, 1, 1, 1, 1, 0.06, 0.07, 0.08, 0.09, 0.1],
-                                  [3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
-                                  [2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
-                                  [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])
-                        )
-
-    # Test loaded unknown tokens
-    pretrain_file3 = 'my_pretrain_file3.txt'
-    pretrain_file4 = 'my_pretrain_file4.txt'
-
-    _mk_my_pretrain_file3(os.path.join(embed_root, embed_name), elem_delim, pretrain_file3)
-    _mk_my_pretrain_file4(os.path.join(embed_root, embed_name), elem_delim, pretrain_file4)
-
-    pretrain_file_path3 = os.path.join(embed_root, embed_name, pretrain_file3)
-    pretrain_file_path4 = os.path.join(embed_root, embed_name, pretrain_file4)
-
-    my_embed3 = from_file(pretrain_file_path3, elem_delim, init_unknown_vec=nd.ones,
-                          unknown_token='<unk1>')
-    my_embed4 = from_file(pretrain_file_path4, elem_delim, unknown_token='<unk2>')
-
-    v2 = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token='<unk>', padding_token=None,
-                   bos_token=None, eos_token=None, reserved_tokens=None)
-    v2.set_embedding(my_embed3, my_embed4)
-    assert v2.embedding.token_to_idx == {'<unk>': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4}
-    assert v2.embedding.idx_to_token == ['<unk>', 'c', 'b', 'a', 'some_word$']
-    assert_almost_equal(v2.embedding.idx_to_vec.asnumpy(),
-                        np.array([[1.1, 1.2, 1.3, 1.4, 1.5,
-                                   0.11, 0.12, 0.13, 0.14, 0.15],
-                                  [1.1, 1.2, 1.3, 1.4, 1.5,
-                                   0.06, 0.07, 0.08, 0.09, 0.1],
-                                  [0.6, 0.7, 0.8, 0.9, 1,
-                                   0.11, 0.12, 0.13, 0.14, 0.15],
-                                  [0.1, 0.2, 0.3, 0.4, 0.5,
-                                   0.01, 0.02, 0.03, 0.04, 0.05],
-                                  [1.1, 1.2, 1.3, 1.4, 1.5,
-                                   0.11, 0.12, 0.13, 0.14, 0.15]])
-                        )
-
-    v3 = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token='<unk1>', padding_token=None,
-                   bos_token=None, eos_token=None, reserved_tokens=None)
-    v3.set_embedding(my_embed3, my_embed4)
-    assert v3.embedding.token_to_idx == {'<unk1>': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4}
-    assert v3.embedding.idx_to_token == ['<unk1>', 'c', 'b', 'a', 'some_word$']
-    assert_almost_equal(v3.embedding.idx_to_vec.asnumpy(),
-                        np.array([[1.1, 1.2, 1.3, 1.4, 1.5,
-                                   0.11, 0.12, 0.13, 0.14, 0.15],
-                                  [1.1, 1.2, 1.3, 1.4, 1.5,
-                                   0.06, 0.07, 0.08, 0.09, 0.1],
-                                  [0.6, 0.7, 0.8, 0.9, 1,
-                                   0.11, 0.12, 0.13, 0.14, 0.15],
-                                  [0.1, 0.2, 0.3, 0.4, 0.5,
-                                   0.01, 0.02, 0.03, 0.04, 0.05],
-                                  [1.1, 1.2, 1.3, 1.4, 1.5,
-                                   0.11, 0.12, 0.13, 0.14, 0.15]])
-                        )
-
-    v4 = nlp.Vocab(counter, max_size=None, min_freq=1, unknown_token='<unk2>', padding_token=None,
-                   bos_token=None, eos_token=None, reserved_tokens=None)
-    v4.set_embedding(my_embed3, my_embed4)
-    assert v4.embedding.token_to_idx == {'<unk2>': 0, 'c': 1, 'b': 2, 'a': 3, 'some_word$': 4}
-    assert v4.embedding.idx_to_token == ['<unk2>', 'c', 'b', 'a', 'some_word$']
-    assert_almost_equal(v4.embedding.idx_to_vec.asnumpy(),
-                        np.array([[1.1, 1.2, 1.3, 1.4, 1.5,
-                                   0.11, 0.12, 0.13, 0.14, 0.15],
-                                  [1.1, 1.2, 1.3, 1.4, 1.5,
-                                   0.06, 0.07, 0.08, 0.09, 0.1],
-                                  [0.6, 0.7, 0.8, 0.9, 1,
-                                   0.11, 0.12, 0.13, 0.14, 0.15],
-                                  [0.1, 0.2, 0.3, 0.4, 0.5,
-                                   0.01, 0.02, 0.03, 0.04, 0.05],
-                                  [1.1, 1.2, 1.3, 1.4, 1.5,
-                                   0.11, 0.12, 0.13, 0.14, 0.15]])
-                        )
-
-    counter2 = nlp.data.utils.Counter(['b', 'b', 'c', 'c', 'c', 'some_word$'])
-
-    v5 = nlp.Vocab(counter2, max_size=None, min_freq=1, unknown_token='a', padding_token=None,
-                   bos_token=None, eos_token=None, reserved_tokens=None)
-    v5.set_embedding(my_embed3, my_embed4)
-    assert v5.embedding.token_to_idx == {'a': 0, 'c': 1, 'b': 2, 'some_word$': 3}
-    assert v5.embedding.idx_to_token == ['a', 'c', 'b', 'some_word$']
-    assert_almost_equal(v5.embedding.idx_to_vec.asnumpy(),
-                        np.array([[1.1, 1.2, 1.3, 1.4, 1.5,
-                                   0.11, 0.12, 0.13, 0.14, 0.15],
-                                  [1.1, 1.2, 1.3, 1.4, 1.5,
-                                   0.06, 0.07, 0.08, 0.09, 0.1],
-                                  [0.6, 0.7, 0.8, 0.9, 1,
-                                   0.11, 0.12, 0.13, 0.14, 0.15],
-                                  [1.1, 1.2, 1.3, 1.4, 1.5,
-                                   0.11, 0.12, 0.13, 0.14, 0.15]])
-                        )
-
-
-@pytest.mark.parametrize('allow_extend', [True, False])
-@pytest.mark.parametrize('unknown_token', [True, False])
-@pytest.mark.parametrize('vocab_unknown_token', [True, False])
-@pytest.mark.parametrize('initialize', [True, False])
-def test_vocab_set_embedding_with_subword_lookup_only_token_embedding(
-        allow_extend, unknown_token, vocab_unknown_token, initialize):
-    embsize = 5
-
-    class NaiveLookup:
-        def __contains__(self, token):
-            return True
-
-        def __getitem__(self, tokens):
-            if isinstance(tokens, str):
-                return nd.ones(embsize)
-            else:
-                return nd.ones((len(tokens), embsize))
-
-    c = nlp.data.utils.Counter(['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])
-    v = nlp.Vocab(c, max_size=None, min_freq=1,
-                  unknown_token='<unk>' if vocab_unknown_token else None,
-                  padding_token='<pad>')
-
-    assert v.embedding is None
-
-    e = nlp.embedding.TokenEmbedding(
-        unknown_lookup=NaiveLookup(), allow_extend=allow_extend,
-        unknown_token='<unk>' if unknown_token else None)
-
-    if initialize and unknown_token:
-        e[e.unknown_token] = nd.zeros(embsize)
-    elif initialize and allow_extend:
-        with pytest.warns(UserWarning):  # encouraged to batch their updates
-            e["hello"] = e.unknown_lookup["hello"]
-    else:  # Cannot initialize, even if initialize is True
-        with pytest.raises(AssertionError):
-            v.set_embedding(e)
-        return  # cannot test more
-
-    v.set_embedding(e)
-    assert v.embedding is not None
-    assert v.embedding.idx_to_vec is not None
-    assert v.embedding.idx_to_vec.shape == (len(v), embsize)
-
-    for t in c.keys():
-        assert np.all(np.isclose(1, v.embedding[t].asnumpy()))
-
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_download_embed():
-    @nlp.embedding.register
-    class Test(nlp.embedding.TokenEmbedding):
-        # 33 bytes.
-        source_file_hash = \
-                {'embedding_test': ('embedding_test.vec',
-                                    '29b9a6511cf4b5aae293c44a9ec1365b74f2a2f8')}
-        namespace = 'test'
-
-        def __init__(self, embedding_root=os.path.join(get_home_dir(), 'embedding'),
-                     init_unknown_vec=nd.zeros, **kwargs):
-            source = 'embedding_test'
-            Test._check_source(self.source_file_hash, source)
-
-            file_path = Test._get_file_path(self.source_file_hash,
-                                            embedding_root, source)
-            unknown_token = kwargs.pop('unknown_token', '<unk>')
-            idx_to_token, idx_to_vec, unknown_token = self._load_embedding(
-                file_path,
-                elem_delim=' ',
-                unknown_token=unknown_token,
-                init_unknown_vec=init_unknown_vec)
-
-            return super(Test, self).__init__(unknown_token=unknown_token,
-                                              init_unknown_vec=None,
-                                              idx_to_token=idx_to_token,
-                                              idx_to_vec=idx_to_vec,
-                                              **kwargs)
-
-    test_embed = nlp.embedding.create('test')
-    assert_almost_equal(test_embed['hello'].asnumpy(), (nd.arange(5) + 1).asnumpy())
-    assert_almost_equal(test_embed['world'].asnumpy(), (nd.arange(5) + 6).asnumpy())
-    assert_almost_equal(test_embed['<unk>'].asnumpy(), nd.zeros((5,)).asnumpy())
-
-
-def test_vocab_serialization():
-    # Preserving unknown_token behaviour
-    vocab = nlp.Vocab(unknown_token=None)
-    with pytest.raises(KeyError):
-        vocab['hello']
-    loaded_vocab = nlp.Vocab.from_json(vocab.to_json())
-    with pytest.raises(KeyError):
-        loaded_vocab['hello']
-
-    vocab = nlp.Vocab(unknown_token='abc')
-    vocab['hello']
-    loaded_vocab = nlp.Vocab.from_json(vocab.to_json())
-    loaded_vocab['hello']
-
-
-def test_token_embedding_from_serialized_file(tmpdir):
-    embed_root = str(tmpdir)
-    embed_name = 'my_embed'
-    elem_delim = '\t'
-    pretrain_file = 'my_pretrain_file.txt'
-    serialize_file = 'my_pretrain_file.npz'
-
-    _mk_my_pretrain_file(
-        os.path.join(embed_root, embed_name), elem_delim, pretrain_file)
-
-    pretrain_file_path = os.path.join(embed_root, embed_name, pretrain_file)
-    serialize_file_path = os.path.join(embed_root, embed_name, serialize_file)
-
-    # Serialize the embedding in format suitable for storage on S3 and test if
-    # loading the serialized file always results in the same as loading the
-    # text file would
-    my_embed_for_serialization = nlp.embedding.TokenEmbedding.from_file(
-        pretrain_file_path, elem_delim=elem_delim, unknown_token=None)
-    my_embed_for_serialization.serialize(serialize_file_path)
-
-    # Test w/wo unknown token
-    known_unknown_token = my_embed_for_serialization.idx_to_token[-1]
-    for unknown_token in [None, '<some_unknown_token>', known_unknown_token]:
-        my_embed_text = nlp.embedding.TokenEmbedding.from_file(
-            pretrain_file_path, elem_delim=elem_delim,
-            unknown_token=unknown_token)
-        my_embed_serialize = nlp.embedding.TokenEmbedding.from_file(
-            serialize_file_path, unknown_token=unknown_token)
-        assert my_embed_serialize == my_embed_text
-
-
-@pytest.mark.parametrize('unknown_token',
-                         ['<strangetoken>', None, nlp._constants.UNK_TOKEN])
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_token_embedding_from_file_S3_with_custom_unknown_token(unknown_token):
-    nlp.embedding.create('glove', source='glove.6B.50d',
-                         unknown_token=unknown_token)
-
-
-@pytest.mark.parametrize('load_ngrams', [True, False])
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_token_embedding_from_S3_fasttext_with_ngrams(load_ngrams):
-    embed = nlp.embedding.create('fasttext', source='wiki.simple',
-                                 load_ngrams=load_ngrams, unknown_token=None)
-    if load_ngrams:
-        embed['$$$unknownword$$$']
-    else:
-        with pytest.raises(KeyError):
-            embed['$$$unknownword$$$']
-
-
-@pytest.mark.parametrize('setinconstructor', [True, False])
-@pytest.mark.parametrize('lookup', ['naive', 'incapable'])
-@pytest.mark.parametrize('initializetokenembedding', [True, False])
-@pytest.mark.parametrize('unknown_token', [True, False])
-@pytest.mark.parametrize('allow_extend', [True, False])
-def test_token_embedding_unknown_lookup(setinconstructor, lookup,
-                                        initializetokenembedding,
-                                        unknown_token, allow_extend, tmpdir):
-    class NaiveLookup:
-        dim = 5  # Must match _mk_my_pretrain_file
-
-        def __contains__(self, token):
-            return True
-
-        def __getitem__(self, tokens):
-            if isinstance(tokens, str):
-                return nd.ones(self.dim)
-            else:
-                return nd.ones((len(tokens), self.dim))
-
-    class IncapableLookup:
-        def __contains__(self, token):
-            return False
-
-        def __getitem__(self, tokens):
-            raise KeyError
-
-    if initializetokenembedding:
-        # Load a TokenEmbedding with idx_to_vec already initialized
-        embed_root = str(tmpdir)
-        embed_name = 'my_embed'
-        elem_delim = '\t'
-        pretrain_file = 'my_pretrain_file.txt'
-        _mk_my_pretrain_file(
-            os.path.join(embed_root, embed_name), elem_delim, pretrain_file)
-        pretrain_file_path = os.path.join(embed_root, embed_name,
-                                          pretrain_file)
-        TokenEmbedding = functools.partial(
-            nlp.embedding.TokenEmbedding.from_file, pretrain_file_path,
-            elem_delim)
-    else:
-        TokenEmbedding = nlp.embedding.token_embedding.TokenEmbedding
-
-    Lookup = NaiveLookup if lookup == "naive" else IncapableLookup
-
-    if setinconstructor:
-        TokEmb = functools.partial(
-            TokenEmbedding, unknown_lookup=Lookup(), allow_extend=allow_extend,
-            unknown_token='<unk>' if unknown_token else None)
-    else:
-
-        def TokEmb(*args, **kwargs):
-            token_embedding = TokenEmbedding(
-                allow_extend=allow_extend,
-                unknown_token='<unk>' if unknown_token else None, *args,
-                **kwargs)
-            token_embedding.unknown_lookup = Lookup()
-            return token_embedding
-
-    token_embedding = TokEmb()
-    if lookup == 'incapable' and not initializetokenembedding:
-        with pytest.raises(KeyError):
-            token_embedding['hello']
-    elif lookup == 'incapable' and initializetokenembedding and not unknown_token:
-        with pytest.raises(KeyError):
-            token_embedding['hello']
-    elif lookup == 'incapable' and initializetokenembedding and unknown_token:
-        assert 'hello' not in token_embedding.token_to_idx
-        assert np.all(np.isclose(0, token_embedding['hello'].asnumpy()))
-        assert 'hello' not in token_embedding.token_to_idx
-    elif lookup != 'naive':
-        raise RuntimeError('Invalid test parameterization.')
-    else:
-        assert 'hello' not in token_embedding.token_to_idx
-        assert np.all(np.isclose(1, token_embedding['hello'].asnumpy()))
-        assert 'hello' not in token_embedding.token_to_idx
-
-        if allow_extend:
-            with pytest.warns(UserWarning):  # encouraged to batch their updates
-                token_embedding['hello'] = token_embedding.unknown_lookup['hello']
-            assert 'hello' in token_embedding.token_to_idx
-            assert np.all(np.isclose(1, token_embedding['hello'].asnumpy()))
-
-            token_embedding[['hello2', 'world']] = \
-                token_embedding.unknown_lookup[['hello2', 'world']]
-            assert 'hello2' in token_embedding.token_to_idx
-            assert 'world' in token_embedding.token_to_idx
-            assert np.all(np.isclose(1, token_embedding['hello2'].asnumpy()))
-
-
-@pytest.mark.parametrize('initializeidxtovecbyextending', [True, False])
-def test_token_embedding_manual_extension(initializeidxtovecbyextending,
-                                          tmpdir):
-    if not initializeidxtovecbyextending:
-        # Load a TokenEmbedding with idx_to_vec already initialized
-        embed_root = str(tmpdir)
-        embed_name = 'my_embed'
-        elem_delim = '\t'
-        pretrain_file = 'my_pretrain_file.txt'
-        _mk_my_pretrain_file(
-            os.path.join(embed_root, embed_name), elem_delim, pretrain_file)
-        pretrain_file_path = os.path.join(embed_root, embed_name,
-                                          pretrain_file)
-        TokEmb = functools.partial(nlp.embedding.TokenEmbedding.from_file,
-                                   pretrain_file_path, elem_delim,
-                                   allow_extend=True)
-    else:
-        TokEmb = functools.partial(
-            nlp.embedding.token_embedding.TokenEmbedding, allow_extend=True)
-
-    # Uninitialized token_embedding._idx_to_vec based
-    token_embedding = TokEmb()
-    with pytest.warns(UserWarning):  # encouraged to batch their updates
-        token_embedding['hello'] = nd.zeros(shape=(1, 5))
-    assert np.all(np.isclose(0, token_embedding['hello'].asnumpy()))
-
-    token_embedding = TokEmb()
-    with pytest.warns(UserWarning):  # encouraged to batch their updates
-        token_embedding['hello'] = nd.zeros(shape=(5, ))
-    assert np.all(np.isclose(0, token_embedding['hello'].asnumpy()))
-
-    token_embedding = TokEmb()
-    token_embedding[['hello', 'world']] = nd.zeros(shape=(2, 5))
-    assert np.all(np.isclose(0, token_embedding['hello'].asnumpy()))
-    assert np.all(np.isclose(0, token_embedding['world'].asnumpy()))
-
-    with pytest.raises(AssertionError):
-        token_embedding = TokEmb()
-        token_embedding[['hello', 'world']] = nd.zeros(shape=(1, 5))
-
-    with pytest.raises(AssertionError):
-        token_embedding = TokEmb()
-        token_embedding[['hello', 'world']] = nd.zeros(shape=(5, ))
-
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_token_embedding_serialization():
-    with warnings.catch_warnings():
-        warnings.simplefilter("ignore")
-        # UserWarning: New token embedding test_vocab_embed.Test registered
-        # with name test isoverriding existing token embedding
-        # test_vocab_embed.Test
-
-        @nlp.embedding.register
-        class Test(nlp.embedding.TokenEmbedding):
-            # 33 bytes.
-            source_file_hash = \
-                    {'embedding_test': ('embedding_test.vec',
-                                        '29b9a6511cf4b5aae293c44a9ec1365b74f2a2f8')}
-            namespace = 'test'
-
-            def __init__(self, embedding_root=os.path.join(get_home_dir(), 'embedding'), **kwargs):
-                source = 'embedding_test'
-                Test._check_source(self.source_file_hash, source)
-
-                file_path = Test._get_file_path(self.source_file_hash, embedding_root, source)
-
-                unknown_token = kwargs.pop('unknown_token', '<unk>')
-                init_unknown_vec = kwargs.pop('init_unknown_vec', nd.zeros)
-                idx_to_token, idx_to_vec, unknown_token = self._load_embedding(
-                    file_path, elem_delim=' ', unknown_token=unknown_token,
-                    init_unknown_vec=init_unknown_vec)
-
-                super(Test,
-                      self).__init__(unknown_token=unknown_token, init_unknown_vec=None,
-                                     idx_to_token=idx_to_token, idx_to_vec=idx_to_vec, **kwargs)
-
-
-    emb = nlp.embedding.create('test')
-
-    # Test uncompressed serialization
-    file_path = os.path.join('tests', 'data', 'embedding', 'embeddings.npz')
-    emb.serialize(file_path, compress=False)
-    loaded_emb = Test.deserialize(file_path)
-    assert loaded_emb == emb
-
-    # Test compressed serialization
-    file_path_compressed = os.path.join('tests', 'data', 'embedding', 'embeddings_compressed.npz')
-    emb.serialize(file_path_compressed, compress=True)
-    loaded_emb = Test.deserialize(file_path)
-    assert loaded_emb == emb
-
-
-def test_word_embedding_evaluation_registry():
-    with pytest.raises(RuntimeError):
-
-        @nlp.embedding.evaluation.register
-        class InvalidEvaluationFunction:
-            pass
-
-    with pytest.raises(KeyError):
-        nlp.embedding.evaluation.create('invalid', 'InvalidEvaluationFunction')
-
-    nlp.embedding.evaluation.list_evaluation_functions()
-    nlp.embedding.evaluation.list_evaluation_functions(kind='similarity')
-    nlp.embedding.evaluation.list_evaluation_functions(kind='analogy')
-    with pytest.raises(KeyError):
-        nlp.embedding.evaluation.list_evaluation_functions('invalid')
-
-
-@pytest.mark.parametrize(
-    'similarity_function',
-    nlp.embedding.evaluation.list_evaluation_functions('similarity'))
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_word_embedding_similarity_evaluation_models(similarity_function):
-    try:
-        from scipy import stats
-    except ImportError:
-        raise ImportError('This testcase requires scipy.')
-
-    dataset = nlp.data.WordSim353()
-
-    counter = nlp.data.utils.Counter(w for wpair in dataset for w in wpair[:2])
-    vocab = nlp.vocab.Vocab(counter)
-    vocab.set_embedding(nlp.embedding.create('fasttext', source='wiki.simple'))
-
-    data = [[vocab[d[0]], vocab[d[1]], d[2]] for d in dataset]
-    words1, words2, scores = zip(*data)
-
-    evaluator = nlp.embedding.evaluation.WordEmbeddingSimilarity(
-        vocab.embedding.idx_to_vec,
-        similarity_function=similarity_function)
-    evaluator.initialize()
-
-    words1, words2 = nd.array(words1), nd.array(words2)
-    pred_similarity = evaluator(words1, words2)
-
-    sr = stats.spearmanr(pred_similarity.asnumpy(), np.array(scores))
-    assert np.isclose(0.6076485693769645, sr.correlation)
-
-
-@pytest.mark.parametrize(
-    'analogy_function',
-    nlp.embedding.evaluation.list_evaluation_functions('analogy'))
-@pytest.mark.serial
-@pytest.mark.remote_required
-def test_word_embedding_analogy_evaluation_models(analogy_function):
-    dataset = nlp.data.GoogleAnalogyTestSet()
-    dataset = [d for i, d in enumerate(dataset) if i < 10]
-
-    embedding = nlp.embedding.create('fasttext', source='wiki.simple')
-    counter = nlp.data.utils.Counter(embedding.idx_to_token)
-    vocab = nlp.vocab.Vocab(counter)
-    vocab.set_embedding(embedding)
-
-    dataset_coded = [[vocab[d[0]], vocab[d[1]], vocab[d[2]], vocab[d[3]]]
-                     for d in dataset]
-    dataset_coded_nd = nd.array(dataset_coded, dtype=np.int64)
-
-    for k in [1, 3]:
-        for exclude_question_words in [True, False]:
-            evaluator = nlp.embedding.evaluation.WordEmbeddingAnalogy(
-                idx_to_vec=vocab.embedding.idx_to_vec,
-                analogy_function=analogy_function, k=k,
-                exclude_question_words=exclude_question_words)
-            evaluator.initialize()
-
-            words1 = dataset_coded_nd[:, 0]
-            words2 = dataset_coded_nd[:, 1]
-            words3 = dataset_coded_nd[:, 2]
-            pred_idxs = evaluator(words1, words2, words3).astype(np.int64)
-
-            # If we don't exclude inputs most predictions should be wrong
-            words4 = dataset_coded_nd[:, 3]
-            accuracy = (pred_idxs[:, 0] == words4).astype(np.float64).mean()
-            accuracy = accuracy.asscalar()
-            if not exclude_question_words:
-                assert accuracy <= 0.1
-
-                # Instead the model would predict W3 most of the time
-                accuracy_w3 = (pred_idxs[:, 0] == words3).astype(np.float64).mean()
-                assert accuracy_w3.asscalar() >= 0.89
-
-            else:
-                # The wiki.simple vectors don't perform too good
-                assert accuracy >= 0.29
-
-            # Assert output shape
-            assert pred_idxs.shape[1] == k
-
-
-def test_subword_function_bytes():
-    sf = nlp.vocab.create_subword_function('ByteSubwords')
-
-    assert [[116, 101, 115, 116]] == sf([u'test'])
-    assert [[207, 132, 206, 181, 207, 131, 207, 132]] == sf([u'τεστ'])
-
-
-def test_subword_function_ngramhashes():
-    num_subwords = 1000
-    sf = nlp.vocab.create_subword_function('NGramHashes', ngrams=[3, 4, 5, 6],
-                                           num_subwords=num_subwords)
-
-    assert set([8, 195, 271, 500, 201, 445, 379, 831, 617, 851]) == set(sf(['test'])[0])
-    assert set([8, 195, 271, 500, 201, 445, 379, 831, 617, 851]) == set(sf([u'test'])[0])
-    assert set([429, 793, 101, 334, 295, 474, 145, 524, 388, 790]) == set(sf([u'τεστ'])[0])
-    assert 1669484008 == sf.fasttext_hash_asbytes('<te')
-    assert 1669484008 == sf.fasttext_hash_asbytes(u'<te')
-    assert 2688791429 == sf.fasttext_hash_asbytes(u'<τε')
-    assert 1669484008 % num_subwords == next(iter(sf.subwords_to_indices(['<te'])))
-    assert 1669484008 % num_subwords == next(iter(sf.subwords_to_indices([u'<te'])))
-    assert 2688791429 % num_subwords == next(iter(sf.subwords_to_indices([u'<τε'])))
-
-
-@pytest.mark.parametrize('unknown_token', ['<unk>', None])
-@pytest.mark.parametrize('padding_token', ['<pad>', '<eos>', None])  # padding_token == eos_token
-@pytest.mark.parametrize('eos_token', ['<eos>', None])
-@pytest.mark.parametrize('reserved_tokens', [['<tok>'], []])
-def test_vocab_duplicate_special_tokens(unknown_token, padding_token,
-                                        eos_token, reserved_tokens):
-    """Different special tokens are allowed to map to the same representations.
-
-    Special tokens are a subset of the reserved tokens. In general reserved
-    tokens must not contain duplicates; however, it is allowed that multiple
-    special tokens use the same reserved token.
-
-    """
-    counter = nlp.data.utils.Counter(
-        ['a', 'b', 'b', 'c', 'c', 'c', 'some_word$'])
-
-    Vocab = functools.partial(nlp.Vocab,
-                              counter,
-                              max_size=None,
-                              min_freq=1,
-                              unknown_token=unknown_token,
-                              padding_token=padding_token,
-                              bos_token=None,
-                              eos_token=eos_token)
-
-    v = Vocab(reserved_tokens=reserved_tokens)
-
-    # Duplicate special tokens must not corrupt the index
-    # (Broken before GluonNLP 0.7)
-    if eos_token is not None and padding_token == eos_token:
-        # padding_token == eos_token; there should only be a single index for
-        # <eos>
-        # Before GluonNLP 0.7, idx_to_token looked like
-        # ['<unk>', '<eos>', '<eos>', 'c', 'b', 'a']
-        # But it should look like
-        # ['<unk>', '<eos>', 'c', 'b', 'a']
-        assert len(v.idx_to_token) == len(v.token_to_idx)
-        assert len(v.idx_to_token) == len(set(v.idx_to_token))
-
-    # Specifying a special tokens as reserved tokens is counted as duplicate
-    if eos_token is not None:
-        with pytest.raises(AssertionError):
-            Vocab(reserved_tokens=reserved_tokens + [eos_token])
-
-
-@pytest.mark.parametrize('unknown_token', ['<unk>', None])
-@pytest.mark.parametrize('padding_token', ['<pad>', None])
-def test_vocab_identifiers_to_tokens_sanity_checks(unknown_token,
-                                                   padding_token, counter):
-    Vocab = functools.partial(nlp.Vocab,
-                              counter,
-                              max_size=None,
-                              min_freq=1,
-                              unknown_token=unknown_token,
-                              bos_token=None,
-                              eos_token=None)
-    # Special tokens are automatically added
-    v = Vocab(my_token='<does_not_exist_yet>')
-    assert v.my_token == '<does_not_exist_yet>'
-
-    # Special token names must end in _token
-    with pytest.raises(ValueError):
-        Vocab(special_tok='<token>')
-
-    # Cannot set internals
-    with pytest.raises(ValueError):
-        Vocab(_private_token='<token>')
-
-    # Enforces uniqueness requirement of reserved_tokens argument
-    with pytest.raises(AssertionError):
-        Vocab(reserved_tokens=['<token>'], special_token='<token>')
-
-    # Many-to-one mapping is allowed
-    v = Vocab(first_name_of_token='<token>', second_name_of_token='<token>')
-    assert v.first_name_of_token == '<token>'
-    assert v.second_name_of_token == '<token>'
-    if unknown_token:
-        v = Vocab(unk_token=unknown_token)
-        assert v.unk_token == unknown_token
-        assert v.unk_token == v.unknown_token
-    if padding_token:
-        v = Vocab(pad_token=padding_token)
-        assert v.pad_token == padding_token
-        assert v.pad_token == v.padding_token
-
-
-@pytest.mark.parametrize('unknown_token', ['<unk>', None])
-@pytest.mark.parametrize('padding_token', ['<pad>', None])
-@pytest.mark.parametrize('identifiers_to_tokens', [{
-    'important_token': '<imp>'
-}, {}])
-@pytest.mark.parametrize('test_serialization', [True, False])
-def test_vocab_identifiers_to_tokens(unknown_token, padding_token,
-                                     identifiers_to_tokens, test_serialization,
-                                     counter):
-    vocab = nlp.Vocab(counter,
-                      max_size=None,
-                      min_freq=1,
-                      unknown_token=unknown_token,
-                      padding_token=padding_token,
-                      bos_token=None,
-                      eos_token=None,
-                      **identifiers_to_tokens)
-
-    if test_serialization:
-        vocab = nlp.Vocab.from_json(vocab.to_json())
-
-    if identifiers_to_tokens:
-        for identifier, token in identifiers_to_tokens.items():
-            assert hasattr(vocab, identifier)
-            assert getattr(vocab, identifier) == token
-            assert token in vocab.reserved_tokens
-
-    assert getattr(vocab, 'unknown_token') == unknown_token
-    assert getattr(vocab, 'padding_token') == padding_token
-
-
-@pytest.mark.parametrize('unknown_token', ['<unk>', None])
-@pytest.mark.parametrize('padding_token', ['<pad>', None])
-def test_vocab_token_to_idx(unknown_token, padding_token, counter):
-    reserved_tokens = ['<tok>']
-    Vocab = functools.partial(nlp.Vocab,
-                              counter,
-                              max_size=None,
-                              min_freq=1,
-                              unknown_token=unknown_token,
-                              padding_token=padding_token,
-                              bos_token=None,
-                              eos_token=None,
-                              reserved_tokens=reserved_tokens)
-    tokens = set(counter)
-    if unknown_token is not None:
-        tokens.add(unknown_token)
-    if padding_token is not None:
-        tokens.add(padding_token)
-    if isinstance(reserved_tokens, dict):
-        tokens.update(reserved_tokens.values())
-    elif isinstance(reserved_tokens, list):
-        tokens.update(reserved_tokens)
-
-    # Test sanity-checks
-    valid_token = next(counter.elements())
-    invalid_token = 'token_that_does_not_occur_in_vocab'
-    assert invalid_token not in counter
-    with pytest.raises(ValueError):
-        Vocab(token_to_idx={invalid_token: 0})
-    with pytest.raises(ValueError):
-        Vocab(token_to_idx={valid_token: -1})
-    with pytest.raises(ValueError):
-        Vocab(token_to_idx={valid_token: len(tokens)})
-
-    def token_idx_check(token, idx):
-        assert v[token] == idx
-        assert v.token_to_idx[token] == idx
-        assert v.idx_to_token[idx] == token
-
-    def consistency_check(v):
-        assert set(v.idx_to_token) == set(v.token_to_idx.keys())
-        assert set(v.token_to_idx.keys()) == set(tokens)
-        assert set(v.token_to_idx.values()) == set(range(len(tokens)))
-
-    # Manual checks with special tokens
-    if unknown_token:
-        v = Vocab(token_to_idx={unknown_token: len(tokens) - 1})
-        consistency_check(v)
-        token_idx_check(unknown_token, len(tokens) -1)
-    if padding_token:
-        v = Vocab(token_to_idx={padding_token: len(tokens) - 1})
-        consistency_check(v)
-        token_idx_check(padding_token, len(tokens) -1)
-
-    # Test 10 random user-specified indices for a subset of tokens
-    for i in range(10):
-        k = random.randint(0, len(tokens) - 1)
-        token_to_idx = {
-            k: v
-            for k, v in zip(random.sample(tokens, k), random.sample(
-                range(k), k))
-        }
-        v = Vocab(token_to_idx=token_to_idx)
-        consistency_check(v)
-        for token, idx in token_to_idx.items():
-            token_idx_check(token, idx)
-
-
-@pytest.mark.parametrize('unknown_token', ['<unk>', None])
-@pytest.mark.parametrize('padding_token', ['<pad>', '<eos>', None])
-@pytest.mark.parametrize('eos_token', ['<eos>', None])
-@pytest.mark.parametrize('reserved_tokens', [['<tok>'], []])
-def test_vocab_duplicate_special_tokens(unknown_token, padding_token,
-                                        eos_token, reserved_tokens, counter):
-    """Different special tokens are allowed to map to the same representations.
-
-    Special tokens are a subset of the reserved tokens. In general reserved
-    tokens must not contain duplicates; however, it is allowed that multiple
-    special tokens use the same reserved token.
-
-    """
-    Vocab = functools.partial(nlp.Vocab,counter,
-                  max_size=None,
-                  min_freq=1,
-                  unknown_token=unknown_token,
-                  padding_token=padding_token,
-                  bos_token=None,
-                  eos_token=eos_token
-                  )
-
-    v = Vocab(reserved_tokens=reserved_tokens)
-
-    # Specifying a special tokens as reserved tokens is counted as duplicate
-    if eos_token is not None:
-        with pytest.raises(AssertionError):
-            Vocab(reserved_tokens=reserved_tokens + [eos_token])
-
-
-def test_vocab_backwards_compatibility_prior_v0_7_corrupted_index_bug():
-    with open('tests/data/vocab/backward_compat_0_7_corrupted_index', 'r') as f:
-        with pytest.warns(UserWarning):  # Detected a corrupted index in the deserialize vocabulary
-            v = nlp.Vocab.from_json(f.read())
-
-    assert len(set(v.idx_to_token)) == len(v.token_to_idx)
-    assert v['<unk>'] == 0
-    assert v['<bos>'] == 2
-    assert v['<eos>'] == 3
-    assert v['token'] == 4
-
-    assert v.idx_to_token[0] == '<unk>'
-    assert v.idx_to_token[1] == '<eos>'  # corruption preserved for backward
-    # compatibility
-    assert v.idx_to_token[2] == '<bos>'
-    assert v.idx_to_token[3] == '<eos>'
-    assert v.idx_to_token[4] == 'token'
-
-
-@pytest.mark.parametrize('unknown_token', ['<unk>', '<UNK>'])
-@pytest.mark.parametrize('padding_token', ['<pad>', '<eos>', None])
-@pytest.mark.parametrize('eos_token', ['<eos>', None])
-@pytest.mark.parametrize('reserved_tokens', [['<tok>'], []])
-def test_vocab_remapped_unknown_token_idx(unknown_token, padding_token, eos_token, reserved_tokens,
-                                          counter):
-    Vocab = functools.partial(nlp.Vocab, counter, max_size=None, min_freq=1,
-                              unknown_token=unknown_token, padding_token=padding_token,
-                              bos_token=None, eos_token=eos_token)
-
-    v = Vocab()
-    assert v['UNKNOWNWORD'] == 0
-
-    v = Vocab(token_to_idx={unknown_token: 1})
-    assert v['UNKNOWNWORD'] == 1
-
-def test_vocab_consistency():
-    v0 = nlp.Vocab({'a': 1}, mask_token='[MASK]', sep_token='[SEP]',
-                   cls_token='[CLS]')
-    v1 = nlp.Vocab({'a': 1}, mask_token='[MASK]', sep_token='[SEP]',
-                   cls_token='[CLS]')
-    assert v0[v0.mask_token] == v1[v1.mask_token]
-    assert v0[v0.sep_token] == v1[v1.sep_token]
-    assert v0[v0.cls_token] == v1[v1.cls_token]
diff --git a/tests/unittest/train/test_dataloader.py b/tests/unittest/train/test_dataloader.py
deleted file mode 100644
index 469a464805..0000000000
--- a/tests/unittest/train/test_dataloader.py
+++ /dev/null
@@ -1,64 +0,0 @@
-import numpy as np
-import os
-import mxnet as mx
-from gluonnlp.data import FixedBucketSampler, ShardedDataLoader
-from mxnet import gluon
-from mxnet.gluon.utils import download
-import pytest
-
-
-def test_sharded_data_loader():
-    X = np.random.uniform(size=(100, 20))
-    Y = np.random.uniform(size=(100,))
-    dataset = gluon.data.ArrayDataset(X, Y)
-    loader = ShardedDataLoader(dataset, 2)
-    for i, (x, y) in enumerate(loader):
-        assert mx.test_utils.almost_equal(x.asnumpy(), X[i*2:(i+1)*2])
-        assert mx.test_utils.almost_equal(y.asnumpy(), Y[i*2:(i+1)*2])
-    num_shards = 4
-    batch_sampler = FixedBucketSampler(lengths=[X.shape[1]] * X.shape[0],
-                                       batch_size=2,
-                                       num_buckets=1,
-                                       shuffle=False,
-                                       num_shards=num_shards)
-    for thread_pool in [True, False]:
-        for num_workers in [0, 1, 2, 3, 4]:
-            loader = ShardedDataLoader(dataset, batch_sampler=batch_sampler, num_workers=num_workers, thread_pool=thread_pool)
-            for i, seqs in enumerate(loader):
-                assert len(seqs) == num_shards
-                for j in range(num_shards):
-                    if i != len(loader) - 1:
-                        assert mx.test_utils.almost_equal(seqs[j][0].asnumpy(),
-                                                          X[(i*num_shards+j)*2:(i*num_shards+j+1)*2])
-                        assert mx.test_utils.almost_equal(seqs[j][1].asnumpy(),
-                                                          Y[(i*num_shards+j)*2:(i*num_shards+j+1)*2])
-                    else:
-                        assert mx.test_utils.almost_equal(seqs[j][0].asnumpy(),
-                                                          X[(i*num_shards+j)*2-num_shards:
-                                                            (i*num_shards+j+1)*2-num_shards])
-                        assert mx.test_utils.almost_equal(seqs[j][1].asnumpy(),
-                                                          Y[(i*num_shards+j)*2-num_shards:
-                                                            (i*num_shards+j+1)*2-num_shards])
-
-@pytest.mark.remote_required
-def test_sharded_data_loader_record_file():
-    # test record file
-    url_format = 'https://apache-mxnet.s3-accelerate.amazonaws.com/gluon/dataset/pikachu/{}'
-    filename = 'val.rec'
-    idx_filename = 'val.idx'
-    download(url_format.format(filename), path=os.path.join('tests', 'data', filename))
-    download(url_format.format(idx_filename), path=os.path.join('tests', 'data', idx_filename))
-    rec_dataset = gluon.data.vision.ImageRecordDataset(os.path.join('tests', 'data', filename))
-
-    num_workers = 2
-    num_shards = 4
-    X = np.random.uniform(size=(100, 20))
-    Y = np.random.uniform(size=(100,))
-    batch_sampler = FixedBucketSampler(lengths=[X.shape[1]] * X.shape[0],
-                                       batch_size=2,
-                                       num_buckets=1,
-                                       shuffle=False,
-                                       num_shards=num_shards)
-    loader = ShardedDataLoader(rec_dataset, batch_sampler=batch_sampler, num_workers=num_workers)
-    for i, seqs in enumerate(loader):
-        assert len(seqs) == num_shards
diff --git a/tests/unittest/train/test_datasetloader.py b/tests/unittest/train/test_datasetloader.py
deleted file mode 100644
index eadea5e23b..0000000000
--- a/tests/unittest/train/test_datasetloader.py
+++ /dev/null
@@ -1,69 +0,0 @@
-import numpy as np
-import os
-import mxnet as mx
-import gluonnlp as nlp
-
-
-def prepare_dataset(filename, allow_pickle=False):
-    return nlp.data.NumpyDataset(filename[0], allow_pickle=allow_pickle)
-
-
-def prepare_bucket_sampler(dataset, batch_size, shuffle=False, num_buckets=1):
-    lengths = dataset.transform(lambda x: len(x))
-    sampler = nlp.data.FixedBucketSampler(lengths,
-                                          batch_size=batch_size,
-                                          num_buckets=num_buckets,
-                                          ratio=0,
-                                          shuffle=shuffle)
-    return sampler
-
-
-def test_dataset_loader():
-    num_files = 5
-    for i in range(num_files):
-        np.save(os.path.join('tests', 'data', 'part_{}.npy'.format(i)),
-                np.random.uniform(size=(100, 20)))
-    data = os.path.join('tests', 'data')
-    split_sampler = nlp.data.SplitSampler(num_files, num_parts=1, part_index=0, shuffle=False)
-
-    dataset_params = {'allow_pickle': True}
-    sampler_params = {'batch_size': 2}
-    X = np.load(os.path.join(data, 'part_{}.npy'.format(0)))
-    for i in range(1, num_files):
-        X = np.concatenate((X, np.load(os.path.join(data, 'part_{}.npy'.format(i)))))
-    for num_dataset_workers in [1, 2]:
-        for num_batch_workers in [1, 2]:
-            dataloader = nlp.data.DatasetLoader(os.path.join(data, '*.npy'),
-                                                file_sampler=split_sampler,
-                                                dataset_fn=prepare_dataset,
-                                                dataset_params=dataset_params,
-                                                batch_sampler_fn=prepare_bucket_sampler,
-                                                batch_sampler_params=sampler_params,
-                                                num_dataset_workers=num_dataset_workers,
-                                                num_batch_workers=num_batch_workers,
-                                                pin_memory=True,
-                                                circle_length=1)
-            for i, x in enumerate(dataloader):
-                assert mx.test_utils.almost_equal(x.asnumpy(), X[i * 2:(i + 1) * 2])
-
-    # test cache
-    split_sampler = nlp.data.SplitSampler(1, num_parts=1, part_index=0,
-                                          repeat=2, shuffle=False)
-    X = np.load(os.path.join(data, 'part_{}.npy'.format(0)))
-    X = np.concatenate((X, X))
-    for num_dataset_workers in [1]:
-        for num_batch_workers in [1]:
-            dataloader = nlp.data.DatasetLoader(os.path.join(data, 'part_0.npy'),
-                                                file_sampler=split_sampler,
-                                                dataset_fn=prepare_dataset,
-                                                dataset_params=dataset_params,
-                                                batch_sampler_fn=prepare_bucket_sampler,
-                                                batch_sampler_params=sampler_params,
-                                                num_dataset_workers=num_dataset_workers,
-                                                num_batch_workers=num_batch_workers,
-                                                pin_memory=True,
-                                                circle_length=1,
-                                                dataset_cached=True,
-                                                num_max_dataset_cached=1)
-            for i, x in enumerate(dataloader):
-                assert mx.test_utils.almost_equal(x.asnumpy(), X[i * 2:(i + 1) * 2])
diff --git a/tests/unittest/train/test_embedding.py b/tests/unittest/train/test_embedding.py
deleted file mode 100644
index 48407c877b..0000000000
--- a/tests/unittest/train/test_embedding.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-
-import mxnet as mx
-import numpy as np
-import pytest
-from numpy.testing import assert_allclose
-
-import gluonnlp as nlp
-
-
-@pytest.fixture(params=[True, False])
-def sparse_grad(request):
-    return request.param
-
-
-def test_csr_embedding(sparse_grad, hybridize):
-    token_to_idx = dict(hello=0, world=1)
-    embedding = nlp.model.train.CSREmbeddingModel(token_to_idx, 30,
-                                                  sparse_grad=sparse_grad)
-    embedding.initialize()
-    if hybridize:
-        embedding.hybridize()
-
-    one_word_per_row = mx.nd.sparse.csr_matrix(
-        ([1.0, 1.0],
-         ([0, 1], [0, 1])), shape=(2, len(token_to_idx)), dtype=np.float32)
-    two_words_per_row = mx.nd.sparse.csr_matrix(
-        ([1.0, 1.0],
-         ([0, 0], [0, 1])), shape=(1, len(token_to_idx)), dtype=np.float32)
-    emb = embedding(one_word_per_row)
-    emb2 = embedding(two_words_per_row)
-    assert_allclose(emb.sum(axis=0, keepdims=True).asnumpy(), emb2.asnumpy())
-    assert_allclose(emb.asnumpy(), embedding[["hello", "world"]].asnumpy())
-    assert_allclose(emb[0].asnumpy(), embedding["hello"].asnumpy())
-
-
-def test_fasttext_embedding(sparse_grad, hybridize):
-    token_to_idx = dict(hello=0, world=1)
-    num_subwords = 100
-    subwords = nlp.vocab.create_subword_function('NGramHashes', ngrams=[
-        3, 4, 5, 6], num_subwords=num_subwords)
-    embedding = nlp.model.train.FasttextEmbeddingModel(
-        token_to_idx, subwords, 30, sparse_grad=sparse_grad)
-    embedding.initialize()
-    if hybridize:
-        embedding.hybridize()
-
-    words = mx.nd.arange(2).reshape((1, -1))
-    subwords = words.reshape((1, -1, 1))
-
-    word_and_subwords = mx.nd.sparse.csr_matrix(
-        ([0.5, 0.5], ([0, 0], [0, 100])),
-        shape=(1, len(token_to_idx) + num_subwords), dtype=np.float32)
-    emb = embedding(word_and_subwords)
-    emb2 = embedding.weight.data()[word_and_subwords.indices].mean(
-        axis=0, keepdims=True)
-    assert_allclose(emb.asnumpy(), emb2.asnumpy())
-
-
-def test_fasttext_embedding_load_binary_compare_vec():
-    test_dir = os.path.dirname(os.path.realpath(__file__))
-    with pytest.warns(UserWarning):  # UserWarning: skipped likely header line
-        token_embedding_vec = nlp.embedding.TokenEmbedding.from_file(
-            os.path.join(str(test_dir), 'test_embedding', 'lorem_ipsum.vec'), unknown_token=None)
-
-    model = nlp.model.train.FasttextEmbeddingModel.load_fasttext_format(
-        os.path.join(str(test_dir), 'test_embedding', 'lorem_ipsum.bin'))
-    idx_to_vec = model[token_embedding_vec.idx_to_token]
-    assert np.all(
-        np.isclose(a=token_embedding_vec.idx_to_vec.asnumpy(),
-                   b=idx_to_vec.asnumpy(), atol=0.001))
-    assert all(token in model for token in token_embedding_vec.idx_to_token)
-
-
-def test_word2vec_embedding_load_binary_format():
-    test_dir = os.path.dirname(os.path.realpath(__file__))
-    with pytest.warns(UserWarning):  # UserWarning: skipped likely header line
-        word2vec_vec = nlp.embedding.Word2Vec.from_file(
-            os.path.join(str(test_dir), 'test_embedding', 'lorem_ipsum_w2v.vec'), elem_delim=' ')
-    word2vec_bin = nlp.embedding.Word2Vec.from_w2v_binary(
-        os.path.join(str(test_dir), 'test_embedding', 'lorem_ipsum_w2v.bin')
-    )
-    idx_to_vec = word2vec_bin[word2vec_vec.idx_to_token]
-    assert np.all(
-        np.isclose(a=word2vec_vec.idx_to_vec.asnumpy(),
-                   b=idx_to_vec.asnumpy(), atol=0.001))
-    assert all(token in word2vec_bin for token in word2vec_vec.idx_to_token)
diff --git a/tests/unittest/train/test_embedding/lorem_ipsum.bin b/tests/unittest/train/test_embedding/lorem_ipsum.bin
deleted file mode 100644
index c90d6d76c3..0000000000
Binary files a/tests/unittest/train/test_embedding/lorem_ipsum.bin and /dev/null differ
diff --git a/tests/unittest/train/test_embedding/lorem_ipsum.vec b/tests/unittest/train/test_embedding/lorem_ipsum.vec
deleted file mode 100644
index 2b90557358..0000000000
--- a/tests/unittest/train/test_embedding/lorem_ipsum.vec
+++ /dev/null
@@ -1,66 +0,0 @@
-65 300
-in -0.046533 -0.030638 -0.042674 0.00063703 -0.037446 0.003073 -0.03519 0.025617 -0.015837 -0.014312 -0.028888 0.01562 -0.070978 -0.0051087 0.064963 -0.0064245 -0.036688 0.01988 -0.063541 0.026296 -0.007687 -0.0026765 -0.054241 0.047352 0.012238 -0.054346 0.072839 0.040299 -0.07946 -0.01342 -0.024471 0.0096417 -0.017647 0.047402 0.0081556 -0.029888 -0.03239 -0.051501 0.028344 0.038667 0.062812 0.026723 -0.090342 0.031146 0.056641 0.020942 0.01133 -0.085486 -0.017618 0.0075641 -0.0103 -0.10344 -0.030806 -0.049631 0.06855 -0.055746 0.0065516 0.030059 -0.073359 0.015688 -0.042681 -0.072915 0.028544 -0.017548 0.046371 -0.057827 0.024478 0.019225 -0.023171 -0.04523 0.0043283 -0.010411 0.014248 -0.0029566 -0.021237 0.00060288 -0.012813 -0.022235 -0.034774 0.087249 -0.052234 -0.035371 0.025092 0.022339 0.014241 -0.0011506 0.023762 0.049165 0.026605 -0.04896 0.0055942 0.038171 -0.0013498 -0.0059196 -0.017924 -0.051446 0.00064654 0.084812 0.064487 0.016641 -0.033238 0.070616 0.00096432 0.038296 -0.0051479 0.048104 -0.0093886 0.034467 0.018861 -0.013121 -0.017555 -0.0073298 -0.034127 -0.022853 0.051281 0.048535 0.014738 0.047565 -0.11448 0.00013472 -0.058516 0.0070623 -0.04011 -0.0068722 0.055663 -0.034826 0.027813 0.070393 -0.019254 -0.044117 -0.073426 -0.029199 0.034682 -0.0090651 -0.048613 -0.035899 -0.073189 0.017661 -0.0064276 0.051451 0.086173 0.039603 -0.038016 0.049953 -0.057756 -0.016075 -0.0072522 -0.019465 0.07826 -0.057784 -0.061681 0.038024 0.012373 -0.022333 -0.077926 0.045996 -0.026528 0.043713 -0.0051454 -0.010286 0.034185 0.03652 -0.04767 0.051615 0.010675 0.00094181 -0.026479 -0.065864 -0.017368 -0.015256 -0.0069673 -0.016404 -0.079827 -0.054623 -0.0095286 -0.023304 -0.025247 -0.055985 -0.048387 -0.019218 0.00016964 -0.069623 0.0070043 0.051682 0.035976 -0.018096 -0.0052418 0.040909 -0.0020815 -0.048622 0.059503 -0.013715 0.10151 -0.072496 -0.040166 -0.0077493 -0.0081165 0.016426 0.016102 -0.011503 0.024598 -0.0082647 -0.050142 -0.015674 0.014622 -0.050467 -0.0019162 0.038208 0.0022356 0.069547 -0.068457 -0.0032034 -0.044599 -0.035119 -0.05092 0.063649 -0.017624 0.023816 0.029819 -0.011735 0.005117 0.023187 -0.0049907 0.048858 -0.046264 0.0059218 0.019011 -0.013601 0.036363 -0.062957 0.017927 0.036327 -0.037785 0.023061 0.099751 -0.012745 0.042621 0.06795 -0.039886 0.0083967 0.030233 -0.0069894 -0.047668 -0.10742 -0.084106 -0.028968 0.073298 -0.0046378 0.018027 0.013011 -0.068388 -0.09538 -0.097605 0.055328 0.036113 -0.037127 0.028808 -0.031343 -0.003341 0.020207 0.12272 0.035412 0.04286 -0.035164 -0.0042132 0.036419 0.0052235 0.010294 0.02257 -0.03922 0.0008652 -0.009601 0.0657 0.025858 -0.033888 0.018226 0.088154 -0.018851 -0.0020176 0.016845 0.070539 0.034722 -0.0021962 -0.04398 0.012759 -0.016513 0.085443 0.0059548 -0.023047 0.066593 -0.027468 0.041345 -0.028386 -0.015642 -0.017656 -0.045831 0.030295 -0.070577 -0.035537 -0.066944 
-</s> -0.0024882 0.0012768 -0.0044385 -0.0010542 -0.0018275 0.0028804 -0.0039685 0.0012543 0.0010035 0.0013448 -0.0030238 0.0038851 -0.0064448 0.0010466 0.0054622 -0.0022089 -0.0042616 -0.0015294 -0.0009128 0.0025464 0.0023554 0.0027616 -0.0032179 0.0031695 -0.0017091 -0.0015415 0.0067386 0.0015105 -0.0075638 -0.0024696 0.00090472 0.0023381 -0.001239 0.0055482 -0.00019937 0.00082785 -0.0010309 -0.0042157 0.0011024 0.0036742 0.0041849 0.0012873 -0.0064888 -0.000489 0.0029061 -0.00030119 0.0031238 -0.0051951 0.00057608 0.0025455 -0.0037231 -0.0078671 -0.0011811 -0.0051068 0.0033301 -0.0049896 0.00035039 0.0025882 -0.0026374 0.0011904 -0.0051368 -0.0032573 0.0025959 -0.0038247 0.00030302 -0.0025914 0.0015894 -0.0006606 -0.0017708 -0.0039589 0.0031517 0.00027958 0.001009 0.0011236 -0.0011889 0.0028262 0.00089213 0.0017578 0.00027207 0.0024593 -0.0026581 -0.001037 0.0040986 0.0030751 0.0041124 -3.4779e-05 0.00089112 0.0053603 -0.0016334 -0.0057307 0.00072694 0.00016147 0.00018161 0.00080959 -0.0030601 -0.0041828 8.2868e-05 0.0062803 0.0029807 0.0016696 -0.0033694 0.0056633 5.8131e-05 0.0027528 0.0027048 0.0057685 0.0010856 -0.0012361 0.0029624 0.00066213 -0.0011581 -0.0018662 -0.0035624 -0.00040814 0.0058506 0.003122 0.0018107 0.001904 -0.0084046 0.0029725 -0.00049913 -0.00065924 -0.0038374 -0.0034217 0.0041393 -0.0006812 0.0047654 0.0052136 0.0012933 -0.00015853 -0.0023436 -0.0045918 -0.0006963 -0.0011026 -0.002346 0.00032009 -0.0070739 0.0041269 0.0016336 0.0031614 0.0036852 0.0014058 0.00035044 0.0059956 -0.0017227 -0.0041577 -0.00067293 0.0018056 0.0038221 -0.0042815 -0.005214 0.0038817 0.0022462 -0.0012552 -0.0012695 0.0050323 -0.0032165 0.0042585 -0.0035041 0.0024492 0.0029654 0.0027456 -0.0042207 0.0025135 0.0030152 0.0017384 -0.0029341 -0.006734 -0.0028588 0.0005669 -0.0017391 -0.0011703 -0.00704 -0.00079366 -0.0020993 -0.0026918 -0.0036321 -0.0063952 -0.0012389 -0.0019646 -0.0025636 -0.0032496 0.0005429 0.0041055 0.0018074 -0.0020478 0.00055687 0.0041973 -0.0016441 -0.00032087 0.0053499 -0.00052154 0.0087859 -0.004374 -0.0015372 0.0021508 -0.00065124 -0.0020132 -0.001107 -0.0026589 0.004459 0.0012635 -0.00084354 -0.0023879 0.0024649 -0.001558 -0.002734 0.0012391 -0.0028089 0.0070212 -0.0037274 0.00109 -0.0042838 -0.0023831 -0.0049847 0.0048967 -0.0032204 -0.0014139 0.004222 -0.0029059 -0.0020537 0.002014 0.0018601 0.0017632 -0.0017825 0.0023558 0.0010657 -0.0029586 -0.00059724 -0.0058637 -0.0012177 0.0043092 -0.0036328 0.0010727 0.0069971 0.0013716 5.1359e-05 0.0020666 -0.0052011 0.0035111 -0.00028296 -0.00052236 -0.0041165 -0.0049957 -0.0022205 -0.0027993 0.0031164 -0.0036846 -0.00070668 0.0033431 -0.0057974 -0.0082032 -0.0088696 0.0052253 0.0034965 -0.0012789 0.0022311 -0.0018984 -5.637e-05 0.00035292 0.0040782 0.00075709 0.0052879 -0.0032497 0.001497 0.0030441 -0.0029173 -0.0014533 -0.0004557 0.00032515 -0.0006961 0.0016087 0.0069664 -0.0017674 -0.0030744 0.00028378 0.0059688 -0.0035841 -0.002169 -5.9541e-05 0.0057434 0.0037598 0.00069039 0.00090602 0.0028964 -0.0013185 0.0017733 0.00074423 -0.0041953 0.0069982 -0.0019326 0.0017679 -0.0043368 -0.0002021 -0.00080115 -0.0025701 -0.0010705 -0.0042411 -0.0041341 -0.0064128 
-dolor -0.040978 -0.029401 -0.037248 0.00027917 -0.034215 0.0031404 -0.030956 0.021358 -0.015506 -0.013617 -0.025941 0.015801 -0.065014 -0.0066163 0.059859 -0.0049418 -0.032335 0.017265 -0.056621 0.024636 -0.0070545 -0.0016458 -0.050143 0.043855 0.010477 -0.049683 0.066699 0.037099 -0.072467 -0.013573 -0.021293 0.0086975 -0.016483 0.042819 0.005852 -0.028466 -0.028548 -0.045385 0.026498 0.036277 0.055553 0.022615 -0.082959 0.029267 0.050469 0.019309 0.010713 -0.076646 -0.014811 0.0058291 -0.0087851 -0.094406 -0.026894 -0.046244 0.063551 -0.052099 0.0070912 0.027961 -0.065636 0.014946 -0.038327 -0.065645 0.025313 -0.016961 0.041489 -0.054652 0.021576 0.017924 -0.021388 -0.042766 0.0039794 -0.0093349 0.013597 -0.0042423 -0.018496 0.00012261 -0.010909 -0.021019 -0.031521 0.080521 -0.047466 -0.033538 0.023128 0.021334 0.014753 -0.00028777 0.021165 0.046835 0.023839 -0.044589 0.0055949 0.033895 -0.0024328 -0.0032439 -0.01574 -0.046419 -0.0007698 0.077826 0.061748 0.014811 -0.030942 0.06444 -0.0018885 0.036334 -0.0070855 0.043341 -0.0058494 0.031536 0.018664 -0.013319 -0.016799 -0.0060122 -0.032471 -0.020082 0.046942 0.046062 0.0147 0.042934 -0.1036 -0.00094962 -0.051713 0.0056531 -0.036959 -0.005318 0.050698 -0.031796 0.027461 0.063371 -0.018233 -0.039964 -0.065978 -0.025489 0.031474 -0.0073684 -0.042489 -0.031564 -0.064719 0.015074 -0.0040926 0.046084 0.079594 0.035937 -0.035175 0.045356 -0.051459 -0.013955 -0.0055417 -0.017409 0.07153 -0.052044 -0.056892 0.034152 0.012501 -0.022048 -0.071031 0.04077 -0.02423 0.040938 -0.0055239 -0.010191 0.032195 0.033898 -0.044513 0.046479 0.0090912 -0.00099467 -0.022644 -0.060998 -0.015734 -0.014176 -0.0057526 -0.013981 -0.073992 -0.049631 -0.0057001 -0.020573 -0.021073 -0.050167 -0.043995 -0.018916 0.00092529 -0.063022 0.0042539 0.047584 0.032519 -0.016003 -0.004287 0.037068 -0.0016001 -0.045308 0.055695 -0.012189 0.091945 -0.066905 -0.037506 -0.0064149 -0.0086077 0.01438 0.016243 -0.010758 0.021408 -0.0063187 -0.04562 -0.01353 0.01267 -0.045165 -0.0014769 0.034308 -0.00051764 0.061139 -0.059908 -0.0039147 -0.041277 -0.030247 -0.045137 0.05686 -0.016243 0.0215 0.028585 -0.011106 0.0050657 0.021469 -0.0036661 0.043703 -0.041597 0.0032404 0.017546 -0.012572 0.034864 -0.058646 0.017174 0.033445 -0.032872 0.023118 0.091708 -0.011961 0.040076 0.060338 -0.038641 0.0087706 0.028283 -0.0061037 -0.042978 -0.096544 -0.076998 -0.02809 0.065592 -0.0067378 0.015399 0.010016 -0.062819 -0.08667 -0.089838 0.049486 0.03396 -0.033247 0.024842 -0.027806 -0.0040021 0.0186 0.11076 0.032468 0.039623 -0.032348 -0.0026376 0.03464 0.0057839 0.010443 0.020946 -0.036131 0.00066247 -0.010065 0.059816 0.024304 -0.031871 0.017276 0.079404 -0.016694 -0.0025335 0.017804 0.063721 0.032833 -0.0027273 -0.038495 0.010895 -0.01322 0.078239 0.0039015 -0.019726 0.060475 -0.02423 0.036324 -0.025728 -0.014767 -0.015293 -0.042897 0.029407 -0.064718 -0.034405 -0.060915 
-dolore -0.045934 -0.03201 -0.041768 0.0013045 -0.03702 0.0036937 -0.03431 0.023962 -0.017847 -0.014688 -0.028618 0.016351 -0.070508 -0.0072723 0.065056 -0.0055765 -0.034711 0.018696 -0.060861 0.02661 -0.0074666 -0.0018506 -0.054077 0.047973 0.010676 -0.054752 0.072268 0.039802 -0.078625 -0.014545 -0.022567 0.0098781 -0.01808 0.047244 0.0071645 -0.031594 -0.031103 -0.049943 0.028786 0.039865 0.060478 0.024567 -0.090588 0.031682 0.05531 0.020709 0.012781 -0.083387 -0.016254 0.0062032 -0.0099962 -0.10293 -0.029441 -0.049703 0.069396 -0.056642 0.0078388 0.03042 -0.071743 0.016872 -0.042161 -0.072615 0.027465 -0.017893 0.045102 -0.059394 0.022944 0.019838 -0.023109 -0.045598 0.0048945 -0.010112 0.014731 -0.0040258 -0.020021 0.00040637 -0.012238 -0.022907 -0.034254 0.087504 -0.052075 -0.036878 0.025155 0.022937 0.016166 0.00015641 0.022905 0.050499 0.025427 -0.048552 0.0063561 0.03782 -0.002446 -0.0038658 -0.017576 -0.049988 -0.00072873 0.0851 0.066356 0.016061 -0.033553 0.070414 -0.0010854 0.039419 -0.0075975 0.047314 -0.0068884 0.033692 0.020201 -0.013442 -0.017735 -0.0056799 -0.034683 -0.022496 0.050976 0.050141 0.015666 0.046612 -0.1124 -0.00050657 -0.056813 0.0063294 -0.040789 -0.0061481 0.055039 -0.034057 0.030108 0.069777 -0.020146 -0.043929 -0.07261 -0.028386 0.034257 -0.0084438 -0.046366 -0.034721 -0.070195 0.017138 -0.0048392 0.050242 0.086425 0.038885 -0.038328 0.049903 -0.056196 -0.014525 -0.0066513 -0.019032 0.077966 -0.056887 -0.060966 0.036921 0.013553 -0.024347 -0.077284 0.044935 -0.026609 0.044867 -0.005527 -0.010884 0.034827 0.037441 -0.048399 0.051203 0.010417 -0.00074934 -0.024479 -0.067298 -0.017523 -0.014556 -0.0062258 -0.014751 -0.080011 -0.054072 -0.0074966 -0.022702 -0.022851 -0.054514 -0.048017 -0.020135 0.00091742 -0.068017 0.0045968 0.0517 0.035565 -0.018416 -0.0048393 0.040692 -0.0021913 -0.049457 0.061343 -0.012971 0.10073 -0.073269 -0.041051 -0.0067224 -0.008709 0.016395 0.017687 -0.011657 0.022871 -0.0074471 -0.050574 -0.015377 0.013409 -0.048155 -0.00085003 0.037778 0.00014899 0.068109 -0.065952 -0.0037857 -0.044383 -0.032632 -0.049467 0.062282 -0.018238 0.023858 0.030654 -0.01237 0.0054303 0.023551 -0.0046832 0.047708 -0.045638 0.0037818 0.018836 -0.01351 0.03781 -0.063961 0.018859 0.036098 -0.03582 0.025904 0.099404 -0.013265 0.043545 0.065615 -0.042557 0.0091973 0.031459 -0.0070395 -0.046365 -0.10581 -0.084305 -0.030883 0.071162 -0.0068608 0.017606 0.011149 -0.068051 -0.094388 -0.09783 0.05354 0.037025 -0.036502 0.027873 -0.030815 -0.0043391 0.020157 0.12081 0.035086 0.043203 -0.035319 -0.0021668 0.037251 0.0060497 0.01211 0.02214 -0.038867 0.00016945 -0.010934 0.06586 0.026611 -0.035191 0.018718 0.086878 -0.017667 -0.0022212 0.018902 0.069966 0.035186 -0.0030809 -0.042058 0.011685 -0.015242 0.085753 0.0049623 -0.021979 0.06543 -0.026985 0.040485 -0.028045 -0.015599 -0.016085 -0.0464 0.031737 -0.07094 -0.037716 -0.06691 
-ut -0.025306 -0.019406 -0.024776 -0.0014046 -0.022217 0.0027283 -0.019205 0.012512 -0.0097116 -0.010626 -0.017983 0.010616 -0.040666 -0.0029216 0.037538 -0.0026104 -0.021173 0.01083 -0.035885 0.014346 -0.004855 -0.00071382 -0.033688 0.02926 0.0068585 -0.034628 0.044911 0.026019 -0.046589 -0.010213 -0.013414 0.0071433 -0.0093832 0.029607 0.0038959 -0.01866 -0.016929 -0.030398 0.016531 0.023059 0.037997 0.014421 -0.05399 0.019002 0.034679 0.012693 0.0089696 -0.049652 -0.010439 0.0038294 -0.0062043 -0.061362 -0.017137 -0.029261 0.039246 -0.033932 0.0038816 0.017911 -0.042607 0.0084359 -0.024721 -0.041536 0.015807 -0.010861 0.026357 -0.035229 0.014601 0.011919 -0.01444 -0.027737 0.0046998 -0.0060458 0.010327 -0.0029993 -0.012469 0.0012276 -0.0077673 -0.013001 -0.020599 0.051663 -0.031741 -0.022217 0.015807 0.013721 0.0098492 -0.00020159 0.013531 0.029135 0.015815 -0.028325 0.0025859 0.021392 -0.0012604 -0.0028663 -0.0097654 -0.03088 0.001439 0.051127 0.039311 0.0098024 -0.02027 0.041933 -0.00062143 0.023923 -0.0050967 0.030186 -0.0027685 0.018767 0.012832 -0.0082452 -0.011708 -0.0028906 -0.02239 -0.01289 0.031108 0.031362 0.0086888 0.027349 -0.069251 0.0011143 -0.035552 0.0038451 -0.023359 -0.0049031 0.035216 -0.021959 0.017566 0.042078 -0.010267 -0.02596 -0.043456 -0.016173 0.020137 -0.0060082 -0.027813 -0.019476 -0.043464 0.0091523 -0.0026686 0.031184 0.054008 0.02465 -0.021421 0.030669 -0.033389 -0.0098742 -0.0051852 -0.010855 0.046746 -0.033852 -0.037638 0.024014 0.0084518 -0.014378 -0.045778 0.027229 -0.015515 0.027479 -0.0043928 -0.0067493 0.01945 0.021897 -0.029858 0.031523 0.0060856 -0.00073232 -0.017415 -0.040041 -0.010606 -0.0082182 -0.0051964 -0.010046 -0.046864 -0.031366 -0.0056854 -0.014436 -0.012431 -0.033175 -0.028282 -0.010143 0.00023417 -0.041058 0.0032999 0.030975 0.021966 -0.0093484 -0.0020315 0.022693 -0.00090764 -0.029181 0.036091 -0.0083782 0.061978 -0.043935 -0.02458 -0.0024883 -0.0060271 0.0077082 0.012044 -0.0066677 0.015295 -0.0045843 -0.03072 -0.0076121 0.0096747 -0.029756 -0.00052957 0.021325 0.0012636 0.040722 -0.040561 -0.0027974 -0.027618 -0.020614 -0.029076 0.037058 -0.010993 0.01557 0.016339 -0.0068563 0.0035837 0.013544 -0.002368 0.027458 -0.026435 0.0034342 0.011348 -0.0080833 0.022983 -0.037801 0.010177 0.021024 -0.02153 0.01317 0.060391 -0.0095691 0.025449 0.040281 -0.026258 0.005689 0.016001 -0.003479 -0.02844 -0.062083 -0.048015 -0.016481 0.043942 -0.0039131 0.011592 0.0071719 -0.040852 -0.056057 -0.058815 0.032675 0.021797 -0.022192 0.016697 -0.016913 -0.0025605 0.014233 0.070711 0.022071 0.025893 -0.021029 -0.0018363 0.021361 0.001832 0.0066963 0.013717 -0.022274 -0.00033292 -0.0066949 0.03975 0.013118 -0.022326 0.01126 0.053503 -0.0086858 -0.0031311 0.012408 0.042495 0.019682 -0.00055669 -0.024628 0.006929 -0.010036 0.052107 0.0031086 -0.013753 0.036294 -0.01545 0.024267 -0.016902 -0.0088335 -0.011022 -0.028178 0.019655 -0.041274 -0.022043 -0.040174 
-Excepteur -0.044527 -0.031318 -0.040752 0.00069211 -0.03552 0.0029848 -0.033668 0.023365 -0.017974 -0.014034 -0.026977 0.015239 -0.069457 -0.0074351 0.063115 -0.0054215 -0.033822 0.017841 -0.059318 0.025743 -0.0078348 -0.0017484 -0.05341 0.04626 0.010854 -0.053909 0.070304 0.038685 -0.076896 -0.013954 -0.022456 0.0092842 -0.017821 0.046485 0.0068439 -0.030703 -0.029883 -0.049163 0.028279 0.038416 0.05881 0.024223 -0.087802 0.030392 0.053934 0.018984 0.012194 -0.081688 -0.016108 0.0062592 -0.0092585 -0.10002 -0.02827 -0.049144 0.067538 -0.054602 0.0077845 0.028212 -0.070395 0.015791 -0.041502 -0.070822 0.026976 -0.017506 0.044325 -0.05809 0.023778 0.019138 -0.022871 -0.04479 0.0040507 -0.010295 0.014096 -0.0039054 -0.01955 0.00046845 -0.011052 -0.022905 -0.03387 0.084402 -0.051149 -0.035038 0.02446 0.022335 0.015452 5.896e-06 0.023446 0.048197 0.024805 -0.047536 0.0057387 0.037351 -0.0022082 -0.0040049 -0.01765 -0.048177 -0.00032148 0.082646 0.064834 0.015351 -0.033316 0.067824 -0.00070433 0.038334 -0.0078962 0.04613 -0.0067672 0.032227 0.020021 -0.012707 -0.017769 -0.0056754 -0.033612 -0.021936 0.050029 0.049157 0.015106 0.0453 -0.11088 -0.00072996 -0.055941 0.0059664 -0.039805 -0.0053992 0.054182 -0.032767 0.028852 0.067709 -0.018845 -0.043055 -0.070455 -0.027428 0.033003 -0.0087045 -0.046863 -0.033955 -0.068663 0.016723 -0.0047666 0.04939 0.084081 0.038247 -0.036693 0.049355 -0.055179 -0.014863 -0.0062996 -0.018601 0.076373 -0.055065 -0.059846 0.036411 0.013346 -0.023735 -0.075579 0.043756 -0.025829 0.043406 -0.0062937 -0.010175 0.033795 0.03591 -0.047034 0.049355 0.010155 -0.00077179 -0.024251 -0.064493 -0.016569 -0.013805 -0.0065846 -0.014472 -0.077781 -0.051942 -0.0078495 -0.021779 -0.022915 -0.052768 -0.04594 -0.019197 0.00070027 -0.067431 0.0052693 0.050655 0.034288 -0.017128 -0.0042772 0.038834 -0.0018618 -0.047303 0.059648 -0.013021 0.098324 -0.070911 -0.039382 -0.0071335 -0.0092756 0.016003 0.017906 -0.011571 0.022779 -0.0069618 -0.049194 -0.014303 0.012556 -0.046274 -0.0014014 0.037102 0.0014005 0.065436 -0.064037 -0.0037066 -0.043653 -0.032424 -0.048381 0.060826 -0.017275 0.022827 0.029493 -0.011678 0.00587 0.022313 -0.0034738 0.046429 -0.044035 0.0045451 0.018011 -0.012238 0.036767 -0.062243 0.018293 0.034446 -0.034748 0.024693 0.096846 -0.012872 0.042295 0.063886 -0.041314 0.0084104 0.030129 -0.0063642 -0.045903 -0.10335 -0.082819 -0.029701 0.070213 -0.0074555 0.016934 0.010993 -0.065931 -0.091413 -0.09489 0.052369 0.034898 -0.036876 0.026678 -0.029496 -0.0048047 0.01916 0.11835 0.034331 0.041714 -0.034603 -0.0028878 0.036966 0.0058346 0.011482 0.022424 -0.037311 0.00048808 -0.010807 0.063502 0.02599 -0.033694 0.018128 0.085076 -0.01662 -0.0031555 0.018371 0.068569 0.034738 -0.0024137 -0.0418 0.012598 -0.014921 0.083752 0.0049011 -0.021468 0.06383 -0.025901 0.039088 -0.026514 -0.016067 -0.016537 -0.044649 0.029905 -0.068307 -0.036531 -0.065902 
-Duis -0.044156 -0.029795 -0.040137 0.00037593 -0.035305 0.0041388 -0.03351 0.023761 -0.016624 -0.013695 -0.02685 0.01534 -0.069243 -0.006519 0.062183 -0.005403 -0.034308 0.018299 -0.060104 0.026582 -0.0088812 -0.0014439 -0.053642 0.046842 0.01108 -0.054524 0.070841 0.039219 -0.075814 -0.014262 -0.022168 0.0097966 -0.017841 0.045711 0.007495 -0.029953 -0.02894 -0.049364 0.029009 0.039361 0.059741 0.023681 -0.087846 0.029128 0.054833 0.019794 0.012796 -0.081227 -0.016282 0.006472 -0.0089687 -0.10071 -0.029743 -0.047984 0.066668 -0.054651 0.0075953 0.029225 -0.070103 0.015725 -0.040988 -0.071099 0.027406 -0.017986 0.044906 -0.056046 0.023 0.018457 -0.022607 -0.045299 0.0039541 -0.011188 0.015215 -0.0029565 -0.020058 0.00012893 -0.012125 -0.022311 -0.032558 0.0849 -0.050332 -0.035792 0.023449 0.022645 0.016051 -0.0008612 0.021885 0.048721 0.025701 -0.047279 0.0066182 0.036886 -0.0019902 -0.0035195 -0.017016 -0.048803 -0.0019262 0.082707 0.06432 0.015802 -0.031933 0.068463 0.00011739 0.037293 -0.0073711 0.045969 -0.0067803 0.03273 0.0199 -0.012812 -0.017393 -0.0059699 -0.033436 -0.022413 0.050191 0.048842 0.015258 0.045893 -0.11046 -0.00029259 -0.054567 0.0072322 -0.040167 -0.0060271 0.054454 -0.033549 0.028698 0.068568 -0.018781 -0.042348 -0.070604 -0.027475 0.033393 -0.0076602 -0.045129 -0.034304 -0.069832 0.016227 -0.0051474 0.049578 0.08371 0.039207 -0.036467 0.049323 -0.056266 -0.013803 -0.0074997 -0.019817 0.076401 -0.055058 -0.060137 0.036697 0.013144 -0.02326 -0.075009 0.044821 -0.025469 0.044282 -0.0053671 -0.011108 0.033581 0.036274 -0.046605 0.050397 0.0107 -0.00054848 -0.025007 -0.063094 -0.016575 -0.013198 -0.0061307 -0.015075 -0.078756 -0.052842 -0.0069267 -0.022019 -0.022723 -0.052992 -0.04663 -0.019712 0.0010663 -0.066972 0.00609 0.05068 0.03379 -0.015909 -0.0046397 0.039648 -0.002604 -0.048511 0.058356 -0.013715 0.098196 -0.071423 -0.040474 -0.0072784 -0.0086486 0.015167 0.01744 -0.011242 0.023015 -0.0077738 -0.048871 -0.014374 0.013402 -0.047441 -0.0013609 0.03638 0.000868 0.066462 -0.06425 -0.0035573 -0.043709 -0.031696 -0.046977 0.061756 -0.017504 0.022977 0.029556 -0.011914 0.0056448 0.023726 -0.0036601 0.045924 -0.044219 0.0040706 0.01908 -0.012589 0.036312 -0.062475 0.018503 0.034095 -0.035396 0.0243 0.096551 -0.012132 0.041714 0.063493 -0.040882 0.0087556 0.030081 -0.0057891 -0.04528 -0.10353 -0.08154 -0.028837 0.070766 -0.006269 0.016865 0.011536 -0.066486 -0.09232 -0.09644 0.052084 0.034102 -0.035627 0.027388 -0.030531 -0.0034178 0.020862 0.11738 0.035138 0.042233 -0.033717 -0.002329 0.036303 0.0055786 0.012019 0.021682 -0.037867 -0.00026869 -0.008871 0.064361 0.025502 -0.033732 0.017469 0.084144 -0.017006 -0.0023818 0.01771 0.067649 0.034826 -0.0041026 -0.041933 0.012514 -0.014517 0.083301 0.0049121 -0.022372 0.063633 -0.026217 0.039665 -0.028001 -0.016103 -0.017949 -0.044827 0.030044 -0.068202 -0.036857 -0.065338 
-pariatur. -0.041247 -0.029007 -0.037466 0.00083363 -0.032843 0.0031465 -0.030717 0.021941 -0.01642 -0.013262 -0.025021 0.014089 -0.064125 -0.0062064 0.058066 -0.0054714 -0.031662 0.016287 -0.055827 0.023508 -0.0073179 -0.0011142 -0.049041 0.042889 0.010305 -0.050207 0.065226 0.035787 -0.070962 -0.013704 -0.020396 0.0095746 -0.016093 0.042968 0.0061568 -0.028339 -0.027442 -0.044982 0.026438 0.036371 0.055324 0.022016 -0.082332 0.028527 0.050896 0.017661 0.011048 -0.076239 -0.015228 0.0058941 -0.0084914 -0.093621 -0.026513 -0.045079 0.06271 -0.051018 0.0073263 0.026281 -0.065178 0.014457 -0.037971 -0.065521 0.024356 -0.01607 0.041209 -0.053184 0.021263 0.017079 -0.020621 -0.041619 0.0037647 -0.0098486 0.012729 -0.0035831 -0.018522 0.00038376 -0.01107 -0.021295 -0.030924 0.078421 -0.048249 -0.033726 0.022376 0.020934 0.014283 -0.00068401 0.020877 0.045063 0.022803 -0.044621 0.0060288 0.034506 -0.0015227 -0.002815 -0.016359 -0.044821 -0.00030361 0.076237 0.059795 0.014477 -0.030485 0.063132 -0.00027799 0.036016 -0.0068341 0.042574 -0.0063795 0.030438 0.018277 -0.012029 -0.016397 -0.0049891 -0.031169 -0.020592 0.045735 0.045736 0.013466 0.041343 -0.10252 -0.00042927 -0.051489 0.0061295 -0.036984 -0.0050174 0.050742 -0.030864 0.026385 0.063158 -0.017736 -0.039489 -0.06598 -0.025211 0.030673 -0.0081195 -0.043187 -0.031528 -0.063468 0.015832 -0.004765 0.046969 0.078027 0.036111 -0.033493 0.045523 -0.051379 -0.01375 -0.0068941 -0.017563 0.071082 -0.051443 -0.055953 0.03383 0.012366 -0.022113 -0.06988 0.041642 -0.024039 0.040128 -0.0055863 -0.0095855 0.031223 0.03434 -0.043592 0.046719 0.0090361 -0.00067774 -0.022833 -0.059799 -0.015198 -0.013031 -0.0054453 -0.013355 -0.072743 -0.048905 -0.0069229 -0.020374 -0.021185 -0.04959 -0.0435 -0.018336 0.00043187 -0.0626 0.004285 0.046732 0.032692 -0.016098 -0.0036884 0.036903 -0.0019059 -0.04484 0.055291 -0.012458 0.09098 -0.065979 -0.036652 -0.0069803 -0.0084975 0.01452 0.016691 -0.010155 0.021629 -0.0060838 -0.045711 -0.013809 0.012366 -0.043673 -0.0012751 0.034697 0.00076453 0.06141 -0.059323 -0.003968 -0.040738 -0.029581 -0.044038 0.056463 -0.016555 0.021268 0.028033 -0.011443 0.0059513 0.022043 -0.0041537 0.042623 -0.041286 0.0038399 0.01666 -0.011957 0.034767 -0.057885 0.016512 0.031795 -0.031661 0.02345 0.090868 -0.011771 0.038815 0.059986 -0.0384 0.0079209 0.027574 -0.0065422 -0.042488 -0.096112 -0.076542 -0.027383 0.064846 -0.0055798 0.015909 0.010631 -0.061403 -0.08568 -0.089227 0.049145 0.032467 -0.033925 0.024559 -0.027556 -0.0039141 0.018463 0.10947 0.032757 0.038377 -0.031446 -0.0025488 0.034097 0.0043288 0.010804 0.020214 -0.034544 -3.9956e-05 -0.0096883 0.059109 0.023774 -0.031617 0.016994 0.07966 -0.01574 -0.0025799 0.017555 0.063315 0.031788 -0.0024721 -0.038229 0.011212 -0.013839 0.076857 0.0050526 -0.02064 0.059024 -0.023961 0.036674 -0.025417 -0.014332 -0.014835 -0.04164 0.028475 -0.063446 -0.033168 -0.060021 
-nulla -0.039323 -0.027106 -0.035532 0.0012441 -0.031842 0.0032458 -0.029863 0.01993 -0.015025 -0.012486 -0.024401 0.014376 -0.061314 -0.0060678 0.054555 -0.0050111 -0.029824 0.016148 -0.053454 0.022602 -0.007271 -0.00095742 -0.047281 0.041565 0.0098089 -0.047799 0.062427 0.034388 -0.068468 -0.012327 -0.019718 0.0085571 -0.015232 0.041547 0.0061713 -0.026517 -0.026499 -0.04369 0.025509 0.034496 0.05236 0.021263 -0.078595 0.027089 0.04834 0.01725 0.010456 -0.073001 -0.014336 0.0052428 -0.0076077 -0.088956 -0.024274 -0.043961 0.060016 -0.049562 0.0069503 0.024625 -0.063051 0.013536 -0.036378 -0.062373 0.023781 -0.01526 0.040071 -0.050828 0.020686 0.016893 -0.020203 -0.040214 0.0044177 -0.008568 0.012712 -0.0030932 -0.017778 0.0017417 -0.0098869 -0.020319 -0.028798 0.075737 -0.045446 -0.032329 0.021734 0.020678 0.014021 0.00052531 0.020282 0.042329 0.022943 -0.042924 0.006139 0.033568 -0.0017087 -0.0032139 -0.01544 -0.044021 -0.00053933 0.073371 0.057912 0.013192 -0.029536 0.060814 -0.0014679 0.034308 -0.0079627 0.040563 -0.0060466 0.028791 0.01768 -0.012509 -0.014859 -0.0042953 -0.030894 -0.018581 0.044105 0.04375 0.013261 0.040528 -0.098494 0.00053126 -0.049563 0.0053772 -0.03568 -0.0048156 0.048149 -0.029734 0.026429 0.060178 -0.017479 -0.037268 -0.063158 -0.024693 0.028913 -0.0078661 -0.040791 -0.030556 -0.061067 0.015316 -0.0046641 0.043708 0.073937 0.034035 -0.033028 0.044013 -0.049749 -0.012449 -0.0060475 -0.01525 0.067786 -0.049003 -0.05335 0.032101 0.011663 -0.021374 -0.066854 0.039459 -0.023382 0.039996 -0.005032 -0.0084491 0.030432 0.031573 -0.041798 0.045104 0.0094395 -0.0015026 -0.020994 -0.057389 -0.01485 -0.012559 -0.0053636 -0.013863 -0.069725 -0.046799 -0.0066621 -0.018485 -0.019763 -0.047081 -0.040695 -0.017069 0.00044862 -0.060435 0.0040371 0.045169 0.031029 -0.015128 -0.0046874 0.034917 -0.0025091 -0.041704 0.05249 -0.012201 0.088306 -0.062775 -0.035047 -0.0059337 -0.0094394 0.01379 0.015351 -0.0098561 0.019702 -0.0060282 -0.043065 -0.012739 0.01208 -0.042053 -0.0010903 0.033235 0.00030098 0.059258 -0.057341 -0.0031578 -0.038874 -0.029059 -0.042354 0.053371 -0.016258 0.020394 0.026371 -0.010537 0.0058638 0.020239 -0.00421 0.041306 -0.038899 0.0036643 0.016036 -0.011478 0.032524 -0.054904 0.015847 0.031048 -0.031112 0.020838 0.086802 -0.011543 0.03746 0.057201 -0.037474 0.0083284 0.025812 -0.0057066 -0.041557 -0.091577 -0.07361 -0.025898 0.062464 -0.0057595 0.015096 0.010554 -0.058881 -0.082518 -0.084924 0.046138 0.030836 -0.03202 0.024239 -0.026496 -0.0033894 0.017042 0.104 0.030859 0.037301 -0.031172 -0.0022566 0.031976 0.0047665 0.010385 0.019332 -0.032649 -0.00043718 -0.0091269 0.057417 0.022244 -0.029932 0.015828 0.076577 -0.015205 -0.0020863 0.016049 0.060363 0.030538 -0.0018195 -0.035896 0.010679 -0.013932 0.07374 0.0046573 -0.019521 0.055914 -0.023352 0.035749 -0.023313 -0.013384 -0.014088 -0.04023 0.026971 -0.061155 -0.032277 -0.056939 
-fugiat -0.048142 -0.033666 -0.044275 -0.00012248 -0.038443 0.0031306 -0.036287 0.02497 -0.018252 -0.015804 -0.029919 0.016923 -0.075077 -0.0076388 0.067123 -0.0065073 -0.037346 0.02005 -0.065851 0.028183 -0.0089405 -0.0016373 -0.057128 0.051059 0.012318 -0.057955 0.076662 0.042848 -0.083486 -0.01541 -0.024323 0.010841 -0.019639 0.050515 0.0077364 -0.03281 -0.032596 -0.053551 0.03051 0.041606 0.06438 0.026816 -0.096406 0.033193 0.059846 0.021599 0.013059 -0.088297 -0.016819 0.007421 -0.0099276 -0.109 -0.030392 -0.052831 0.073352 -0.060499 0.0078159 0.031274 -0.07715 0.017093 -0.044385 -0.076642 0.029298 -0.019502 0.048088 -0.061885 0.025214 0.020132 -0.02457 -0.048391 0.005033 -0.012091 0.016089 -0.0040483 -0.022401 0.0013602 -0.012643 -0.024109 -0.035809 0.092644 -0.054849 -0.03864 0.026253 0.024689 0.017554 -0.0002334 0.024675 0.052522 0.027837 -0.051655 0.0067292 0.039839 -0.0020551 -0.004332 -0.018456 -0.0535 -0.00093502 0.08954 0.070598 0.01701 -0.035791 0.074697 -0.001034 0.040753 -0.0084992 0.050281 -0.0073025 0.035921 0.021293 -0.014624 -0.01888 -0.0061036 -0.036489 -0.02382 0.054467 0.052914 0.016041 0.04943 -0.12035 -0.00033781 -0.060483 0.006676 -0.043849 -0.0062739 0.059177 -0.036212 0.031696 0.074452 -0.020723 -0.0463 -0.077277 -0.029741 0.036 -0.0091818 -0.04937 -0.036687 -0.07473 0.01826 -0.0058304 0.054082 0.091763 0.041211 -0.039361 0.053071 -0.060814 -0.015851 -0.0074015 -0.020591 0.08318 -0.059764 -0.064883 0.040103 0.014269 -0.025804 -0.08193 0.048203 -0.027514 0.047694 -0.0062648 -0.010928 0.036206 0.039444 -0.051147 0.054861 0.011726 -0.00031974 -0.02662 -0.069788 -0.01844 -0.015233 -0.0059385 -0.016424 -0.085302 -0.056902 -0.008387 -0.024208 -0.024852 -0.057221 -0.051369 -0.020309 3.0431e-05 -0.07284 0.0056543 0.055159 0.038077 -0.019149 -0.0051892 0.042295 -0.0026022 -0.052807 0.064276 -0.014675 0.10722 -0.076789 -0.043121 -0.0078493 -0.0092937 0.016813 0.018764 -0.011528 0.024239 -0.0074169 -0.053364 -0.016679 0.014496 -0.050693 -0.0012991 0.040073 0.00057785 0.072742 -0.070033 -0.0046234 -0.048142 -0.034905 -0.052148 0.065828 -0.01952 0.02498 0.031632 -0.012458 0.0058444 0.025516 -0.0047814 0.050768 -0.048281 0.0045767 0.020299 -0.01386 0.03935 -0.067341 0.019189 0.037666 -0.038221 0.026178 0.10585 -0.01428 0.045282 0.069408 -0.04449 0.0089727 0.032423 -0.0071668 -0.049778 -0.11244 -0.088946 -0.031499 0.07538 -0.0066206 0.017776 0.011565 -0.072074 -0.099576 -0.10411 0.057013 0.037479 -0.038638 0.029706 -0.032244 -0.0039103 0.021872 0.12794 0.037184 0.044649 -0.037379 -0.0030204 0.039803 0.0058531 0.011859 0.024165 -0.040272 0.00010784 -0.011253 0.068775 0.027673 -0.036914 0.018752 0.092611 -0.01815 -0.0035241 0.019584 0.073643 0.037706 -0.0030118 -0.045216 0.012837 -0.015783 0.090657 0.0059041 -0.024107 0.069485 -0.028011 0.043746 -0.030208 -0.016149 -0.018021 -0.048695 0.033707 -0.075711 -0.038647 -0.07021 
-eu -0.037353 -0.027362 -0.033909 -0.00039969 -0.030222 0.0015911 -0.028232 0.01786 -0.014257 -0.011448 -0.023139 0.012031 -0.056435 -0.006617 0.052981 -0.0060826 -0.03028 0.015707 -0.052398 0.023712 -0.0064432 -0.0014718 -0.045118 0.039253 0.0097602 -0.04558 0.05964 0.0343 -0.063687 -0.011909 -0.01895 0.0067809 -0.014778 0.039186 0.0058757 -0.025614 -0.023893 -0.0399 0.024129 0.033742 0.048303 0.019461 -0.07657 0.02474 0.046677 0.016832 0.009885 -0.068296 -0.01399 0.0055226 -0.0069431 -0.085845 -0.023411 -0.042923 0.057427 -0.047182 0.0060256 0.024146 -0.058223 0.013072 -0.033671 -0.058119 0.023982 -0.015719 0.03739 -0.048472 0.020311 0.014984 -0.018643 -0.036795 0.0035241 -0.009675 0.013851 -0.0034445 -0.017347 -0.00023894 -0.010806 -0.019534 -0.028022 0.071355 -0.044949 -0.032786 0.020467 0.01806 0.013505 -0.00017721 0.018355 0.040541 0.021265 -0.041678 0.0039899 0.029847 -0.0031074 -0.0041961 -0.014325 -0.041482 -0.0010772 0.070444 0.055681 0.012827 -0.02892 0.057256 -4.8651e-05 0.032088 -0.0079271 0.038938 -0.0053851 0.028593 0.017836 -0.011032 -0.01478 -0.0058081 -0.030188 -0.019595 0.041325 0.040432 0.011999 0.038629 -0.093789 -0.0011015 -0.046322 0.0045352 -0.03474 -0.0044491 0.045891 -0.028638 0.026874 0.057225 -0.01696 -0.034866 -0.058644 -0.022931 0.028788 -0.0083569 -0.037923 -0.029953 -0.058198 0.013038 -0.0032861 0.043514 0.072637 0.030691 -0.0306 0.040851 -0.047351 -0.011646 -0.0056477 -0.017128 0.06462 -0.047185 -0.050766 0.029978 0.010825 -0.020542 -0.064268 0.037026 -0.020089 0.038004 -0.0054114 -0.0092003 0.027351 0.031467 -0.040654 0.044036 0.0079458 -0.00071975 -0.021276 -0.053588 -0.015251 -0.013233 -0.003868 -0.011949 -0.06566 -0.045887 -0.0042648 -0.018674 -0.020515 -0.044458 -0.040253 -0.01698 0.00063857 -0.055777 0.0039248 0.044001 0.030203 -0.014351 -0.0037253 0.034033 -0.0037416 -0.040934 0.048791 -0.011211 0.084519 -0.061822 -0.033333 -0.0066846 -0.0083088 0.013647 0.012507 -0.0074329 0.020559 -0.0058292 -0.041749 -0.012164 0.012664 -0.039248 -0.0017567 0.029393 -0.00041987 0.053412 -0.052871 -0.0033727 -0.038903 -0.027484 -0.041462 0.050878 -0.01633 0.020644 0.024052 -0.0084396 0.0041456 0.020994 -0.0032324 0.040335 -0.038481 0.0022649 0.016631 -0.01163 0.031659 -0.053354 0.017033 0.029549 -0.030608 0.018608 0.082025 -0.010255 0.035669 0.053879 -0.033498 0.0072816 0.025403 -0.0045653 -0.039523 -0.085859 -0.070905 -0.02492 0.060729 -0.0052671 0.013448 0.007041 -0.056404 -0.07653 -0.080863 0.043212 0.029184 -0.031704 0.021531 -0.024142 -0.0025314 0.017274 0.10082 0.027916 0.03469 -0.028931 -0.0019323 0.031362 0.0028475 0.011214 0.01915 -0.033227 0.00060354 -0.0083034 0.054478 0.022637 -0.029832 0.013599 0.072118 -0.013603 -0.0011195 0.015972 0.056146 0.030863 -0.0022054 -0.034718 0.0084416 -0.011443 0.070707 0.0030446 -0.018977 0.054948 -0.022455 0.031931 -0.02418 -0.012363 -0.014807 -0.038616 0.025805 -0.05869 -0.030805 -0.055253 
-cillum -0.039623 -0.027379 -0.035879 0.00086196 -0.031173 0.0028234 -0.030071 0.020825 -0.015558 -0.012121 -0.02389 0.01421 -0.062027 -0.0065 0.055776 -0.0048057 -0.030381 0.01543 -0.053143 0.022576 -0.0075001 -0.0019001 -0.046938 0.0409 0.010445 -0.047691 0.062897 0.035282 -0.067717 -0.012717 -0.019831 0.0081598 -0.016523 0.040799 0.0062605 -0.027855 -0.027068 -0.043497 0.025415 0.034584 0.052089 0.02165 -0.078894 0.026389 0.04797 0.017252 0.01043 -0.073032 -0.014477 0.005584 -0.0077619 -0.089903 -0.025382 -0.043778 0.060048 -0.049208 0.0067553 0.025049 -0.062513 0.014823 -0.036903 -0.062597 0.023551 -0.015405 0.039663 -0.051461 0.020346 0.01735 -0.019502 -0.04008 0.0034437 -0.0088056 0.012477 -0.0030963 -0.018578 4.6648e-05 -0.010562 -0.019405 -0.029224 0.075757 -0.045707 -0.031356 0.021358 0.020164 0.013896 -0.00074664 0.019942 0.044069 0.022127 -0.042417 0.0051495 0.033071 -0.0018045 -0.0028404 -0.015719 -0.042977 -0.00087877 0.073788 0.057852 0.014061 -0.029608 0.061263 6.3018e-05 0.033967 -0.0065405 0.039983 -0.0056221 0.029235 0.017083 -0.011621 -0.015932 -0.0045572 -0.030043 -0.019833 0.044761 0.044374 0.013396 0.040649 -0.099143 -0.00087775 -0.049685 0.0051562 -0.035428 -0.0045684 0.048735 -0.029375 0.025036 0.060215 -0.017184 -0.038275 -0.06304 -0.024195 0.030165 -0.0070596 -0.040659 -0.030251 -0.061263 0.014916 -0.0044765 0.044541 0.075391 0.033687 -0.033086 0.042818 -0.049666 -0.013936 -0.0057507 -0.016638 0.067311 -0.04948 -0.053341 0.032926 0.012223 -0.020786 -0.067484 0.039226 -0.02321 0.038887 -0.0053877 -0.0092926 0.030465 0.033263 -0.041446 0.044041 0.0095334 -0.00098075 -0.021361 -0.057341 -0.015159 -0.012432 -0.0062196 -0.013056 -0.069901 -0.046804 -0.0067141 -0.020026 -0.019817 -0.047374 -0.041297 -0.017603 0.00035275 -0.060122 0.0048325 0.044168 0.030817 -0.014389 -0.0035954 0.034533 -0.0015452 -0.042336 0.052274 -0.011642 0.08757 -0.06339 -0.035564 -0.0057282 -0.0079381 0.013861 0.015869 -0.010355 0.02076 -0.0066423 -0.044164 -0.013363 0.011525 -0.041795 -0.0013189 0.03252 0.00026093 0.058781 -0.057112 -0.0032177 -0.03903 -0.028665 -0.042971 0.054242 -0.016272 0.020184 0.026812 -0.010053 0.0051579 0.020192 -0.0043338 0.041144 -0.039724 0.0032687 0.016273 -0.011885 0.032721 -0.055428 0.016502 0.03071 -0.030709 0.021721 0.085651 -0.011818 0.037572 0.056893 -0.037182 0.0071444 0.026906 -0.0056371 -0.041195 -0.091871 -0.073334 -0.026663 0.06208 -0.0062815 0.015393 0.0088205 -0.059231 -0.081009 -0.085779 0.047273 0.031058 -0.031895 0.02399 -0.025668 -0.0043209 0.017118 0.10567 0.030344 0.037485 -0.030828 -0.0022631 0.032846 0.0048801 0.010365 0.019624 -0.033842 0.00020643 -0.0096909 0.056054 0.022646 -0.029788 0.01611 0.075373 -0.015771 -0.0023111 0.016851 0.060179 0.031233 -0.0030546 -0.037429 0.011137 -0.012194 0.074512 0.0040641 -0.018915 0.057009 -0.023071 0.034958 -0.024423 -0.01411 -0.014674 -0.039508 0.027031 -0.061022 -0.032516 -0.058173 
-esse -0.042329 -0.030795 -0.038444 0.0014814 -0.034593 0.0029438 -0.032737 0.022256 -0.015497 -0.013313 -0.025544 0.014997 -0.067556 -0.0078647 0.060095 -0.0051137 -0.033437 0.017106 -0.058936 0.025467 -0.0084099 -0.0016997 -0.051078 0.045563 0.010745 -0.053398 0.068567 0.039097 -0.074016 -0.014578 -0.021472 0.0091517 -0.017871 0.045183 0.0066662 -0.029456 -0.028114 -0.047357 0.027576 0.037805 0.057224 0.023477 -0.086256 0.029111 0.053574 0.019777 0.011896 -0.079806 -0.016095 0.0070672 -0.0081802 -0.097941 -0.027232 -0.047877 0.06553 -0.054087 0.0071365 0.027148 -0.068576 0.014145 -0.039472 -0.067838 0.026628 -0.017663 0.043791 -0.056918 0.023123 0.017591 -0.021881 -0.043267 0.0035423 -0.010129 0.013941 -0.0033818 -0.019335 0.00029357 -0.011549 -0.022784 -0.032413 0.083428 -0.049923 -0.034524 0.023459 0.02117 0.013955 0.00028289 0.022612 0.046403 0.024775 -0.04751 0.0051246 0.035755 -0.0025642 -0.0036689 -0.016022 -0.046706 -0.00037588 0.080749 0.063383 0.015242 -0.032447 0.067003 -0.00059547 0.036948 -0.0081884 0.045746 -0.0062313 0.032225 0.019128 -0.012659 -0.017277 -0.0051261 -0.033193 -0.021027 0.047577 0.047553 0.014378 0.044518 -0.10763 -7.577e-05 -0.053252 0.0057562 -0.038314 -0.0062043 0.05303 -0.032934 0.028847 0.06602 -0.018876 -0.040848 -0.068958 -0.02624 0.031349 -0.0074699 -0.044327 -0.034824 -0.0667 0.015769 -0.0037941 0.048399 0.082991 0.036622 -0.035968 0.048208 -0.053625 -0.013881 -0.0076253 -0.017749 0.073705 -0.053848 -0.058184 0.034863 0.012278 -0.023401 -0.072901 0.043252 -0.025404 0.042913 -0.006147 -0.010156 0.033113 0.035791 -0.045418 0.049016 0.010501 -0.0015869 -0.023738 -0.062681 -0.017406 -0.014649 -0.0046804 -0.015253 -0.076998 -0.051641 -0.007417 -0.021474 -0.021842 -0.05124 -0.045016 -0.018456 0.00033837 -0.065577 0.0053431 0.048423 0.034486 -0.01641 -0.0044752 0.03848 -0.0027304 -0.047408 0.056536 -0.012372 0.097619 -0.068799 -0.038116 -0.0074151 -0.0087819 0.016003 0.017336 -0.010427 0.022459 -0.0062642 -0.047742 -0.015553 0.013497 -0.045227 -0.0022037 0.035463 0.00066092 0.063794 -0.0619 -0.0035234 -0.042507 -0.03146 -0.04651 0.058864 -0.017791 0.022896 0.028208 -0.011175 0.0059632 0.023049 -0.0036864 0.045622 -0.043228 0.0033037 0.017888 -0.012833 0.035637 -0.060569 0.017247 0.033893 -0.034374 0.023356 0.094829 -0.01222 0.040471 0.062351 -0.040467 0.0082338 0.029054 -0.0058478 -0.044794 -0.10045 -0.080795 -0.028293 0.068817 -0.0064381 0.016086 0.010327 -0.064585 -0.089957 -0.092599 0.049867 0.033248 -0.035014 0.026289 -0.028112 -0.0037341 0.018589 0.11472 0.033327 0.040008 -0.033575 -0.0024558 0.035913 0.0047587 0.012336 0.02199 -0.037026 -2.2724e-05 -0.0096627 0.062188 0.024995 -0.034056 0.01734 0.082751 -0.015448 -0.0028012 0.01733 0.066382 0.033816 -0.0030914 -0.04006 0.01178 -0.01467 0.080162 0.0037808 -0.021675 0.062579 -0.024553 0.038675 -0.026376 -0.014903 -0.016333 -0.043488 0.029822 -0.067272 -0.035566 -0.062107 
-velit -0.037502 -0.026298 -0.034425 0.00087747 -0.030805 0.0027733 -0.028649 0.020231 -0.015526 -0.012548 -0.023897 0.013206 -0.059491 -0.0065197 0.052985 -0.0058669 -0.029419 0.01535 -0.051521 0.021836 -0.0061866 -0.0023308 -0.045468 0.038728 0.0094502 -0.04592 0.060465 0.034438 -0.066317 -0.010842 -0.019157 0.0076199 -0.01508 0.039404 0.006791 -0.026042 -0.025 -0.042502 0.024928 0.03286 0.05032 0.021136 -0.076038 0.025101 0.047646 0.016772 0.0099314 -0.069792 -0.013446 0.0051105 -0.0073949 -0.086252 -0.024312 -0.042745 0.05697 -0.046534 0.0067404 0.023409 -0.060361 0.01434 -0.034991 -0.060343 0.023051 -0.014871 0.038561 -0.049194 0.019052 0.015849 -0.019712 -0.038546 0.0038229 -0.0089217 0.01152 -0.0034785 -0.01677 0.00028951 -0.010634 -0.019087 -0.027603 0.072637 -0.0444 -0.030794 0.020378 0.018262 0.013948 -0.00055698 0.019822 0.041374 0.021013 -0.040374 0.0047546 0.031462 -0.0025669 -0.002438 -0.014961 -0.041672 -0.00010263 0.070135 0.054861 0.01302 -0.027856 0.058319 -5.507e-05 0.033721 -0.0058489 0.039536 -0.0052865 0.027917 0.0173 -0.010963 -0.014722 -0.0044839 -0.029801 -0.018785 0.042489 0.042141 0.012491 0.039685 -0.0956 -0.00072806 -0.047482 0.0056898 -0.034177 -0.0048136 0.046135 -0.028132 0.024215 0.05826 -0.016371 -0.036282 -0.060512 -0.023951 0.028626 -0.0072522 -0.039395 -0.029781 -0.059149 0.014125 -0.0038653 0.04329 0.072189 0.031477 -0.03146 0.041374 -0.049164 -0.013213 -0.0058359 -0.015481 0.065502 -0.047523 -0.050893 0.030593 0.011458 -0.019953 -0.064252 0.037559 -0.02198 0.037593 -0.0047339 -0.008493 0.02886 0.031318 -0.038953 0.04226 0.0085215 -0.00033582 -0.020494 -0.054832 -0.014252 -0.011543 -0.0052285 -0.012739 -0.067912 -0.045228 -0.0069401 -0.019251 -0.020002 -0.045366 -0.040674 -0.015753 6.8703e-05 -0.057253 0.0039952 0.043322 0.029363 -0.015128 -0.0032647 0.033482 -0.0020845 -0.040573 0.050594 -0.010986 0.084041 -0.061338 -0.03401 -0.0061904 -0.0082863 0.0131 0.014278 -0.008335 0.019649 -0.0062144 -0.042441 -0.013131 0.011413 -0.040698 -0.0010207 0.031924 0.001448 0.056649 -0.055077 -0.0032268 -0.037499 -0.02753 -0.040546 0.051406 -0.016273 0.019895 0.025848 -0.010685 0.004424 0.019423 -0.0041691 0.03995 -0.038452 0.0031887 0.014828 -0.011448 0.029914 -0.053209 0.01543 0.029518 -0.029113 0.020142 0.082764 -0.010947 0.036026 0.054675 -0.035976 0.0075434 0.025066 -0.0058672 -0.039219 -0.087874 -0.069328 -0.025655 0.059941 -0.0049007 0.014837 0.0095439 -0.056346 -0.077837 -0.081931 0.044706 0.029983 -0.032043 0.022671 -0.025119 -0.0030666 0.017427 0.10128 0.028667 0.035263 -0.028937 -0.0023776 0.030999 0.004555 0.010475 0.018607 -0.032096 0.00070284 -0.00912 0.054607 0.021532 -0.029312 0.015098 0.073319 -0.014444 -0.0014807 0.015692 0.058844 0.028822 -0.0027127 -0.035548 0.010057 -0.012898 0.070189 0.005031 -0.018198 0.054748 -0.021333 0.033546 -0.023143 -0.012843 -0.014517 -0.038415 0.025645 -0.059126 -0.03155 -0.056833 
-voluptate -0.049919 -0.035027 -0.045274 0.0017551 -0.03975 0.0038111 -0.037165 0.025704 -0.019011 -0.016047 -0.030152 0.018037 -0.077612 -0.0079442 0.069923 -0.0064678 -0.038088 0.020142 -0.066741 0.028509 -0.0083939 -0.0018145 -0.05872 0.051556 0.011613 -0.059766 0.07899 0.043485 -0.085378 -0.016121 -0.024352 0.01109 -0.02004 0.051846 0.0078926 -0.03358 -0.034021 -0.054931 0.031388 0.04276 0.065963 0.026844 -0.098527 0.034328 0.061164 0.021454 0.013095 -0.090728 -0.017814 0.0066226 -0.010286 -0.11157 -0.031782 -0.054246 0.074945 -0.06203 0.0083112 0.032018 -0.078322 0.018344 -0.046298 -0.079015 0.029759 -0.01882 0.049459 -0.064475 0.025637 0.021503 -0.024194 -0.049514 0.0049737 -0.011462 0.016078 -0.0033242 -0.02236 0.0010461 -0.013299 -0.025413 -0.037157 0.094794 -0.057563 -0.039667 0.027073 0.024815 0.017255 -0.00063522 0.025538 0.05433 0.028024 -0.052272 0.0060764 0.04158 -0.0021715 -0.0032995 -0.019377 -0.054702 -0.0009715 0.091957 0.072644 0.017635 -0.036783 0.076598 -0.00011802 0.043227 -0.0086773 0.051317 -0.0074633 0.036595 0.022346 -0.014822 -0.01947 -0.0063798 -0.038053 -0.024161 0.055267 0.054711 0.016087 0.050461 -0.12343 -0.00071522 -0.06187 0.0070757 -0.0448 -0.0065582 0.060566 -0.037034 0.032363 0.075754 -0.021646 -0.047957 -0.079294 -0.030831 0.037109 -0.0091802 -0.05115 -0.037744 -0.076499 0.018084 -0.0060417 0.05517 0.094016 0.042455 -0.04107 0.05478 -0.061873 -0.016475 -0.0072902 -0.020638 0.084553 -0.061378 -0.066427 0.039526 0.014815 -0.026972 -0.084074 0.049355 -0.029396 0.04852 -0.006174 -0.011598 0.037617 0.040763 -0.052757 0.056194 0.011718 -0.00097371 -0.027384 -0.072434 -0.018908 -0.016403 -0.0065888 -0.015408 -0.087281 -0.058708 -0.0082215 -0.024225 -0.02533 -0.059435 -0.052654 -0.021949 0.0011549 -0.074985 0.0055788 0.056225 0.038477 -0.020356 -0.0048531 0.044622 -0.0018629 -0.053575 0.066964 -0.014955 0.10984 -0.07961 -0.043625 -0.0076738 -0.010812 0.017549 0.020004 -0.012749 0.025248 -0.0076381 -0.054685 -0.01634 0.014946 -0.052218 -0.0014457 0.041814 0.00073987 0.074077 -0.071228 -0.0046986 -0.048044 -0.035265 -0.053798 0.067322 -0.019618 0.025048 0.034356 -0.013973 0.0067548 0.026487 -0.0052773 0.051005 -0.049339 0.0043546 0.020551 -0.014163 0.040959 -0.069673 0.019569 0.039089 -0.03884 0.027258 0.10838 -0.014849 0.046773 0.072312 -0.046572 0.0098355 0.033662 -0.0079026 -0.050956 -0.1159 -0.091549 -0.033748 0.077451 -0.0075106 0.019309 0.012223 -0.07401 -0.10293 -0.10674 0.058741 0.039496 -0.040439 0.030436 -0.032889 -0.0053708 0.022085 0.13171 0.039326 0.046347 -0.038352 -0.0028695 0.040643 0.0061592 0.012756 0.02475 -0.041796 1.3323e-06 -0.011075 0.07098 0.028473 -0.038365 0.020363 0.095048 -0.019472 -0.0031259 0.020157 0.076471 0.038434 -0.0033231 -0.045932 0.013797 -0.017281 0.092354 0.0049376 -0.023842 0.070987 -0.028992 0.044128 -0.030135 -0.017495 -0.018106 -0.050157 0.033676 -0.076876 -0.040697 -0.072379 
-reprehenderit -0.045885 -0.031966 -0.041675 0.0013672 -0.037289 0.0034877 -0.034897 0.024122 -0.017732 -0.015009 -0.02836 0.016606 -0.071914 -0.0073559 0.065486 -0.0056221 -0.036111 0.018583 -0.062508 0.027141 -0.0086968 -0.0021555 -0.055776 0.048787 0.011514 -0.056413 0.073184 0.041131 -0.08023 -0.015089 -0.023433 0.010324 -0.018567 0.047983 0.0079406 -0.03151 -0.030777 -0.051506 0.029931 0.040422 0.061894 0.025736 -0.092167 0.031966 0.056932 0.020715 0.012445 -0.085233 -0.016728 0.0064586 -0.0096638 -0.10504 -0.029509 -0.051494 0.069966 -0.057317 0.0080731 0.030067 -0.073631 0.016652 -0.042398 -0.073942 0.028114 -0.018075 0.046731 -0.060221 0.0241 0.019986 -0.023667 -0.047205 0.0044 -0.010981 0.014638 -0.0038488 -0.02099 0.00048022 -0.012527 -0.023747 -0.034197 0.088874 -0.053928 -0.037157 0.025339 0.023546 0.01603 -0.00072973 0.023711 0.050609 0.025974 -0.049341 0.0061044 0.038441 -0.002225 -0.0035296 -0.018012 -0.050819 -0.00088389 0.086015 0.067651 0.016513 -0.033957 0.07147 -0.00082377 0.04054 -0.0079258 0.048108 -0.0071075 0.033983 0.020571 -0.013836 -0.018852 -0.0057402 -0.035019 -0.022996 0.052359 0.05119 0.015123 0.047409 -0.11562 5.6433e-05 -0.058207 0.0066097 -0.041393 -0.006462 0.057345 -0.035222 0.030223 0.071288 -0.019627 -0.043882 -0.074706 -0.028704 0.034127 -0.0089264 -0.047988 -0.03538 -0.072063 0.017792 -0.0051703 0.052531 0.08859 0.040135 -0.038523 0.051358 -0.05847 -0.015638 -0.0071954 -0.019217 0.079938 -0.058035 -0.062798 0.038386 0.014447 -0.024609 -0.078762 0.046242 -0.02691 0.045999 -0.0061889 -0.011018 0.034924 0.03783 -0.049503 0.051794 0.011364 -0.0010068 -0.025548 -0.066814 -0.017599 -0.014805 -0.0065405 -0.015474 -0.081934 -0.054946 -0.008288 -0.022881 -0.023979 -0.055932 -0.04864 -0.020376 0.00078604 -0.070102 0.0050721 0.052763 0.036075 -0.018445 -0.0049559 0.041327 -0.0020977 -0.04998 0.062043 -0.013935 0.10328 -0.074146 -0.041563 -0.0068959 -0.0090617 0.016721 0.018551 -0.011584 0.023735 -0.0067364 -0.051212 -0.015532 0.013792 -0.049293 -0.0015673 0.038525 0.0009498 0.069417 -0.067076 -0.0042833 -0.045322 -0.033888 -0.049813 0.064187 -0.019175 0.024302 0.031129 -0.012982 0.0062076 0.023902 -0.004918 0.048377 -0.046597 0.0040524 0.018785 -0.013578 0.038036 -0.065717 0.018098 0.036199 -0.036243 0.025439 0.10154 -0.013312 0.043751 0.066991 -0.043291 0.0093896 0.031507 -0.0071376 -0.048028 -0.10799 -0.085792 -0.03091 0.072623 -0.0066171 0.018365 0.011269 -0.068854 -0.096325 -0.099585 0.054748 0.036739 -0.038071 0.028117 -0.030459 -0.0045437 0.020507 0.12344 0.036237 0.042977 -0.035248 -0.0031696 0.038178 0.0054717 0.011812 0.023227 -0.039229 0.00032966 -0.010518 0.0666 0.026876 -0.035549 0.01913 0.089127 -0.017594 -0.0022481 0.019187 0.07128 0.035482 -0.0029842 -0.042731 0.012674 -0.015778 0.086844 0.0051258 -0.022982 0.06661 -0.02673 0.04154 -0.029222 -0.016046 -0.016537 -0.046717 0.032131 -0.071478 -0.038333 -0.067741 
-irure -0.043389 -0.029599 -0.039213 0.001115 -0.034568 0.0031651 -0.032352 0.022168 -0.017263 -0.014078 -0.026015 0.015431 -0.066007 -0.0073586 0.060177 -0.005555 -0.033128 0.017938 -0.05805 0.024558 -0.007995 -0.0011839 -0.052291 0.044914 0.011427 -0.053026 0.067549 0.037815 -0.074095 -0.013283 -0.021128 0.0089724 -0.01731 0.045603 0.0057765 -0.030128 -0.027789 -0.047415 0.027426 0.037657 0.056884 0.022656 -0.085604 0.029331 0.052638 0.019491 0.011835 -0.079611 -0.01601 0.0055827 -0.008854 -0.097328 -0.027522 -0.04748 0.064131 -0.053007 0.0065365 0.028011 -0.067493 0.015637 -0.039397 -0.068671 0.026074 -0.017478 0.043513 -0.055334 0.022718 0.017999 -0.021867 -0.042253 0.0040287 -0.010886 0.014229 -0.0040878 -0.01964 0.0006747 -0.011963 -0.022148 -0.031695 0.082936 -0.050328 -0.03498 0.023601 0.021715 0.014864 0.00081927 0.02161 0.046938 0.025063 -0.0457 0.006063 0.036088 -0.0026368 -0.0037581 -0.016003 -0.047061 -0.0011712 0.080362 0.063371 0.015259 -0.031859 0.066069 -0.00054954 0.037672 -0.0071976 0.044985 -0.0065814 0.032574 0.019063 -0.011297 -0.016671 -0.0053415 -0.032903 -0.021495 0.047951 0.047861 0.014193 0.045088 -0.10679 -0.00057564 -0.053861 0.0062669 -0.038902 -0.0058906 0.051376 -0.033127 0.028298 0.067125 -0.018835 -0.041049 -0.069491 -0.026892 0.031381 -0.0077634 -0.043489 -0.033366 -0.065968 0.015854 -0.0042028 0.048221 0.08214 0.036929 -0.035882 0.047349 -0.053558 -0.013783 -0.0074217 -0.017719 0.074484 -0.053452 -0.058577 0.035789 0.012596 -0.022612 -0.072598 0.043015 -0.024931 0.042187 -0.0052114 -0.010359 0.032839 0.034639 -0.045516 0.04832 0.010413 -0.0011513 -0.0243 -0.063502 -0.016941 -0.013824 -0.00489 -0.015575 -0.076408 -0.051002 -0.0071135 -0.020067 -0.023223 -0.051268 -0.045146 -0.018396 0.00058788 -0.064672 0.0042557 0.049104 0.032577 -0.017125 -0.0036164 0.038668 -0.0022049 -0.046357 0.057603 -0.012165 0.096187 -0.068291 -0.038327 -0.0074758 -0.007946 0.01487 0.016711 -0.01167 0.021828 -0.0072223 -0.047334 -0.014095 0.013138 -0.044721 -0.0018632 0.035277 0.0011095 0.06422 -0.062635 -0.0040274 -0.042568 -0.030987 -0.046832 0.059041 -0.017388 0.023561 0.028499 -0.011805 0.0051497 0.022677 -0.0037805 0.045198 -0.043465 0.00307 0.018899 -0.0123 0.035789 -0.060717 0.017541 0.032795 -0.033923 0.024049 0.094739 -0.012251 0.040914 0.062452 -0.039725 0.0089348 0.029334 -0.0062448 -0.044736 -0.099861 -0.079976 -0.028541 0.067183 -0.0070773 0.016256 0.010877 -0.064463 -0.089165 -0.092757 0.050129 0.033387 -0.034504 0.025905 -0.029368 -0.0040863 0.01888 0.11425 0.033112 0.03994 -0.033434 -0.0014695 0.035289 0.0054919 0.01209 0.020961 -0.036437 0.00016367 -0.009669 0.061908 0.025418 -0.033358 0.017531 0.082048 -0.016585 -0.0019752 0.017324 0.066089 0.033069 -0.003429 -0.039406 0.011416 -0.015003 0.080239 0.0050025 -0.02107 0.061992 -0.024964 0.038024 -0.026983 -0.014674 -0.016246 -0.043829 0.029958 -0.066561 -0.035689 -0.062548 
-aute -0.03821 -0.025241 -0.035032 0.00094895 -0.030925 0.0036833 -0.027604 0.019816 -0.014659 -0.011954 -0.024306 0.01375 -0.058492 -0.0065334 0.052217 -0.0052031 -0.029328 0.014305 -0.049285 0.021683 -0.0066562 -0.0023334 -0.045204 0.039238 0.0097269 -0.045878 0.059187 0.033982 -0.064931 -0.012762 -0.018771 0.0077053 -0.014305 0.039469 0.0051493 -0.025066 -0.024805 -0.042113 0.024786 0.032443 0.05038 0.020706 -0.074538 0.025685 0.045956 0.01707 0.0098193 -0.06975 -0.01408 0.0049009 -0.0082486 -0.085345 -0.024213 -0.041621 0.056974 -0.046639 0.0072411 0.023255 -0.059462 0.013919 -0.034003 -0.060386 0.022742 -0.014686 0.037391 -0.04851 0.01925 0.015798 -0.018785 -0.037139 0.0039983 -0.0086848 0.012203 -0.0025936 -0.017185 0.00069617 -0.0098372 -0.018735 -0.028204 0.071832 -0.04379 -0.030734 0.02058 0.019088 0.013532 -0.0010193 0.018811 0.040052 0.020264 -0.039815 0.0050603 0.031832 -0.0020728 -0.0022182 -0.013455 -0.041465 -0.00079002 0.069028 0.054868 0.013247 -0.026717 0.057094 -0.00094168 0.033068 -0.007144 0.038858 -0.0061993 0.027916 0.017574 -0.01118 -0.015529 -0.0037203 -0.028963 -0.018089 0.042475 0.041628 0.013292 0.037442 -0.094398 -0.0016402 -0.046459 0.005898 -0.033562 -0.0046581 0.046524 -0.027553 0.024528 0.057072 -0.016755 -0.035206 -0.05883 -0.023294 0.026873 -0.008216 -0.038837 -0.02852 -0.058621 0.013828 -0.0036321 0.042876 0.071763 0.031858 -0.030939 0.040694 -0.047364 -0.011949 -0.0058023 -0.016087 0.065062 -0.046346 -0.051676 0.030761 0.012077 -0.01945 -0.065096 0.036521 -0.021241 0.037439 -0.0055548 -0.0085674 0.028813 0.032017 -0.040379 0.041619 0.008988 -0.0016453 -0.020304 -0.054125 -0.014099 -0.012088 -0.0050676 -0.012447 -0.065506 -0.045372 -0.0065453 -0.018488 -0.018871 -0.04541 -0.039851 -0.015998 0.00080712 -0.056605 0.0053048 0.042969 0.029111 -0.016154 -0.0036595 0.033993 -0.0026886 -0.039681 0.049164 -0.010568 0.0836 -0.060772 -0.033161 -0.0060374 -0.0087083 0.014427 0.014736 -0.0085522 0.020742 -0.0055225 -0.042169 -0.011351 0.011607 -0.04058 -0.0013569 0.032354 -0.00043517 0.055319 -0.053858 -0.0036502 -0.036659 -0.027551 -0.04041 0.051314 -0.014943 0.019391 0.024516 -0.010514 0.0045853 0.020551 -0.0038782 0.0393 -0.038403 0.0037062 0.016024 -0.010388 0.030367 -0.05323 0.015448 0.030105 -0.028943 0.019689 0.081353 -0.01081 0.035667 0.05476 -0.033895 0.0081962 0.02547 -0.0060116 -0.038656 -0.088449 -0.069676 -0.025321 0.060894 -0.0054431 0.015308 0.0096122 -0.056655 -0.07752 -0.080464 0.044173 0.030007 -0.031671 0.022535 -0.026301 -0.0040654 0.016682 0.10031 0.028987 0.035316 -0.028301 -0.0017261 0.032304 0.0045214 0.0099882 0.019302 -0.031584 0.0002056 -0.008337 0.053998 0.022028 -0.028522 0.014877 0.072649 -0.014452 -0.002551 0.015271 0.057707 0.030098 -0.0029248 -0.03533 0.0095846 -0.012373 0.070657 0.0042042 -0.018408 0.054299 -0.021259 0.033501 -0.02347 -0.012124 -0.012938 -0.038248 0.025986 -0.058756 -0.031161 -0.05533 
-ea -0.040078 -0.027843 -0.035999 0.00139 -0.031629 0.0037044 -0.031635 0.021534 -0.014035 -0.013277 -0.023897 0.016393 -0.062716 -0.0056423 0.055259 -0.0052879 -0.031266 0.01543 -0.055455 0.022985 -0.0070602 -0.0012748 -0.05017 0.043584 0.012402 -0.050825 0.065505 0.035253 -0.070111 -0.012771 -0.019664 0.010101 -0.015181 0.042 0.0078614 -0.028017 -0.026979 -0.044361 0.027963 0.03793 0.054369 0.024182 -0.080574 0.027634 0.0485 0.018244 0.010629 -0.073948 -0.012782 0.0063832 -0.0084989 -0.093216 -0.02477 -0.045332 0.062159 -0.05025 0.0062536 0.026126 -0.062426 0.015703 -0.036788 -0.065114 0.025652 -0.01616 0.040157 -0.0531 0.020896 0.017258 -0.020201 -0.041583 0.0034859 -0.0096234 0.012781 -0.0028509 -0.020539 -0.00031385 -0.011122 -0.020414 -0.030038 0.079685 -0.045903 -0.032585 0.023384 0.020896 0.013786 4.5904e-05 0.019931 0.045929 0.023429 -0.042518 0.0045631 0.034036 -0.0038517 -0.0027557 -0.017023 -0.046738 -0.0017102 0.076051 0.061234 0.015127 -0.031159 0.063563 -0.0012974 0.036157 -0.0063553 0.042172 -0.0063204 0.030812 0.017202 -0.012114 -0.016701 -0.003961 -0.0316 -0.019438 0.045932 0.043963 0.015548 0.041536 -0.10293 -0.00041092 -0.051499 0.0033948 -0.037423 -0.0051588 0.048371 -0.031829 0.025762 0.062288 -0.016643 -0.039447 -0.066768 -0.026327 0.031788 -0.0067924 -0.04218 -0.029893 -0.06329 0.016597 -0.0046997 0.046253 0.075988 0.036236 -0.034164 0.044263 -0.051851 -0.014345 -0.0060815 -0.016673 0.068538 -0.050075 -0.054845 0.034214 0.012988 -0.021662 -0.068588 0.04015 -0.024293 0.041857 -0.0043114 -0.010792 0.031152 0.033124 -0.043248 0.045891 0.0095763 -0.0016439 -0.021297 -0.057041 -0.014261 -0.013861 -0.0046299 -0.01248 -0.071819 -0.047447 -0.0060266 -0.019661 -0.019771 -0.05002 -0.043954 -0.019247 0.0014314 -0.060609 0.0056586 0.046355 0.031139 -0.016179 -0.0048987 0.035718 -0.0018478 -0.04428 0.055149 -0.012121 0.091478 -0.06365 -0.037525 -0.0048442 -0.0058833 0.014756 0.01701 -0.010763 0.021873 -0.0048551 -0.044958 -0.014229 0.012759 -0.043755 -0.0011878 0.032693 -4.8904e-05 0.061254 -0.058752 -0.003894 -0.040936 -0.029576 -0.04346 0.056089 -0.018091 0.021034 0.028722 -0.011556 0.0050435 0.019393 -0.0035291 0.042037 -0.041595 0.0025112 0.016339 -0.012099 0.03587 -0.05644 0.017449 0.032325 -0.032569 0.021732 0.089162 -0.012017 0.039107 0.058788 -0.038915 0.0075091 0.028535 -0.0060565 -0.043003 -0.095526 -0.074507 -0.025904 0.062834 -0.0077157 0.016985 0.011787 -0.060606 -0.085782 -0.086375 0.049744 0.031711 -0.031689 0.025258 -0.026009 -0.0048388 0.018588 0.10777 0.032387 0.038186 -0.031702 -0.0023548 0.034279 0.0061486 0.010104 0.020422 -0.036001 0.0012744 -0.0092258 0.058405 0.023341 -0.032428 0.017284 0.077872 -0.016849 -0.0011045 0.017519 0.06173 0.0306 -0.0036785 -0.037408 0.010213 -0.014084 0.075802 0.0057688 -0.01962 0.05839 -0.023511 0.036419 -0.024658 -0.014705 -0.0145 -0.041114 0.028816 -0.063192 -0.033651 -0.057534 
-sint -0.043348 -0.030527 -0.040746 0.0016896 -0.036153 0.0036149 -0.033147 0.022888 -0.018098 -0.013697 -0.027668 0.016732 -0.069825 -0.0076096 0.062583 -0.0057667 -0.033177 0.018147 -0.059609 0.025549 -0.0074718 -0.0027864 -0.053485 0.046043 0.011248 -0.053904 0.069856 0.039939 -0.076123 -0.013963 -0.022064 0.0092957 -0.017809 0.046883 0.0069277 -0.029737 -0.028959 -0.049393 0.028451 0.037573 0.059129 0.024688 -0.088055 0.029373 0.054313 0.019005 0.012548 -0.080994 -0.015974 0.0059226 -0.0089734 -0.099218 -0.028186 -0.048658 0.066957 -0.05523 0.0078233 0.027985 -0.070018 0.01634 -0.040839 -0.070557 0.026646 -0.017884 0.044003 -0.0573 0.022065 0.018388 -0.022578 -0.043771 0.0053132 -0.010341 0.015397 -0.0034499 -0.019808 -0.00030925 -0.011468 -0.023154 -0.033343 0.084086 -0.050822 -0.036017 0.024414 0.020736 0.015713 -0.0023238 0.022823 0.04771 0.024342 -0.047115 0.0051272 0.036931 -0.0030902 -0.0035113 -0.016344 -0.047158 -0.00096576 0.081876 0.064572 0.016025 -0.032365 0.068125 -0.00022211 0.038176 -0.0081917 0.046084 -0.0077421 0.032666 0.020525 -0.013506 -0.017355 -0.0049034 -0.033777 -0.021999 0.049047 0.048908 0.015145 0.045053 -0.11121 -0.00037873 -0.054789 0.0055006 -0.039918 -0.0049512 0.053856 -0.032123 0.028486 0.067268 -0.019827 -0.042721 -0.070114 -0.028284 0.032186 -0.0092883 -0.046079 -0.034742 -0.068799 0.017534 -0.0052688 0.049793 0.084149 0.037639 -0.03691 0.04881 -0.055806 -0.014437 -0.006809 -0.019547 0.076447 -0.054911 -0.060816 0.035899 0.013382 -0.024013 -0.074817 0.043969 -0.026029 0.044306 -0.0059479 -0.011079 0.03427 0.036661 -0.046242 0.049805 0.010249 -0.00065859 -0.024392 -0.064621 -0.016666 -0.013547 -0.0065659 -0.014323 -0.077232 -0.052238 -0.0075502 -0.022673 -0.022637 -0.052834 -0.046744 -0.017703 0.00094296 -0.067445 0.0049291 0.049959 0.033806 -0.018337 -0.003521 0.039487 -0.0024893 -0.047507 0.05879 -0.01249 0.098618 -0.071494 -0.038104 -0.0073275 -0.0095762 0.015921 0.017411 -0.010614 0.023005 -0.0068261 -0.049549 -0.014339 0.013786 -0.047038 -0.0022329 0.037738 0.00076114 0.066166 -0.064518 -0.0043178 -0.044296 -0.032109 -0.048021 0.060371 -0.017623 0.022783 0.029674 -0.011986 0.0046307 0.023855 -0.0039157 0.046361 -0.044365 0.0044742 0.017469 -0.011738 0.034979 -0.062659 0.017828 0.035592 -0.034989 0.023699 0.095577 -0.012408 0.041458 0.063804 -0.041925 0.0094015 0.029082 -0.00676 -0.046542 -0.10345 -0.082216 -0.029003 0.070555 -0.0058334 0.017442 0.01061 -0.065566 -0.091145 -0.09489 0.052827 0.035046 -0.036665 0.02645 -0.030255 -0.0045073 0.019915 0.11742 0.033697 0.040821 -0.033616 -0.0024613 0.037545 0.005619 0.01122 0.022152 -0.037563 0.0013908 -0.011121 0.064609 0.025734 -0.035283 0.017892 0.085419 -0.016564 -0.0021283 0.018508 0.068428 0.034638 -0.0020615 -0.041464 0.01239 -0.015036 0.082001 0.0049102 -0.021237 0.063393 -0.026013 0.039641 -0.027384 -0.015157 -0.016011 -0.045648 0.030343 -0.068903 -0.03667 -0.065703 
-occaecat -0.036973 -0.026356 -0.034178 0.00088293 -0.030249 0.0032643 -0.028254 0.02001 -0.014816 -0.012146 -0.023234 0.01313 -0.058572 -0.0060036 0.053148 -0.0046616 -0.028475 0.015229 -0.05108 0.021687 -0.0069001 -0.0015139 -0.045653 0.039349 0.010076 -0.046518 0.059999 0.03398 -0.06517 -0.012347 -0.018646 0.00827 -0.014851 0.03926 0.0057649 -0.025453 -0.025061 -0.041117 0.024487 0.033096 0.050235 0.020128 -0.074668 0.026074 0.04604 0.016838 0.0098716 -0.069418 -0.013719 0.0048756 -0.0076443 -0.08529 -0.023987 -0.041392 0.0569 -0.046164 0.0062766 0.024618 -0.059306 0.013454 -0.035235 -0.060963 0.022335 -0.015096 0.038435 -0.04914 0.020014 0.016226 -0.018653 -0.037957 0.0028386 -0.00852 0.011534 -0.0030225 -0.016673 0.00065101 -0.0099561 -0.019404 -0.027674 0.072376 -0.044178 -0.030578 0.020589 0.019156 0.013143 0.00023784 0.019139 0.04149 0.02168 -0.040592 0.0054077 0.031827 -0.0018383 -0.0028823 -0.014091 -0.041464 -0.00028969 0.069983 0.055135 0.013748 -0.027443 0.058038 -0.00017185 0.033451 -0.0059144 0.038765 -0.0057514 0.027929 0.017046 -0.010861 -0.01494 -0.0043065 -0.029587 -0.018772 0.042149 0.041936 0.012382 0.038594 -0.094748 -0.00051483 -0.047104 0.005376 -0.03358 -0.00506 0.046556 -0.028151 0.024395 0.058559 -0.016069 -0.035133 -0.060442 -0.023877 0.027562 -0.007517 -0.038772 -0.029026 -0.05816 0.014105 -0.0041933 0.042423 0.072322 0.032745 -0.031614 0.04162 -0.047823 -0.012433 -0.0055474 -0.01609 0.064772 -0.046761 -0.051508 0.03179 0.011011 -0.020469 -0.063786 0.037491 -0.022175 0.037002 -0.0043335 -0.0089279 0.028506 0.031205 -0.039993 0.041746 0.0089772 -0.0017502 -0.020664 -0.054817 -0.014146 -0.011784 -0.0051285 -0.012706 -0.066803 -0.045025 -0.0061844 -0.018595 -0.019429 -0.04452 -0.039371 -0.016541 0.00084557 -0.056574 0.0037729 0.042578 0.029007 -0.015352 -0.0037188 0.033949 -0.0021399 -0.040537 0.05047 -0.010824 0.083749 -0.060861 -0.034148 -0.0053469 -0.0072125 0.013571 0.015255 -0.0093684 0.019591 -0.0052787 -0.042288 -0.011905 0.011949 -0.039696 -0.00067257 0.031491 0.001261 0.056256 -0.054763 -0.0037415 -0.037158 -0.027498 -0.040355 0.051744 -0.016199 0.019732 0.025552 -0.010619 0.0045799 0.019729 -0.003802 0.039237 -0.037822 0.0031018 0.015461 -0.010272 0.031019 -0.05283 0.01524 0.029164 -0.029245 0.020821 0.082654 -0.010499 0.036334 0.05467 -0.035063 0.0079682 0.025109 -0.005079 -0.038454 -0.088085 -0.069725 -0.025818 0.058988 -0.00605 0.014412 0.0097177 -0.056277 -0.078059 -0.081011 0.044408 0.02981 -0.03136 0.022679 -0.024707 -0.0037684 0.017607 0.10025 0.029797 0.035715 -0.029575 -0.0023749 0.03122 0.0046602 0.0095158 0.018823 -0.031613 0.00010478 -0.0092899 0.054283 0.022045 -0.028833 0.015604 0.072365 -0.014417 -0.0020138 0.015605 0.058046 0.029067 -0.0022925 -0.034579 0.0099961 -0.012617 0.070249 0.0043933 -0.018752 0.053971 -0.021893 0.033627 -0.022824 -0.013407 -0.014452 -0.038295 0.025918 -0.057944 -0.031127 -0.055092 
-cupidatat -0.042265 -0.029544 -0.038945 0.0012674 -0.034004 0.0034269 -0.032147 0.022081 -0.016893 -0.013366 -0.026262 0.01516 -0.066376 -0.007196 0.060173 -0.0053262 -0.032327 0.017385 -0.056798 0.024852 -0.0075243 -0.0020339 -0.050317 0.044462 0.0098757 -0.051571 0.067478 0.037414 -0.072815 -0.014053 -0.021454 0.0092716 -0.017148 0.044028 0.0071549 -0.028549 -0.028297 -0.046896 0.027254 0.037127 0.05656 0.023079 -0.083811 0.028988 0.051995 0.018966 0.01181 -0.078277 -0.015377 0.006039 -0.0086883 -0.095972 -0.027663 -0.046468 0.064236 -0.052702 0.0075688 0.027457 -0.067037 0.015553 -0.039386 -0.06791 0.025305 -0.016509 0.042565 -0.055552 0.022043 0.018234 -0.021199 -0.042957 0.0037275 -0.0095272 0.013619 -0.0035988 -0.018975 0.00044779 -0.011198 -0.021649 -0.032135 0.081544 -0.049297 -0.03406 0.023579 0.02148 0.014565 3.8347e-05 0.021343 0.046418 0.023123 -0.045704 0.00612 0.035331 -0.0015341 -0.0028201 -0.016762 -0.046173 -0.00049462 0.079028 0.062133 0.015453 -0.031617 0.065437 2.9786e-06 0.036524 -0.0072114 0.043556 -0.0058301 0.031023 0.019182 -0.012974 -0.017077 -0.0050522 -0.031769 -0.020939 0.047118 0.04697 0.014048 0.043219 -0.1058 -0.00031916 -0.052701 0.0057934 -0.037763 -0.0055239 0.052029 -0.031995 0.028146 0.06427 -0.018349 -0.040484 -0.068192 -0.025838 0.031507 -0.0082452 -0.043663 -0.032067 -0.065774 0.015885 -0.0047693 0.047653 0.080629 0.037132 -0.035711 0.047018 -0.053551 -0.014488 -0.0059677 -0.017765 0.072886 -0.051995 -0.057181 0.034401 0.012953 -0.022736 -0.071967 0.042644 -0.025008 0.041882 -0.0054552 -0.01006 0.032727 0.035113 -0.045083 0.047455 0.010308 -0.001065 -0.023352 -0.061884 -0.015814 -0.013588 -0.0060516 -0.014075 -0.074845 -0.049655 -0.0072044 -0.021199 -0.021124 -0.050376 -0.044462 -0.018438 0.0011498 -0.064825 0.0052153 0.048133 0.032925 -0.016877 -0.004315 0.037477 -0.0015988 -0.045726 0.056901 -0.01206 0.09387 -0.068259 -0.037815 -0.0061716 -0.0084956 0.015198 0.017166 -0.010648 0.021439 -0.0069167 -0.047533 -0.013503 0.013157 -0.04524 -0.0012044 0.035387 0.0007268 0.063455 -0.061153 -0.003256 -0.04138 -0.030149 -0.045021 0.058221 -0.016521 0.022027 0.02867 -0.01146 0.0056461 0.022265 -0.0041158 0.043937 -0.042094 0.0037214 0.016984 -0.012373 0.035129 -0.059592 0.017027 0.032919 -0.033141 0.024399 0.093056 -0.011757 0.039695 0.061235 -0.039469 0.0088542 0.029017 -0.006595 -0.044109 -0.098343 -0.07895 -0.028982 0.066841 -0.0061549 0.016101 0.010812 -0.063326 -0.088325 -0.091717 0.050671 0.033406 -0.034761 0.025789 -0.028518 -0.0041581 0.019092 0.11305 0.032848 0.04064 -0.032744 -0.0023089 0.034787 0.0050246 0.011077 0.021362 -0.035476 0.00036362 -0.010032 0.06117 0.024535 -0.032741 0.017255 0.081233 -0.016181 -0.0020348 0.017483 0.065197 0.032758 -0.0021783 -0.039095 0.01152 -0.013527 0.079486 0.0050267 -0.020664 0.060304 -0.024807 0.037992 -0.02639 -0.014922 -0.015742 -0.042688 0.028907 -0.065481 -0.035026 -0.062179 
-non -0.040748 -0.028354 -0.038019 0.0004538 -0.032751 0.0035848 -0.030959 0.021334 -0.015555 -0.014308 -0.026403 0.015601 -0.06474 -0.0058731 0.057881 -0.0052492 -0.032768 0.01565 -0.055814 0.025179 -0.0073983 -0.0032626 -0.050547 0.043629 0.011387 -0.049021 0.065976 0.037724 -0.071692 -0.013179 -0.020527 0.0089121 -0.014868 0.042779 0.0070914 -0.027999 -0.027494 -0.045436 0.026607 0.035065 0.055355 0.023061 -0.08162 0.028353 0.050192 0.018407 0.011174 -0.076298 -0.013835 0.0052935 -0.0094734 -0.093475 -0.026924 -0.044139 0.061647 -0.051059 0.0075957 0.027392 -0.064464 0.015139 -0.038932 -0.065215 0.025287 -0.017072 0.041407 -0.054416 0.021727 0.017527 -0.020792 -0.04205 0.0037214 -0.0095796 0.012287 -0.0038407 -0.017634 0.00078716 -0.011078 -0.020991 -0.031047 0.078684 -0.046549 -0.032717 0.021928 0.020048 0.012603 -0.0011798 0.022157 0.045268 0.023188 -0.043325 0.0058477 0.034567 -0.0010002 -0.0032364 -0.015283 -0.045446 -0.0012388 0.076996 0.05992 0.015491 -0.029945 0.062932 -0.00088994 0.036451 -0.0059951 0.042673 -0.0059829 0.031174 0.017666 -0.012821 -0.018239 -0.0060145 -0.030277 -0.020954 0.046704 0.045258 0.013368 0.041623 -0.10285 0.00071124 -0.052308 0.004889 -0.035795 -0.0066981 0.051267 -0.031208 0.026777 0.063963 -0.018821 -0.038402 -0.064966 -0.025908 0.030219 -0.0083168 -0.042929 -0.031046 -0.065214 0.015393 -0.0048907 0.046752 0.079381 0.036293 -0.034182 0.04645 -0.051691 -0.013954 -0.0052217 -0.017885 0.07051 -0.052376 -0.057523 0.03407 0.012523 -0.020245 -0.071055 0.040466 -0.022991 0.039795 -0.0053198 -0.0093498 0.031159 0.034361 -0.044618 0.046194 0.0084229 -0.00031302 -0.024504 -0.060623 -0.014914 -0.011404 -0.005719 -0.013756 -0.072619 -0.050117 -0.007679 -0.021019 -0.020661 -0.05033 -0.044095 -0.01785 0.00098317 -0.062402 0.0046079 0.046462 0.032699 -0.016615 -0.0034409 0.036394 -0.0023165 -0.043896 0.054855 -0.01212 0.091584 -0.066438 -0.036985 -0.0053604 -0.008555 0.014234 0.01693 -0.0097218 0.023314 -0.005273 -0.045808 -0.013315 0.013188 -0.044314 -0.0015349 0.034219 0.00051285 0.061283 -0.059552 -0.0028752 -0.040254 -0.030612 -0.044834 0.056746 -0.015591 0.020109 0.028169 -0.012904 0.0051134 0.022731 -0.0039251 0.042916 -0.041405 0.0036855 0.017191 -0.011179 0.033302 -0.058308 0.016661 0.031615 -0.033024 0.021643 0.090772 -0.012595 0.038139 0.05973 -0.037273 0.008377 0.027785 -0.0057473 -0.042921 -0.096605 -0.076121 -0.027613 0.065884 -0.0057652 0.014427 0.008752 -0.062285 -0.085241 -0.088751 0.049649 0.033506 -0.034303 0.023499 -0.027883 -0.0039873 0.018675 0.10994 0.03376 0.039208 -0.033853 -0.003145 0.035048 0.0053827 0.0089329 0.021213 -0.033637 0.00066312 -0.0094041 0.060113 0.023897 -0.030425 0.016434 0.07959 -0.015407 -0.0029525 0.01695 0.064184 0.031055 -0.0012347 -0.038945 0.0111 -0.015343 0.077542 0.0033976 -0.01988 0.05913 -0.023761 0.037297 -0.024114 -0.014613 -0.015314 -0.042256 0.027845 -0.0638 -0.034417 -0.061412 
-proident, -0.040558 -0.028945 -0.037701 0.0011761 -0.032713 0.0036335 -0.031119 0.021967 -0.015861 -0.012947 -0.025302 0.014349 -0.064137 -0.0065472 0.05764 -0.004999 -0.031474 0.016374 -0.055653 0.02308 -0.0072494 -0.0017167 -0.049484 0.04272 0.011157 -0.049745 0.064998 0.036988 -0.070508 -0.01337 -0.020465 0.0087211 -0.016528 0.042225 0.006614 -0.028388 -0.027641 -0.045683 0.026631 0.035566 0.054798 0.021687 -0.082153 0.028186 0.050615 0.01821 0.011227 -0.075961 -0.01509 0.0059344 -0.0079667 -0.093081 -0.025996 -0.045434 0.06213 -0.050478 0.0071077 0.026618 -0.064833 0.015207 -0.038198 -0.065623 0.023937 -0.016099 0.041387 -0.053884 0.020815 0.017068 -0.02062 -0.041555 0.0035808 -0.0097253 0.011968 -0.0036016 -0.018349 0.00054501 -0.011356 -0.020946 -0.030858 0.079415 -0.047588 -0.033079 0.021937 0.020976 0.014749 -0.0002306 0.020764 0.044669 0.023476 -0.044113 0.0053857 0.034051 -0.0014228 -0.0030255 -0.015727 -0.044878 -0.0002039 0.076037 0.059626 0.013875 -0.030371 0.063207 -0.001483 0.035991 -0.0064251 0.041958 -0.0060671 0.029902 0.018462 -0.011755 -0.015986 -0.0045577 -0.031287 -0.021029 0.045468 0.045046 0.013457 0.041115 -0.10297 -5.4709e-05 -0.051362 0.0061093 -0.037248 -0.0055287 0.050821 -0.030772 0.026431 0.063844 -0.017608 -0.038672 -0.065893 -0.025999 0.030748 -0.0078018 -0.042661 -0.032608 -0.063484 0.015603 -0.004617 0.046271 0.077931 0.035293 -0.034633 0.045482 -0.052284 -0.013435 -0.0065954 -0.017456 0.070766 -0.051354 -0.056066 0.033747 0.012186 -0.02166 -0.069624 0.040554 -0.02389 0.040874 -0.0053477 -0.0099269 0.031014 0.034429 -0.043396 0.045985 0.0093469 -0.001335 -0.022846 -0.060248 -0.016135 -0.012795 -0.0051035 -0.013684 -0.07265 -0.049462 -0.0059897 -0.020987 -0.021439 -0.048941 -0.043564 -0.018066 0.00047993 -0.06113 0.0045817 0.046517 0.032687 -0.016197 -0.0038791 0.037473 -0.0021157 -0.044321 0.054542 -0.012398 0.091453 -0.06576 -0.036699 -0.006079 -0.0082895 0.013911 0.016167 -0.010251 0.020754 -0.0064729 -0.045961 -0.013889 0.012457 -0.04336 -0.00071961 0.03368 0.00075945 0.061642 -0.059804 -0.0039651 -0.040568 -0.030255 -0.044574 0.055672 -0.017346 0.020994 0.027446 -0.011613 0.0054327 0.021461 -0.0044618 0.043131 -0.041414 0.0030995 0.016636 -0.012129 0.034179 -0.057636 0.016723 0.032088 -0.032016 0.022422 0.089793 -0.011934 0.038798 0.059083 -0.038662 0.0092868 0.027582 -0.0062888 -0.042391 -0.095599 -0.076223 -0.027606 0.064554 -0.0063296 0.01654 0.010771 -0.061515 -0.085209 -0.089143 0.048772 0.032602 -0.0341 0.02507 -0.026875 -0.0036685 0.018266 0.1093 0.032581 0.038186 -0.031292 -0.0026435 0.034136 0.0043908 0.010655 0.019636 -0.03528 0.00078379 -0.009966 0.059138 0.024075 -0.031059 0.017113 0.079376 -0.016008 -0.0024468 0.017751 0.063797 0.031626 -0.0021078 -0.03839 0.011451 -0.013899 0.076722 0.005363 -0.02035 0.059246 -0.023871 0.036296 -0.024747 -0.014489 -0.015141 -0.041011 0.028063 -0.063707 -0.034021 -0.060147 
-sunt -0.047113 -0.032331 -0.042632 0.0018494 -0.038593 0.003202 -0.035351 0.024121 -0.018791 -0.015594 -0.029737 0.017685 -0.073398 -0.0061422 0.065893 -0.0068548 -0.03603 0.018691 -0.063422 0.026803 -0.0086234 -0.00096076 -0.057215 0.049071 0.012336 -0.056342 0.074619 0.041802 -0.080878 -0.016421 -0.02421 0.010236 -0.018313 0.048395 0.0077751 -0.031881 -0.031508 -0.052449 0.031052 0.040445 0.063615 0.026321 -0.094215 0.03265 0.057777 0.020467 0.012029 -0.087415 -0.016909 0.0071343 -0.011628 -0.10625 -0.030562 -0.052936 0.070985 -0.059779 0.0076135 0.030152 -0.07494 0.017231 -0.043205 -0.07496 0.028329 -0.018628 0.048388 -0.06083 0.024108 0.020208 -0.024068 -0.046552 0.0057073 -0.011512 0.015343 -0.0035533 -0.022045 0.00024271 -0.013381 -0.023557 -0.035136 0.091029 -0.054415 -0.038417 0.026413 0.023473 0.016989 -0.0024785 0.023273 0.051445 0.025534 -0.049559 0.0051756 0.039435 -0.002049 -0.004161 -0.01808 -0.051782 -0.0010136 0.086925 0.068666 0.01684 -0.034768 0.072209 -0.0011599 0.040226 -0.0078879 0.047845 -0.0082931 0.035284 0.021082 -0.013514 -0.01862 -0.0058739 -0.035633 -0.024147 0.052542 0.052105 0.0147 0.047237 -0.11875 -0.0010843 -0.05878 0.0054287 -0.041655 -0.0064458 0.05694 -0.035696 0.029443 0.071999 -0.02074 -0.044653 -0.075853 -0.029405 0.034819 -0.009952 -0.048509 -0.036727 -0.073281 0.019194 -0.0059841 0.052959 0.088928 0.040292 -0.039418 0.051822 -0.058969 -0.016382 -0.0082468 -0.019819 0.081891 -0.05833 -0.063973 0.039152 0.013918 -0.025636 -0.080694 0.046357 -0.027915 0.047009 -0.0056584 -0.01247 0.036295 0.039155 -0.049905 0.052762 0.01093 -0.00066666 -0.026637 -0.069014 -0.017659 -0.015942 -0.0075369 -0.01556 -0.081956 -0.055244 -0.0082479 -0.023302 -0.024922 -0.057075 -0.050621 -0.021309 0.00043675 -0.072012 0.0058449 0.053267 0.036547 -0.019607 -0.0047087 0.042394 -0.0011588 -0.050816 0.063188 -0.01466 0.10451 -0.074644 -0.041608 -0.0083425 -0.0099075 0.016497 0.018958 -0.010738 0.023913 -0.0072447 -0.052652 -0.015478 0.013954 -0.049958 -0.0024268 0.039948 8.163e-05 0.070384 -0.068281 -0.0043401 -0.045406 -0.034114 -0.051848 0.064481 -0.018662 0.024021 0.03149 -0.013564 0.0062206 0.025327 -0.0048933 0.050603 -0.047806 0.0050384 0.019424 -0.013321 0.039181 -0.066095 0.01921 0.037227 -0.037749 0.025434 0.10356 -0.014083 0.044072 0.069484 -0.043322 0.0097272 0.031451 -0.0081121 -0.048702 -0.11046 -0.086985 -0.031191 0.074168 -0.0063759 0.019179 0.011786 -0.070274 -0.0986 -0.10193 0.056611 0.037953 -0.038866 0.028933 -0.031795 -0.0048345 0.020175 0.12563 0.036078 0.04392 -0.035424 -0.0020002 0.038725 0.0054923 0.011011 0.023385 -0.040247 0.00032273 -0.010707 0.067391 0.027772 -0.036095 0.018597 0.090025 -0.017799 -0.0024613 0.020064 0.072742 0.036971 -0.0034907 -0.044439 0.013763 -0.016141 0.087313 0.0059323 -0.022957 0.06855 -0.02741 0.04271 -0.029106 -0.015136 -0.016642 -0.047871 0.032865 -0.072446 -0.038431 -0.068907 
-culpa -0.041309 -0.02966 -0.037665 0.0022096 -0.034162 0.0035262 -0.031086 0.022625 -0.016177 -0.013328 -0.025986 0.014792 -0.066544 -0.0066364 0.05882 -0.0050743 -0.031966 0.016369 -0.057039 0.02448 -0.0083041 -0.0014289 -0.0503 0.044797 0.010802 -0.051732 0.06733 0.037675 -0.071872 -0.013952 -0.020593 0.009898 -0.017855 0.044016 0.0073792 -0.029351 -0.028307 -0.046177 0.027598 0.036265 0.05621 0.022286 -0.083935 0.028429 0.051768 0.018992 0.011514 -0.077547 -0.014936 0.0052635 -0.0082602 -0.095558 -0.026233 -0.045435 0.064034 -0.052116 0.006739 0.026811 -0.06709 0.015289 -0.038627 -0.066633 0.025129 -0.015949 0.04171 -0.055502 0.02235 0.017849 -0.021342 -0.042542 0.0047451 -0.010066 0.013514 -0.0033433 -0.018947 0.00094237 -0.011677 -0.021623 -0.031565 0.080858 -0.04917 -0.033591 0.023403 0.02114 0.014338 0.00023016 0.021611 0.04495 0.024211 -0.04561 0.0057694 0.034775 -0.0021238 -0.0035418 -0.015818 -0.044382 -0.00023467 0.079211 0.060737 0.014648 -0.031067 0.064574 -0.00046381 0.03728 -0.0070264 0.043667 -0.0064459 0.031026 0.01826 -0.012849 -0.016972 -0.0046667 -0.032117 -0.021369 0.046678 0.046787 0.014371 0.043786 -0.10515 -0.00020952 -0.053175 0.0052314 -0.037783 -0.0056514 0.052002 -0.031331 0.026855 0.064566 -0.018585 -0.040355 -0.068017 -0.026338 0.031132 -0.0077112 -0.043575 -0.032711 -0.065567 0.015723 -0.0044125 0.047073 0.079823 0.036497 -0.03608 0.047532 -0.05264 -0.01367 -0.0072108 -0.017459 0.072397 -0.052269 -0.056873 0.035002 0.012981 -0.023218 -0.070124 0.041726 -0.025116 0.041101 -0.0053785 -0.010209 0.032092 0.034767 -0.044456 0.046899 0.0092583 -0.00097562 -0.02378 -0.061849 -0.015929 -0.013469 -0.0060302 -0.013735 -0.074887 -0.049304 -0.0079199 -0.021388 -0.021387 -0.050336 -0.044207 -0.017508 -9.3925e-05 -0.063442 0.0048503 0.047071 0.034128 -0.016083 -0.0048082 0.037382 -0.0020471 -0.045496 0.055912 -0.01204 0.093845 -0.067431 -0.036573 -0.0057475 -0.0083259 0.014842 0.016505 -0.010603 0.022239 -0.0059405 -0.046881 -0.01399 0.013015 -0.044686 -0.0011926 0.034526 0.00087044 0.063566 -0.061147 -0.0039437 -0.041145 -0.031412 -0.044804 0.058218 -0.016567 0.02184 0.028265 -0.012585 0.0058016 0.021667 -0.0037504 0.044706 -0.041639 0.0038482 0.017105 -0.011987 0.035144 -0.059689 0.016633 0.033357 -0.033398 0.023546 0.091715 -0.01144 0.039638 0.061097 -0.03956 0.0095046 0.028148 -0.0066133 -0.043942 -0.098003 -0.078066 -0.027491 0.066187 -0.0064811 0.016147 0.010743 -0.062626 -0.087486 -0.09108 0.049322 0.032884 -0.033693 0.025275 -0.027216 -0.0039972 0.018857 0.11089 0.032221 0.038893 -0.033273 -0.002365 0.034649 0.0048077 0.011678 0.021023 -0.03544 0.00043285 -0.010231 0.060807 0.02374 -0.032342 0.017682 0.08096 -0.0161 -0.0024843 0.018055 0.065022 0.032879 -0.0023586 -0.038762 0.011351 -0.014205 0.078986 0.0043712 -0.020947 0.06086 -0.024991 0.038004 -0.025408 -0.015197 -0.015315 -0.042394 0.028893 -0.065043 -0.034521 -0.061579 
-qui -0.029932 -0.021718 -0.027026 0.001841 -0.025323 0.0026815 -0.023301 0.015747 -0.011691 -0.0098502 -0.019445 0.010478 -0.046942 -0.003778 0.041473 -0.0033011 -0.023384 0.012661 -0.04083 0.017131 -0.00577 -0.00064092 -0.037028 0.032049 0.0078482 -0.037507 0.048008 0.027224 -0.051044 -0.0099815 -0.015874 0.0072513 -0.011456 0.033017 0.0051066 -0.021218 -0.019157 -0.034711 0.020729 0.026598 0.040697 0.015889 -0.059111 0.020045 0.037647 0.014386 0.0089367 -0.055318 -0.01264 0.0046914 -0.0068967 -0.069131 -0.019449 -0.032964 0.045251 -0.037539 0.0048424 0.020063 -0.048643 0.009069 -0.02775 -0.048586 0.0168 -0.012143 0.030283 -0.039531 0.016368 0.013547 -0.016541 -0.029767 0.0026017 -0.0061989 0.010488 -0.0016806 -0.013425 0.002081 -0.0078189 -0.016494 -0.024821 0.058596 -0.036261 -0.024329 0.016842 0.015827 0.010712 -0.00075846 0.015043 0.033959 0.018548 -0.033761 0.0029213 0.024724 -0.0029789 -0.0029156 -0.012808 -0.032589 -0.0012419 0.056869 0.04525 0.011249 -0.023466 0.046452 0.0010004 0.026325 -0.0052699 0.031257 -0.0045441 0.022007 0.013978 -0.0089874 -0.012198 -0.0011114 -0.024028 -0.015456 0.035253 0.033545 0.010798 0.031 -0.075112 -0.00012365 -0.039302 0.0025678 -0.026367 -0.0045716 0.037496 -0.022828 0.018299 0.047457 -0.013872 -0.029008 -0.04767 -0.019483 0.023708 -0.0060709 -0.029941 -0.022582 -0.046801 0.011828 -0.0026395 0.033701 0.058438 0.025915 -0.02586 0.033926 -0.03814 -0.010166 -0.0051683 -0.012485 0.051537 -0.038002 -0.040113 0.026172 0.011225 -0.016562 -0.051559 0.030425 -0.015762 0.030554 -0.0047715 -0.007197 0.023742 0.025649 -0.031911 0.035304 0.0067577 -0.00084741 -0.016117 -0.044392 -0.010642 -0.0089229 -0.0059364 -0.010891 -0.052729 -0.036183 -0.0052097 -0.015218 -0.015561 -0.035723 -0.031691 -0.014397 -0.00028943 -0.045874 0.0026039 0.034357 0.023217 -0.012226 -0.0025882 0.028103 -0.0027627 -0.03271 0.041108 -0.01022 0.068003 -0.047993 -0.027215 -0.0030746 -0.0050573 0.010483 0.01293 -0.0085576 0.015911 -0.0042502 -0.034313 -0.010366 0.0088487 -0.031239 -0.00039794 0.025881 0.0017229 0.045594 -0.044262 -0.0023614 -0.029676 -0.021977 -0.032143 0.041812 -0.013166 0.016344 0.020472 -0.0096892 0.0041165 0.01573 -0.0032717 0.032013 -0.032043 0.0025535 0.012483 -0.0079051 0.024674 -0.04238 0.012691 0.022938 -0.024666 0.016124 0.066581 -0.0081409 0.027803 0.043551 -0.029952 0.0065839 0.020444 -0.0045886 -0.031697 -0.069637 -0.056028 -0.020635 0.0478 -0.0048422 0.012722 0.0089803 -0.045258 -0.063194 -0.066247 0.035543 0.024234 -0.025052 0.019625 -0.020622 -0.0030197 0.01397 0.080738 0.023139 0.027943 -0.023784 -0.0017381 0.025017 0.0029002 0.0073617 0.013916 -0.026676 -5.0843e-05 -0.008076 0.043395 0.016334 -0.023433 0.012503 0.058401 -0.011241 -0.0030958 0.014061 0.045455 0.023524 -0.0018129 -0.027363 0.0078581 -0.0095198 0.057505 0.0037919 -0.015826 0.042927 -0.018464 0.02642 -0.01887 -0.011527 -0.012604 -0.030213 0.021011 -0.046551 -0.025238 -0.044369 
-officia -0.042542 -0.029823 -0.03902 0.0015014 -0.034417 0.0034922 -0.032828 0.021895 -0.017065 -0.013825 -0.026079 0.015614 -0.067811 -0.0072137 0.060754 -0.0055027 -0.032974 0.016772 -0.058145 0.025463 -0.0084305 -0.0021528 -0.051794 0.045612 0.010811 -0.052508 0.069215 0.03889 -0.074728 -0.013181 -0.021536 0.0090482 -0.017337 0.044809 0.0067395 -0.029862 -0.028793 -0.047959 0.028505 0.037518 0.057793 0.023456 -0.085996 0.029641 0.052856 0.019114 0.011536 -0.07986 -0.015682 0.0055152 -0.008791 -0.099001 -0.027704 -0.04696 0.06564 -0.053447 0.0073558 0.027452 -0.068371 0.015146 -0.040031 -0.069101 0.026541 -0.01715 0.042804 -0.056077 0.022657 0.01876 -0.0215 -0.043992 0.004029 -0.01031 0.01427 -0.0037918 -0.019447 0.00036866 -0.011462 -0.022168 -0.031866 0.083115 -0.049458 -0.035059 0.02348 0.021281 0.014661 -0.00038597 0.022248 0.047084 0.024148 -0.046484 0.006446 0.036139 -0.0022469 -0.0035207 -0.016262 -0.046794 -0.00061139 0.080924 0.062499 0.015797 -0.03219 0.066461 -0.00082002 0.037925 -0.0072569 0.044687 -0.0068568 0.032107 0.019194 -0.012677 -0.017746 -0.0049678 -0.032609 -0.021777 0.048658 0.048541 0.015565 0.044395 -0.10801 -0.00053733 -0.054131 0.0055147 -0.038946 -0.005387 0.053125 -0.031789 0.027972 0.06573 -0.018937 -0.041548 -0.069056 -0.02696 0.031957 -0.0081543 -0.044836 -0.033845 -0.067324 0.016377 -0.0045883 0.047767 0.081982 0.037711 -0.035665 0.04811 -0.054516 -0.014841 -0.0067956 -0.018974 0.074559 -0.053903 -0.058475 0.035739 0.013005 -0.022677 -0.072977 0.042877 -0.025538 0.042577 -0.0052038 -0.010172 0.032668 0.035656 -0.045117 0.048016 0.010188 -0.00018025 -0.025069 -0.062568 -0.016075 -0.013583 -0.005924 -0.014669 -0.076067 -0.050683 -0.0080795 -0.021548 -0.022286 -0.051303 -0.045111 -0.018828 0.0009865 -0.065983 0.0055898 0.04853 0.034193 -0.016177 -0.004222 0.038053 -0.0020847 -0.046911 0.058054 -0.012384 0.095746 -0.069457 -0.038374 -0.0062346 -0.00828 0.015729 0.017507 -0.011181 0.022346 -0.0072815 -0.048583 -0.014638 0.012988 -0.045993 -0.0016435 0.036179 0.0012617 0.064568 -0.062413 -0.0040584 -0.042612 -0.031356 -0.046774 0.05929 -0.017095 0.022277 0.028979 -0.011945 0.0062164 0.021959 -0.0039084 0.044879 -0.043549 0.0039061 0.017833 -0.012101 0.036399 -0.060584 0.017728 0.033394 -0.033609 0.023859 0.09382 -0.012583 0.041344 0.062059 -0.040342 0.0086886 0.029309 -0.006658 -0.044479 -0.10089 -0.08059 -0.028985 0.06849 -0.0069478 0.015948 0.010939 -0.064548 -0.089724 -0.093733 0.05134 0.034232 -0.035313 0.026407 -0.029413 -0.0043454 0.019599 0.1148 0.033667 0.040344 -0.033541 -0.0022071 0.035849 0.005666 0.010955 0.021708 -0.036913 -0.00023045 -0.010396 0.061791 0.024987 -0.032907 0.017388 0.081995 -0.016774 -0.0023318 0.017961 0.066425 0.033738 -0.0020977 -0.040566 0.012114 -0.015112 0.081142 0.0041567 -0.020902 0.062442 -0.025391 0.038832 -0.02637 -0.015528 -0.016166 -0.04408 0.029533 -0.066956 -0.035588 -0.063521 
-deserunt -0.043861 -0.030655 -0.04027 0.0019118 -0.035558 0.0044346 -0.033576 0.02268 -0.017351 -0.014141 -0.027338 0.015893 -0.068772 -0.0069923 0.061926 -0.0062561 -0.034343 0.017689 -0.059574 0.025131 -0.0082305 -0.0021692 -0.053345 0.046799 0.011251 -0.053402 0.069581 0.03936 -0.075678 -0.014498 -0.021644 0.0092701 -0.017679 0.045762 0.0069482 -0.029759 -0.029954 -0.048169 0.028706 0.038422 0.058878 0.023568 -0.087888 0.029609 0.054512 0.019606 0.01227 -0.08185 -0.016337 0.0062878 -0.0095756 -0.10003 -0.028727 -0.048572 0.066548 -0.054688 0.0075958 0.0278 -0.069612 0.016217 -0.040639 -0.070135 0.026718 -0.017345 0.044575 -0.057595 0.022774 0.01885 -0.02227 -0.043512 0.0046404 -0.010694 0.014505 -0.0036635 -0.019634 0.00035916 -0.012269 -0.022123 -0.033145 0.084649 -0.050668 -0.0361 0.023914 0.022553 0.015848 -0.00078095 0.02213 0.04724 0.025025 -0.047123 0.0055672 0.037009 -0.0022052 -0.0036266 -0.017015 -0.048207 -0.0010544 0.08196 0.0646 0.015727 -0.032498 0.067718 -0.00047223 0.038202 -0.0076972 0.045278 -0.0069604 0.03272 0.019805 -0.012762 -0.017215 -0.0052795 -0.033553 -0.022107 0.048651 0.04793 0.01488 0.045011 -0.11025 -0.00079904 -0.054295 0.0069004 -0.040022 -0.0064316 0.053918 -0.032661 0.028492 0.067636 -0.019323 -0.041699 -0.070576 -0.027744 0.032717 -0.0086093 -0.045826 -0.03446 -0.069053 0.016511 -0.0047109 0.050022 0.083448 0.038281 -0.036587 0.048428 -0.055769 -0.014236 -0.0076644 -0.018474 0.075958 -0.054121 -0.060051 0.035967 0.013122 -0.023027 -0.074958 0.04377 -0.025978 0.043729 -0.0057469 -0.011009 0.034274 0.036599 -0.046933 0.049801 0.010869 -0.0013372 -0.024495 -0.064468 -0.017077 -0.014013 -0.0062931 -0.013792 -0.077662 -0.052712 -0.0070618 -0.021707 -0.023208 -0.053279 -0.047136 -0.019702 0.00068795 -0.066679 0.0056653 0.049895 0.034311 -0.017774 -0.004122 0.04011 -0.0024427 -0.047445 0.058253 -0.01326 0.098045 -0.070936 -0.039553 -0.0076514 -0.0092376 0.015647 0.017208 -0.010289 0.02267 -0.0067649 -0.048772 -0.014623 0.013736 -0.046865 -0.0019004 0.036682 0.00053922 0.065727 -0.063275 -0.0037565 -0.043312 -0.032164 -0.047735 0.060139 -0.017855 0.022804 0.029422 -0.011759 0.0060079 0.023464 -0.0039584 0.046642 -0.044481 0.0037937 0.018172 -0.012315 0.036509 -0.061843 0.017898 0.034394 -0.035737 0.023849 0.096588 -0.012193 0.041967 0.064124 -0.040281 0.0095248 0.03 -0.0067126 -0.045844 -0.10306 -0.081353 -0.029032 0.06969 -0.006132 0.017689 0.011472 -0.065964 -0.092007 -0.095135 0.05197 0.035246 -0.03605 0.026292 -0.029445 -0.0048421 0.019835 0.11705 0.034405 0.041381 -0.033702 -0.0021794 0.0369 0.0048097 0.011555 0.02175 -0.037798 8.0235e-05 -0.010408 0.063604 0.025746 -0.034058 0.017727 0.083796 -0.017215 -0.0020745 0.018189 0.067484 0.034697 -0.0033266 -0.041205 0.012088 -0.015008 0.082326 0.0053852 -0.022057 0.063808 -0.025801 0.038971 -0.027271 -0.014836 -0.01616 -0.044211 0.029894 -0.068334 -0.036417 -0.06494 
-mollit -0.042748 -0.029921 -0.038282 0.0012741 -0.034141 0.0037684 -0.032524 0.023018 -0.016595 -0.014157 -0.02641 0.014776 -0.066344 -0.0064595 0.06046 -0.0056545 -0.032913 0.016408 -0.057675 0.024891 -0.0079415 -0.00093703 -0.051386 0.044467 0.010696 -0.052159 0.067355 0.038357 -0.073587 -0.013549 -0.021527 0.0091963 -0.017162 0.044343 0.0066138 -0.029341 -0.02856 -0.047642 0.027885 0.037148 0.056662 0.023559 -0.084815 0.028974 0.052888 0.01931 0.010723 -0.079053 -0.016056 0.0052659 -0.0087942 -0.096862 -0.02698 -0.047252 0.0644 -0.053233 0.0079088 0.026582 -0.067831 0.01517 -0.03955 -0.068478 0.026386 -0.016888 0.043261 -0.055927 0.022238 0.018087 -0.021552 -0.042581 0.0049611 -0.0097324 0.01382 -0.0033941 -0.019282 -3.2997e-06 -0.011335 -0.021302 -0.031783 0.081124 -0.04933 -0.033944 0.022635 0.022051 0.015395 -0.00030334 0.022153 0.04714 0.023371 -0.045461 0.0057327 0.03597 -0.002113 -0.0035646 -0.017432 -0.04691 -0.0010196 0.079786 0.062122 0.014445 -0.032026 0.065777 -0.00025685 0.037238 -0.0077795 0.045014 -0.0069401 0.031751 0.018913 -0.012529 -0.017682 -0.0050526 -0.032797 -0.021428 0.048035 0.047435 0.015084 0.044403 -0.10632 -0.0012314 -0.053684 0.0051014 -0.038772 -0.0059134 0.052262 -0.032153 0.027161 0.065228 -0.017921 -0.040563 -0.067978 -0.026679 0.03147 -0.0074704 -0.044548 -0.033389 -0.066374 0.016073 -0.003904 0.048146 0.082098 0.0359 -0.035419 0.047389 -0.053534 -0.014434 -0.0070438 -0.017401 0.072767 -0.053024 -0.058044 0.035769 0.012726 -0.022716 -0.072685 0.041905 -0.025304 0.04127 -0.005887 -0.009808 0.031897 0.034499 -0.04524 0.047366 0.010364 -0.00049696 -0.023896 -0.062578 -0.015947 -0.013899 -0.0063504 -0.014236 -0.075288 -0.050439 -0.0074994 -0.021144 -0.021238 -0.050791 -0.044554 -0.019581 0.00070494 -0.064964 0.0051111 0.048509 0.033597 -0.01653 -0.0035351 0.037628 -0.0016828 -0.045645 0.057143 -0.013053 0.094777 -0.068026 -0.03781 -0.0059899 -0.0081212 0.015603 0.017737 -0.010976 0.021464 -0.0065793 -0.047419 -0.01466 0.013321 -0.044335 -0.0022331 0.035727 0.0005063 0.063684 -0.060971 -0.0031564 -0.040798 -0.03081 -0.046907 0.058821 -0.017338 0.02271 0.028758 -0.011426 0.0060319 0.021685 -0.0038165 0.044544 -0.043016 0.0033074 0.018209 -0.01227 0.034981 -0.059824 0.01719 0.033576 -0.033842 0.023677 0.093549 -0.012829 0.040724 0.061321 -0.040259 0.0078778 0.029468 -0.0053348 -0.04469 -0.099844 -0.079342 -0.028401 0.067033 -0.0066166 0.016731 0.0099379 -0.063209 -0.0893 -0.091916 0.050818 0.033788 -0.035166 0.026191 -0.0273 -0.0044991 0.018993 0.11387 0.033385 0.040157 -0.033079 -0.0023856 0.035445 0.005539 0.01077 0.021906 -0.036381 -6.4144e-05 -0.01021 0.061091 0.023941 -0.033281 0.017845 0.081742 -0.016647 -0.0029349 0.017633 0.06492 0.033079 -0.0029747 -0.039406 0.011672 -0.014352 0.080114 0.0049856 -0.020083 0.062008 -0.024835 0.037817 -0.025502 -0.015438 -0.016086 -0.042655 0.030001 -0.065559 -0.035545 -0.061523 
-anim -0.039154 -0.026713 -0.034738 0.0010362 -0.030315 0.0032712 -0.02839 0.020794 -0.014682 -0.01226 -0.024 0.014331 -0.059229 -0.0056601 0.053197 -0.005511 -0.028857 0.014939 -0.0517 0.020826 -0.007052 -0.00031963 -0.045996 0.038255 0.0091821 -0.046504 0.060628 0.033635 -0.066089 -0.012393 -0.019471 0.0077649 -0.015607 0.040599 0.0049497 -0.026284 -0.024541 -0.042523 0.024874 0.03378 0.051382 0.02123 -0.075959 0.025829 0.04626 0.016199 0.0097975 -0.070682 -0.013968 0.0059557 -0.0067494 -0.084918 -0.023579 -0.042066 0.057545 -0.047976 0.0060579 0.024887 -0.060332 0.01364 -0.03537 -0.060404 0.0224 -0.015686 0.038392 -0.05014 0.020469 0.01603 -0.018604 -0.038324 0.0044161 -0.0080225 0.011483 -0.0037867 -0.017451 0.00053569 -0.010073 -0.019139 -0.028121 0.073812 -0.044856 -0.030079 0.020396 0.019102 0.013063 -0.001842 0.018429 0.041715 0.022085 -0.04132 0.0041993 0.032067 -0.0019597 -0.001509 -0.014266 -0.042407 -0.0012831 0.070662 0.056176 0.013567 -0.028014 0.058689 -0.00055572 0.033384 -0.0068696 0.038965 -0.0065798 0.027642 0.016367 -0.010724 -0.014183 -0.0041034 -0.030292 -0.018921 0.042482 0.042369 0.012347 0.038385 -0.095356 -0.0017753 -0.047466 0.005025 -0.033934 -0.0035554 0.047349 -0.028846 0.025431 0.058659 -0.016951 -0.03599 -0.061781 -0.023246 0.027102 -0.0075314 -0.039468 -0.028862 -0.05842 0.013596 -0.0035704 0.043422 0.071304 0.032142 -0.03216 0.0418 -0.047211 -0.012195 -0.0056313 -0.016229 0.06562 -0.046322 -0.051639 0.031711 0.012373 -0.019949 -0.065262 0.037171 -0.023054 0.037048 -0.0053148 -0.0081347 0.030062 0.030612 -0.040388 0.042758 0.0087948 -0.00098802 -0.020426 -0.055327 -0.014233 -0.013012 -0.0048336 -0.013168 -0.067146 -0.04519 -0.0067721 -0.018789 -0.0199 -0.045174 -0.038653 -0.016398 -4.5239e-06 -0.057922 0.0035059 0.044099 0.029948 -0.015842 -0.003442 0.034416 -0.0016477 -0.040231 0.051033 -0.010381 0.083792 -0.059844 -0.033856 -0.0064535 -0.0089486 0.013954 0.015754 -0.0089569 0.019892 -0.005028 -0.041992 -0.011951 0.011325 -0.039965 -0.0010052 0.032886 0.00046624 0.055896 -0.055736 -0.0035041 -0.037805 -0.028438 -0.042001 0.051952 -0.01532 0.019906 0.025335 -0.01023 0.005075 0.019455 -0.0033373 0.039319 -0.037084 0.0044145 0.015477 -0.010992 0.031626 -0.05216 0.016001 0.0304 -0.029201 0.020348 0.083782 -0.011287 0.036136 0.055481 -0.035591 0.0065715 0.024953 -0.0061624 -0.039647 -0.088916 -0.070274 -0.025388 0.060838 -0.0051109 0.015392 0.0099484 -0.056312 -0.078618 -0.081874 0.045013 0.030164 -0.031756 0.022225 -0.024743 -0.0051201 0.016936 0.10088 0.029569 0.03561 -0.030224 -0.001873 0.032058 0.0046657 0.0093526 0.018718 -0.031629 0.00061286 -0.0095709 0.054302 0.021534 -0.030404 0.015507 0.073811 -0.01468 -0.0027337 0.015956 0.059282 0.029365 -0.001657 -0.035561 0.010105 -0.013382 0.071232 0.0039945 -0.017715 0.055131 -0.02178 0.033956 -0.02172 -0.013125 -0.013358 -0.038345 0.026331 -0.058089 -0.031305 -0.05552 
-id -0.027087 -0.019 -0.025404 0.0013325 -0.021722 0.0037562 -0.020832 0.01598 -0.01188 -0.0075117 -0.017056 0.0085978 -0.041915 -0.004719 0.037371 -0.0024208 -0.020222 0.012264 -0.037135 0.016732 -0.0059167 -0.001328 -0.034082 0.0283 0.0054558 -0.033462 0.043033 0.024014 -0.04863 -0.0084704 -0.012337 0.0068457 -0.012756 0.029813 0.0052544 -0.017223 -0.016997 -0.030296 0.018209 0.023041 0.037096 0.015502 -0.05485 0.01784 0.033673 0.013778 0.0078576 -0.050384 -0.010021 0.0038559 -0.0055542 -0.062576 -0.018088 -0.029435 0.042153 -0.033978 0.0056255 0.016468 -0.041482 0.010561 -0.023893 -0.042397 0.015289 -0.011498 0.027571 -0.034527 0.013485 0.013184 -0.013004 -0.025946 0.0036923 -0.0061826 0.0091728 -0.0033455 -0.011575 -0.001441 -0.007544 -0.014655 -0.01868 0.052952 -0.030889 -0.022051 0.014013 0.012653 0.010481 -0.00038622 0.013329 0.029253 0.014636 -0.029322 0.0033942 0.024472 -0.0015416 -0.0032982 -0.011627 -0.031171 0.00014225 0.05107 0.04032 0.0090688 -0.020522 0.041622 0.00012019 0.024392 -0.0054455 0.02814 -0.0035965 0.018638 0.012402 -0.0072829 -0.01246 -0.00071543 -0.021344 -0.014465 0.029981 0.03028 0.0085943 0.027934 -0.067512 3.5676e-05 -0.034192 0.0046937 -0.02408 -0.0045757 0.032299 -0.02167 0.017725 0.042543 -0.013152 -0.026269 -0.044216 -0.017928 0.018814 -0.00563 -0.027827 -0.021705 -0.043746 0.01163 -0.0031019 0.030626 0.053016 0.02556 -0.022095 0.031562 -0.036076 -0.0061725 -0.0040356 -0.013167 0.047729 -0.033672 -0.03714 0.021393 0.0083375 -0.014112 -0.046507 0.027887 -0.016557 0.026913 -0.0049466 -0.0065535 0.018593 0.02425 -0.029319 0.030147 0.0073855 -0.0003871 -0.015514 -0.039883 -0.011003 -0.007704 -0.0029987 -0.0086298 -0.048189 -0.034298 -0.0054091 -0.012757 -0.013956 -0.034512 -0.028487 -0.011119 0.0015176 -0.04047 0.0013583 0.033372 0.021814 -0.011215 -0.0049512 0.026699 -0.0016978 -0.029439 0.036439 -0.0086881 0.061752 -0.045337 -0.025433 -0.0050854 -0.0040624 0.00968 0.0096613 -0.007952 0.014975 -0.0040299 -0.030276 -0.010412 0.0087927 -0.029373 -0.0012932 0.023463 0.000655 0.0405 -0.039854 -0.002305 -0.026149 -0.019057 -0.030221 0.039001 -0.010286 0.015719 0.019784 -0.0067486 0.0043274 0.015776 -0.0046769 0.028892 -0.027603 0.0013172 0.012364 -0.008148 0.024438 -0.038615 0.011859 0.020896 -0.022403 0.01671 0.060279 -0.005436 0.025366 0.040274 -0.026124 0.0058064 0.020392 -0.0042871 -0.026283 -0.063274 -0.05035 -0.019599 0.043983 -0.0038573 0.012365 0.0073616 -0.040661 -0.056179 -0.058864 0.031448 0.02209 -0.022021 0.016709 -0.019016 -0.0014199 0.013195 0.070934 0.022834 0.026291 -0.020042 -0.0004477 0.022991 0.0040338 0.0070831 0.013324 -0.023339 -0.00058792 -0.0062994 0.038902 0.01533 -0.022024 0.011371 0.051825 -0.0097819 -0.00019038 0.011965 0.042567 0.020503 -0.0027807 -0.024531 0.0082735 -0.010359 0.051485 0.0029538 -0.013203 0.039924 -0.017658 0.024241 -0.018073 -0.0086103 -0.0092038 -0.027644 0.020081 -0.042859 -0.023226 -0.041835 
-est -0.032424 -0.022515 -0.030622 0.00033217 -0.028809 0.002679 -0.025973 0.016992 -0.013715 -0.010718 -0.021166 0.011239 -0.051662 -0.0050136 0.047789 -0.0039117 -0.025131 0.014252 -0.044273 0.019643 -0.0056544 -0.00036222 -0.040305 0.033135 0.0088515 -0.0405 0.053194 0.029144 -0.056574 -0.010731 -0.017099 0.0087806 -0.01286 0.03417 0.0054582 -0.023727 -0.021694 -0.036623 0.02184 0.028805 0.045409 0.017939 -0.066702 0.022783 0.041524 0.014847 0.0078381 -0.061688 -0.012242 0.0051779 -0.0070458 -0.074832 -0.02202 -0.038301 0.050367 -0.041397 0.0046462 0.021965 -0.052112 0.013173 -0.031768 -0.052884 0.020312 -0.013764 0.032722 -0.043697 0.018163 0.01409 -0.017172 -0.032373 0.0034176 -0.0071958 0.010847 -0.0023634 -0.015006 0.0014378 -0.0087197 -0.017419 -0.024575 0.065171 -0.039556 -0.02707 0.018163 0.016198 0.012841 -0.00057174 0.015179 0.034769 0.017858 -0.035651 0.0040854 0.026056 -0.0015732 -0.0027095 -0.012265 -0.036451 -0.0013166 0.061866 0.048388 0.010381 -0.026688 0.052212 -7.2745e-05 0.027615 -0.0058546 0.034756 -0.0053475 0.024141 0.015454 -0.010469 -0.013641 -0.0031237 -0.02468 -0.016474 0.03724 0.036884 0.010798 0.0327 -0.083843 -0.00024397 -0.041506 0.0041274 -0.030454 -0.0039665 0.039626 -0.02587 0.021833 0.050421 -0.015141 -0.030799 -0.053423 -0.020488 0.02367 -0.0063869 -0.034264 -0.026772 -0.052034 0.0125 -0.0050736 0.037626 0.062803 0.028231 -0.028483 0.036632 -0.042666 -0.011398 -0.0044916 -0.013589 0.058204 -0.041178 -0.044487 0.027812 0.0097021 -0.01777 -0.05689 0.035121 -0.01934 0.034684 -0.0031521 -0.0092381 0.026614 0.026976 -0.034567 0.036513 0.0068425 -0.0020744 -0.018148 -0.048317 -0.012375 -0.010542 -0.0056598 -0.011833 -0.058278 -0.038657 -0.0056372 -0.016801 -0.017464 -0.038899 -0.035307 -0.01448 0.00020311 -0.051277 0.0056287 0.037757 0.026415 -0.013812 -0.0036005 0.028855 -0.0013653 -0.036453 0.044404 -0.010921 0.072967 -0.052826 -0.029673 -0.0047228 -0.0072431 0.012189 0.013278 -0.0077059 0.017292 -0.005436 -0.037636 -0.0098316 0.010432 -0.035773 -0.0014373 0.027651 1.4924e-05 0.050725 -0.046946 -0.0018184 -0.031797 -0.024552 -0.035263 0.045033 -0.01344 0.016495 0.021851 -0.010053 0.003991 0.016785 -0.0036252 0.036008 -0.032555 0.0020158 0.013381 -0.010346 0.027334 -0.047969 0.013249 0.026207 -0.026816 0.018914 0.073071 -0.0098608 0.031113 0.047792 -0.030506 0.0084136 0.023143 -0.005486 -0.033839 -0.079156 -0.060535 -0.02212 0.052233 -0.0038571 0.0134 0.0081839 -0.048969 -0.069088 -0.072362 0.039339 0.025689 -0.028488 0.021457 -0.021389 -0.0028356 0.014966 0.088745 0.025773 0.031059 -0.02492 -0.0017084 0.028175 0.003962 0.0077703 0.01684 -0.028733 0.0011286 -0.0073293 0.046963 0.020049 -0.026088 0.013406 0.062581 -0.011573 -0.00050067 0.014357 0.051443 0.025728 -0.0020174 -0.031308 0.0086934 -0.010408 0.061567 0.0040878 -0.015789 0.04754 -0.019837 0.029545 -0.01987 -0.011097 -0.012537 -0.03293 0.021931 -0.051142 -0.028018 -0.048493 
-laborum. -0.043866 -0.031473 -0.039162 0.0014304 -0.035285 0.0035343 -0.033557 0.023111 -0.016983 -0.013878 -0.026754 0.015342 -0.069536 -0.0069666 0.062748 -0.0053161 -0.03443 0.018631 -0.060027 0.025564 -0.0084177 -0.0014811 -0.053299 0.046064 0.011191 -0.05368 0.070538 0.039581 -0.076203 -0.014238 -0.022232 0.0095847 -0.017382 0.046022 0.0077655 -0.0307 -0.029733 -0.049271 0.027992 0.039068 0.059446 0.024008 -0.087902 0.03001 0.054448 0.019706 0.011861 -0.081146 -0.016309 0.0060488 -0.0086344 -0.099679 -0.028997 -0.048904 0.066663 -0.05449 0.0080786 0.028935 -0.070374 0.015444 -0.04148 -0.070927 0.026846 -0.017334 0.044595 -0.057528 0.023426 0.019363 -0.021346 -0.045015 0.0046099 -0.01051 0.014307 -0.0033173 -0.020046 0.0010788 -0.01222 -0.022126 -0.033141 0.085233 -0.051089 -0.03565 0.024454 0.022465 0.014715 -0.0002985 0.022658 0.048699 0.024762 -0.047342 0.0056699 0.036253 -0.0024195 -0.0034612 -0.017057 -0.04848 -0.00056912 0.082655 0.064237 0.016449 -0.032405 0.068323 -0.00025729 0.038432 -0.0081318 0.045964 -0.0072129 0.032442 0.020645 -0.013001 -0.01757 -0.005385 -0.034065 -0.022114 0.049663 0.049214 0.01444 0.045241 -0.11026 -4.36e-05 -0.055512 0.0063081 -0.039012 -0.0062836 0.054076 -0.033171 0.028835 0.067832 -0.019244 -0.042529 -0.070769 -0.028326 0.032797 -0.0081062 -0.045045 -0.034668 -0.068397 0.016171 -0.0049042 0.048663 0.084111 0.03795 -0.036647 0.048751 -0.055501 -0.014876 -0.0065261 -0.019075 0.076286 -0.054999 -0.059276 0.036262 0.013381 -0.023781 -0.075322 0.044043 -0.025761 0.04393 -0.0060793 -0.010653 0.033957 0.036628 -0.046475 0.04984 0.010319 -0.001133 -0.024138 -0.064528 -0.016452 -0.014348 -0.0060378 -0.014748 -0.077841 -0.052023 -0.0078203 -0.0217 -0.022917 -0.052375 -0.046422 -0.019601 0.00048384 -0.066633 0.0045209 0.049894 0.034635 -0.01711 -0.0042228 0.038965 -0.0018228 -0.048296 0.059295 -0.012797 0.097228 -0.070877 -0.03957 -0.0067379 -0.0092526 0.015822 0.017606 -0.010829 0.022401 -0.0067341 -0.048819 -0.014388 0.013004 -0.046789 -0.0013718 0.036679 0.00083024 0.065905 -0.063145 -0.0037895 -0.042974 -0.032012 -0.047913 0.060484 -0.017534 0.023165 0.030045 -0.012011 0.0056406 0.02239 -0.0044518 0.046084 -0.043747 0.0038048 0.018854 -0.012487 0.036619 -0.061943 0.017546 0.033827 -0.0347 0.024268 0.0966 -0.012843 0.042361 0.063845 -0.041356 0.0092544 0.030284 -0.0063493 -0.045333 -0.10254 -0.081908 -0.030117 0.069339 -0.0063952 0.016637 0.010767 -0.066555 -0.091868 -0.095145 0.05206 0.035274 -0.036359 0.027302 -0.028888 -0.0042162 0.01994 0.11768 0.034004 0.041457 -0.03469 -0.0028645 0.036309 0.0053236 0.01203 0.021783 -0.037876 0.00080624 -0.010501 0.063007 0.026026 -0.033527 0.017325 0.084086 -0.017092 -0.0022837 0.019054 0.068096 0.034562 -0.002188 -0.041397 0.012114 -0.015341 0.082912 0.0044619 -0.021279 0.063138 -0.025893 0.039317 -0.027249 -0.015419 -0.016205 -0.044021 0.029952 -0.068353 -0.036505 -0.0649 
-Ut -0.038702 -0.02659 -0.036763 -0.00084688 -0.030335 0.0033551 -0.029913 0.020122 -0.014907 -0.013728 -0.024878 0.01394 -0.060806 -0.0067356 0.052742 -0.0055189 -0.029064 0.014029 -0.052096 0.02351 -0.0074937 -0.0025659 -0.046039 0.040134 0.010251 -0.046727 0.059974 0.034939 -0.065707 -0.011632 -0.018822 0.0091981 -0.014884 0.040758 0.0053872 -0.025475 -0.024743 -0.043277 0.025074 0.03295 0.050367 0.019514 -0.076207 0.024744 0.047126 0.01713 0.011824 -0.069619 -0.01345 0.0051982 -0.0077968 -0.086401 -0.025408 -0.041238 0.057779 -0.047742 0.0071521 0.025351 -0.059548 0.013537 -0.036328 -0.06135 0.023337 -0.016652 0.039297 -0.05117 0.0201 0.017021 -0.018676 -0.039249 0.0023702 -0.010515 0.010628 -0.0027195 -0.016854 0.00090858 -0.010615 -0.019982 -0.030219 0.073223 -0.044573 -0.03101 0.02092 0.019168 0.012067 -0.00068203 0.020676 0.04292 0.021197 -0.039672 0.0068775 0.032971 -0.00098208 -0.0026577 -0.013618 -0.041967 -0.0022013 0.072638 0.055082 0.014224 -0.027417 0.058444 -0.00097445 0.033191 -0.0061531 0.039465 -0.0040761 0.029309 0.017261 -0.011557 -0.017833 -0.0045316 -0.028357 -0.02052 0.04318 0.041246 0.011587 0.038173 -0.096422 0.00010578 -0.047967 0.005976 -0.034413 -0.0065706 0.04764 -0.027315 0.025045 0.060063 -0.016369 -0.036014 -0.060891 -0.023075 0.02718 -0.0077905 -0.039926 -0.028721 -0.059956 0.014534 -0.0053817 0.043595 0.07322 0.033941 -0.032025 0.04385 -0.04857 -0.011546 -0.004619 -0.01669 0.065831 -0.048324 -0.053684 0.030983 0.011413 -0.020665 -0.066571 0.038395 -0.023155 0.037826 -0.005312 -0.0094974 0.029824 0.033381 -0.042635 0.043832 0.0095036 -0.0018732 -0.02231 -0.056537 -0.015645 -0.01051 -0.0050322 -0.011827 -0.06871 -0.04603 -0.006998 -0.01822 -0.019354 -0.046714 -0.042584 -0.017102 0.00077198 -0.057333 0.0033468 0.044359 0.031427 -0.016007 -0.0020197 0.034442 -0.0034345 -0.041972 0.051081 -0.011533 0.085749 -0.061394 -0.034922 -0.00553 -0.0074439 0.014684 0.015985 -0.0092356 0.021252 -0.0044693 -0.04243 -0.012705 0.013083 -0.041125 -0.0013647 0.033749 -8.8919e-05 0.057697 -0.055104 -0.0040463 -0.037562 -0.027282 -0.041806 0.054985 -0.013761 0.019431 0.02589 -0.012309 0.0041754 0.022227 -0.0047545 0.040073 -0.038278 0.0048235 0.01585 -0.010768 0.030492 -0.055022 0.01552 0.028739 -0.029926 0.021554 0.084536 -0.010963 0.035095 0.05528 -0.03586 0.0073933 0.02753 -0.0057426 -0.039646 -0.090358 -0.072288 -0.027589 0.061509 -0.0047593 0.013436 0.0089324 -0.058655 -0.080343 -0.082727 0.046974 0.03101 -0.032687 0.021878 -0.026317 -0.0044207 0.01647 0.10351 0.031569 0.036528 -0.030585 -0.0028254 0.032013 0.00562 0.0087777 0.01952 -0.031553 -1.1385e-05 -0.0092831 0.054708 0.022699 -0.029344 0.014814 0.073821 -0.013386 -0.0026388 0.016647 0.059181 0.030213 -0.0018953 -0.035991 0.010425 -0.013601 0.07257 0.0039817 -0.01892 0.054016 -0.021192 0.034411 -0.022139 -0.013684 -0.013956 -0.039067 0.025576 -0.059201 -0.030989 -0.057819 
-ipsum -0.044995 -0.031878 -0.039315 0.00062132 -0.03618 0.002551 -0.033837 0.023925 -0.01666 -0.013459 -0.025981 0.016441 -0.070003 -0.0065505 0.062204 -0.0053832 -0.033287 0.017945 -0.060417 0.025426 -0.0085272 -0.0012489 -0.053681 0.046533 0.011744 -0.054348 0.069824 0.039257 -0.076323 -0.013747 -0.022863 0.01045 -0.018074 0.046196 0.0065603 -0.031164 -0.029489 -0.048989 0.02872 0.038327 0.059605 0.024651 -0.089425 0.030226 0.054164 0.020483 0.011116 -0.081948 -0.016517 0.0071573 -0.0093497 -0.10113 -0.02835 -0.048956 0.067954 -0.054122 0.0079495 0.028496 -0.069801 0.015172 -0.041318 -0.070317 0.02754 -0.017081 0.044748 -0.057555 0.023688 0.018706 -0.021859 -0.045456 0.0045327 -0.0097382 0.013581 -0.0035461 -0.020632 -0.0009066 -0.011394 -0.022339 -0.033152 0.085573 -0.051368 -0.036267 0.024128 0.022325 0.015854 0.00017544 0.022848 0.049376 0.024634 -0.048308 0.0053985 0.037008 -0.0020832 -0.0045312 -0.016566 -0.047964 -0.00058533 0.083129 0.064203 0.01599 -0.032791 0.067533 -0.0010174 0.038944 -0.0076467 0.046562 -0.0071714 0.032738 0.019451 -0.012225 -0.018746 -0.0052478 -0.033483 -0.022378 0.050205 0.049681 0.014531 0.045908 -0.11081 4.7209e-05 -0.056127 0.0065938 -0.038911 -0.0058013 0.054825 -0.033268 0.0287 0.068389 -0.018568 -0.042139 -0.07101 -0.026935 0.032217 -0.0082147 -0.04601 -0.035052 -0.068211 0.016436 -0.0042869 0.049591 0.083163 0.038139 -0.037379 0.049642 -0.055039 -0.015052 -0.0066442 -0.019177 0.076432 -0.055315 -0.059839 0.036584 0.012823 -0.02468 -0.074796 0.044645 -0.026626 0.043139 -0.0060568 -0.0099961 0.034022 0.036162 -0.046739 0.050029 0.0090721 -0.0014173 -0.024886 -0.064496 -0.016628 -0.014295 -0.0061381 -0.015125 -0.078614 -0.051106 -0.0073822 -0.022621 -0.022538 -0.053263 -0.045645 -0.019577 0.00023177 -0.067164 0.0048549 0.05058 0.035161 -0.016932 -0.0048907 0.038957 -0.001616 -0.047654 0.059851 -0.013256 0.098531 -0.070788 -0.040046 -0.006514 -0.0087472 0.015431 0.017776 -0.011657 0.021939 -0.0071559 -0.048795 -0.015133 0.013433 -0.047256 -0.0012472 0.037069 0.00029364 0.066638 -0.064348 -0.0035821 -0.043853 -0.032278 -0.048178 0.061624 -0.018303 0.023637 0.029282 -0.012216 0.006328 0.022509 -0.0041693 0.04632 -0.043912 0.0041973 0.018658 -0.012946 0.036133 -0.061515 0.018203 0.03497 -0.034923 0.024984 0.096979 -0.012908 0.042743 0.063937 -0.041788 0.0079203 0.029634 -0.005729 -0.045592 -0.1031 -0.082383 -0.029902 0.069345 -0.0063381 0.016719 0.010616 -0.067187 -0.092434 -0.09614 0.052914 0.034305 -0.03566 0.02785 -0.029668 -0.0037105 0.018773 0.11731 0.033649 0.041755 -0.034802 -0.0032968 0.035922 0.0056528 0.011581 0.022222 -0.037688 0.00045021 -0.011308 0.063421 0.025808 -0.034182 0.018535 0.083783 -0.016009 -0.0028793 0.018297 0.067945 0.034169 -0.0026036 -0.041666 0.01205 -0.015436 0.082504 0.0053038 -0.021241 0.064051 -0.026118 0.040202 -0.027098 -0.015896 -0.016404 -0.044858 0.030305 -0.068065 -0.036359 -0.064147 
-sit -0.04657 -0.034424 -0.044418 0.0023004 -0.038938 0.0055514 -0.03499 0.025843 -0.018706 -0.014487 -0.030577 0.017956 -0.075327 -0.0089153 0.067383 -0.0072795 -0.036564 0.017875 -0.064378 0.028096 -0.0080664 -0.0038265 -0.056672 0.049788 0.010632 -0.059002 0.075278 0.043288 -0.081552 -0.014945 -0.023752 0.009443 -0.019886 0.050212 0.0076743 -0.034008 -0.032434 -0.053699 0.030094 0.040542 0.062546 0.026871 -0.093267 0.031468 0.059313 0.021428 0.013566 -0.085823 -0.016347 0.0049771 -0.0096991 -0.10903 -0.031068 -0.051308 0.070495 -0.058097 0.010007 0.029884 -0.07567 0.018326 -0.045124 -0.07591 0.027779 -0.017334 0.047598 -0.062589 0.024912 0.020183 -0.023533 -0.046825 0.0061756 -0.012199 0.015842 -0.0047243 -0.019857 0.0021474 -0.013486 -0.023883 -0.035423 0.091772 -0.056426 -0.037489 0.025584 0.022193 0.015389 -0.0020958 0.024936 0.051224 0.025432 -0.04946 0.0078956 0.040219 -0.001777 -0.0033565 -0.01838 -0.050211 0.00048113 0.087508 0.068246 0.018214 -0.033749 0.073351 0.0022678 0.042859 -0.0085355 0.049498 -0.0086016 0.035682 0.022578 -0.013278 -0.019268 -0.0075524 -0.036511 -0.022972 0.053287 0.051902 0.016099 0.048922 -0.1195 0.00047682 -0.059753 0.0073238 -0.042071 -0.0080624 0.057374 -0.034653 0.031229 0.07246 -0.021956 -0.045583 -0.075105 -0.03083 0.036011 -0.010523 -0.050161 -0.036642 -0.074121 0.017641 -0.0057278 0.054648 0.088802 0.041096 -0.038964 0.053056 -0.060522 -0.016232 -0.0076028 -0.019994 0.082651 -0.060245 -0.064645 0.037242 0.015915 -0.026328 -0.079466 0.04848 -0.028714 0.048039 -0.0047231 -0.010045 0.035274 0.038209 -0.048382 0.054357 0.011622 -0.00030723 -0.025462 -0.069894 -0.019224 -0.014574 -0.0049838 -0.013994 -0.085794 -0.057313 -0.0097388 -0.023729 -0.024052 -0.058787 -0.051661 -0.020013 7.5205e-05 -0.07193 0.0056893 0.053506 0.036861 -0.019145 -0.0036011 0.042058 -0.0016438 -0.0506 0.06314 -0.013938 0.1052 -0.07816 -0.041207 -0.00708 -0.010074 0.017132 0.018722 -0.0097069 0.022939 -0.0085721 -0.051454 -0.016561 0.013577 -0.050622 -0.0015314 0.039878 0.00036676 0.071662 -0.067663 -0.0042934 -0.046598 -0.035309 -0.052546 0.065659 -0.019599 0.02421 0.033523 -0.014766 0.0052203 0.025224 -0.004608 0.049682 -0.045348 0.0043944 0.019093 -0.013003 0.038516 -0.068498 0.018491 0.038151 -0.037078 0.026025 0.10285 -0.012408 0.047029 0.068212 -0.043607 0.010405 0.033173 -0.0076316 -0.048956 -0.11032 -0.087905 -0.032883 0.074849 -0.0067834 0.018066 0.011373 -0.071265 -0.09854 -0.1037 0.057977 0.038582 -0.039804 0.029947 -0.030891 -0.003933 0.020709 0.12654 0.035215 0.042957 -0.036341 -0.0021522 0.038093 0.0050351 0.012373 0.022169 -0.041561 0.00096671 -0.011334 0.068785 0.028343 -0.036429 0.018654 0.092162 -0.018664 -0.0013434 0.020681 0.072984 0.035861 -0.0023483 -0.046242 0.01397 -0.01619 0.089389 0.0056528 -0.02322 0.068987 -0.028667 0.042831 -0.028848 -0.01665 -0.016143 -0.048599 0.031298 -0.073346 -0.03886 -0.071384 
-amet, -0.040503 -0.029654 -0.038061 0.0019485 -0.033243 0.0037467 -0.030579 0.02256 -0.016275 -0.012425 -0.025831 0.014459 -0.065891 -0.0069668 0.057824 -0.0058678 -0.031099 0.016111 -0.055403 0.024285 -0.0072885 -0.0020481 -0.049261 0.042942 0.010248 -0.049965 0.065872 0.036755 -0.071285 -0.01261 -0.020545 0.0084839 -0.016691 0.042762 0.0070214 -0.02855 -0.027879 -0.046504 0.026139 0.035451 0.053947 0.022099 -0.081792 0.027576 0.050373 0.01733 0.012029 -0.075608 -0.014977 0.0049939 -0.0088354 -0.0936 -0.025608 -0.045941 0.061932 -0.050903 0.0077159 0.026273 -0.065942 0.015493 -0.038998 -0.067151 0.024337 -0.015938 0.041374 -0.054226 0.021893 0.017772 -0.020073 -0.041873 0.0034012 -0.010441 0.012416 -0.0041194 -0.018656 0.00074865 -0.010424 -0.020857 -0.031382 0.078856 -0.048288 -0.033041 0.022711 0.019789 0.014581 -0.00064585 0.02106 0.044782 0.023559 -0.044079 0.006456 0.034764 -0.0016419 -0.0032549 -0.01573 -0.04531 0.00022416 0.077142 0.060536 0.014837 -0.030242 0.063258 1.0806e-07 0.036819 -0.006946 0.043256 -0.0053416 0.030884 0.018319 -0.011654 -0.017243 -0.0054486 -0.031969 -0.020177 0.046158 0.046455 0.01375 0.042404 -0.10288 -0.00062688 -0.052092 0.0061032 -0.037052 -0.0059539 0.050423 -0.030601 0.027266 0.063318 -0.018475 -0.03972 -0.065091 -0.025814 0.030979 -0.0089331 -0.043374 -0.031625 -0.063704 0.01554 -0.0045031 0.046383 0.078926 0.035699 -0.034972 0.046651 -0.05288 -0.014024 -0.0058397 -0.017391 0.070662 -0.051819 -0.056763 0.0327 0.012628 -0.022209 -0.070607 0.041024 -0.024151 0.041033 -0.0054369 -0.0089121 0.031786 0.034756 -0.043567 0.046623 0.0099092 -0.00052326 -0.022681 -0.060511 -0.015315 -0.012549 -0.0050304 -0.012534 -0.073807 -0.049286 -0.0072588 -0.019723 -0.021477 -0.049757 -0.043842 -0.01787 0.0011556 -0.061687 0.0044753 0.046935 0.032329 -0.016809 -0.0038241 0.037587 -0.0011382 -0.043776 0.055616 -0.011996 0.09129 -0.067035 -0.036203 -0.0055735 -0.0080723 0.014974 0.01642 -0.0093975 0.02218 -0.0063636 -0.04579 -0.013608 0.012702 -0.043672 -0.0011914 0.035175 0.0012127 0.062653 -0.059253 -0.0042343 -0.041006 -0.029603 -0.044717 0.056719 -0.017096 0.02099 0.028161 -0.012112 0.0054197 0.021661 -0.0042892 0.042974 -0.04091 0.0040447 0.016981 -0.011508 0.034434 -0.057728 0.016766 0.032534 -0.032486 0.023232 0.090085 -0.011911 0.040228 0.058962 -0.038808 0.0091172 0.028537 -0.005814 -0.042222 -0.095667 -0.07714 -0.028652 0.066002 -0.0064203 0.01615 0.010247 -0.061753 -0.085684 -0.089734 0.049385 0.032635 -0.034768 0.025149 -0.026801 -0.0040627 0.018255 0.11035 0.032181 0.038444 -0.031861 -0.0023398 0.033458 0.0044626 0.010064 0.019499 -0.034886 0.0002301 -0.0098634 0.059455 0.024443 -0.030956 0.016722 0.079057 -0.015545 -0.0021983 0.017128 0.063208 0.031809 -0.0018932 -0.03882 0.012214 -0.015052 0.078288 0.0057953 -0.02071 0.059818 -0.023659 0.037211 -0.024935 -0.014281 -0.015307 -0.042052 0.027343 -0.064204 -0.033943 -0.061632 
-consectetur -0.042307 -0.029821 -0.038975 0.00053165 -0.033451 0.0036045 -0.031798 0.022255 -0.015955 -0.013874 -0.02623 0.014946 -0.065228 -0.006959 0.059181 -0.0056108 -0.032184 0.017306 -0.056605 0.024674 -0.0071583 -0.0019243 -0.050148 0.043767 0.0099888 -0.050506 0.066309 0.03701 -0.072578 -0.013472 -0.02138 0.0089202 -0.017171 0.043944 0.0065726 -0.029115 -0.027773 -0.046387 0.026855 0.036859 0.055756 0.02291 -0.083233 0.028152 0.050936 0.01804 0.011725 -0.077125 -0.015061 0.0065565 -0.0083797 -0.095179 -0.026804 -0.046223 0.064225 -0.051787 0.0072252 0.027771 -0.066259 0.01529 -0.039351 -0.067097 0.025092 -0.016735 0.042152 -0.055164 0.022141 0.01808 -0.020809 -0.042492 0.0037419 -0.01047 0.013284 -0.0034587 -0.018415 0.00064161 -0.011524 -0.020965 -0.031628 0.080566 -0.048136 -0.033143 0.02278 0.02039 0.014095 -0.00083954 0.022138 0.045792 0.024095 -0.044513 0.0054446 0.034515 -0.0019735 -0.0025623 -0.016477 -0.045939 -0.0011622 0.077871 0.06163 0.01499 -0.030677 0.064359 -0.00018363 0.036178 -0.0072045 0.043595 -0.0060071 0.030372 0.018848 -0.012204 -0.016539 -0.0057021 -0.031576 -0.02106 0.04688 0.046118 0.013531 0.042888 -0.10386 -0.0002718 -0.052257 0.0060044 -0.037772 -0.0051651 0.051696 -0.031538 0.027643 0.06472 -0.018177 -0.039909 -0.067588 -0.026236 0.031212 -0.0083553 -0.04304 -0.031436 -0.064984 0.015345 -0.0045049 0.047739 0.079666 0.036351 -0.03552 0.046686 -0.052514 -0.013642 -0.0061677 -0.01764 0.072261 -0.051854 -0.057136 0.034129 0.012693 -0.021999 -0.071522 0.041912 -0.024278 0.041002 -0.0060949 -0.010262 0.031917 0.034198 -0.044363 0.047163 0.010224 -0.00112 -0.02316 -0.060867 -0.016028 -0.012938 -0.0057525 -0.013735 -0.074129 -0.050193 -0.0071498 -0.020629 -0.021531 -0.050456 -0.044334 -0.018162 0.00070301 -0.063779 0.0046454 0.047882 0.032339 -0.016454 -0.0043729 0.037614 -0.0023618 -0.045043 0.055701 -0.012145 0.09243 -0.06685 -0.037669 -0.006379 -0.0085802 0.014981 0.017056 -0.010046 0.021725 -0.0062611 -0.04618 -0.013649 0.012795 -0.04462 -0.0016271 0.035203 0.00037295 0.062274 -0.060205 -0.0036019 -0.041989 -0.031307 -0.04559 0.057268 -0.016662 0.021514 0.027902 -0.011445 0.0052598 0.021787 -0.003892 0.043721 -0.041818 0.0035646 0.016941 -0.012256 0.035175 -0.059053 0.017096 0.03302 -0.033013 0.023048 0.091899 -0.011862 0.039614 0.060625 -0.038977 0.0082668 0.028436 -0.0060786 -0.043685 -0.098313 -0.077571 -0.028402 0.06659 -0.0063806 0.016033 0.010253 -0.062862 -0.086854 -0.090552 0.050609 0.033435 -0.034545 0.025225 -0.02787 -0.004006 0.018786 0.11166 0.032869 0.039138 -0.032916 -0.0028783 0.034905 0.0051179 0.010273 0.021167 -0.035415 0.0002145 -0.010174 0.060636 0.024317 -0.032236 0.016665 0.080389 -0.015622 -0.0026133 0.017032 0.064288 0.032946 -0.0024458 -0.039238 0.011354 -0.013745 0.078731 0.0043261 -0.021133 0.06017 -0.024443 0.037297 -0.025183 -0.01472 -0.015061 -0.041561 0.027881 -0.064921 -0.034217 -0.061648 
-adipiscing -0.04481 -0.032333 -0.041468 0.0014939 -0.036502 0.0034091 -0.034681 0.02424 -0.018394 -0.01485 -0.028301 0.015875 -0.07179 -0.0071077 0.064181 -0.005435 -0.03498 0.018042 -0.061564 0.026826 -0.0077702 -0.0018228 -0.054575 0.04766 0.011333 -0.05555 0.073035 0.040401 -0.078433 -0.014434 -0.023166 0.010015 -0.018347 0.047801 0.0076197 -0.031303 -0.03059 -0.05072 0.029408 0.03952 0.060269 0.024717 -0.090666 0.031002 0.056136 0.02008 0.012864 -0.083929 -0.016476 0.0064296 -0.0088656 -0.10361 -0.029156 -0.050746 0.069118 -0.056876 0.0079649 0.029423 -0.072164 0.0164 -0.042456 -0.072314 0.027665 -0.017547 0.045955 -0.05953 0.023894 0.019815 -0.022903 -0.045854 0.0041721 -0.010398 0.014952 -0.0034366 -0.019987 0.00057948 -0.012409 -0.023119 -0.034708 0.08739 -0.053308 -0.036387 0.02481 0.022713 0.016365 4.2379e-05 0.023619 0.050097 0.025893 -0.048648 0.0060721 0.03841 -0.002317 -0.0037655 -0.017792 -0.049859 -0.00033124 0.085366 0.066911 0.016356 -0.03362 0.070312 2.404e-05 0.039697 -0.007344 0.047266 -0.0064406 0.033757 0.020593 -0.013231 -0.01883 -0.0054792 -0.034431 -0.022746 0.050855 0.050423 0.015234 0.046621 -0.11422 -0.00050852 -0.057161 0.0063628 -0.040992 -0.0065762 0.05548 -0.034516 0.02914 0.070043 -0.019905 -0.043954 -0.073254 -0.028523 0.034316 -0.008624 -0.047442 -0.035154 -0.071098 0.017769 -0.0049154 0.051443 0.087542 0.039293 -0.038444 0.050924 -0.058026 -0.015291 -0.0066501 -0.019494 0.077936 -0.056442 -0.061869 0.037727 0.013578 -0.024845 -0.077427 0.04509 -0.026338 0.044767 -0.0062879 -0.010747 0.034757 0.037346 -0.04832 0.051443 0.010395 -0.00085051 -0.025492 -0.067017 -0.016591 -0.014023 -0.0064931 -0.014802 -0.080841 -0.05361 -0.0080117 -0.022655 -0.023717 -0.054935 -0.048419 -0.020137 0.00050541 -0.069076 0.005045 0.052155 0.035577 -0.017815 -0.0044501 0.040582 -0.0022071 -0.048952 0.061031 -0.013367 0.10167 -0.073141 -0.040659 -0.0066141 -0.0086041 0.01599 0.018679 -0.011092 0.024426 -0.0071787 -0.050667 -0.015128 0.01405 -0.048323 -0.001723 0.03828 0.0014949 0.068455 -0.06581 -0.0041224 -0.044712 -0.033178 -0.049608 0.062661 -0.018215 0.023725 0.030666 -0.01262 0.0061148 0.023694 -0.0044456 0.047576 -0.045753 0.0032634 0.018599 -0.013184 0.037488 -0.064571 0.018524 0.036174 -0.036036 0.025479 0.099514 -0.013285 0.043592 0.065676 -0.042664 0.0090065 0.03094 -0.0067355 -0.047313 -0.10663 -0.084402 -0.030593 0.072067 -0.0069922 0.017308 0.011174 -0.067659 -0.094817 -0.097881 0.054705 0.036385 -0.037607 0.027642 -0.030308 -0.0044839 0.020516 0.12128 0.035388 0.043227 -0.035834 -0.0023975 0.037438 0.0056027 0.011559 0.022723 -0.038541 0.00046999 -0.010648 0.065732 0.026381 -0.034973 0.018798 0.087329 -0.017741 -0.0025826 0.019171 0.07039 0.035196 -0.0030469 -0.042588 0.012426 -0.015833 0.085994 0.0047694 -0.022255 0.065814 -0.026749 0.040276 -0.027597 -0.016341 -0.017267 -0.04624 0.03071 -0.070186 -0.037674 -0.067717 
-elit, -0.039359 -0.027617 -0.036 0.00052145 -0.030813 0.0025614 -0.029695 0.021652 -0.015549 -0.012682 -0.02417 0.013124 -0.061534 -0.0064554 0.054922 -0.0052654 -0.030375 0.015341 -0.052642 0.023057 -0.0059965 -0.002193 -0.04718 0.04077 0.0091032 -0.047433 0.06273 0.033756 -0.068908 -0.012191 -0.020038 0.008651 -0.015053 0.041417 0.0059586 -0.027146 -0.026192 -0.042751 0.024581 0.034588 0.052464 0.021246 -0.078182 0.027295 0.049028 0.016713 0.0104 -0.071957 -0.013328 0.0060392 -0.0076413 -0.088636 -0.024766 -0.043315 0.059314 -0.048748 0.0072287 0.025015 -0.062292 0.014446 -0.037455 -0.062402 0.02415 -0.015319 0.039813 -0.051003 0.020264 0.017233 -0.019549 -0.0391 0.0030561 -0.008961 0.011613 -0.0034919 -0.017672 0.00087378 -0.01031 -0.019124 -0.029604 0.074632 -0.044878 -0.032146 0.021197 0.019563 0.013792 -0.00023729 0.020112 0.043567 0.022836 -0.042569 0.0054972 0.033 -0.001266 -0.0018927 -0.015622 -0.043824 4.8951e-05 0.073018 0.057349 0.013386 -0.029698 0.060357 -0.00025027 0.034452 -0.0072086 0.041076 -0.0052199 0.029007 0.017824 -0.011974 -0.015603 -0.0044911 -0.029721 -0.019719 0.042497 0.0437 0.012369 0.040021 -0.097944 -1.1167e-05 -0.049729 0.005165 -0.034957 -0.0046335 0.047748 -0.029141 0.025452 0.060438 -0.016959 -0.037013 -0.063322 -0.023613 0.029473 -0.0072676 -0.040459 -0.030687 -0.060309 0.013886 -0.0048926 0.044143 0.074365 0.033576 -0.03262 0.043267 -0.049953 -0.01338 -0.005216 -0.016352 0.066926 -0.048387 -0.052529 0.031339 0.011611 -0.020722 -0.066609 0.039843 -0.022292 0.038704 -0.0053374 -0.0090624 0.030139 0.032291 -0.041255 0.044342 0.0089366 -0.00088654 -0.02196 -0.057423 -0.015142 -0.012343 -0.0049603 -0.01342 -0.069544 -0.046059 -0.006051 -0.019395 -0.020134 -0.046889 -0.041695 -0.017031 0.00057207 -0.059286 0.0036283 0.04512 0.031248 -0.016131 -0.0029768 0.034162 -0.0014446 -0.042454 0.053453 -0.011473 0.087096 -0.062465 -0.034591 -0.0060677 -0.0082402 0.013071 0.016009 -0.0092381 0.020883 -0.0063669 -0.04351 -0.012836 0.011985 -0.041564 -0.0011288 0.033725 0.0017448 0.058369 -0.056054 -0.0039244 -0.039524 -0.027923 -0.042839 0.05381 -0.015738 0.020843 0.02686 -0.010467 0.0053373 0.020754 -0.0041328 0.04105 -0.039251 0.0033662 0.015883 -0.011962 0.032295 -0.054437 0.016039 0.030201 -0.030873 0.021471 0.08644 -0.011541 0.036787 0.056703 -0.036777 0.0083109 0.026804 -0.0052574 -0.040932 -0.090603 -0.072813 -0.026103 0.062078 -0.0055814 0.014676 0.0097624 -0.05899 -0.081045 -0.084685 0.047218 0.030587 -0.032477 0.023817 -0.025926 -0.003581 0.017439 0.10434 0.030671 0.036059 -0.030246 -0.0027056 0.032312 0.0046864 0.010705 0.018894 -0.033017 0.0006614 -0.0091515 0.056496 0.022748 -0.030313 0.016329 0.074967 -0.015174 -0.002224 0.01666 0.060878 0.030137 -0.0018274 -0.036165 0.01058 -0.013399 0.073834 0.0045668 -0.018644 0.056272 -0.02291 0.034876 -0.023593 -0.013444 -0.014983 -0.03974 0.025869 -0.060872 -0.03203 -0.057233 
-sed -0.02888 -0.021277 -0.02777 0.00084809 -0.023029 0.0021799 -0.021564 0.016151 -0.012294 -0.0098094 -0.018835 0.010488 -0.045782 -0.0048857 0.041444 -0.0032289 -0.021491 0.012579 -0.039584 0.016832 -0.0051581 -0.00067652 -0.034714 0.030982 0.0087886 -0.034704 0.047796 0.024595 -0.050265 -0.0093321 -0.014433 0.0072693 -0.012164 0.030715 0.0051638 -0.020137 -0.019066 -0.032883 0.017913 0.025956 0.038937 0.01646 -0.058371 0.020722 0.036299 0.012942 0.0074242 -0.053865 -0.010778 0.0042716 -0.0065794 -0.065949 -0.018577 -0.033636 0.045806 -0.03629 0.0065953 0.019089 -0.047438 0.010654 -0.027509 -0.046946 0.017625 -0.011489 0.029743 -0.037365 0.014757 0.013996 -0.014326 -0.02975 0.0041629 -0.0062155 0.0078693 -0.0026442 -0.012131 0.00088033 -0.0080026 -0.014096 -0.021743 0.056628 -0.033927 -0.023357 0.016002 0.015648 0.010812 -5.8519e-05 0.014695 0.031318 0.017132 -0.032083 0.0053117 0.024974 -0.00018766 -0.0017812 -0.010757 -0.032905 -0.00054768 0.055167 0.043273 0.0094402 -0.020685 0.045584 -0.00045531 0.025095 -0.0044364 0.031992 -0.004975 0.019684 0.013073 -0.0092755 -0.012104 -0.0043444 -0.021758 -0.014723 0.033494 0.03246 0.010717 0.030043 -0.072838 2.9171e-05 -0.037866 0.0043086 -0.025515 -0.0029796 0.035138 -0.020511 0.018913 0.044718 -0.013315 -0.027697 -0.048883 -0.019325 0.021958 -0.0071945 -0.031258 -0.023192 -0.045284 0.010845 -0.0034776 0.032368 0.055589 0.024808 -0.024391 0.032952 -0.037246 -0.0098969 -0.0033716 -0.011713 0.05047 -0.037079 -0.039417 0.025385 0.0097536 -0.015713 -0.05105 0.03035 -0.017222 0.030104 -0.0035689 -0.0079045 0.022511 0.024412 -0.031747 0.032376 0.0069823 -0.00065313 -0.01574 -0.043002 -0.0095696 -0.0083585 -0.0044158 -0.010063 -0.051181 -0.033893 -0.0058043 -0.013506 -0.015703 -0.036822 -0.030835 -0.012383 -0.00024209 -0.044806 0.0037647 0.034921 0.024066 -0.012353 -0.0039797 0.025979 -0.0021768 -0.031114 0.039213 -0.0092313 0.066137 -0.0475 -0.026281 -0.0033234 -0.006355 0.01033 0.011014 -0.0074306 0.015272 -0.0035807 -0.032592 -0.0092301 0.0084009 -0.0316 -0.0010952 0.025302 0.00028709 0.045541 -0.041327 -0.0018596 -0.028974 -0.021855 -0.031672 0.040722 -0.012936 0.014088 0.01937 -0.0084854 0.0046196 0.013838 -0.0036081 0.032303 -0.029156 0.0019505 0.011538 -0.0092384 0.024588 -0.041663 0.012899 0.023836 -0.023886 0.015908 0.062706 -0.008912 0.028362 0.042564 -0.027242 0.0059867 0.018616 -0.0050342 -0.029653 -0.0703 -0.053174 -0.019598 0.046166 -0.0037593 0.011538 0.0069476 -0.043668 -0.061723 -0.06245 0.036226 0.021831 -0.024003 0.01824 -0.020457 -0.0023344 0.013639 0.076151 0.022887 0.026465 -0.022177 -0.0026328 0.024649 0.0028696 0.0076125 0.015212 -0.024734 -0.00016512 -0.0059099 0.041225 0.017629 -0.022531 0.012091 0.055771 -0.010652 -0.00098546 0.010878 0.045527 0.023243 -0.001525 -0.027552 0.006969 -0.010014 0.056085 0.0021722 -0.015358 0.042207 -0.017266 0.027434 -0.017416 -0.010807 -0.010683 -0.029865 0.019855 -0.045795 -0.023451 -0.043991 
-do -0.047249 -0.032923 -0.041921 0.0021175 -0.038614 0.0039464 -0.034571 0.024142 -0.018539 -0.015633 -0.027778 0.016688 -0.073656 -0.0081879 0.068833 -0.0071625 -0.037251 0.019675 -0.065979 0.027184 -0.0077872 -0.0010696 -0.057096 0.0486 0.011769 -0.058002 0.075783 0.040317 -0.082005 -0.015811 -0.022224 0.010537 -0.019138 0.050996 0.0068487 -0.032228 -0.03265 -0.052228 0.031014 0.04155 0.064447 0.025815 -0.094804 0.032706 0.057861 0.021309 0.012208 -0.088907 -0.016702 0.0066629 -0.0080187 -0.10517 -0.030744 -0.051395 0.070779 -0.059426 0.0091924 0.031075 -0.075492 0.016903 -0.043355 -0.076683 0.029045 -0.019717 0.045937 -0.062327 0.024008 0.020129 -0.024347 -0.048773 0.0061355 -0.011597 0.015844 -0.0025633 -0.020586 0.0020827 -0.012461 -0.024799 -0.036582 0.09191 -0.055021 -0.038307 0.026899 0.024222 0.015991 -0.00056156 0.025655 0.051852 0.026633 -0.050589 0.0055781 0.03856 -0.0012597 -0.0031241 -0.019227 -0.052916 -0.00069955 0.089618 0.070342 0.01664 -0.035002 0.073861 0.00045498 0.04025 -0.0089041 0.048764 -0.0074391 0.034473 0.020822 -0.01434 -0.017852 -0.0053121 -0.038284 -0.022321 0.054353 0.053156 0.016422 0.048649 -0.11751 -0.0014918 -0.059316 0.007251 -0.042673 -0.006328 0.058136 -0.037385 0.032469 0.071496 -0.019463 -0.046445 -0.077112 -0.030051 0.034724 -0.0079989 -0.048804 -0.036277 -0.073584 0.016995 -0.0057566 0.052113 0.090418 0.040774 -0.037662 0.052603 -0.059797 -0.015616 -0.0061471 -0.018535 0.081151 -0.059392 -0.065251 0.03941 0.01537 -0.024834 -0.081253 0.047761 -0.028257 0.047477 -0.0060484 -0.01041 0.036868 0.038079 -0.049979 0.054313 0.010951 -0.00035817 -0.027022 -0.068364 -0.017369 -0.016354 -0.0058055 -0.016245 -0.083172 -0.056571 -0.0073522 -0.023457 -0.023505 -0.057458 -0.049001 -0.021315 0.0010011 -0.072547 0.0045488 0.054737 0.0377 -0.01736 -0.0024183 0.042159 -0.0019048 -0.050892 0.064792 -0.014527 0.10476 -0.076015 -0.042232 -0.007901 -0.010648 0.016876 0.019051 -0.012093 0.023552 -0.0083436 -0.053257 -0.014889 0.014715 -0.050775 -0.0018365 0.03786 -0.00058043 0.071347 -0.068128 -0.0039349 -0.046024 -0.033695 -0.049931 0.064201 -0.017938 0.0225 0.033852 -0.012406 0.0072748 0.025092 -0.0045981 0.04855 -0.046082 0.0037493 0.020939 -0.014092 0.038269 -0.066807 0.019491 0.037442 -0.036916 0.026749 0.10463 -0.013482 0.044236 0.068947 -0.045051 0.0096503 0.032412 -0.0060787 -0.049431 -0.11007 -0.087612 -0.031725 0.073621 -0.0054343 0.017812 0.011934 -0.070282 -0.099332 -0.10253 0.055342 0.036804 -0.038541 0.029512 -0.0321 -0.0049325 0.021273 0.12742 0.038635 0.045241 -0.037066 -0.002715 0.037469 0.0048146 0.013428 0.02482 -0.039139 0.00024377 -0.010347 0.068334 0.027813 -0.035059 0.018874 0.091964 -0.017633 -0.0020009 0.019252 0.074176 0.036466 -0.0023983 -0.044312 0.012685 -0.017063 0.088865 0.0045972 -0.02167 0.066169 -0.028994 0.042296 -0.030031 -0.016176 -0.017697 -0.047441 0.03291 -0.07258 -0.039703 -0.068439 
-eiusmod -0.042272 -0.029388 -0.038876 0.0026314 -0.033508 0.0025808 -0.03254 0.02268 -0.017241 -0.013243 -0.026254 0.014779 -0.066494 -0.0065936 0.059599 -0.0066462 -0.032775 0.016634 -0.057845 0.025141 -0.0070764 -0.0017325 -0.050921 0.04474 0.010802 -0.051512 0.067749 0.037369 -0.073739 -0.013679 -0.021232 0.0093296 -0.016853 0.044083 0.006909 -0.0288 -0.028489 -0.047193 0.027993 0.036708 0.056304 0.023344 -0.085443 0.030192 0.052775 0.017956 0.010857 -0.078484 -0.015365 0.0062014 -0.0087069 -0.096324 -0.027183 -0.047893 0.064516 -0.052743 0.0064935 0.027552 -0.067642 0.015991 -0.039414 -0.067436 0.025977 -0.016537 0.043111 -0.055198 0.023009 0.018253 -0.021465 -0.043131 0.0036185 -0.0098343 0.013187 -0.0033764 -0.019508 0.00021607 -0.011411 -0.021705 -0.031584 0.081614 -0.050611 -0.034037 0.022711 0.020716 0.015756 -0.00064861 0.022131 0.046789 0.024377 -0.045178 0.0055817 0.0357 -0.0013722 -0.0031974 -0.016888 -0.046551 -0.0011161 0.078831 0.06263 0.015008 -0.032104 0.066023 -0.00095719 0.036771 -0.0059939 0.043588 -0.0063788 0.031241 0.018554 -0.011952 -0.017051 -0.0053644 -0.032364 -0.021345 0.047395 0.047381 0.013355 0.043421 -0.10726 -0.0012511 -0.053661 0.0059309 -0.037433 -0.0057609 0.05213 -0.031693 0.027314 0.065886 -0.017981 -0.040911 -0.068613 -0.025947 0.031523 -0.0084362 -0.04391 -0.033153 -0.065888 0.015505 -0.0051526 0.047638 0.081395 0.037042 -0.035389 0.04703 -0.053394 -0.014759 -0.0065426 -0.017586 0.073367 -0.053089 -0.058413 0.035547 0.012091 -0.022988 -0.072997 0.042254 -0.02503 0.041438 -0.0056846 -0.010409 0.032579 0.035402 -0.045102 0.047781 0.010334 -0.00060367 -0.023954 -0.062233 -0.015934 -0.013503 -0.0068513 -0.013266 -0.07537 -0.050439 -0.0070722 -0.021165 -0.02286 -0.051195 -0.045349 -0.018392 0.00080791 -0.064109 0.0051697 0.048037 0.033733 -0.016209 -0.003965 0.037927 -0.0022284 -0.04624 0.057259 -0.01289 0.09494 -0.067536 -0.037935 -0.0068062 -0.0096227 0.014647 0.016623 -0.0099501 0.02245 -0.0063626 -0.047622 -0.014814 0.013006 -0.045163 -0.0014055 0.035353 0.00093473 0.063355 -0.06184 -0.0045676 -0.041736 -0.03137 -0.045833 0.0583 -0.017577 0.021916 0.028833 -0.011505 0.0058447 0.02187 -0.0042088 0.044377 -0.043119 0.0034461 0.017086 -0.013036 0.035164 -0.059951 0.017221 0.033588 -0.033674 0.023963 0.093693 -0.01281 0.040468 0.062435 -0.039388 0.0082073 0.029167 -0.007139 -0.044008 -0.099353 -0.078294 -0.028721 0.066999 -0.0060978 0.016472 0.010616 -0.062911 -0.088589 -0.091166 0.050669 0.033762 -0.035909 0.025941 -0.028298 -0.004553 0.018821 0.11398 0.033224 0.039931 -0.033078 -0.0021324 0.035114 0.0046811 0.010174 0.021357 -0.03593 0.00049897 -0.0099362 0.061326 0.024748 -0.032037 0.017733 0.081849 -0.015919 -0.0023457 0.017248 0.065682 0.033331 -0.0030792 -0.040374 0.01199 -0.014197 0.080441 0.004855 -0.021478 0.062312 -0.023932 0.037467 -0.025639 -0.014708 -0.01611 -0.043368 0.029362 -0.065075 -0.035012 -0.062696 
-tempor -0.041311 -0.028633 -0.03655 0.00079576 -0.032699 0.003052 -0.031018 0.022434 -0.016208 -0.01295 -0.024725 0.015442 -0.064082 -0.0068023 0.057094 -0.005508 -0.031523 0.016858 -0.055363 0.023912 -0.0068754 -0.0020137 -0.048978 0.042465 0.010504 -0.049551 0.064728 0.036389 -0.070172 -0.012673 -0.020456 0.0091613 -0.016158 0.04314 0.0060693 -0.028623 -0.026824 -0.044981 0.025411 0.035666 0.054079 0.021724 -0.081031 0.027617 0.050612 0.017869 0.010528 -0.07547 -0.014474 0.0056289 -0.0086188 -0.092896 -0.026397 -0.04454 0.06226 -0.050551 0.0075513 0.026462 -0.064453 0.015138 -0.038006 -0.064701 0.024921 -0.016012 0.04061 -0.052557 0.020752 0.017592 -0.020357 -0.040995 0.0044276 -0.010189 0.012829 -0.0034237 -0.019046 0.00049393 -0.010442 -0.020533 -0.030619 0.078926 -0.046871 -0.033645 0.021813 0.020332 0.013504 -0.00021156 0.020737 0.044658 0.022608 -0.044084 0.0058317 0.034742 -0.0017824 -0.0028614 -0.016079 -0.044853 -0.001438 0.076157 0.060097 0.014004 -0.030461 0.062547 -0.0005573 0.035935 -0.0070375 0.042041 -0.0060149 0.030354 0.017721 -0.011149 -0.016203 -0.0055459 -0.031638 -0.020564 0.045664 0.045915 0.014165 0.042098 -0.102 0.00015813 -0.05124 0.0058427 -0.036342 -0.0059713 0.049298 -0.031195 0.026523 0.063122 -0.017881 -0.038574 -0.064921 -0.025172 0.029918 -0.0077676 -0.042124 -0.03182 -0.063308 0.014473 -0.0051956 0.046095 0.077361 0.035544 -0.033691 0.045616 -0.050205 -0.014437 -0.00625 -0.016612 0.069661 -0.051164 -0.05482 0.032833 0.011771 -0.021498 -0.069528 0.04158 -0.023564 0.039621 -0.0057376 -0.0092931 0.031013 0.033873 -0.042275 0.045546 0.0089965 -0.0011007 -0.022997 -0.059465 -0.015578 -0.01313 -0.0053139 -0.013775 -0.072347 -0.049028 -0.0067654 -0.019696 -0.020385 -0.048616 -0.043164 -0.016899 0.0010018 -0.062633 0.0051581 0.046538 0.031676 -0.016103 -0.003656 0.036631 -0.0019691 -0.043414 0.054865 -0.01251 0.090777 -0.065315 -0.036341 -0.0065415 -0.008701 0.014783 0.016589 -0.010693 0.021151 -0.0066697 -0.045179 -0.013279 0.012161 -0.043492 -0.001388 0.034414 0.00079023 0.060906 -0.058584 -0.0031462 -0.040087 -0.029184 -0.044423 0.055911 -0.017212 0.021573 0.027399 -0.011254 0.006212 0.021996 -0.0032462 0.041829 -0.040621 0.0030302 0.01739 -0.011845 0.034081 -0.056986 0.017 0.031462 -0.03164 0.021958 0.089779 -0.011857 0.039194 0.058646 -0.038095 0.0079009 0.027413 -0.0061164 -0.042525 -0.094907 -0.075263 -0.027684 0.063874 -0.0060584 0.014662 0.0093549 -0.060855 -0.084521 -0.087978 0.049028 0.032472 -0.033273 0.024585 -0.027391 -0.0043065 0.017885 0.10817 0.031616 0.038868 -0.031659 -0.0026875 0.033501 0.0046239 0.011094 0.02089 -0.034479 0.00052274 -0.0096741 0.05859 0.023416 -0.032391 0.016828 0.077885 -0.015427 -0.0022037 0.017473 0.06261 0.031357 -0.0025761 -0.038261 0.011211 -0.014224 0.075568 0.0049699 -0.020057 0.058421 -0.02405 0.036155 -0.024169 -0.014677 -0.015373 -0.041249 0.02813 -0.063552 -0.034346 -0.059433 
-incididunt -0.045521 -0.031492 -0.042256 0.00083693 -0.037815 0.0036814 -0.034119 0.024687 -0.01838 -0.014603 -0.0292 0.016664 -0.072188 -0.0073111 0.064101 -0.0059787 -0.035583 0.018259 -0.061995 0.026582 -0.008266 -0.0020697 -0.055418 0.048477 0.011447 -0.055848 0.072885 0.04048 -0.079633 -0.015109 -0.02317 0.010019 -0.018358 0.047461 0.0075115 -0.031086 -0.031393 -0.051142 0.029259 0.039092 0.061219 0.02527 -0.091055 0.031048 0.056739 0.020459 0.012023 -0.084141 -0.01594 0.0061797 -0.0096254 -0.10361 -0.029238 -0.05127 0.069353 -0.056979 0.0082422 0.029304 -0.07305 0.016535 -0.042824 -0.072936 0.027599 -0.017336 0.046575 -0.059485 0.024457 0.019966 -0.02364 -0.045977 0.0052466 -0.011411 0.014915 -0.0033005 -0.020649 0.0008297 -0.012934 -0.023255 -0.034451 0.087641 -0.053403 -0.036849 0.025388 0.023513 0.016034 -0.0010906 0.023665 0.049328 0.025902 -0.048541 0.0060098 0.038829 -0.0024161 -0.0037192 -0.017655 -0.050549 5.6152e-05 0.084703 0.067075 0.016324 -0.033914 0.069735 -0.00044859 0.039962 -0.0076098 0.047885 -0.0075215 0.033584 0.020523 -0.014061 -0.01837 -0.0062315 -0.034798 -0.022595 0.050856 0.050402 0.014711 0.047007 -0.11495 5.1667e-05 -0.057965 0.006259 -0.041142 -0.0065976 0.05627 -0.034525 0.029611 0.070678 -0.020001 -0.043918 -0.073432 -0.028669 0.034126 -0.0093466 -0.047837 -0.035392 -0.071527 0.017999 -0.0054121 0.052073 0.087238 0.039735 -0.038247 0.050822 -0.058136 -0.015734 -0.0069555 -0.019269 0.078889 -0.057453 -0.062486 0.037607 0.014092 -0.02492 -0.078335 0.045488 -0.026974 0.045585 -0.0061108 -0.01045 0.034565 0.037754 -0.049313 0.051679 0.01068 -0.00053798 -0.025129 -0.066929 -0.017589 -0.014141 -0.0062478 -0.015345 -0.080836 -0.054064 -0.007544 -0.022632 -0.024324 -0.055572 -0.048812 -0.019237 0.00036534 -0.069425 0.0053251 0.052065 0.035676 -0.018724 -0.0042204 0.040881 -0.0017779 -0.049483 0.060765 -0.014145 0.10213 -0.073857 -0.040446 -0.0076532 -0.0095784 0.015434 0.017741 -0.011126 0.024067 -0.0074276 -0.05003 -0.014488 0.014011 -0.048513 -0.0013958 0.03873 0.00057948 0.068983 -0.066207 -0.0046679 -0.045324 -0.033964 -0.050052 0.063084 -0.018171 0.02366 0.031218 -0.012508 0.005872 0.024097 -0.0046554 0.048437 -0.045768 0.004757 0.018786 -0.013094 0.037239 -0.065304 0.018367 0.035954 -0.036239 0.024673 0.10011 -0.013313 0.043083 0.067082 -0.042675 0.009418 0.031077 -0.0067874 -0.04796 -0.10701 -0.084805 -0.03062 0.072543 -0.0065759 0.017933 0.011403 -0.069139 -0.094692 -0.098955 0.055181 0.03592 -0.038098 0.027715 -0.030429 -0.0041972 0.019968 0.12231 0.035391 0.042926 -0.035081 -0.0028128 0.037768 0.0057777 0.011746 0.023098 -0.038847 0.00099371 -0.010443 0.066634 0.026658 -0.034744 0.018668 0.08831 -0.017574 -0.0024867 0.019253 0.070866 0.035588 -0.0026293 -0.043119 0.013216 -0.015605 0.085847 0.0058927 -0.022664 0.06651 -0.027129 0.041486 -0.028252 -0.016057 -0.016651 -0.046336 0.03093 -0.07115 -0.038055 -0.067594 
-labore -0.041513 -0.029629 -0.037605 0.0013961 -0.033625 0.0030521 -0.032079 0.021997 -0.016957 -0.013466 -0.025051 0.014768 -0.065865 -0.0059418 0.059347 -0.0047871 -0.031978 0.017352 -0.056821 0.023834 -0.0080476 -0.00096353 -0.050401 0.043647 0.011002 -0.05107 0.067029 0.037213 -0.07204 -0.013734 -0.021128 0.0094491 -0.016941 0.044097 0.006573 -0.029717 -0.028182 -0.045942 0.025809 0.036994 0.05627 0.022451 -0.083356 0.028263 0.051427 0.018345 0.010909 -0.076929 -0.015469 0.0054427 -0.0082256 -0.094786 -0.027057 -0.04666 0.063037 -0.052124 0.0072171 0.027103 -0.066725 0.015127 -0.039529 -0.067446 0.024853 -0.015906 0.042138 -0.054878 0.022094 0.018663 -0.020338 -0.042299 0.0036319 -0.0096746 0.013045 -0.0027333 -0.019014 0.00071539 -0.012152 -0.021077 -0.032124 0.081005 -0.049001 -0.033747 0.023325 0.022119 0.01445 0.00011631 0.021596 0.046528 0.023817 -0.045014 0.0052811 0.034872 -0.002811 -0.0031774 -0.016262 -0.045502 -0.0005152 0.078282 0.061848 0.015458 -0.031244 0.064419 -0.00014442 0.036229 -0.0075714 0.043315 -0.0061971 0.030313 0.019103 -0.012265 -0.016586 -0.0047294 -0.031914 -0.021355 0.047105 0.047431 0.012934 0.042727 -0.10443 9.0372e-05 -0.052763 0.0056898 -0.037126 -0.0057996 0.05129 -0.03246 0.027477 0.064729 -0.018592 -0.039819 -0.067418 -0.026314 0.031181 -0.0073673 -0.043385 -0.03243 -0.06451 0.016202 -0.0047711 0.046798 0.080203 0.035324 -0.035506 0.046431 -0.051898 -0.013691 -0.0057741 -0.018188 0.071869 -0.052167 -0.056311 0.035463 0.012951 -0.022437 -0.0718 0.042133 -0.024059 0.041414 -0.0059228 -0.010102 0.031796 0.034455 -0.044637 0.047277 0.0093182 -0.0015737 -0.022777 -0.061368 -0.016107 -0.013533 -0.0057229 -0.013979 -0.073891 -0.049173 -0.0072779 -0.020677 -0.021957 -0.049535 -0.043625 -0.018726 0.00070714 -0.063097 0.004189 0.047124 0.032558 -0.016604 -0.003795 0.037202 -0.0016677 -0.045027 0.056867 -0.012274 0.092951 -0.067402 -0.037652 -0.0064892 -0.0083786 0.015248 0.017702 -0.010823 0.021414 -0.0066882 -0.04642 -0.01358 0.012512 -0.04405 -0.0012287 0.03546 0.00014792 0.062823 -0.06071 -0.0039886 -0.040648 -0.030697 -0.045699 0.057115 -0.016941 0.022402 0.028087 -0.01179 0.0054377 0.021544 -0.0044652 0.043346 -0.041965 0.0036248 0.017899 -0.012398 0.035099 -0.058641 0.016203 0.032101 -0.032889 0.023376 0.091758 -0.012589 0.040075 0.059962 -0.039157 0.0088656 0.028582 -0.0060926 -0.043785 -0.097433 -0.078365 -0.028377 0.065649 -0.006756 0.016118 0.010203 -0.063258 -0.087089 -0.0906 0.050085 0.032851 -0.033859 0.025899 -0.027226 -0.0040503 0.018466 0.11149 0.032236 0.040033 -0.032669 -0.0027488 0.034434 0.0048473 0.011307 0.020292 -0.036055 0.00027907 -0.010276 0.059869 0.024589 -0.032206 0.017292 0.079885 -0.015812 -0.0020996 0.018044 0.064424 0.032733 -0.0029879 -0.038091 0.011545 -0.01429 0.07864 0.0041111 -0.02024 0.059958 -0.025143 0.037486 -0.025527 -0.014613 -0.016516 -0.042022 0.028136 -0.064938 -0.03514 -0.061339 
-et -0.038716 -0.027137 -0.033659 0.0018842 -0.032044 0.003221 -0.029079 0.018753 -0.015897 -0.012335 -0.02386 0.013383 -0.061674 -0.0069614 0.053378 -0.0048566 -0.029532 0.016063 -0.052209 0.021368 -0.0072826 -0.0022887 -0.046746 0.040285 0.0097763 -0.046026 0.05968 0.035024 -0.066475 -0.013806 -0.019282 0.0085543 -0.014113 0.039326 0.0060842 -0.025999 -0.025362 -0.042184 0.024945 0.034463 0.05155 0.022438 -0.076458 0.02519 0.046919 0.017397 0.010281 -0.071198 -0.013687 0.0059984 -0.0087828 -0.086508 -0.026507 -0.043324 0.057135 -0.046991 0.0081494 0.025188 -0.060937 0.012945 -0.035324 -0.061715 0.0239 -0.01566 0.037657 -0.049742 0.018665 0.017241 -0.02029 -0.037127 0.0045254 -0.010565 0.01294 -0.0021188 -0.017012 -4.8533e-05 -0.010658 -0.019749 -0.029209 0.072596 -0.042561 -0.031787 0.021997 0.019261 0.013945 -0.0011393 0.022076 0.042351 0.020316 -0.040726 0.0043949 0.032547 -0.0035383 -0.001417 -0.014603 -0.042433 -0.0015388 0.07203 0.057561 0.015839 -0.027578 0.057865 0.00074488 0.034927 -0.0067874 0.040256 -0.0055318 0.029293 0.018239 -0.011749 -0.016649 -0.0032718 -0.029129 -0.019215 0.041573 0.041772 0.014474 0.039083 -0.095282 -0.0015757 -0.048361 0.0051262 -0.033038 -0.0046133 0.046687 -0.028923 0.025248 0.058222 -0.017532 -0.038348 -0.061754 -0.026046 0.027881 -0.0073375 -0.0402 -0.029054 -0.05899 0.013506 -0.0032404 0.042787 0.074136 0.034437 -0.031099 0.042399 -0.04702 -0.014183 -0.0055365 -0.015849 0.067459 -0.046512 -0.052503 0.033047 0.012483 -0.020216 -0.067804 0.036798 -0.022964 0.038061 -0.005555 -0.0091199 0.028547 0.032797 -0.040259 0.04403 0.0094561 -0.00015722 -0.022139 -0.056122 -0.01457 -0.010853 -0.0067656 -0.011998 -0.066817 -0.045351 -0.0067131 -0.019288 -0.020278 -0.04514 -0.03913 -0.018289 -2.5823e-06 -0.059364 0.0033007 0.0423 0.029031 -0.016855 -0.0026223 0.034309 -0.0030091 -0.040463 0.052035 -0.011683 0.08402 -0.062207 -0.035497 -0.0065806 -0.0070604 0.01488 0.013937 -0.0087945 0.019621 -0.0042586 -0.04311 -0.011205 0.011248 -0.040903 -0.0015431 0.03261 0.00086443 0.056683 -0.054744 -0.0022611 -0.036683 -0.028311 -0.041152 0.0515 -0.016254 0.018805 0.026731 -0.0111 0.0053706 0.019476 -0.0034967 0.039295 -0.038981 0.0033564 0.01647 -0.011779 0.029483 -0.053885 0.014916 0.029592 -0.030021 0.019765 0.082024 -0.010973 0.035666 0.056168 -0.035806 0.0091427 0.026193 -0.0063479 -0.040422 -0.088118 -0.070135 -0.025236 0.059041 -0.0044595 0.014692 0.0096429 -0.057878 -0.079644 -0.083193 0.044323 0.029942 -0.031483 0.023758 -0.027131 -0.0059495 0.019452 0.10188 0.028954 0.03545 -0.02805 -0.0037404 0.032026 0.0042718 0.0097198 0.019299 -0.032708 0.0025828 -0.0097971 0.054655 0.023462 -0.030093 0.013809 0.074251 -0.013718 -0.001928 0.017164 0.059398 0.029832 -0.0028317 -0.036357 0.01045 -0.013422 0.0694 0.0056258 -0.018043 0.05278 -0.022704 0.033165 -0.025479 -0.011584 -0.015133 -0.037852 0.025615 -0.059563 -0.032605 -0.056222 
-magna -0.046135 -0.031811 -0.04144 0.00081087 -0.037082 0.0025173 -0.034391 0.024575 -0.017538 -0.01477 -0.026886 0.015258 -0.070831 -0.0073139 0.063523 -0.0057296 -0.034724 0.018696 -0.060957 0.026173 -0.0085918 -0.0017108 -0.053929 0.046778 0.011621 -0.054887 0.070965 0.039163 -0.077612 -0.014634 -0.022429 0.010309 -0.018309 0.047376 0.0083284 -0.030777 -0.030387 -0.049691 0.03004 0.039746 0.060776 0.024965 -0.090559 0.030147 0.056069 0.019824 0.012365 -0.083874 -0.016501 0.006772 -0.0096244 -0.10321 -0.029049 -0.050085 0.068601 -0.056027 0.0083303 0.027906 -0.072063 0.016624 -0.042231 -0.072863 0.02687 -0.01799 0.046184 -0.058749 0.022994 0.019253 -0.022932 -0.045213 0.0043519 -0.010879 0.014163 -0.0031824 -0.020545 0.00046182 -0.011129 -0.02348 -0.033343 0.085795 -0.052418 -0.036785 0.024529 0.021755 0.015965 -0.0005243 0.022972 0.049008 0.024645 -0.048149 0.0064501 0.038216 -0.0023714 -0.0036245 -0.018001 -0.049769 -0.00099256 0.084177 0.06584 0.016154 -0.033707 0.069704 0.00022301 0.039646 -0.0082155 0.046844 -0.007126 0.033118 0.020058 -0.013182 -0.017374 -0.005475 -0.034419 -0.022612 0.050933 0.050815 0.01508 0.047577 -0.11327 -0.00078158 -0.056561 0.0074107 -0.04094 -0.0058659 0.055173 -0.033799 0.029018 0.069475 -0.018844 -0.043294 -0.071662 -0.028168 0.033514 -0.008052 -0.048118 -0.035332 -0.070505 0.017222 -0.0060117 0.051372 0.085886 0.039392 -0.036688 0.050345 -0.057169 -0.014861 -0.0077774 -0.019279 0.078018 -0.057238 -0.061222 0.036605 0.01291 -0.024517 -0.077269 0.046267 -0.02672 0.044329 -0.0062995 -0.010775 0.03422 0.038031 -0.046903 0.050638 0.010407 -0.0010682 -0.024668 -0.065433 -0.017049 -0.014582 -0.0069599 -0.014352 -0.080559 -0.054435 -0.0081426 -0.022456 -0.023165 -0.055151 -0.04729 -0.020287 0.0010778 -0.069218 0.0051968 0.051981 0.035079 -0.017343 -0.0049461 0.04132 -0.0018797 -0.048787 0.060225 -0.014129 0.10088 -0.07334 -0.041208 -0.0077161 -0.0092156 0.01602 0.017618 -0.011684 0.023952 -0.0066748 -0.051003 -0.015861 0.013228 -0.047882 -0.0020638 0.038012 0.0010681 0.06779 -0.065216 -0.0032418 -0.044756 -0.032439 -0.048828 0.062354 -0.019172 0.023977 0.030051 -0.012583 0.0064211 0.023589 -0.0045263 0.04711 -0.046153 0.003811 0.018627 -0.013104 0.037592 -0.063809 0.017923 0.036072 -0.035649 0.024841 0.099509 -0.012159 0.042322 0.065241 -0.042239 0.0090983 0.030465 -0.0063409 -0.045729 -0.10605 -0.083875 -0.030675 0.071444 -0.0066985 0.017339 0.010661 -0.067239 -0.093958 -0.097837 0.053247 0.035002 -0.036517 0.028393 -0.030113 -0.0050587 0.01947 0.12029 0.035314 0.043041 -0.034648 -0.0024923 0.036793 0.0053404 0.012333 0.022544 -0.038358 -0.0004547 -0.00989 0.065315 0.025812 -0.034404 0.017935 0.085858 -0.016364 -0.0023827 0.018171 0.069943 0.035487 -0.0040095 -0.043123 0.013448 -0.015784 0.084269 0.0055237 -0.022659 0.065195 -0.026936 0.040633 -0.028125 -0.015669 -0.017061 -0.044901 0.030734 -0.069931 -0.037316 -0.066492 
-aliqua. -0.047594 -0.032733 -0.043047 0.0012082 -0.03819 0.0029492 -0.037593 0.025557 -0.018889 -0.015244 -0.028923 0.017506 -0.074483 -0.0066702 0.066861 -0.0059024 -0.036515 0.019362 -0.064831 0.027777 -0.0081253 -0.0026686 -0.057188 0.049139 0.012126 -0.057743 0.075847 0.042619 -0.082435 -0.014232 -0.024149 0.011033 -0.01881 0.050067 0.0076574 -0.032491 -0.032192 -0.053041 0.030941 0.041401 0.064059 0.026433 -0.095755 0.032572 0.059176 0.021 0.012559 -0.087212 -0.016989 0.0071177 -0.0093473 -0.10838 -0.030275 -0.053031 0.072576 -0.059378 0.008113 0.030816 -0.075109 0.017425 -0.044374 -0.075743 0.028584 -0.018326 0.048011 -0.061423 0.024877 0.020638 -0.023528 -0.04871 0.0043109 -0.011011 0.015136 -0.0035237 -0.02187 0.0010138 -0.012464 -0.024511 -0.035927 0.091698 -0.055349 -0.038877 0.025237 0.023247 0.018031 -0.00059253 0.024528 0.05245 0.026723 -0.050305 0.0057619 0.039866 -0.002808 -0.0040285 -0.019067 -0.052811 -0.0014108 0.088146 0.069798 0.017541 -0.035758 0.073495 -0.001439 0.041852 -0.0076991 0.049173 -0.0078954 0.035479 0.021264 -0.013693 -0.018884 -0.0055313 -0.036026 -0.024326 0.054395 0.053369 0.016001 0.048559 -0.11954 -0.00054573 -0.060014 0.0072594 -0.042304 -0.0065265 0.05814 -0.035427 0.030408 0.073741 -0.020275 -0.046173 -0.076406 -0.029548 0.035971 -0.0093028 -0.049297 -0.037582 -0.074076 0.017947 -0.0063869 0.053729 0.09129 0.04144 -0.038793 0.052775 -0.059505 -0.015614 -0.0074362 -0.020288 0.082321 -0.058838 -0.064427 0.039328 0.013342 -0.025501 -0.081043 0.047944 -0.027276 0.046782 -0.0069203 -0.011211 0.035295 0.039835 -0.049521 0.054012 0.011062 -0.00044114 -0.026844 -0.069134 -0.017827 -0.015024 -0.0069785 -0.015667 -0.084346 -0.05611 -0.0080051 -0.023966 -0.025701 -0.057204 -0.050802 -0.021353 0.00078413 -0.072194 0.0052728 0.05445 0.036765 -0.01911 -0.0043649 0.042409 -0.0017674 -0.051448 0.063957 -0.014892 0.10636 -0.076198 -0.042486 -0.007151 -0.0099938 0.016927 0.019903 -0.012266 0.024161 -0.0075861 -0.05352 -0.016963 0.014744 -0.050423 -0.00090941 0.039711 0.0015389 0.071439 -0.069169 -0.0048345 -0.04671 -0.034222 -0.051996 0.065584 -0.020205 0.025081 0.031714 -0.013705 0.0067096 0.024413 -0.004336 0.049934 -0.047675 0.0034898 0.019323 -0.01402 0.039022 -0.066634 0.01915 0.037256 -0.038065 0.025878 0.10475 -0.013853 0.04614 0.068832 -0.044398 0.0088263 0.032235 -0.0075701 -0.049184 -0.11166 -0.08846 -0.031628 0.07565 -0.0071277 0.017746 0.011918 -0.071359 -0.099 -0.10225 0.056872 0.037663 -0.038792 0.030173 -0.032378 -0.0048814 0.021119 0.12744 0.03646 0.044958 -0.037233 -0.0031082 0.03894 0.0054544 0.012311 0.023675 -0.040003 -0.0005219 -0.010802 0.068993 0.027973 -0.036213 0.019135 0.091137 -0.018015 -0.0021004 0.019405 0.074002 0.037322 -0.0040572 -0.044754 0.013675 -0.016586 0.089007 0.0052579 -0.023307 0.069583 -0.027869 0.041605 -0.028523 -0.016819 -0.018475 -0.048201 0.031958 -0.073706 -0.038824 -0.070409 
-consequat. -0.043319 -0.030527 -0.040187 0.00063424 -0.034245 0.0037127 -0.03245 0.023749 -0.017347 -0.013982 -0.027346 0.01531 -0.068265 -0.0067049 0.061957 -0.0059774 -0.033347 0.017323 -0.059009 0.025721 -0.007576 -0.001324 -0.051831 0.045563 0.011211 -0.052724 0.069863 0.038242 -0.076044 -0.014798 -0.021854 0.0093685 -0.017562 0.045371 0.0063784 -0.030361 -0.029132 -0.047625 0.027754 0.038512 0.057648 0.023187 -0.087208 0.029673 0.053127 0.019172 0.012016 -0.080879 -0.015915 0.0061996 -0.0086687 -0.098464 -0.027416 -0.048558 0.066493 -0.054747 0.0079218 0.028868 -0.069711 0.016287 -0.040962 -0.069516 0.02644 -0.016596 0.043573 -0.057591 0.022613 0.01821 -0.021893 -0.044713 0.0041366 -0.010479 0.01369 -0.0036318 -0.020227 0.00075762 -0.011433 -0.021762 -0.033361 0.083953 -0.050262 -0.034881 0.023936 0.021929 0.015021 -0.00018422 0.02309 0.047666 0.02484 -0.046581 0.0056955 0.036318 -0.0014136 -0.0031003 -0.016945 -0.047556 -0.00032344 0.081753 0.06452 0.015306 -0.032592 0.067424 -0.0010394 0.038233 -0.0074895 0.045104 -0.0062548 0.030984 0.019589 -0.013055 -0.016946 -0.0057415 -0.033084 -0.021994 0.04898 0.04863 0.01414 0.044477 -0.10845 -0.00026202 -0.055024 0.006234 -0.039487 -0.0051888 0.054291 -0.033178 0.029165 0.067375 -0.019028 -0.041884 -0.07007 -0.026962 0.03273 -0.0084231 -0.045271 -0.032998 -0.067399 0.016425 -0.0056283 0.049306 0.083209 0.037545 -0.036479 0.048698 -0.054695 -0.015006 -0.0060235 -0.017937 0.074319 -0.054418 -0.059453 0.036113 0.013318 -0.022515 -0.074599 0.042835 -0.025326 0.043281 -0.0064829 -0.010253 0.03289 0.036057 -0.046478 0.048372 0.010229 -0.00048639 -0.024316 -0.063712 -0.016288 -0.013532 -0.0060053 -0.014276 -0.077272 -0.051795 -0.0068651 -0.022056 -0.022439 -0.052279 -0.046217 -0.018789 0.00072407 -0.065803 0.0050857 0.04955 0.03396 -0.016619 -0.0043924 0.038877 -0.0021389 -0.04705 0.059024 -0.013247 0.097643 -0.070124 -0.039137 -0.0060182 -0.0090307 0.015427 0.017541 -0.011326 0.022965 -0.0062779 -0.04889 -0.014571 0.012948 -0.046169 -0.00081225 0.036171 0.00040594 0.065078 -0.063232 -0.0042724 -0.043162 -0.032247 -0.047263 0.059527 -0.017902 0.022371 0.028806 -0.011762 0.0056721 0.022201 -0.0041295 0.045365 -0.044032 0.0033334 0.017829 -0.01317 0.03636 -0.06194 0.017747 0.034898 -0.034162 0.024691 0.095667 -0.013014 0.041417 0.062395 -0.04082 0.0099254 0.029828 -0.0068137 -0.045698 -0.10189 -0.080483 -0.02972 0.068394 -0.0065871 0.016538 0.01074 -0.064992 -0.090542 -0.09391 0.052754 0.034429 -0.035805 0.026534 -0.028453 -0.0040269 0.019697 0.11603 0.034047 0.04135 -0.033366 -0.0028371 0.03637 0.0046473 0.010884 0.022204 -0.037361 0.0011001 -0.01087 0.062913 0.025066 -0.033112 0.017814 0.083531 -0.016804 -0.0023819 0.018245 0.067146 0.033947 -0.0023337 -0.040952 0.011926 -0.014232 0.082399 0.0049858 -0.021148 0.06302 -0.025804 0.039123 -0.026342 -0.015535 -0.016358 -0.044481 0.029429 -0.068017 -0.036274 -0.064415 
-enim -0.042606 -0.02894 -0.038355 0.00092665 -0.033814 0.0026719 -0.031031 0.022699 -0.015894 -0.012362 -0.02591 0.015473 -0.066734 -0.0062393 0.060174 -0.0048245 -0.032869 0.01795 -0.056815 0.024456 -0.0082896 -0.0001866 -0.050817 0.043571 0.0097869 -0.05078 0.067509 0.036643 -0.072126 -0.014061 -0.020857 0.0077834 -0.016017 0.043636 0.006203 -0.028048 -0.028304 -0.046751 0.027872 0.036607 0.056856 0.022225 -0.084121 0.02894 0.051763 0.018391 0.011264 -0.077292 -0.014656 0.0049875 -0.0082479 -0.093819 -0.026247 -0.04642 0.063519 -0.053012 0.0072519 0.02745 -0.066773 0.016188 -0.038193 -0.067201 0.025172 -0.015781 0.042311 -0.053983 0.023049 0.017659 -0.019874 -0.042888 0.0044353 -0.0086629 0.013572 -0.0033052 -0.019213 0.001628 -0.010766 -0.021019 -0.031756 0.081141 -0.049103 -0.034007 0.02286 0.020644 0.014949 -0.00064451 0.021201 0.045346 0.022807 -0.045297 0.0056009 0.035683 -0.0014862 -0.0023113 -0.016095 -0.046426 -0.0011901 0.078441 0.060665 0.015185 -0.031615 0.064669 -0.00065207 0.036994 -0.007355 0.043448 -0.0067819 0.030912 0.018918 -0.013194 -0.016832 -0.0062488 -0.032214 -0.020686 0.046793 0.046431 0.014237 0.043531 -0.10529 -0.001769 -0.052741 0.0056305 -0.037972 -0.0047331 0.05252 -0.030607 0.026914 0.064647 -0.01853 -0.041066 -0.067055 -0.026266 0.03112 -0.0084134 -0.043132 -0.031852 -0.065487 0.015605 -0.005481 0.047151 0.079486 0.03632 -0.034193 0.046687 -0.051899 -0.014049 -0.0072897 -0.017836 0.072352 -0.052511 -0.056173 0.034428 0.013711 -0.021911 -0.071906 0.040616 -0.025231 0.041654 -0.005536 -0.0097423 0.032685 0.034849 -0.044638 0.047016 0.0096336 -0.00033951 -0.023546 -0.061399 -0.0154 -0.013774 -0.0059079 -0.013015 -0.073788 -0.050156 -0.007649 -0.021046 -0.022398 -0.050046 -0.043715 -0.01826 0.00085406 -0.064371 0.0057178 0.047895 0.033229 -0.015806 -0.0039615 0.037209 -0.0014967 -0.044866 0.056506 -0.013432 0.092738 -0.067716 -0.037552 -0.006904 -0.0087628 0.014624 0.016962 -0.0099539 0.022041 -0.0055183 -0.046593 -0.013086 0.012894 -0.044446 -0.0014941 0.034843 0.00075932 0.063268 -0.060447 -0.003722 -0.041452 -0.030943 -0.045241 0.058825 -0.016043 0.022415 0.02786 -0.012007 0.0053829 0.022179 -0.0039194 0.043387 -0.042589 0.0043017 0.018027 -0.011408 0.034264 -0.058891 0.017151 0.033296 -0.033245 0.022559 0.091967 -0.011877 0.039692 0.061671 -0.039238 0.0082854 0.028553 -0.007655 -0.042969 -0.098807 -0.077715 -0.028591 0.066487 -0.0062711 0.016397 0.010464 -0.062678 -0.086579 -0.09056 0.049145 0.03293 -0.034345 0.025509 -0.028206 -0.0046507 0.018384 0.11132 0.032065 0.03911 -0.033362 -0.0011493 0.035068 0.0055142 0.0099535 0.020143 -0.035029 0.00040204 -0.010228 0.061038 0.023812 -0.03175 0.016483 0.081084 -0.017128 -0.0022267 0.017173 0.064928 0.032936 -0.0026749 -0.039983 0.012188 -0.014404 0.079605 0.004098 -0.019988 0.060755 -0.02473 0.037722 -0.025065 -0.015413 -0.015267 -0.042505 0.029414 -0.065341 -0.035021 -0.061891 
-ad -0.035332 -0.024919 -0.034812 0.0014055 -0.030216 0.0018491 -0.029165 0.021246 -0.012969 -0.010608 -0.022258 0.011953 -0.05612 -0.0050391 0.050532 -0.005742 -0.027598 0.013188 -0.049148 0.022691 -0.0053569 -0.0010447 -0.043503 0.03593 0.00916 -0.043181 0.05626 0.031193 -0.063101 -0.010295 -0.020223 0.0074547 -0.011926 0.038252 0.0051568 -0.024655 -0.026168 -0.040959 0.023148 0.029764 0.047306 0.020094 -0.072236 0.023858 0.045938 0.01618 0.0078442 -0.067609 -0.012855 0.0051867 -0.0081591 -0.080888 -0.023302 -0.040195 0.053672 -0.044647 0.007713 0.022062 -0.057519 0.011798 -0.033775 -0.056764 0.023335 -0.012711 0.037465 -0.047733 0.019293 0.014774 -0.017668 -0.035473 0.0058008 -0.0069398 0.0093985 -0.004336 -0.01697 -0.0012119 -0.010559 -0.018061 -0.028477 0.068145 -0.04137 -0.028927 0.018049 0.017184 0.011221 -0.00038354 0.017787 0.039969 0.020194 -0.037442 0.0027709 0.030478 -0.0016279 -0.0026749 -0.014659 -0.039774 0.00078621 0.065914 0.051422 0.011726 -0.025576 0.054768 -5.5214e-05 0.031628 -0.005469 0.037096 -0.0059397 0.025911 0.014908 -0.010156 -0.012773 -0.0044601 -0.027004 -0.016961 0.040635 0.038829 0.011416 0.037065 -0.089317 0.00018222 -0.046439 0.0046425 -0.031613 -0.005314 0.043343 -0.028115 0.02239 0.055354 -0.013721 -0.032883 -0.056229 -0.023517 0.02729 -0.0064319 -0.037117 -0.027025 -0.055572 0.01449 -0.0038294 0.04097 0.066542 0.030411 -0.030874 0.039276 -0.044452 -0.012352 -0.0053545 -0.015064 0.060607 -0.044408 -0.048942 0.030375 0.010046 -0.017077 -0.062192 0.035166 -0.020149 0.033586 -0.0047018 -0.0064574 0.025972 0.0288 -0.036673 0.039973 0.0063104 -0.00062589 -0.020688 -0.051853 -0.010989 -0.011237 -0.0063016 -0.01185 -0.063084 -0.042814 -0.0050397 -0.019537 -0.019939 -0.043755 -0.03608 -0.016133 -0.0013293 -0.054163 0.00319 0.042436 0.027639 -0.014541 -0.0019106 0.033379 -0.00042732 -0.036366 0.048953 -0.011035 0.08016 -0.05691 -0.032269 -0.0051444 -0.008359 0.011501 0.013745 -0.0086205 0.019315 -0.0053374 -0.03878 -0.01317 0.010884 -0.037942 -0.0016144 0.030587 0.0015428 0.053503 -0.050824 -0.0015813 -0.035193 -0.026827 -0.040391 0.049886 -0.014723 0.019462 0.023344 -0.010186 0.0045436 0.018876 -0.0032943 0.038415 -0.036286 0.0035951 0.01448 -0.0099976 0.027878 -0.050164 0.013321 0.028665 -0.029037 0.01869 0.078944 -0.01013 0.032608 0.051852 -0.033219 0.0060903 0.02443 -0.0049501 -0.03841 -0.083707 -0.066249 -0.024046 0.05842 -0.0052646 0.011869 0.0086657 -0.054097 -0.075864 -0.077492 0.045474 0.029004 -0.030159 0.02233 -0.023526 -0.0036498 0.016702 0.095886 0.027173 0.0359 -0.029087 -0.0018975 0.028003 0.0027095 0.0097843 0.019363 -0.030297 0.00030364 -0.0084371 0.052517 0.020647 -0.027795 0.015645 0.068609 -0.013545 -0.003076 0.013689 0.055344 0.028234 -0.0027387 -0.033825 0.0093357 -0.013591 0.065254 0.0054993 -0.015918 0.051877 -0.021305 0.03209 -0.020183 -0.011398 -0.016084 -0.03574 0.024423 -0.053398 -0.030473 -0.052704 
-minim -0.050287 -0.035031 -0.044473 0.0016782 -0.039924 0.0029572 -0.037936 0.026319 -0.018919 -0.01618 -0.030843 0.01788 -0.077954 -0.0081971 0.069218 -0.0057779 -0.037396 0.021313 -0.066575 0.028123 -0.0088753 -0.0014919 -0.059346 0.051847 0.012595 -0.059714 0.078373 0.043685 -0.08456 -0.016209 -0.025172 0.010288 -0.019022 0.052453 0.0084509 -0.033381 -0.032445 -0.055054 0.032118 0.04331 0.065651 0.028122 -0.097976 0.033261 0.060501 0.021673 0.01388 -0.090855 -0.018048 0.0075636 -0.010397 -0.11115 -0.031873 -0.054895 0.074667 -0.060855 0.0091914 0.032344 -0.078409 0.017876 -0.046123 -0.079094 0.02957 -0.019541 0.049324 -0.06412 0.025514 0.021619 -0.024265 -0.049474 0.0053023 -0.011158 0.016173 -0.0037085 -0.023171 0.0010792 -0.013052 -0.024431 -0.037467 0.095344 -0.05663 -0.039807 0.027359 0.02401 0.016784 -0.00014314 0.025504 0.053824 0.027806 -0.053516 0.0060854 0.042869 -0.0024932 -0.0028421 -0.019836 -0.054941 -0.0022079 0.092793 0.073122 0.018573 -0.036608 0.076798 0.00037469 0.043438 -0.0085816 0.0514 -0.0077469 0.036287 0.021799 -0.014062 -0.019939 -0.0069868 -0.036976 -0.023584 0.054872 0.054305 0.017325 0.050208 -0.12423 -0.0010139 -0.062143 0.006775 -0.043618 -0.0066139 0.059824 -0.035865 0.031878 0.076445 -0.021953 -0.047748 -0.078773 -0.031797 0.037112 -0.010204 -0.051717 -0.037806 -0.076232 0.018208 -0.0055403 0.05501 0.093711 0.042323 -0.040342 0.055394 -0.062351 -0.017192 -0.007853 -0.020345 0.084891 -0.061348 -0.065634 0.041231 0.015322 -0.027147 -0.084375 0.048689 -0.029194 0.048771 -0.0063304 -0.011655 0.037742 0.040866 -0.051764 0.055839 0.011715 -0.0012406 -0.027296 -0.072063 -0.018641 -0.01591 -0.0066955 -0.015773 -0.087786 -0.059378 -0.0087206 -0.024175 -0.02521 -0.058815 -0.051469 -0.021724 0.0011575 -0.075993 0.0058525 0.056543 0.038151 -0.02 -0.0049523 0.044517 -0.0020276 -0.052881 0.066088 -0.014374 0.10925 -0.079407 -0.044932 -0.0073512 -0.0099264 0.018522 0.019365 -0.012176 0.025185 -0.0070082 -0.055399 -0.015573 0.015111 -0.052223 -0.0015651 0.041423 0.0014922 0.074318 -0.070831 -0.0027441 -0.048845 -0.03589 -0.053956 0.068049 -0.020579 0.0258 0.033251 -0.013777 0.0067385 0.02492 -0.0045337 0.050815 -0.049239 0.0038937 0.019845 -0.013324 0.040344 -0.068301 0.020556 0.039111 -0.039225 0.026542 0.10751 -0.014008 0.046932 0.071723 -0.045765 0.0093426 0.033177 -0.0077751 -0.05144 -0.11506 -0.09141 -0.032939 0.077461 -0.0078243 0.01934 0.012974 -0.073512 -0.1026 -0.10665 0.05802 0.039341 -0.039911 0.030775 -0.032748 -0.0060984 0.021957 0.13196 0.037948 0.045889 -0.03887 -0.0034771 0.041084 0.0061441 0.01179 0.023715 -0.042321 0.00077786 -0.012286 0.071454 0.027521 -0.038428 0.019622 0.095461 -0.019583 -0.0034046 0.021035 0.076195 0.038337 -0.0025884 -0.047448 0.014386 -0.017576 0.092158 0.0062062 -0.02415 0.070458 -0.028668 0.043982 -0.030205 -0.017642 -0.018097 -0.049155 0.034138 -0.076623 -0.041 -0.071934 
-veniam, -0.044987 -0.031072 -0.041984 0.0015155 -0.036646 0.0036399 -0.034284 0.02499 -0.018492 -0.014729 -0.028197 0.016312 -0.07114 -0.0071568 0.064265 -0.0058305 -0.035244 0.018453 -0.061509 0.026545 -0.0080904 -0.0020673 -0.054671 0.047509 0.012411 -0.055255 0.072402 0.0402 -0.079457 -0.014406 -0.022698 0.010863 -0.018155 0.047503 0.0073416 -0.031249 -0.031341 -0.050408 0.029196 0.03999 0.060772 0.024681 -0.091591 0.03083 0.05572 0.019973 0.012913 -0.084747 -0.017002 0.0064264 -0.0093555 -0.10399 -0.029111 -0.050683 0.069503 -0.057171 0.007638 0.02963 -0.072377 0.016669 -0.042136 -0.07317 0.027269 -0.017952 0.045574 -0.059147 0.023878 0.019305 -0.023954 -0.046121 0.0043418 -0.011553 0.014456 -0.0036723 -0.020837 0.00032803 -0.012352 -0.023872 -0.034013 0.087863 -0.052563 -0.03688 0.025098 0.022496 0.01594 -0.00054036 0.023744 0.049822 0.025924 -0.048238 0.0059614 0.038485 -0.0022698 -0.004034 -0.017724 -0.049963 -0.00020663 0.084668 0.06641 0.016382 -0.034563 0.069739 -0.00093385 0.040492 -0.0071827 0.047505 -0.006901 0.033582 0.020043 -0.013175 -0.018418 -0.0047964 -0.034228 -0.023109 0.051673 0.051242 0.015397 0.04676 -0.11434 -0.00083947 -0.057393 0.0064249 -0.041531 -0.0056749 0.055823 -0.034442 0.028722 0.070532 -0.020012 -0.044701 -0.073493 -0.028323 0.033958 -0.0088323 -0.047421 -0.035846 -0.071013 0.017557 -0.0052737 0.051558 0.087121 0.039895 -0.037862 0.049983 -0.057706 -0.014948 -0.0075249 -0.019348 0.078949 -0.05722 -0.062023 0.037459 0.013065 -0.023808 -0.07773 0.045779 -0.026418 0.045114 -0.0066579 -0.010819 0.033928 0.038064 -0.048145 0.050933 0.010352 -0.00058775 -0.025386 -0.066669 -0.017555 -0.013804 -0.0062896 -0.014963 -0.080642 -0.054245 -0.0077201 -0.022305 -0.023755 -0.054968 -0.049023 -0.020061 0.0011395 -0.068772 0.0052803 0.051415 0.035274 -0.017877 -0.004563 0.041006 -0.0022116 -0.049543 0.061309 -0.013107 0.10149 -0.07357 -0.040893 -0.007349 -0.0085023 0.015913 0.018088 -0.011892 0.023644 -0.0073241 -0.051456 -0.015518 0.013844 -0.048798 -0.0012104 0.037752 0.00062815 0.068394 -0.066814 -0.0045658 -0.045591 -0.03333 -0.049975 0.062364 -0.018878 0.023505 0.031131 -0.012389 0.0062071 0.023554 -0.0050231 0.047615 -0.046496 0.0037472 0.018749 -0.013879 0.038107 -0.064206 0.018071 0.035682 -0.036009 0.025456 0.0998 -0.013284 0.04323 0.065962 -0.043132 0.0097358 0.031329 -0.006983 -0.047316 -0.10673 -0.084873 -0.030838 0.071821 -0.0070368 0.018401 0.012125 -0.068596 -0.094543 -0.098544 0.05452 0.036319 -0.037528 0.028583 -0.030392 -0.0045028 0.02055 0.12187 0.035489 0.043185 -0.035047 -0.0030899 0.03776 0.0053901 0.012257 0.022623 -0.03868 0.00078373 -0.011303 0.065649 0.026323 -0.034529 0.019053 0.087643 -0.01747 -0.0022833 0.019395 0.070935 0.035777 -0.0026362 -0.042365 0.012915 -0.014834 0.086117 0.0058901 -0.022973 0.066316 -0.026922 0.040567 -0.027814 -0.015576 -0.01671 -0.045898 0.031163 -0.071273 -0.037627 -0.06758 
-quis -0.039395 -0.028282 -0.035 0.00074034 -0.032815 0.0028628 -0.029138 0.020604 -0.015567 -0.012957 -0.024019 0.013686 -0.063334 -0.0064626 0.055558 -0.004724 -0.031273 0.01661 -0.054179 0.023106 -0.0072865 -0.00062781 -0.049116 0.041999 0.010195 -0.049522 0.063766 0.03623 -0.068332 -0.012691 -0.020336 0.0092189 -0.016111 0.041789 0.006591 -0.028197 -0.026124 -0.044097 0.026345 0.035226 0.053465 0.020952 -0.079509 0.026004 0.050063 0.017762 0.010979 -0.073683 -0.015252 0.0057424 -0.0084351 -0.090767 -0.026355 -0.044375 0.060194 -0.04912 0.0063164 0.025565 -0.063106 0.013279 -0.037707 -0.064099 0.0242 -0.016491 0.040463 -0.051368 0.020896 0.016728 -0.020114 -0.039569 0.0030864 -0.010348 0.014582 -0.0031769 -0.018611 0.0010331 -0.011764 -0.020864 -0.030415 0.076773 -0.046447 -0.032694 0.021709 0.019777 0.01347 -0.0010219 0.020095 0.042972 0.022794 -0.043187 0.0048605 0.033773 -0.0026701 -0.0025052 -0.014658 -0.043906 -0.00081435 0.075085 0.058629 0.014094 -0.029889 0.060955 0.0001693 0.034453 -0.0077238 0.042452 -0.0066101 0.029586 0.018029 -0.011463 -0.015851 -0.0042234 -0.030354 -0.020056 0.044878 0.044643 0.013802 0.041525 -0.099421 -0.0003904 -0.049621 0.0053469 -0.035224 -0.0057718 0.048867 -0.031164 0.025412 0.061635 -0.016979 -0.039001 -0.063667 -0.024997 0.02968 -0.0080289 -0.040368 -0.031412 -0.06184 0.013949 -0.0045789 0.044672 0.076899 0.035188 -0.032783 0.044578 -0.050325 -0.013514 -0.0073917 -0.016916 0.068372 -0.048751 -0.054387 0.032726 0.012327 -0.021556 -0.067814 0.040745 -0.02258 0.039867 -0.00588 -0.0096625 0.029895 0.03334 -0.04124 0.045889 0.0089774 -0.00070722 -0.022785 -0.058055 -0.015117 -0.012455 -0.0053206 -0.013906 -0.071457 -0.047808 -0.0061779 -0.019284 -0.020844 -0.046713 -0.041425 -0.017486 0.00030108 -0.060964 0.0049416 0.045387 0.031065 -0.015386 -0.0035773 0.03552 -0.0014488 -0.042955 0.053639 -0.012606 0.088538 -0.064023 -0.035064 -0.0067432 -0.0073987 0.014146 0.016157 -0.010887 0.021526 -0.0064611 -0.04457 -0.013073 0.012748 -0.041821 -0.0014719 0.033523 0.0015071 0.059179 -0.05644 -0.0034253 -0.039183 -0.028493 -0.043029 0.055718 -0.015925 0.021711 0.026276 -0.011242 0.0053242 0.020949 -0.0032853 0.04164 -0.040584 0.0034239 0.01625 -0.011651 0.034001 -0.055958 0.016548 0.029704 -0.031655 0.020694 0.088051 -0.011151 0.037528 0.057694 -0.037963 0.0084944 0.026599 -0.0059682 -0.041714 -0.092176 -0.073874 -0.026638 0.064312 -0.0056561 0.015133 0.010613 -0.059627 -0.082347 -0.086173 0.047579 0.031789 -0.033051 0.024359 -0.026845 -0.002687 0.018503 0.10576 0.029829 0.037173 -0.029876 -0.0017531 0.033093 0.0049645 0.010742 0.019793 -0.034931 0.0010852 -0.0087596 0.058276 0.022508 -0.031941 0.016522 0.076561 -0.015253 -0.0020338 0.016739 0.060307 0.031577 -0.0033215 -0.036747 0.01113 -0.013865 0.074799 0.0052592 -0.02067 0.056511 -0.023604 0.034827 -0.024568 -0.014056 -0.016255 -0.039494 0.026877 -0.061266 -0.033228 -0.05829 
-nostrud -0.039432 -0.027894 -0.036498 0.00091283 -0.031781 0.0032256 -0.029483 0.020247 -0.015717 -0.013147 -0.025232 0.014417 -0.061128 -0.0061613 0.055725 -0.0049302 -0.030728 0.016005 -0.053277 0.023113 -0.0070191 -0.0020795 -0.046836 0.041864 0.010412 -0.047951 0.062475 0.035547 -0.068464 -0.013271 -0.019328 0.0086661 -0.01552 0.041432 0.0059936 -0.027145 -0.02619 -0.043638 0.02587 0.034481 0.052881 0.021547 -0.078654 0.026325 0.048438 0.017763 0.010646 -0.073131 -0.014763 0.0059177 -0.0085908 -0.090438 -0.025768 -0.0437 0.060574 -0.049876 0.0067715 0.025757 -0.062754 0.014152 -0.036349 -0.062647 0.023976 -0.015942 0.03959 -0.05203 0.020797 0.01732 -0.020324 -0.039444 0.0037931 -0.010282 0.012059 -0.0034651 -0.017015 0.00056851 -0.010651 -0.020197 -0.029204 0.0762 -0.045297 -0.031563 0.021357 0.019977 0.013741 -0.00036999 0.021081 0.043134 0.022256 -0.042122 0.0057085 0.033261 -0.0017095 -0.0032854 -0.015777 -0.043026 -0.0012372 0.073477 0.058082 0.014109 -0.028984 0.060435 -0.0012592 0.034288 -0.0067261 0.040818 -0.0058719 0.029195 0.017626 -0.011896 -0.015542 -0.005225 -0.029726 -0.020274 0.04491 0.044223 0.013796 0.040316 -0.098592 -0.00052135 -0.04933 0.0056909 -0.035135 -0.0050274 0.048439 -0.02954 0.025868 0.061553 -0.017433 -0.037526 -0.062597 -0.024109 0.029273 -0.0075638 -0.040368 -0.030106 -0.061821 0.013829 -0.0036478 0.044476 0.075805 0.034426 -0.032094 0.043226 -0.048999 -0.013454 -0.006624 -0.01587 0.067869 -0.049126 -0.054128 0.032785 0.012404 -0.020451 -0.068385 0.039335 -0.023124 0.038871 -0.0056325 -0.0099586 0.029561 0.032906 -0.04185 0.044208 0.0099654 -0.00090231 -0.021745 -0.057603 -0.014537 -0.011464 -0.0061846 -0.013232 -0.070343 -0.047249 -0.0065433 -0.019612 -0.020037 -0.047701 -0.041999 -0.017606 0.00073578 -0.06038 0.0052504 0.044632 0.030706 -0.015244 -0.0036919 0.035738 -0.0024788 -0.043097 0.052828 -0.011194 0.088214 -0.06347 -0.035559 -0.006066 -0.0085919 0.014095 0.015705 -0.0098384 0.020648 -0.0062525 -0.044324 -0.013115 0.012432 -0.042038 -0.0012566 0.032383 0.00060623 0.059207 -0.056741 -0.0030192 -0.039156 -0.029206 -0.043187 0.053825 -0.016475 0.019935 0.026819 -0.010721 0.0062648 0.020475 -0.0034628 0.041383 -0.040235 0.003119 0.016631 -0.011966 0.032582 -0.055756 0.016882 0.031397 -0.031205 0.021924 0.086285 -0.011911 0.03746 0.056847 -0.036155 0.0077646 0.027049 -0.0059764 -0.040991 -0.092185 -0.072719 -0.026412 0.062505 -0.0061981 0.015131 0.009579 -0.058981 -0.082035 -0.084905 0.046778 0.031431 -0.032125 0.023615 -0.026585 -0.0042798 0.017946 0.10552 0.031278 0.037127 -0.030814 -0.0020448 0.033344 0.0038065 0.0096469 0.019422 -0.033597 -0.00044042 -0.009243 0.056974 0.023105 -0.030409 0.015859 0.075519 -0.015003 -0.002347 0.015971 0.060979 0.030503 -0.002775 -0.037355 0.00989 -0.012718 0.073973 0.0044435 -0.019536 0.057538 -0.022852 0.035072 -0.023951 -0.013805 -0.013879 -0.039714 0.027292 -0.061149 -0.032287 -0.057968 
-exercitation -0.04298 -0.030161 -0.03981 0.0014299 -0.035365 0.0031905 -0.033081 0.023335 -0.017326 -0.01459 -0.026988 0.016185 -0.068684 -0.0066167 0.06146 -0.0056826 -0.033332 0.01774 -0.05891 0.024972 -0.0079167 -0.001256 -0.052779 0.045688 0.011878 -0.053032 0.069611 0.038277 -0.075224 -0.013704 -0.021952 0.0098693 -0.01755 0.045359 0.0070055 -0.030001 -0.029344 -0.047967 0.028177 0.037931 0.05795 0.02354 -0.087199 0.029885 0.053159 0.018737 0.011551 -0.080928 -0.015747 0.006241 -0.0089967 -0.098958 -0.027806 -0.048549 0.066447 -0.053813 0.0076213 0.028018 -0.068835 0.015679 -0.040568 -0.069773 0.026002 -0.017231 0.044091 -0.057178 0.02302 0.01866 -0.022095 -0.044065 0.0042774 -0.010299 0.014148 -0.0032686 -0.019799 0.00056569 -0.011704 -0.022092 -0.032698 0.084442 -0.0505 -0.035176 0.023436 0.022047 0.015475 -0.00055252 0.021883 0.047355 0.025403 -0.046926 0.0057237 0.036429 -0.0023573 -0.0033075 -0.016509 -0.047779 -0.00076245 0.081823 0.063398 0.015667 -0.032576 0.067488 -0.00039216 0.037823 -0.007369 0.045425 -0.006648 0.032171 0.019002 -0.012625 -0.018039 -0.005576 -0.032933 -0.0213 0.048765 0.048218 0.014404 0.044407 -0.10922 -0.00040118 -0.054806 0.0057923 -0.039701 -0.0059104 0.053129 -0.032293 0.027801 0.066984 -0.019041 -0.041644 -0.070529 -0.027157 0.032537 -0.0083068 -0.045446 -0.033994 -0.067801 0.016426 -0.0049994 0.048779 0.082896 0.03772 -0.036784 0.048584 -0.055122 -0.014842 -0.0067973 -0.017824 0.075322 -0.05476 -0.058557 0.036082 0.0131 -0.022796 -0.074343 0.043692 -0.025623 0.043156 -0.0048724 -0.01052 0.033845 0.036 -0.046075 0.049057 0.010263 -0.0014779 -0.02467 -0.063824 -0.016324 -0.013805 -0.0065474 -0.014366 -0.07716 -0.051813 -0.0073121 -0.021391 -0.022547 -0.052208 -0.046058 -0.019252 0.00068065 -0.065957 0.0054965 0.049324 0.034361 -0.017519 -0.0044209 0.039074 -0.0019016 -0.04705 0.058655 -0.013215 0.097183 -0.069987 -0.03922 -0.0065586 -0.0089046 0.015224 0.017583 -0.010741 0.022636 -0.0066895 -0.048635 -0.01447 0.013399 -0.046442 -0.001689 0.036301 0.0010865 0.065792 -0.06285 -0.0034136 -0.042793 -0.032047 -0.047071 0.059947 -0.017783 0.022293 0.029324 -0.012099 0.0059824 0.022253 -0.0043871 0.045689 -0.043412 0.0036796 0.017424 -0.012462 0.036339 -0.061263 0.018207 0.033983 -0.034638 0.02321 0.095251 -0.013048 0.041042 0.063702 -0.041036 0.0085895 0.029471 -0.0065702 -0.045064 -0.10245 -0.080779 -0.029206 0.068366 -0.0065918 0.017342 0.011012 -0.065271 -0.090762 -0.094255 0.051767 0.03457 -0.035687 0.026608 -0.028697 -0.004777 0.019157 0.11581 0.034539 0.040965 -0.034197 -0.0026914 0.036071 0.00545 0.011154 0.021734 -0.037305 0.00047687 -0.010241 0.062559 0.025173 -0.033426 0.01856 0.083289 -0.016884 -0.0021667 0.018562 0.067372 0.034034 -0.0026762 -0.040828 0.012414 -0.015246 0.081604 0.0048412 -0.021484 0.062926 -0.025704 0.039258 -0.026367 -0.015448 -0.016203 -0.043378 0.029871 -0.067517 -0.036204 -0.064196 
-ullamco -0.035028 -0.025077 -0.032342 0.00085924 -0.029109 0.0025713 -0.027125 0.018689 -0.013868 -0.01188 -0.022062 0.012836 -0.055458 -0.0056508 0.051407 -0.0044188 -0.02761 0.0147 -0.048531 0.020157 -0.0063548 -0.001771 -0.042952 0.0371 0.0086802 -0.042674 0.057185 0.030747 -0.062138 -0.011854 -0.017568 0.0072699 -0.014368 0.037275 0.0058229 -0.02427 -0.024845 -0.039141 0.022763 0.031667 0.048217 0.019189 -0.070619 0.024961 0.044015 0.01539 0.0093025 -0.065776 -0.01277 0.0051893 -0.0071155 -0.080691 -0.023297 -0.040018 0.054127 -0.04479 0.0058353 0.023023 -0.057491 0.013195 -0.032357 -0.056559 0.021739 -0.013856 0.036118 -0.046555 0.019073 0.015365 -0.017943 -0.036278 0.0037203 -0.0079273 0.011867 -0.0028603 -0.015978 0.00058734 -0.0098262 -0.018077 -0.026476 0.069142 -0.041442 -0.028283 0.020107 0.018396 0.012316 -6.0183e-05 0.017868 0.039043 0.020807 -0.038778 0.0052533 0.030219 -0.0016399 -0.0030592 -0.014178 -0.039503 -0.00072768 0.066707 0.052288 0.01244 -0.026521 0.055421 -0.00041584 0.030762 -0.0061749 0.036505 -0.0052288 0.026143 0.016035 -0.01047 -0.014029 -0.004604 -0.027283 -0.01724 0.039718 0.039272 0.010989 0.036978 -0.089022 -0.00012849 -0.044758 0.0054727 -0.031777 -0.0042899 0.044004 -0.026923 0.02342 0.055089 -0.015594 -0.033999 -0.057131 -0.022681 0.026793 -0.006282 -0.037167 -0.027435 -0.055602 0.013494 -0.0041496 0.039558 0.067467 0.030199 -0.029753 0.039568 -0.044959 -0.012049 -0.0052459 -0.014886 0.06089 -0.044942 -0.047727 0.029318 0.010885 -0.018684 -0.061114 0.035889 -0.020545 0.035468 -0.0041957 -0.0084724 0.027981 0.029334 -0.038038 0.039625 0.0086281 -0.0012044 -0.019579 -0.05155 -0.013295 -0.011495 -0.0049191 -0.012005 -0.063437 -0.042446 -0.0063764 -0.017472 -0.01839 -0.04305 -0.037308 -0.015943 0.00072537 -0.05459 0.0038817 0.040601 0.028257 -0.013939 -0.0036433 0.032208 -0.0019224 -0.038251 0.048183 -0.010729 0.079683 -0.057355 -0.032194 -0.0060632 -0.0075482 0.013055 0.014451 -0.0097214 0.018086 -0.005225 -0.039121 -0.011463 0.010838 -0.038786 -0.0017318 0.030631 0.0005453 0.05383 -0.051424 -0.0028507 -0.035038 -0.026 -0.03902 0.048911 -0.014286 0.018838 0.024722 -0.009644 0.0046451 0.018977 -0.0040768 0.037763 -0.035971 0.0035575 0.014157 -0.0097162 0.029249 -0.05016 0.014536 0.028384 -0.027955 0.019396 0.078639 -0.010194 0.033677 0.051587 -0.033198 0.0074593 0.024146 -0.0053621 -0.037067 -0.083163 -0.066334 -0.024084 0.055701 -0.0053231 0.013683 0.0088432 -0.053309 -0.074695 -0.076717 0.042789 0.028494 -0.028876 0.021403 -0.023492 -0.0037636 0.015878 0.094613 0.027783 0.03359 -0.027965 -0.0022675 0.029384 0.0048207 0.009714 0.017827 -0.030832 -4.2472e-05 -0.0076969 0.050838 0.020108 -0.027567 0.014075 0.068143 -0.014264 -0.0019574 0.01436 0.054802 0.027961 -0.0020503 -0.032743 0.0097072 -0.012368 0.066785 0.0029362 -0.017145 0.050744 -0.020914 0.030922 -0.022109 -0.012253 -0.013444 -0.036668 0.024473 -0.055061 -0.029191 -0.053031 
-laboris -0.043475 -0.03069 -0.03945 0.0013321 -0.034832 0.0033655 -0.033523 0.023409 -0.017565 -0.013922 -0.02673 0.015719 -0.068442 -0.0065161 0.061962 -0.0052219 -0.033683 0.018137 -0.05949 0.025271 -0.0085165 -0.0011563 -0.053201 0.045876 0.011656 -0.05333 0.07026 0.039012 -0.075158 -0.014549 -0.02169 0.0096292 -0.018119 0.045865 0.0072103 -0.030635 -0.029396 -0.048743 0.027601 0.038773 0.058787 0.023781 -0.087463 0.029337 0.054254 0.01929 0.011743 -0.080852 -0.016242 0.0058634 -0.0089902 -0.099344 -0.02835 -0.04934 0.066372 -0.055169 0.0072778 0.028214 -0.070052 0.015763 -0.040613 -0.070005 0.026181 -0.018229 0.044293 -0.057223 0.022702 0.019547 -0.020998 -0.043908 0.004051 -0.01043 0.014232 -0.0039724 -0.020119 0.00087292 -0.012213 -0.021908 -0.033628 0.08532 -0.050328 -0.035089 0.023962 0.022298 0.015213 -0.00040819 0.022149 0.048356 0.024743 -0.046557 0.0056644 0.036613 -0.0029419 -0.0039559 -0.017228 -0.048189 -0.00083439 0.082612 0.065156 0.016208 -0.032793 0.068066 -0.00029857 0.037671 -0.0079765 0.045518 -0.0067096 0.032038 0.01992 -0.012573 -0.017394 -0.0048199 -0.033623 -0.022555 0.049865 0.049015 0.014519 0.045028 -0.10988 -0.00053054 -0.05472 0.0054047 -0.039081 -0.0063057 0.053432 -0.033394 0.028339 0.06759 -0.020055 -0.042447 -0.070461 -0.028027 0.033066 -0.0076213 -0.045184 -0.034324 -0.067719 0.016994 -0.0049534 0.048638 0.084073 0.037816 -0.036884 0.048603 -0.055304 -0.01389 -0.0068918 -0.019097 0.075325 -0.054629 -0.059163 0.036624 0.013597 -0.02349 -0.074934 0.043639 -0.025362 0.043666 -0.0061034 -0.010633 0.033184 0.036919 -0.046262 0.049886 0.010336 -0.0015438 -0.024246 -0.064618 -0.016991 -0.014002 -0.0061967 -0.014607 -0.077676 -0.052041 -0.0075088 -0.021609 -0.022864 -0.052464 -0.046296 -0.019841 0.0008448 -0.066375 0.0045008 0.049974 0.033981 -0.017459 -0.0040318 0.039726 -0.0015532 -0.04752 0.05854 -0.012931 0.097736 -0.070423 -0.039719 -0.0070456 -0.0088179 0.016034 0.017525 -0.011129 0.022486 -0.0065607 -0.049443 -0.014893 0.013132 -0.04653 -0.001828 0.036235 0.00058788 0.065457 -0.063059 -0.0038395 -0.043239 -0.031654 -0.048676 0.059834 -0.018139 0.023078 0.029379 -0.011647 0.0059326 0.022973 -0.0040953 0.046899 -0.043886 0.0039663 0.018833 -0.012878 0.037251 -0.061826 0.017427 0.034302 -0.035528 0.024188 0.096051 -0.012796 0.041736 0.06272 -0.040767 0.0091953 0.030048 -0.0063559 -0.045458 -0.10229 -0.082323 -0.028982 0.069622 -0.0070675 0.01699 0.01083 -0.065833 -0.091123 -0.094399 0.051647 0.034805 -0.036004 0.027368 -0.028977 -0.0043085 0.019569 0.11685 0.033764 0.041734 -0.0341 -0.0018815 0.036679 0.0048292 0.011584 0.021559 -0.037714 -0.00017534 -0.010154 0.062812 0.025863 -0.034004 0.017643 0.083352 -0.016844 -0.0024419 0.019227 0.067337 0.034167 -0.0033848 -0.040531 0.012368 -0.015221 0.082882 0.0046219 -0.021413 0.063589 -0.025555 0.038883 -0.026368 -0.015098 -0.016847 -0.043801 0.02979 -0.068085 -0.036361 -0.064722 
-nisi -0.03339 -0.023335 -0.029892 0.0017968 -0.02688 0.0031134 -0.025291 0.01796 -0.012724 -0.011333 -0.020361 0.012375 -0.05254 -0.0058467 0.048002 -0.0044268 -0.025808 0.013297 -0.046663 0.019286 -0.006691 -0.00090443 -0.040891 0.034261 0.0092409 -0.042113 0.055266 0.030181 -0.057879 -0.0097865 -0.016389 0.0085461 -0.014622 0.034729 0.0050116 -0.023888 -0.022469 -0.038165 0.021572 0.029697 0.04536 0.018348 -0.067033 0.021897 0.042373 0.015461 0.0091001 -0.062935 -0.013948 0.0040212 -0.0061821 -0.076835 -0.021088 -0.037289 0.050033 -0.04354 0.005385 0.021088 -0.05366 0.011251 -0.031334 -0.054333 0.020229 -0.014347 0.033572 -0.043979 0.017187 0.014818 -0.016985 -0.033584 0.0030603 -0.008358 0.011191 -0.0019959 -0.015296 0.00086791 -0.0086126 -0.016997 -0.025486 0.065163 -0.0405 -0.027317 0.018986 0.01734 0.011633 -0.00053692 0.017273 0.037116 0.018641 -0.036725 0.0044958 0.028277 -0.0016733 -0.0036804 -0.013558 -0.036124 0.00023097 0.064227 0.0495 0.010993 -0.026657 0.052453 0.00071431 0.029351 -0.0051748 0.035486 -0.0052849 0.025079 0.0144 -0.0081973 -0.014015 -0.0029924 -0.026003 -0.015845 0.038088 0.037556 0.011006 0.034794 -0.084498 -0.0010347 -0.041888 0.0041898 -0.030694 -0.005667 0.042021 -0.025812 0.021622 0.051154 -0.015492 -0.03293 -0.053761 -0.020493 0.024957 -0.0062858 -0.034627 -0.025829 -0.05201 0.012597 -0.0023559 0.038003 0.065237 0.027784 -0.029764 0.037499 -0.042614 -0.011422 -0.0061476 -0.013356 0.056801 -0.042316 -0.04557 0.027899 0.010458 -0.01774 -0.057071 0.034267 -0.019476 0.033972 -0.0044527 -0.007178 0.025796 0.02823 -0.035421 0.037372 0.0077662 -0.00016886 -0.017798 -0.050156 -0.012674 -0.011416 -0.0035424 -0.012641 -0.060653 -0.040002 -0.0061417 -0.016214 -0.016108 -0.040528 -0.035842 -0.015401 0.00056542 -0.051421 0.003107 0.038608 0.027244 -0.013697 -0.0034414 0.030254 -0.0015363 -0.036497 0.04458 -0.0097999 0.074233 -0.052858 -0.029784 -0.0049715 -0.0057061 0.012452 0.014225 -0.0086814 0.018735 -0.0058802 -0.037137 -0.011523 0.0099619 -0.035488 -0.0015752 0.028365 0.0010426 0.050644 -0.048799 -0.0027845 -0.033255 -0.024049 -0.037167 0.046312 -0.01368 0.017644 0.023298 -0.0087469 0.0051917 0.017647 -0.0037532 0.034863 -0.034133 0.0030059 0.014329 -0.009369 0.028424 -0.048232 0.012698 0.026082 -0.027627 0.018727 0.074424 -0.010497 0.032707 0.048716 -0.033112 0.0067769 0.022812 -0.0036294 -0.034919 -0.078708 -0.062643 -0.023288 0.05334 -0.003551 0.012842 0.0078161 -0.050038 -0.07114 -0.073205 0.040456 0.02746 -0.02779 0.020973 -0.021 -0.0040844 0.014804 0.090048 0.026349 0.032021 -0.026787 -0.0018049 0.02814 0.0033639 0.010059 0.017419 -0.027878 -0.00093901 -0.0088931 0.047839 0.018464 -0.026471 0.014295 0.065227 -0.012488 -0.0018744 0.014995 0.050788 0.025932 -0.0017538 -0.031646 0.0084968 -0.011801 0.062948 0.0042641 -0.016695 0.047819 -0.019955 0.02917 -0.020167 -0.011877 -0.012066 -0.03347 0.023311 -0.052072 -0.026893 -0.048645 
-aliquip -0.044485 -0.031204 -0.040917 0.00089551 -0.036529 0.0032612 -0.034225 0.023572 -0.017206 -0.014597 -0.027407 0.015946 -0.069816 -0.0068012 0.062457 -0.0051849 -0.035038 0.017877 -0.060143 0.026231 -0.0077074 -0.0024747 -0.05373 0.046832 0.01091 -0.054515 0.071358 0.039466 -0.07673 -0.014397 -0.023007 0.009844 -0.017625 0.046815 0.0081224 -0.030359 -0.030493 -0.050069 0.028106 0.039183 0.059992 0.024197 -0.08871 0.030885 0.055758 0.01973 0.011939 -0.082276 -0.016491 0.0066276 -0.0094812 -0.10197 -0.029348 -0.048773 0.067822 -0.054836 0.007977 0.029308 -0.070902 0.015014 -0.041508 -0.071321 0.026974 -0.016935 0.044617 -0.057951 0.0234 0.019418 -0.022175 -0.044916 0.004639 -0.0098287 0.014691 -0.0036336 -0.020537 0.0012596 -0.01297 -0.022915 -0.03412 0.085754 -0.052016 -0.036576 0.024459 0.022331 0.01569 -0.0005213 0.022247 0.048502 0.025634 -0.047687 0.005676 0.037041 -0.0028668 -0.0035237 -0.017227 -0.049363 -0.00034838 0.082463 0.065202 0.016503 -0.033017 0.06892 -0.00011216 0.038786 -0.0076531 0.046474 -0.0074053 0.033054 0.01974 -0.01307 -0.017639 -0.0052968 -0.033705 -0.022571 0.050709 0.04951 0.014673 0.045369 -0.11139 4.4178e-05 -0.056593 0.0062052 -0.039629 -0.0067388 0.054705 -0.033411 0.028222 0.068576 -0.01959 -0.042357 -0.071395 -0.028019 0.033638 -0.0091052 -0.046033 -0.034888 -0.0696 0.017066 -0.0059568 0.050432 0.085757 0.038438 -0.036814 0.049038 -0.05669 -0.015016 -0.0069228 -0.018573 0.076933 -0.055626 -0.060618 0.036668 0.013302 -0.023996 -0.07607 0.045603 -0.024907 0.04407 -0.0062975 -0.010249 0.03366 0.03671 -0.047024 0.05051 0.010172 -0.00035385 -0.024949 -0.064674 -0.016964 -0.014791 -0.0064463 -0.015272 -0.078125 -0.052661 -0.0079533 -0.022355 -0.023194 -0.053777 -0.047549 -0.019399 -8.2901e-05 -0.067797 0.0054631 0.051322 0.034988 -0.01857 -0.0037822 0.040029 -0.0017994 -0.048165 0.06005 -0.013764 0.099447 -0.071636 -0.039448 -0.0059514 -0.0092446 0.016003 0.017574 -0.011551 0.023468 -0.0072797 -0.050257 -0.015167 0.014079 -0.047463 -0.00095519 0.037732 0.001113 0.067371 -0.064635 -0.004021 -0.043283 -0.032144 -0.048182 0.061583 -0.017902 0.024119 0.029942 -0.013274 0.0053748 0.02325 -0.0049315 0.046539 -0.044729 0.0039986 0.017885 -0.012593 0.036985 -0.063119 0.017928 0.034265 -0.035221 0.02445 0.09805 -0.013187 0.042438 0.064978 -0.041397 0.0085438 0.029729 -0.0067351 -0.046752 -0.10467 -0.083008 -0.02963 0.071182 -0.0063468 0.01733 0.01142 -0.067064 -0.09276 -0.096375 0.053727 0.035888 -0.036949 0.028239 -0.03007 -0.0043975 0.020214 0.1194 0.034626 0.041763 -0.034067 -0.0029926 0.036419 0.0056788 0.012054 0.022587 -0.03801 0.0003949 -0.0098546 0.064311 0.025521 -0.034278 0.017886 0.085703 -0.016981 -0.0021952 0.018416 0.06868 0.034964 -0.0028832 -0.041971 0.011979 -0.015907 0.083923 0.0050543 -0.022502 0.064221 -0.026223 0.039367 -0.027698 -0.015535 -0.017261 -0.04513 0.030058 -0.068767 -0.03684 -0.065901 
-ex -0.04035 -0.029033 -0.038226 -0.000379 -0.031574 0.0019683 -0.031639 0.022015 -0.01377 -0.012993 -0.025239 0.013984 -0.064624 -0.0055279 0.056962 -0.0066237 -0.03134 0.015385 -0.05668 0.024849 -0.0059078 -0.0023248 -0.048901 0.042447 0.0096164 -0.047685 0.065215 0.036583 -0.069708 -0.012422 -0.021859 0.0081558 -0.015079 0.043189 0.0070193 -0.027575 -0.028216 -0.045542 0.025441 0.034566 0.052865 0.021906 -0.081114 0.026258 0.049678 0.017226 0.011069 -0.075136 -0.013432 0.0048503 -0.0098119 -0.091844 -0.024914 -0.043801 0.061625 -0.049494 0.0072751 0.026496 -0.064606 0.014206 -0.040107 -0.064381 0.025226 -0.016549 0.041479 -0.052532 0.022946 0.01701 -0.019362 -0.040543 0.0033275 -0.0095725 0.010569 -0.00451 -0.01696 0.00094086 -0.010911 -0.020955 -0.031117 0.078798 -0.045524 -0.033249 0.022497 0.02051 0.014126 -0.0018306 0.022303 0.044848 0.023182 -0.041959 0.0056599 0.033967 -0.0013101 -0.0031334 -0.015698 -0.046351 -0.00031634 0.075026 0.058678 0.015015 -0.030344 0.062911 -0.00042607 0.035385 -0.0053394 0.04061 -0.0056499 0.031382 0.017849 -0.011645 -0.016543 -0.0054915 -0.030106 -0.021077 0.045019 0.043087 0.012081 0.04073 -0.10134 0.0010145 -0.05225 0.0035895 -0.035928 -0.0057905 0.05035 -0.029449 0.025522 0.063161 -0.017846 -0.038858 -0.064872 -0.025174 0.029877 -0.0084166 -0.041464 -0.032014 -0.063762 0.017089 -0.0056873 0.046955 0.079378 0.035439 -0.035464 0.046318 -0.0522 -0.012652 -0.0045279 -0.017424 0.069712 -0.051284 -0.055925 0.032756 0.012358 -0.02014 -0.069712 0.039575 -0.022982 0.039275 -0.0056605 -0.0092584 0.029877 0.033559 -0.044252 0.045926 0.0083055 -0.00017194 -0.023209 -0.060161 -0.01485 -0.011404 -0.0056601 -0.013544 -0.072634 -0.050672 -0.0076243 -0.019436 -0.021339 -0.050701 -0.044458 -0.017485 0.00070965 -0.059759 0.0037368 0.044979 0.032466 -0.016625 -0.0035231 0.036314 -0.0021079 -0.043008 0.055459 -0.012871 0.089728 -0.06468 -0.03479 -0.0055318 -0.0069123 0.013682 0.016184 -0.0093833 0.021825 -0.006027 -0.045583 -0.01314 0.013086 -0.042431 -0.0017774 0.033963 0.0013658 0.061759 -0.058168 -0.0031365 -0.038194 -0.029853 -0.044884 0.057054 -0.016329 0.020937 0.027538 -0.012589 0.0036726 0.02311 -0.0035779 0.042823 -0.039885 0.0054692 0.016998 -0.011904 0.032926 -0.056973 0.017129 0.029886 -0.033468 0.021907 0.090501 -0.010656 0.037312 0.059424 -0.038475 0.0080116 0.028972 -0.0061348 -0.043145 -0.095022 -0.076062 -0.027923 0.064897 -0.0050475 0.01571 0.0087246 -0.06146 -0.083797 -0.08743 0.049175 0.032705 -0.034878 0.023875 -0.026209 -0.00347 0.017056 0.10821 0.03226 0.038293 -0.032126 -0.0035657 0.032285 0.0063518 0.009395 0.020718 -0.033593 0.0016808 -0.0089234 0.058897 0.024849 -0.02976 0.017593 0.078252 -0.014537 -0.0026426 0.01691 0.063331 0.030542 -0.001392 -0.038556 0.010897 -0.014781 0.075543 0.0049721 -0.020509 0.059037 -0.022892 0.03742 -0.0245 -0.014515 -0.01569 -0.040964 0.027505 -0.062524 -0.033941 -0.060796 
-Lorem -0.03971 -0.027702 -0.036046 0.001354 -0.032874 0.003617 -0.030674 0.02148 -0.01498 -0.013043 -0.024604 0.014168 -0.063514 -0.0060996 0.056859 -0.0052151 -0.03086 0.014897 -0.053461 0.023369 -0.0065058 -0.0017002 -0.047942 0.042493 0.0092994 -0.048564 0.06351 0.035942 -0.069527 -0.012812 -0.020135 0.0085753 -0.015256 0.041566 0.0064516 -0.02801 -0.027706 -0.043574 0.025449 0.035708 0.053893 0.021909 -0.079867 0.027652 0.04968 0.017735 0.010991 -0.074284 -0.014165 0.0054973 -0.007948 -0.091596 -0.026763 -0.044318 0.060829 -0.04918 0.0069414 0.026004 -0.063108 0.014293 -0.037732 -0.063643 0.024331 -0.015314 0.041296 -0.052235 0.02012 0.01718 -0.019992 -0.041145 0.0035144 -0.0094659 0.012897 -0.0032018 -0.017357 0.0007392 -0.011034 -0.019456 -0.03085 0.076743 -0.045396 -0.032761 0.021922 0.020982 0.013802 -0.00046206 0.020373 0.043265 0.022432 -0.042883 0.0056734 0.03291 -0.0025973 -0.0025257 -0.014923 -0.043769 0.00023357 0.074258 0.058311 0.01439 -0.029362 0.061645 -0.00067039 0.035072 -0.007783 0.042173 -0.0055921 0.030298 0.018566 -0.012907 -0.015824 -0.004768 -0.030388 -0.019876 0.044243 0.04344 0.013881 0.040346 -0.099409 -0.00034355 -0.049777 0.0057497 -0.035929 -0.0061474 0.048965 -0.029849 0.026737 0.060715 -0.018099 -0.038422 -0.06366 -0.025124 0.030893 -0.0072504 -0.041237 -0.031617 -0.061625 0.015201 -0.0042208 0.044604 0.076823 0.034504 -0.033386 0.044116 -0.050757 -0.012836 -0.0055954 -0.016588 0.068985 -0.049932 -0.053823 0.032939 0.011498 -0.020916 -0.06807 0.039935 -0.022743 0.040231 -0.0050731 -0.0093047 0.031037 0.033818 -0.042379 0.045756 0.0086226 -0.0011205 -0.02195 -0.05846 -0.015397 -0.012847 -0.0054695 -0.012833 -0.071075 -0.048048 -0.0065307 -0.020077 -0.020013 -0.04872 -0.042694 -0.017892 0.00090915 -0.059943 0.0042194 0.044571 0.031638 -0.015831 -0.0038313 0.035184 -0.0017005 -0.043552 0.053523 -0.012069 0.088331 -0.064654 -0.035961 -0.0054263 -0.0082928 0.014822 0.016297 -0.0092132 0.020659 -0.0065263 -0.044283 -0.013575 0.011683 -0.042962 -0.0014585 0.033433 7.4185e-05 0.059596 -0.057094 -0.0031919 -0.038619 -0.029207 -0.042955 0.054032 -0.015702 0.021386 0.026342 -0.011031 0.0052728 0.021014 -0.0040402 0.041914 -0.040021 0.0033703 0.016162 -0.011906 0.033951 -0.056503 0.016262 0.030668 -0.030785 0.021541 0.087714 -0.01142 0.038976 0.057904 -0.037517 0.0082568 0.027651 -0.0056301 -0.041759 -0.093817 -0.074409 -0.027126 0.063798 -0.005783 0.015585 0.010016 -0.060122 -0.083766 -0.086131 0.046797 0.032209 -0.031616 0.024414 -0.027132 -0.003578 0.018268 0.1064 0.030846 0.037944 -0.030722 -0.0026934 0.033109 0.005533 0.010896 0.019671 -0.034056 0.00033416 -0.0096204 0.058399 0.02325 -0.030768 0.016107 0.077065 -0.015257 -0.0015514 0.017515 0.06168 0.031021 -0.0028043 -0.037055 0.010789 -0.013675 0.074873 0.0036871 -0.019438 0.057156 -0.023864 0.035833 -0.024476 -0.01383 -0.014534 -0.040056 0.027237 -0.062561 -0.03292 -0.05846 
-commodo -0.045682 -0.031379 -0.041496 0.001142 -0.036399 0.0035979 -0.035087 0.02336 -0.01817 -0.014875 -0.027898 0.016394 -0.071514 -0.0075327 0.064729 -0.0058072 -0.035627 0.01918 -0.06145 0.027048 -0.0082547 -0.0013716 -0.055225 0.047983 0.012221 -0.054972 0.07255 0.040802 -0.078493 -0.014797 -0.023348 0.0093967 -0.018041 0.047219 0.0081144 -0.031452 -0.030376 -0.051163 0.029627 0.03922 0.061131 0.025096 -0.091147 0.031274 0.055743 0.02017 0.012312 -0.084798 -0.017087 0.0063861 -0.010004 -0.10322 -0.029894 -0.050905 0.069428 -0.056375 0.008 0.029937 -0.072601 0.016172 -0.042711 -0.072871 0.027628 -0.018564 0.045805 -0.059661 0.023881 0.020035 -0.023856 -0.046306 0.0046895 -0.01056 0.014836 -0.0038079 -0.020828 0.00081614 -0.011707 -0.023511 -0.034535 0.088111 -0.052929 -0.036594 0.024431 0.022987 0.015812 -0.00084951 0.023665 0.049799 0.025311 -0.048816 0.0057508 0.038105 -0.0016942 -0.0042878 -0.017988 -0.049926 -0.00087566 0.084922 0.067468 0.016524 -0.033942 0.070498 -0.0010102 0.039372 -0.0076888 0.047142 -0.0074287 0.033693 0.020115 -0.012983 -0.018107 -0.005498 -0.034148 -0.022511 0.051803 0.050237 0.015542 0.046716 -0.11391 -0.00074402 -0.057437 0.0060478 -0.040977 -0.0067784 0.056133 -0.03466 0.030102 0.069926 -0.020263 -0.044239 -0.072794 -0.028851 0.033476 -0.0084295 -0.047517 -0.035564 -0.070833 0.017351 -0.0048833 0.051052 0.086972 0.039425 -0.038346 0.050013 -0.057442 -0.016149 -0.006702 -0.018919 0.079178 -0.058166 -0.062345 0.038162 0.013604 -0.023954 -0.078621 0.045279 -0.026937 0.045349 -0.005789 -0.010651 0.034701 0.03698 -0.048169 0.050818 0.010252 -0.00035265 -0.025134 -0.066062 -0.017671 -0.015153 -0.0069683 -0.015965 -0.080371 -0.054672 -0.0077698 -0.022673 -0.023221 -0.054413 -0.047543 -0.020324 0.0002402 -0.06949 0.0055164 0.052202 0.036039 -0.017662 -0.0039679 0.040647 -0.0022314 -0.049453 0.061154 -0.014143 0.10139 -0.073576 -0.040663 -0.0071337 -0.0091374 0.016608 0.018196 -0.011524 0.02353 -0.0069414 -0.05143 -0.014788 0.01278 -0.048319 -0.0016812 0.038105 0.0011812 0.06849 -0.066331 -0.0032335 -0.044657 -0.033273 -0.050219 0.062796 -0.017713 0.023611 0.0307 -0.012353 0.0059869 0.023515 -0.003877 0.047276 -0.046086 0.0033223 0.019295 -0.012667 0.037924 -0.064373 0.018854 0.035914 -0.036034 0.024801 0.099936 -0.01344 0.043515 0.065895 -0.042361 0.008833 0.030845 -0.0068582 -0.04784 -0.10703 -0.08481 -0.030786 0.071663 -0.0067908 0.017138 0.011208 -0.067829 -0.094481 -0.098772 0.054685 0.03647 -0.03787 0.028329 -0.03059 -0.0047728 0.020562 0.12249 0.035629 0.043042 -0.035086 -0.0025874 0.038435 0.0060652 0.011489 0.023626 -0.039192 0.00086991 -0.010519 0.065293 0.026793 -0.034818 0.018328 0.087327 -0.017299 -0.0031689 0.01933 0.070781 0.035741 -0.0023301 -0.043339 0.012842 -0.015661 0.085873 0.0053724 -0.02205 0.06517 -0.026354 0.040802 -0.028054 -0.016137 -0.016773 -0.045776 0.031392 -0.07068 -0.038005 -0.067109 
diff --git a/tests/unittest/train/test_embedding/lorem_ipsum_w2v.bin b/tests/unittest/train/test_embedding/lorem_ipsum_w2v.bin
deleted file mode 100644
index 476dcc27a7..0000000000
Binary files a/tests/unittest/train/test_embedding/lorem_ipsum_w2v.bin and /dev/null differ
diff --git a/tests/unittest/train/test_embedding/lorem_ipsum_w2v.vec b/tests/unittest/train/test_embedding/lorem_ipsum_w2v.vec
deleted file mode 100644
index 74db8689ea..0000000000
--- a/tests/unittest/train/test_embedding/lorem_ipsum_w2v.vec
+++ /dev/null
@@ -1,436 +0,0 @@
-435 100
-</s> 0.004003 0.004419 -0.003830 -0.003278 0.001367 0.003021 0.000941 0.000211 -0.003604 0.002218 -0.004356 0.001250 -0.000751 -0.000957 -0.003316 -0.001882 0.002579 0.003025 0.002969 0.001597 0.001545 -0.003803 -0.004096 0.004970 0.003801 0.003090 -0.000604 0.004016 -0.000495 0.000735 -0.000149 -0.002983 0.001312 -0.001337 -0.003825 0.004754 0.004379 -0.001095 -0.000226 0.000509 -0.003638 -0.004007 0.004555 0.000063 -0.002582 -0.003042 -0.003076 0.001697 0.000201 0.001331 -0.004214 -0.003808 -0.000130 0.001144 0.002550 -0.003170 0.004080 0.000927 0.001120 -0.000608 0.002986 -0.002288 -0.002097 0.002158 -0.000753 0.001031 0.001805 -0.004089 -0.001983 0.002914 0.004232 0.003932 -0.003047 -0.002108 -0.000909 0.002001 -0.003788 0.002998 0.002788 -0.001599 -0.001552 -0.002238 0.004229 0.003912 -0.001180 0.004215 0.004820 0.001815 0.004983 -0.003111 -0.001532 -0.002107 -0.002907 0.002815 0.001579 0.000425 -0.002194 0.001524 0.003059 0.000194 
-et 0.065103 -0.286523 0.113123 -0.267967 -0.071322 -0.067120 -0.090752 -0.139977 -0.069691 0.182572 -0.001066 0.336990 -0.385081 -0.070025 0.131611 0.436864 0.262294 0.007443 0.139368 0.067435 0.053795 0.238023 0.129775 -0.164821 -0.078077 -0.264162 0.094937 0.046446 0.250781 0.127879 -0.096248 -0.120144 -0.310494 0.202850 0.029711 -0.070201 0.164986 -0.125055 0.002332 -0.073112 -0.263511 0.194947 -0.304613 -0.068838 0.148701 0.214144 0.077265 0.152397 -0.100744 -0.179966 0.071316 -0.200064 0.111830 0.083516 -0.079367 0.068929 0.322799 -0.001353 -0.125702 -0.151813 0.134009 -0.180896 -0.033876 -0.179773 -0.173403 0.275796 0.065289 -0.186138 0.226389 0.060892 -0.209211 -0.003613 0.122452 -0.075892 -0.163041 0.089298 -0.210737 -0.113117 -0.174553 -0.314183 0.070417 0.045745 0.022625 -0.093502 0.093843 0.316804 -0.002149 -0.054628 0.269803 -0.152779 -0.020158 0.203164 -0.028039 0.250874 0.079001 0.166517 0.059464 0.355036 0.085771 -0.114481 
-sit 0.005407 -0.268745 0.137069 -0.280150 -0.072577 -0.106089 -0.042013 -0.144888 -0.058908 0.205040 0.026844 0.301271 -0.376915 -0.069401 0.140154 0.371762 0.209684 0.052092 0.147699 0.087859 0.043461 0.229119 0.102816 -0.106911 -0.078888 -0.126843 0.148608 0.051757 0.151598 0.139823 -0.092583 -0.144275 -0.255244 0.227597 0.005931 -0.066500 0.115997 -0.116663 0.002924 -0.105281 -0.270314 0.149361 -0.281372 -0.042565 0.130552 0.186600 0.030509 0.174775 -0.110628 -0.195999 0.037234 -0.184069 0.068683 0.052460 -0.021080 0.012387 0.315309 0.033812 -0.110050 -0.098705 0.183473 -0.115056 -0.077980 -0.156192 -0.164408 0.230441 0.059473 -0.261931 0.231508 0.039684 -0.187235 0.033482 0.114394 -0.073167 -0.183654 0.041510 -0.198037 -0.057384 -0.200310 -0.325266 0.130745 0.030015 0.055004 -0.038817 0.132752 0.336473 -0.031389 -0.106982 0.287181 -0.123972 0.044295 0.214112 -0.028887 0.217571 0.109797 0.086076 0.065615 0.344542 0.095246 -0.100462 
-nec 0.013414 -0.244646 0.104579 -0.268555 -0.059516 -0.073309 -0.087890 -0.156403 -0.048590 0.169527 -0.029787 0.342251 -0.366747 -0.068124 0.120087 0.386451 0.224247 0.017554 0.105016 0.033891 0.024070 0.211925 0.097107 -0.151958 -0.086522 -0.205775 0.085237 0.037719 0.270298 0.159825 -0.090958 -0.113915 -0.274183 0.202612 0.039277 -0.043562 0.148761 -0.094023 0.011142 -0.065631 -0.228012 0.191212 -0.296301 -0.058178 0.161533 0.161805 0.062286 0.163819 -0.119299 -0.196598 0.062719 -0.187678 0.101999 0.048979 -0.051193 0.056016 0.296245 0.039098 -0.113585 -0.143659 0.152478 -0.159798 -0.069574 -0.174039 -0.165590 0.244528 0.044988 -0.187889 0.205525 0.053494 -0.179433 0.008474 0.110640 -0.067905 -0.171631 0.068858 -0.165561 -0.092249 -0.196393 -0.295674 0.072607 0.024844 0.059740 -0.086212 0.081762 0.292891 -0.015979 -0.073214 0.256732 -0.119356 0.011390 0.168325 -0.038036 0.212587 0.099171 0.153091 0.066768 0.329833 0.113195 -0.110117 
-ac 0.041427 -0.284458 0.110569 -0.283961 -0.066496 -0.072843 -0.095922 -0.148890 -0.063615 0.199144 -0.002907 0.379073 -0.399444 -0.086784 0.140916 0.452140 0.259161 0.026376 0.137111 0.065161 0.056172 0.248228 0.132977 -0.175879 -0.076097 -0.250525 0.098685 0.041858 0.277021 0.146923 -0.102060 -0.124925 -0.309271 0.218954 0.031153 -0.068723 0.164870 -0.104775 0.011208 -0.061442 -0.258680 0.202788 -0.314749 -0.074344 0.165817 0.190908 0.080522 0.174135 -0.112472 -0.192793 0.066317 -0.198712 0.103946 0.080068 -0.068479 0.056320 0.332897 0.023180 -0.120509 -0.153732 0.155585 -0.174592 -0.060342 -0.176627 -0.187683 0.265992 0.061573 -0.208454 0.227497 0.064245 -0.201971 0.004419 0.128283 -0.073280 -0.179404 0.092090 -0.203020 -0.102165 -0.188149 -0.303628 0.071958 0.029400 0.037750 -0.092929 0.097248 0.320599 -0.007607 -0.065359 0.272074 -0.137890 -0.004163 0.192983 -0.035971 0.236400 0.084828 0.158917 0.069205 0.348969 0.092524 -0.118533 
-quis 0.017820 -0.245507 0.103001 -0.279314 -0.060112 -0.072285 -0.090360 -0.146245 -0.053630 0.169986 -0.020318 0.346928 -0.363568 -0.075916 0.127314 0.372504 0.215105 0.029586 0.095194 0.043247 0.028382 0.200028 0.098182 -0.144164 -0.075376 -0.190208 0.078897 0.036208 0.254057 0.164309 -0.084770 -0.116061 -0.280788 0.207351 0.028917 -0.044612 0.157103 -0.093967 0.009375 -0.077705 -0.238304 0.193057 -0.290255 -0.060833 0.157977 0.165111 0.060516 0.168018 -0.122389 -0.195926 0.050136 -0.184234 0.099437 0.051083 -0.034468 0.053358 0.304101 0.037640 -0.113605 -0.130786 0.151353 -0.152545 -0.083138 -0.166148 -0.171957 0.229807 0.051774 -0.191399 0.210852 0.049221 -0.173527 0.003873 0.123426 -0.065498 -0.171492 0.068138 -0.164286 -0.082628 -0.207022 -0.290846 0.070280 0.016249 0.058701 -0.071227 0.082348 0.307593 -0.018736 -0.081660 0.268100 -0.108807 0.017442 0.171759 -0.039662 0.205925 0.098326 0.143716 0.066930 0.324605 0.109290 -0.111682 
-id 0.034776 -0.275987 0.118864 -0.309799 -0.075656 -0.084756 -0.107020 -0.172037 -0.055292 0.194329 -0.028133 0.392520 -0.412896 -0.064713 0.140038 0.427335 0.247444 0.029380 0.107638 0.048872 0.039669 0.235814 0.112850 -0.166379 -0.087345 -0.227933 0.087861 0.039930 0.299433 0.190627 -0.092893 -0.129901 -0.326238 0.232290 0.044509 -0.054937 0.179519 -0.109701 0.010116 -0.077888 -0.266745 0.221967 -0.341407 -0.072606 0.179987 0.192540 0.063662 0.190172 -0.129453 -0.216027 0.071085 -0.210516 0.118029 0.058812 -0.053575 0.078235 0.339118 0.041179 -0.127407 -0.159091 0.167543 -0.180650 -0.082431 -0.199770 -0.196509 0.272555 0.057993 -0.222752 0.228665 0.062815 -0.210274 0.006908 0.128185 -0.080157 -0.197845 0.084723 -0.198323 -0.098296 -0.228201 -0.330875 0.074570 0.015549 0.073589 -0.089886 0.089776 0.339896 -0.013111 -0.085781 0.294436 -0.127001 0.020775 0.196657 -0.037432 0.232895 0.102266 0.167481 0.081473 0.369727 0.124783 -0.117693 
-in 0.062851 -0.275460 0.125820 -0.290238 -0.066814 -0.066375 -0.098133 -0.165424 -0.053974 0.197083 -0.006403 0.369342 -0.404662 -0.064194 0.142466 0.418486 0.260702 0.024002 0.126058 0.058006 0.055609 0.241138 0.120165 -0.169415 -0.092889 -0.228217 0.087157 0.032596 0.276448 0.159595 -0.104859 -0.117817 -0.320729 0.233554 0.039375 -0.066249 0.178983 -0.127549 -0.003602 -0.064841 -0.269127 0.209319 -0.333683 -0.075859 0.171015 0.220472 0.077187 0.176845 -0.126799 -0.211777 0.072898 -0.212423 0.118444 0.066170 -0.067557 0.071257 0.329911 0.019013 -0.125237 -0.150370 0.155308 -0.173788 -0.061673 -0.195760 -0.190003 0.274801 0.052694 -0.215690 0.246848 0.070255 -0.210043 0.007484 0.129645 -0.096597 -0.192809 0.092926 -0.204072 -0.118974 -0.197573 -0.350875 0.079615 0.030030 0.057279 -0.101719 0.089039 0.339971 -0.017397 -0.064109 0.278175 -0.147156 0.003776 0.196004 -0.023658 0.253939 0.099928 0.159526 0.082605 0.363572 0.105138 -0.107833 
-at 0.039700 -0.285451 0.128371 -0.325357 -0.069328 -0.085818 -0.104114 -0.176581 -0.047963 0.188665 -0.036577 0.377938 -0.412496 -0.060005 0.143023 0.414803 0.250830 0.023028 0.118920 0.048253 0.030380 0.229242 0.103625 -0.168365 -0.096298 -0.225601 0.092222 0.037505 0.286624 0.179061 -0.097166 -0.120217 -0.306002 0.211798 0.047062 -0.049429 0.164853 -0.113948 0.004612 -0.066361 -0.265843 0.199854 -0.323676 -0.073717 0.179819 0.181873 0.065488 0.170370 -0.125235 -0.214945 0.067774 -0.214647 0.118932 0.051980 -0.058195 0.070231 0.323603 0.034832 -0.112295 -0.144070 0.166133 -0.160651 -0.076644 -0.193645 -0.176449 0.255289 0.053491 -0.200145 0.224028 0.059440 -0.200294 -0.005046 0.133840 -0.081471 -0.191505 0.073979 -0.176159 -0.091658 -0.204224 -0.318569 0.067669 0.018115 0.070736 -0.076587 0.077634 0.302926 -0.019238 -0.072419 0.259327 -0.126870 0.013352 0.171166 -0.025745 0.218550 0.101728 0.155355 0.079879 0.346147 0.115566 -0.106928 
-sed 0.025350 -0.299870 0.133748 -0.335526 -0.073530 -0.098495 -0.106229 -0.181473 -0.055699 0.194685 -0.024391 0.396307 -0.411121 -0.081418 0.148691 0.442232 0.251030 0.026650 0.111935 0.044576 0.032229 0.247749 0.118999 -0.171610 -0.091419 -0.231751 0.099355 0.049404 0.306270 0.194795 -0.099595 -0.145833 -0.310839 0.231222 0.036949 -0.058131 0.171478 -0.102624 0.010032 -0.079975 -0.264398 0.219761 -0.342186 -0.073257 0.177820 0.187627 0.061609 0.198750 -0.137615 -0.225848 0.066041 -0.220236 0.107597 0.054984 -0.042049 0.072343 0.348129 0.040293 -0.132750 -0.151571 0.174789 -0.188260 -0.082757 -0.207450 -0.193129 0.272594 0.059679 -0.225777 0.227733 0.052354 -0.205320 -0.002844 0.123931 -0.076062 -0.201460 0.090999 -0.189663 -0.105016 -0.233279 -0.337721 0.085601 0.013095 0.076713 -0.094447 0.092597 0.343381 -0.019861 -0.078728 0.290186 -0.119710 0.018128 0.194456 -0.036955 0.247689 0.111984 0.160754 0.073934 0.376312 0.129130 -0.120592 
-eget 0.016019 -0.265347 0.112941 -0.293856 -0.066104 -0.080079 -0.085744 -0.155339 -0.054025 0.178948 -0.022865 0.353691 -0.396703 -0.072269 0.137793 0.394933 0.227253 0.035451 0.112422 0.055432 0.031135 0.220262 0.093593 -0.150509 -0.092115 -0.193275 0.091473 0.046380 0.252303 0.173374 -0.091770 -0.132877 -0.299884 0.225991 0.030916 -0.046751 0.155093 -0.102434 0.006526 -0.077011 -0.260215 0.194959 -0.309457 -0.058470 0.155825 0.183095 0.054775 0.181245 -0.118995 -0.203426 0.061576 -0.205713 0.099235 0.058748 -0.047495 0.066408 0.315234 0.042246 -0.126901 -0.134391 0.169663 -0.158461 -0.078335 -0.184038 -0.190533 0.246433 0.049793 -0.211791 0.221015 0.047126 -0.188698 0.006243 0.121568 -0.078815 -0.194921 0.074754 -0.194812 -0.090051 -0.224003 -0.326090 0.088646 0.017058 0.054165 -0.073136 0.094981 0.326085 -0.027679 -0.088923 0.279442 -0.122435 0.027548 0.186467 -0.029172 0.229788 0.096728 0.151797 0.071172 0.349604 0.124959 -0.114506 
-ut 0.028086 -0.279754 0.118374 -0.307983 -0.070040 -0.082116 -0.089356 -0.165355 -0.061991 0.188866 -0.020656 0.367469 -0.389046 -0.077991 0.148594 0.411791 0.238015 0.023288 0.111400 0.057079 0.043342 0.230393 0.109313 -0.161308 -0.092113 -0.214772 0.087016 0.040734 0.276288 0.177138 -0.094976 -0.138273 -0.309913 0.226188 0.035852 -0.050908 0.169460 -0.110931 0.004956 -0.075507 -0.249909 0.214881 -0.332811 -0.070662 0.166972 0.188890 0.062739 0.189201 -0.122188 -0.211669 0.061581 -0.204409 0.108847 0.046935 -0.050006 0.071934 0.327262 0.034270 -0.123244 -0.147769 0.167602 -0.174197 -0.073728 -0.189763 -0.186313 0.257923 0.056807 -0.213839 0.214214 0.053245 -0.191044 -0.000955 0.113248 -0.065895 -0.183365 0.080209 -0.184127 -0.088510 -0.214817 -0.317848 0.076381 0.018757 0.071179 -0.087576 0.086577 0.334374 -0.016591 -0.086649 0.280360 -0.121543 0.024435 0.191844 -0.037320 0.239852 0.106249 0.151978 0.074178 0.357539 0.122125 -0.111921 
-non 0.025968 -0.255274 0.118169 -0.276967 -0.073396 -0.079519 -0.094527 -0.165517 -0.053001 0.175410 -0.030842 0.358348 -0.381935 -0.065585 0.142673 0.395821 0.228580 0.019568 0.108517 0.042579 0.028693 0.218215 0.098139 -0.158321 -0.087449 -0.210585 0.084902 0.044041 0.267887 0.178803 -0.096201 -0.129890 -0.303214 0.225672 0.041223 -0.049024 0.162987 -0.111294 0.006601 -0.071036 -0.252238 0.208663 -0.327233 -0.070667 0.168324 0.184825 0.066273 0.183080 -0.126056 -0.210822 0.064388 -0.207897 0.113542 0.057608 -0.038607 0.068515 0.314981 0.036976 -0.123094 -0.143096 0.169733 -0.170244 -0.071488 -0.186606 -0.180055 0.250065 0.050941 -0.204670 0.215380 0.050940 -0.196968 0.007488 0.118280 -0.073757 -0.185911 0.074639 -0.183737 -0.087984 -0.210545 -0.315976 0.080276 0.019715 0.067331 -0.088863 0.082438 0.315029 -0.013561 -0.079878 0.280456 -0.120407 0.019649 0.179630 -0.031637 0.232936 0.111100 0.150347 0.082167 0.352247 0.124443 -0.111481 
-a 0.023371 -0.252490 0.114438 -0.296187 -0.064972 -0.080610 -0.104122 -0.164619 -0.055908 0.175158 -0.030471 0.364790 -0.377657 -0.069280 0.132307 0.388422 0.215844 0.017396 0.089992 0.044445 0.024312 0.211430 0.099186 -0.155399 -0.095707 -0.204792 0.082399 0.029067 0.267307 0.168732 -0.090844 -0.119008 -0.296970 0.209238 0.038761 -0.046208 0.158626 -0.104838 0.017612 -0.075680 -0.234409 0.200451 -0.305220 -0.063723 0.166512 0.173366 0.066287 0.187763 -0.135190 -0.213712 0.056169 -0.189509 0.116345 0.049577 -0.035676 0.074176 0.310237 0.032520 -0.113086 -0.145104 0.163083 -0.163125 -0.075740 -0.187572 -0.178490 0.253061 0.049255 -0.193712 0.218393 0.046542 -0.182113 0.004012 0.122761 -0.070004 -0.181527 0.077727 -0.165618 -0.086028 -0.221748 -0.308221 0.064864 0.022438 0.068551 -0.071366 0.075379 0.315078 -0.015383 -0.077079 0.279924 -0.122536 0.014054 0.179630 -0.027587 0.210663 0.107182 0.148611 0.077197 0.328323 0.121762 -0.113546 
-amet 0.031601 -0.267092 0.116064 -0.283677 -0.062403 -0.101108 -0.096583 -0.150116 -0.058340 0.160073 -0.013326 0.348075 -0.418015 -0.094335 0.153640 0.414471 0.208048 0.056509 0.122274 0.046380 0.032169 0.257862 0.103958 -0.152961 -0.093531 -0.177170 0.111694 0.059300 0.245826 0.157485 -0.113107 -0.124586 -0.307996 0.237546 0.005493 -0.051499 0.168639 -0.086396 -0.001813 -0.086492 -0.259250 0.196171 -0.346554 -0.033368 0.125254 0.196274 0.026458 0.175779 -0.101240 -0.228948 0.037131 -0.201155 0.065632 0.047340 -0.050898 0.062029 0.330581 0.023930 -0.139316 -0.115239 0.184084 -0.123023 -0.058886 -0.174517 -0.203795 0.250009 0.023607 -0.222511 0.223405 0.019462 -0.181732 0.031065 0.091724 -0.073566 -0.208960 0.076942 -0.215194 -0.064607 -0.205995 -0.314534 0.105024 -0.013942 0.043261 -0.059953 0.098666 0.316295 -0.032374 -0.097245 0.263256 -0.088963 0.058656 0.171067 -0.027786 0.235895 0.089811 0.159207 0.075096 0.355200 0.117123 -0.142413 
-vitae 0.024482 -0.274452 0.126691 -0.314570 -0.072724 -0.095970 -0.105898 -0.171899 -0.058329 0.199810 -0.024241 0.388421 -0.423324 -0.068532 0.140765 0.441877 0.248415 0.038527 0.120294 0.058411 0.037074 0.236516 0.101370 -0.172824 -0.099655 -0.218579 0.094445 0.041792 0.291139 0.196824 -0.103317 -0.134994 -0.323189 0.239972 0.034313 -0.061765 0.167629 -0.110554 0.005682 -0.083338 -0.265813 0.208914 -0.331043 -0.068385 0.172633 0.187118 0.063787 0.186491 -0.137288 -0.214596 0.065795 -0.199749 0.116801 0.054241 -0.044057 0.063316 0.320170 0.029806 -0.123069 -0.143944 0.173145 -0.164509 -0.079130 -0.191147 -0.182363 0.256376 0.059536 -0.209628 0.227292 0.050945 -0.204148 0.001134 0.128706 -0.082224 -0.188084 0.080116 -0.185140 -0.090274 -0.221075 -0.335547 0.090900 0.023794 0.070294 -0.084656 0.087258 0.322846 -0.026225 -0.079669 0.286103 -0.120425 0.015456 0.185208 -0.031537 0.234427 0.109638 0.152090 0.069326 0.353664 0.116381 -0.109064 
-eu 0.015410 -0.270953 0.112599 -0.303805 -0.071185 -0.090996 -0.091125 -0.157710 -0.051312 0.194346 -0.016942 0.382769 -0.400414 -0.087902 0.145467 0.428736 0.247064 0.030508 0.119608 0.045125 0.041929 0.236393 0.110376 -0.169890 -0.084638 -0.224526 0.088099 0.038795 0.294899 0.189050 -0.097076 -0.127025 -0.300190 0.220431 0.029561 -0.052982 0.174867 -0.104882 0.013762 -0.075791 -0.247524 0.215803 -0.315132 -0.077803 0.166800 0.181724 0.065405 0.175767 -0.129763 -0.212991 0.061179 -0.196882 0.104811 0.049352 -0.046970 0.070608 0.329694 0.037251 -0.127117 -0.143999 0.170341 -0.174134 -0.081754 -0.180196 -0.184237 0.254467 0.047032 -0.211031 0.216834 0.056943 -0.194286 0.004410 0.124987 -0.067101 -0.192280 0.082148 -0.179996 -0.091128 -0.224651 -0.319577 0.077102 0.013387 0.057092 -0.083017 0.089728 0.321012 -0.012735 -0.089268 0.284819 -0.124559 0.011960 0.177023 -0.036440 0.229122 0.098861 0.153623 0.077797 0.347861 0.117147 -0.118245 
-vel 0.028025 -0.255622 0.111261 -0.288881 -0.069076 -0.087622 -0.093533 -0.147520 -0.049534 0.182391 -0.012303 0.361520 -0.386134 -0.083793 0.139382 0.400551 0.235921 0.036962 0.108324 0.056362 0.042064 0.230292 0.107561 -0.165055 -0.093160 -0.212659 0.090687 0.034058 0.279638 0.170469 -0.100615 -0.125722 -0.299041 0.214896 0.026502 -0.050295 0.164426 -0.095517 0.003693 -0.070783 -0.242942 0.207469 -0.314267 -0.061773 0.163169 0.171324 0.074401 0.181276 -0.127685 -0.204217 0.071229 -0.195180 0.102409 0.053460 -0.037879 0.053828 0.319560 0.038586 -0.117610 -0.153042 0.169909 -0.160753 -0.082977 -0.185648 -0.185647 0.255358 0.049271 -0.206557 0.214500 0.041085 -0.186442 0.001398 0.106211 -0.067072 -0.177118 0.063843 -0.180070 -0.099814 -0.206651 -0.297996 0.071678 0.016076 0.064481 -0.088088 0.076887 0.323710 -0.025145 -0.079107 0.279974 -0.122683 0.011262 0.181207 -0.039459 0.237091 0.097833 0.149097 0.075130 0.341797 0.111512 -0.111445 
-Sed 0.015632 -0.254197 0.108804 -0.283617 -0.062499 -0.073296 -0.086348 -0.151436 -0.042833 0.166336 -0.024611 0.345771 -0.368816 -0.069260 0.130752 0.389267 0.217316 0.036843 0.101512 0.025881 0.020717 0.217508 0.093955 -0.149709 -0.084579 -0.179677 0.087287 0.037877 0.259801 0.157338 -0.087108 -0.104898 -0.269348 0.206353 0.030624 -0.047985 0.154751 -0.099893 0.000587 -0.067771 -0.223623 0.186695 -0.287736 -0.059882 0.155183 0.170475 0.056078 0.162848 -0.124489 -0.199686 0.044266 -0.192881 0.094568 0.044066 -0.052757 0.056843 0.292831 0.036974 -0.102776 -0.129650 0.148084 -0.142757 -0.069785 -0.165193 -0.178787 0.242579 0.053978 -0.197943 0.204897 0.047251 -0.189624 0.008871 0.113601 -0.080353 -0.177267 0.082437 -0.171316 -0.077585 -0.205926 -0.300317 0.081439 0.006525 0.069088 -0.073541 0.088993 0.286962 -0.010682 -0.073230 0.253107 -0.117212 0.029663 0.168671 -0.033345 0.201328 0.083989 0.153128 0.072150 0.321076 0.112754 -0.114471 
-Donec 0.024306 -0.285179 0.112753 -0.328770 -0.069398 -0.088300 -0.093734 -0.170296 -0.059821 0.192484 -0.012411 0.399430 -0.411128 -0.088389 0.147322 0.430476 0.245192 0.040145 0.117230 0.046132 0.036157 0.242751 0.114529 -0.163240 -0.096831 -0.218559 0.101116 0.040931 0.298082 0.180942 -0.104131 -0.130523 -0.325207 0.227078 0.030134 -0.047271 0.174389 -0.106446 0.011591 -0.083451 -0.274672 0.222404 -0.341617 -0.073915 0.169053 0.191341 0.060564 0.187960 -0.132491 -0.223906 0.063419 -0.213416 0.108902 0.056927 -0.045829 0.054453 0.338977 0.035305 -0.121618 -0.155947 0.177029 -0.163795 -0.082481 -0.191984 -0.197833 0.274097 0.047739 -0.234333 0.234393 0.050976 -0.202589 0.011269 0.124674 -0.078098 -0.204592 0.077627 -0.188315 -0.093264 -0.231639 -0.331975 0.078443 0.002038 0.076070 -0.084347 0.087748 0.338790 -0.016426 -0.083043 0.291962 -0.120197 0.014331 0.186897 -0.043660 0.231777 0.097046 0.154609 0.072212 0.372344 0.120043 -0.124155 
-tincidunt 0.022588 -0.298386 0.130824 -0.340711 -0.079643 -0.104880 -0.097037 -0.178953 -0.054081 0.210950 -0.011515 0.406699 -0.446274 -0.093032 0.152547 0.456751 0.247547 0.033378 0.129717 0.047512 0.048894 0.259218 0.121019 -0.173700 -0.087381 -0.235635 0.109160 0.042586 0.314610 0.188842 -0.118108 -0.149287 -0.335271 0.252635 0.031379 -0.053838 0.183674 -0.124174 0.009110 -0.091688 -0.283278 0.236666 -0.361507 -0.075477 0.183738 0.201249 0.073036 0.212076 -0.140111 -0.242426 0.066761 -0.221467 0.118265 0.059724 -0.058718 0.068486 0.375074 0.040089 -0.146803 -0.169122 0.197939 -0.194580 -0.089121 -0.209234 -0.213419 0.293117 0.062292 -0.244050 0.251186 0.057592 -0.215681 0.015529 0.127518 -0.071330 -0.214453 0.083745 -0.206911 -0.107804 -0.253822 -0.362377 0.089987 0.016849 0.060559 -0.100025 0.092295 0.355759 -0.025998 -0.093347 0.316190 -0.137699 0.026884 0.207241 -0.041946 0.262677 0.111137 0.174806 0.079052 0.400477 0.129059 -0.134173 
-ipsum 0.075419 -0.282167 0.109309 -0.269451 -0.054305 -0.074401 -0.051240 -0.124106 -0.068278 0.204619 0.032990 0.328121 -0.393957 -0.069814 0.151372 0.392395 0.229598 0.030752 0.146759 0.101529 0.067548 0.215710 0.113876 -0.149234 -0.069550 -0.179188 0.110833 0.049626 0.164388 0.078380 -0.083071 -0.115951 -0.251202 0.207935 0.015292 -0.072219 0.137423 -0.110808 -0.007490 -0.061358 -0.255061 0.147535 -0.292252 -0.048314 0.093326 0.206235 0.050228 0.148216 -0.090678 -0.165885 0.057587 -0.176488 0.068628 0.055327 -0.063287 0.027370 0.282327 0.018675 -0.113260 -0.112765 0.119612 -0.106010 -0.052442 -0.153043 -0.158366 0.239405 0.030361 -0.220496 0.215291 0.055941 -0.154293 0.021897 0.111970 -0.068042 -0.169348 0.064453 -0.178738 -0.078483 -0.124795 -0.280414 0.092185 0.032239 0.016173 -0.057148 0.097903 0.268870 -0.039371 -0.072873 0.210198 -0.116410 0.018739 0.187453 -0.017521 0.216607 0.057550 0.128185 0.076179 0.327438 0.080700 -0.098424 
-Aliquam 0.018095 -0.264003 0.111894 -0.306613 -0.069103 -0.085624 -0.097419 -0.158661 -0.053154 0.180175 -0.007850 0.380050 -0.409148 -0.099967 0.146470 0.455000 0.250863 0.047572 0.109739 0.046766 0.047067 0.246764 0.115092 -0.174911 -0.086785 -0.231078 0.104873 0.037495 0.302897 0.180033 -0.099186 -0.125944 -0.298273 0.206943 0.036289 -0.053779 0.172157 -0.095431 0.021527 -0.065683 -0.249620 0.220904 -0.317952 -0.079477 0.173149 0.172331 0.070500 0.184174 -0.125608 -0.213817 0.077906 -0.205691 0.099226 0.057513 -0.046391 0.054566 0.323615 0.023823 -0.123501 -0.153993 0.176543 -0.168513 -0.094432 -0.178764 -0.188456 0.267137 0.066385 -0.216757 0.218536 0.052549 -0.195209 0.018283 0.106622 -0.072240 -0.175730 0.083969 -0.197247 -0.093907 -0.233805 -0.312529 0.075525 0.000548 0.055990 -0.084200 0.084297 0.326891 -0.018059 -0.081289 0.277865 -0.111698 0.013524 0.186795 -0.036822 0.215283 0.086246 0.160428 0.064688 0.347327 0.120739 -0.114444 
-Maecenas 0.024794 -0.286293 0.122943 -0.320302 -0.074114 -0.099369 -0.095265 -0.166694 -0.063321 0.198918 -0.014992 0.390145 -0.412322 -0.080925 0.154761 0.445852 0.256608 0.025898 0.118871 0.049367 0.038781 0.240788 0.113527 -0.168077 -0.095817 -0.225840 0.104323 0.037801 0.300312 0.191838 -0.100500 -0.136317 -0.315941 0.225595 0.029699 -0.046836 0.173402 -0.111589 0.007583 -0.081384 -0.258113 0.208114 -0.335627 -0.067713 0.168578 0.187500 0.063283 0.186849 -0.136506 -0.219784 0.055431 -0.210919 0.109486 0.060502 -0.046712 0.064040 0.334736 0.034209 -0.123978 -0.146033 0.170851 -0.175866 -0.086074 -0.195541 -0.199720 0.272589 0.054951 -0.214293 0.231383 0.056733 -0.206143 0.010601 0.123140 -0.072167 -0.196634 0.086718 -0.187040 -0.098055 -0.228897 -0.326101 0.081010 0.012468 0.064934 -0.083534 0.091398 0.333418 -0.011602 -0.084273 0.280069 -0.127286 0.019908 0.188253 -0.030980 0.229615 0.105148 0.159365 0.070169 0.363347 0.120713 -0.122050 
-In 0.019567 -0.235069 0.088291 -0.268247 -0.060213 -0.078974 -0.077561 -0.123649 -0.066142 0.170016 0.007708 0.342189 -0.349017 -0.100921 0.136792 0.405230 0.219174 0.048202 0.097597 0.052933 0.051358 0.232695 0.118393 -0.156908 -0.084257 -0.216882 0.085668 0.023243 0.280354 0.158067 -0.082640 -0.110882 -0.275079 0.191783 0.018452 -0.038689 0.178351 -0.091241 0.029720 -0.061464 -0.215078 0.201867 -0.288928 -0.083761 0.158895 0.154782 0.063334 0.171819 -0.111818 -0.189706 0.075459 -0.183707 0.078017 0.039200 -0.023948 0.050086 0.298844 0.029176 -0.119504 -0.159850 0.149736 -0.172359 -0.080774 -0.164500 -0.186556 0.258862 0.054055 -0.197892 0.194004 0.047333 -0.167477 0.016210 0.094122 -0.066025 -0.152133 0.071466 -0.184937 -0.091054 -0.210963 -0.280420 0.057871 -0.009159 0.053959 -0.091259 0.077363 0.306763 -0.020879 -0.060950 0.276608 -0.104209 -0.000286 0.173723 -0.032606 0.209273 0.075766 0.134763 0.059821 0.312442 0.096092 -0.096593 
-orci 0.037535 -0.268172 0.117793 -0.305179 -0.067210 -0.095983 -0.089774 -0.157090 -0.052973 0.184050 -0.014083 0.373030 -0.388346 -0.087608 0.155659 0.436667 0.239519 0.039172 0.108287 0.059580 0.052360 0.242470 0.101680 -0.163741 -0.092597 -0.214443 0.091662 0.038664 0.270102 0.174316 -0.102313 -0.134721 -0.289329 0.212913 0.026364 -0.054686 0.171606 -0.100145 0.014583 -0.071665 -0.234424 0.201539 -0.313426 -0.068481 0.159254 0.187091 0.062224 0.182083 -0.115778 -0.206962 0.077243 -0.191924 0.107232 0.047614 -0.048887 0.055522 0.315171 0.017808 -0.129565 -0.143854 0.169511 -0.171130 -0.078573 -0.169758 -0.186130 0.252180 0.062932 -0.207149 0.207289 0.051684 -0.182910 0.017506 0.107654 -0.075181 -0.171378 0.075322 -0.191289 -0.090795 -0.209449 -0.303002 0.083051 0.006762 0.060811 -0.085977 0.082866 0.312334 -0.014913 -0.075546 0.276801 -0.113274 0.012339 0.183415 -0.030006 0.224660 0.094728 0.147233 0.072993 0.345402 0.108817 -0.108297 
-Cras 0.025380 -0.283955 0.126837 -0.319029 -0.080969 -0.098069 -0.104038 -0.163942 -0.050254 0.198461 -0.025295 0.391730 -0.426040 -0.075382 0.150757 0.450036 0.255177 0.037758 0.126499 0.053289 0.032035 0.252495 0.105060 -0.174367 -0.098767 -0.230392 0.094749 0.047723 0.307421 0.184363 -0.112632 -0.128369 -0.328943 0.243827 0.034600 -0.056122 0.186445 -0.113800 0.004396 -0.082458 -0.268814 0.212276 -0.338984 -0.074450 0.177120 0.205868 0.068603 0.185090 -0.134787 -0.222979 0.065305 -0.212801 0.114874 0.055783 -0.057716 0.066094 0.337108 0.028323 -0.126033 -0.151516 0.174475 -0.169256 -0.084527 -0.191572 -0.197477 0.280356 0.054420 -0.221247 0.241834 0.049771 -0.212383 0.004333 0.128027 -0.080585 -0.205344 0.085728 -0.196255 -0.100735 -0.225215 -0.339808 0.093391 0.018085 0.071867 -0.084452 0.084104 0.336062 -0.017120 -0.088345 0.285091 -0.129476 0.017318 0.188940 -0.032305 0.237996 0.105451 0.164725 0.079308 0.374596 0.127553 -0.126278 
-Pellentesque 0.030885 -0.277893 0.108961 -0.295153 -0.069923 -0.076025 -0.094281 -0.152890 -0.064027 0.178368 -0.015751 0.384153 -0.411142 -0.086828 0.146806 0.445285 0.256773 0.027410 0.129420 0.051504 0.033481 0.241848 0.107382 -0.169309 -0.091855 -0.229979 0.103153 0.045132 0.293151 0.173107 -0.091891 -0.132382 -0.305004 0.212547 0.030260 -0.055217 0.174509 -0.113729 0.021276 -0.082709 -0.250024 0.213670 -0.313877 -0.068159 0.168261 0.182909 0.063675 0.178789 -0.129802 -0.209361 0.057157 -0.200693 0.102171 0.055513 -0.062622 0.057839 0.336109 0.016816 -0.121667 -0.153223 0.163776 -0.164564 -0.071511 -0.183440 -0.194565 0.261433 0.056595 -0.204674 0.223337 0.053102 -0.196165 0.006953 0.124466 -0.070952 -0.197904 0.080445 -0.188858 -0.089532 -0.226748 -0.322429 0.080861 0.019168 0.060791 -0.082146 0.079073 0.314784 -0.012329 -0.082888 0.280847 -0.128256 0.012964 0.181049 -0.031490 0.221840 0.101019 0.157478 0.059838 0.353525 0.108542 -0.114516 
-ante 0.076938 -0.285492 0.120061 -0.292885 -0.062407 -0.074006 -0.089587 -0.152293 -0.069876 0.207219 0.015890 0.374461 -0.400309 -0.084067 0.156332 0.449537 0.249996 0.032181 0.137545 0.066570 0.066017 0.240068 0.130629 -0.162638 -0.077503 -0.228909 0.094094 0.040213 0.245212 0.124149 -0.097484 -0.126453 -0.297856 0.206161 0.033531 -0.073735 0.160996 -0.108741 0.003369 -0.057700 -0.249566 0.186573 -0.308644 -0.074868 0.140488 0.203593 0.071375 0.165348 -0.101175 -0.189791 0.081590 -0.195127 0.095687 0.069430 -0.063577 0.059633 0.316886 0.004891 -0.126627 -0.138015 0.149391 -0.157367 -0.061008 -0.164344 -0.179913 0.257778 0.052900 -0.214283 0.220865 0.069601 -0.179757 0.007505 0.108625 -0.085947 -0.165686 0.088355 -0.195088 -0.103699 -0.161509 -0.297239 0.087473 0.020525 0.046032 -0.097578 0.090672 0.319600 -0.014115 -0.056727 0.250830 -0.134376 -0.000225 0.197767 -0.030508 0.248002 0.074337 0.146949 0.077820 0.351226 0.097639 -0.103799 
-tristique 0.038784 -0.262614 0.111400 -0.274929 -0.067246 -0.075425 -0.076916 -0.128570 -0.060368 0.162765 -0.008654 0.314785 -0.374710 -0.067302 0.128262 0.406349 0.250021 0.022701 0.116753 0.061004 0.043699 0.208816 0.106100 -0.146863 -0.078209 -0.206910 0.092523 0.048762 0.233590 0.136410 -0.084953 -0.117509 -0.274898 0.185945 0.020557 -0.048597 0.143706 -0.100716 0.014493 -0.066983 -0.236830 0.172483 -0.283541 -0.053210 0.151095 0.169884 0.071642 0.152979 -0.093390 -0.188678 0.062601 -0.182429 0.094553 0.064370 -0.053805 0.058042 0.289567 0.005428 -0.107587 -0.142957 0.143776 -0.157438 -0.055557 -0.173731 -0.164237 0.241460 0.065265 -0.179745 0.194509 0.041427 -0.182905 0.003000 0.102209 -0.056480 -0.164940 0.079875 -0.187792 -0.086955 -0.187121 -0.282387 0.062069 0.028392 0.031551 -0.081316 0.071794 0.282399 -0.013762 -0.065349 0.257907 -0.117729 0.006611 0.163501 -0.029005 0.214243 0.079819 0.135925 0.049223 0.317116 0.083793 -0.106546 
-dolor 0.049757 -0.279343 0.124542 -0.281147 -0.068830 -0.106927 -0.052994 -0.131309 -0.055721 0.199783 0.027919 0.351579 -0.412062 -0.073192 0.148988 0.395968 0.207505 0.047777 0.139532 0.093133 0.055068 0.231316 0.108276 -0.133931 -0.078809 -0.178605 0.146995 0.065504 0.193884 0.116570 -0.109714 -0.131081 -0.281579 0.242538 0.013206 -0.071153 0.142426 -0.105277 0.006419 -0.089064 -0.294412 0.175145 -0.332316 -0.051052 0.121783 0.200177 0.027645 0.181551 -0.106971 -0.203858 0.046425 -0.199752 0.072032 0.047951 -0.048654 0.027124 0.314875 0.033197 -0.143874 -0.114274 0.181328 -0.116958 -0.073092 -0.167807 -0.192211 0.256096 0.031182 -0.266581 0.237351 0.041059 -0.175392 0.022765 0.101231 -0.067821 -0.207640 0.057015 -0.202142 -0.084781 -0.177155 -0.328151 0.123246 0.026055 0.048206 -0.063948 0.115526 0.322043 -0.036066 -0.088290 0.276109 -0.113483 0.054610 0.219728 -0.030230 0.250417 0.105874 0.132294 0.083543 0.370990 0.119396 -0.115804 
-posuere 0.040262 -0.286620 0.131540 -0.315240 -0.068900 -0.095323 -0.102740 -0.180605 -0.047913 0.202122 -0.027835 0.395688 -0.427512 -0.080031 0.156300 0.437785 0.257114 0.024711 0.121387 0.042412 0.043588 0.244168 0.102947 -0.171522 -0.098746 -0.231677 0.095992 0.037046 0.295651 0.187918 -0.110252 -0.143460 -0.325769 0.226463 0.043629 -0.069841 0.174894 -0.120677 0.007125 -0.079756 -0.274895 0.208457 -0.335586 -0.064887 0.179883 0.197782 0.075211 0.185933 -0.130998 -0.214890 0.063015 -0.210662 0.123076 0.050578 -0.050577 0.065715 0.338505 0.034890 -0.126724 -0.149906 0.174690 -0.174600 -0.070440 -0.189587 -0.191846 0.271327 0.066009 -0.210999 0.234988 0.055958 -0.205022 0.000627 0.129451 -0.080116 -0.195990 0.082635 -0.184312 -0.101105 -0.208927 -0.332097 0.084612 0.017158 0.059504 -0.086479 0.085421 0.328088 -0.020407 -0.088207 0.276970 -0.132783 0.021051 0.182124 -0.033358 0.246560 0.097323 0.158592 0.078024 0.360806 0.124919 -0.125793 
-Ut 0.022248 -0.285596 0.121214 -0.317196 -0.075063 -0.092849 -0.102719 -0.168499 -0.063984 0.187006 -0.019125 0.392709 -0.411617 -0.085905 0.155045 0.447881 0.245801 0.034974 0.110736 0.043715 0.034445 0.242522 0.110963 -0.163448 -0.101005 -0.221733 0.092106 0.039935 0.295409 0.176773 -0.100978 -0.131664 -0.320635 0.229623 0.037242 -0.057546 0.180160 -0.107764 0.013415 -0.074672 -0.265648 0.222355 -0.336614 -0.069434 0.179229 0.187748 0.063443 0.194516 -0.139964 -0.221608 0.072002 -0.212513 0.109801 0.051465 -0.042854 0.067583 0.334610 0.026887 -0.124832 -0.156060 0.174064 -0.174150 -0.087865 -0.192866 -0.201920 0.268898 0.057397 -0.213086 0.226594 0.044001 -0.205376 0.009664 0.127621 -0.077429 -0.198085 0.078441 -0.189820 -0.099353 -0.235623 -0.331161 0.079396 0.010194 0.078130 -0.090133 0.082244 0.334871 -0.020324 -0.081959 0.290532 -0.119489 0.019912 0.184060 -0.030460 0.231278 0.098650 0.158057 0.075850 0.361001 0.127943 -0.122568 
-tellus 0.034027 -0.297429 0.123518 -0.333662 -0.080254 -0.091142 -0.102686 -0.167904 -0.054129 0.198964 -0.022188 0.397628 -0.415293 -0.083809 0.150092 0.449753 0.256095 0.026923 0.117554 0.052390 0.033650 0.241395 0.100078 -0.162841 -0.092315 -0.228962 0.088675 0.044751 0.296994 0.184102 -0.099759 -0.137464 -0.312947 0.225409 0.031956 -0.059002 0.171663 -0.104804 0.007679 -0.075444 -0.261941 0.211173 -0.326392 -0.072989 0.175032 0.189663 0.079273 0.185843 -0.125813 -0.220451 0.073554 -0.213354 0.112294 0.054060 -0.057455 0.068454 0.335126 0.029081 -0.126441 -0.161256 0.173159 -0.177284 -0.082785 -0.188708 -0.198435 0.276781 0.060757 -0.203867 0.222762 0.059297 -0.201265 0.001982 0.125121 -0.072130 -0.186629 0.079488 -0.199195 -0.105728 -0.224930 -0.333242 0.078136 0.013921 0.055073 -0.090435 0.078587 0.331237 -0.015508 -0.082736 0.291102 -0.139304 0.010024 0.186807 -0.039928 0.241172 0.101603 0.164746 0.076442 0.371640 0.121215 -0.118897 
-justo 0.025374 -0.269416 0.118975 -0.296983 -0.064714 -0.080513 -0.098241 -0.159634 -0.054819 0.178969 -0.014156 0.370063 -0.396294 -0.082746 0.149494 0.430345 0.240976 0.024276 0.106588 0.043915 0.032865 0.226764 0.095531 -0.155465 -0.092860 -0.218841 0.089391 0.039571 0.275845 0.175076 -0.102431 -0.121781 -0.304783 0.216784 0.027779 -0.058596 0.161882 -0.103589 0.010961 -0.071394 -0.245152 0.200787 -0.315060 -0.068307 0.156429 0.183070 0.059345 0.176905 -0.127858 -0.209399 0.062343 -0.189249 0.099450 0.058668 -0.045946 0.059435 0.310572 0.025696 -0.123374 -0.137395 0.157997 -0.152969 -0.080217 -0.179928 -0.179893 0.246623 0.045211 -0.199612 0.216056 0.052405 -0.183896 0.009660 0.110333 -0.069772 -0.188158 0.078626 -0.176678 -0.093816 -0.203248 -0.304800 0.072047 0.007812 0.055427 -0.079223 0.073223 0.302952 -0.017401 -0.081656 0.269386 -0.122078 0.012622 0.165308 -0.038151 0.214603 0.101430 0.146990 0.074078 0.335976 0.106401 -0.119583 
-Phasellus 0.025072 -0.280011 0.114607 -0.300043 -0.066872 -0.088341 -0.099088 -0.166757 -0.060043 0.186849 -0.014513 0.380232 -0.387689 -0.088179 0.136434 0.433394 0.238005 0.025299 0.108273 0.050777 0.045248 0.234130 0.104025 -0.151339 -0.087388 -0.212365 0.101450 0.034469 0.276937 0.164651 -0.099141 -0.128060 -0.289545 0.211118 0.037294 -0.058175 0.165242 -0.107456 0.019866 -0.074564 -0.240597 0.204864 -0.314893 -0.065509 0.161256 0.187442 0.063495 0.176545 -0.127106 -0.214897 0.062362 -0.201081 0.102794 0.050235 -0.049320 0.052153 0.324620 0.025591 -0.117756 -0.148280 0.162867 -0.160083 -0.086369 -0.182879 -0.187108 0.264041 0.059050 -0.209504 0.225737 0.050622 -0.194477 0.006209 0.123046 -0.073458 -0.177655 0.078930 -0.185864 -0.098801 -0.217550 -0.315527 0.082957 0.008308 0.067486 -0.087063 0.078987 0.323776 -0.014177 -0.079032 0.283152 -0.124721 0.008772 0.181830 -0.037561 0.223293 0.091214 0.152705 0.069093 0.347716 0.113852 -0.110711 
-felis 0.025796 -0.295407 0.127013 -0.324564 -0.075213 -0.102846 -0.112233 -0.175144 -0.057493 0.190788 -0.029236 0.405809 -0.419914 -0.090778 0.150872 0.461834 0.260825 0.028524 0.116882 0.041540 0.035684 0.254165 0.118815 -0.179423 -0.101183 -0.229144 0.090306 0.041970 0.306161 0.192839 -0.108794 -0.137040 -0.323118 0.226238 0.031142 -0.061578 0.178199 -0.108559 0.010899 -0.074589 -0.261304 0.223355 -0.338624 -0.076407 0.175335 0.196048 0.080144 0.201293 -0.134024 -0.219531 0.068711 -0.208177 0.113359 0.053971 -0.050230 0.074570 0.338098 0.028932 -0.130243 -0.155433 0.169804 -0.176667 -0.077929 -0.192702 -0.203685 0.278677 0.065884 -0.209365 0.230099 0.061543 -0.203496 -0.001924 0.125012 -0.075383 -0.204157 0.078889 -0.194518 -0.108053 -0.236494 -0.334380 0.081991 0.016364 0.061426 -0.087337 0.093201 0.336644 -0.019801 -0.083538 0.301171 -0.142789 0.016117 0.189709 -0.043304 0.241753 0.099428 0.177052 0.072777 0.372465 0.125661 -0.122350 
-enim 0.019058 -0.283744 0.126839 -0.322204 -0.067447 -0.091785 -0.105542 -0.178940 -0.060926 0.198881 -0.025825 0.403159 -0.423042 -0.088322 0.146057 0.462272 0.260645 0.033360 0.124071 0.039052 0.043818 0.251817 0.109263 -0.174965 -0.107389 -0.236931 0.098878 0.035713 0.317513 0.205890 -0.109030 -0.148191 -0.336502 0.245514 0.027890 -0.046175 0.200542 -0.116960 0.010451 -0.075842 -0.268186 0.239394 -0.345359 -0.081593 0.186172 0.190356 0.075203 0.203065 -0.148381 -0.238523 0.066589 -0.214037 0.126772 0.059404 -0.047902 0.069784 0.343351 0.042039 -0.143206 -0.166196 0.180553 -0.194345 -0.086187 -0.210544 -0.209648 0.284458 0.068678 -0.228527 0.237775 0.054255 -0.212645 0.012262 0.123022 -0.072667 -0.199386 0.096036 -0.199208 -0.101973 -0.254716 -0.348281 0.084879 0.016040 0.078497 -0.105681 0.082414 0.360763 -0.021199 -0.083878 0.309473 -0.137000 0.025441 0.198286 -0.042322 0.246732 0.119262 0.166398 0.077367 0.380256 0.129594 -0.132500 
-leo 0.025435 -0.276990 0.121497 -0.312062 -0.065991 -0.096641 -0.101673 -0.171693 -0.058981 0.197368 -0.018376 0.395709 -0.427380 -0.092431 0.149110 0.456304 0.253854 0.031046 0.118888 0.045961 0.043036 0.246032 0.107365 -0.162884 -0.101762 -0.231460 0.103854 0.039619 0.302209 0.193578 -0.103223 -0.145509 -0.326326 0.230648 0.023790 -0.049130 0.181462 -0.113687 0.009192 -0.077384 -0.256002 0.212666 -0.337456 -0.062582 0.169142 0.190231 0.074983 0.193236 -0.123797 -0.220610 0.067833 -0.203966 0.107029 0.058596 -0.046823 0.073325 0.333315 0.033903 -0.126091 -0.162341 0.183386 -0.174913 -0.091976 -0.191855 -0.201471 0.263560 0.058418 -0.212303 0.227233 0.052326 -0.207155 0.009662 0.111335 -0.082474 -0.188567 0.079257 -0.198140 -0.093394 -0.238626 -0.328710 0.083510 0.006898 0.059584 -0.097878 0.083791 0.352030 -0.015613 -0.089463 0.306395 -0.138592 0.021073 0.182768 -0.040184 0.241975 0.101362 0.159328 0.083309 0.370596 0.117052 -0.125675 
-porta 0.020258 -0.255078 0.112910 -0.294068 -0.068462 -0.080422 -0.079477 -0.150817 -0.045974 0.175702 -0.027349 0.349385 -0.389514 -0.066490 0.135706 0.398222 0.225434 0.023219 0.114766 0.042688 0.026059 0.212269 0.099380 -0.148966 -0.082707 -0.195738 0.095878 0.031092 0.261833 0.166251 -0.095311 -0.125699 -0.282397 0.212348 0.029894 -0.046174 0.157394 -0.105247 0.005294 -0.069658 -0.244239 0.186914 -0.292348 -0.055868 0.157516 0.165917 0.061054 0.175883 -0.115822 -0.194891 0.056763 -0.198261 0.103306 0.045848 -0.039846 0.066062 0.310046 0.032796 -0.119086 -0.135109 0.157244 -0.160030 -0.078871 -0.174712 -0.178200 0.242183 0.057655 -0.201394 0.205066 0.052382 -0.184827 0.008818 0.112052 -0.063659 -0.176877 0.068509 -0.180774 -0.083550 -0.208507 -0.308818 0.075979 0.017653 0.061071 -0.075055 0.091146 0.309666 -0.015684 -0.080585 0.264958 -0.118460 0.018828 0.166066 -0.028101 0.211963 0.094824 0.143183 0.066548 0.325893 0.115017 -0.112980 
-Vestibulum 0.040169 -0.271537 0.119611 -0.304648 -0.069222 -0.095009 -0.097050 -0.164878 -0.062805 0.201570 -0.004045 0.387898 -0.400664 -0.086701 0.153137 0.434311 0.233726 0.034622 0.119129 0.060290 0.037716 0.237608 0.107027 -0.162439 -0.078330 -0.197928 0.087365 0.040977 0.262428 0.164051 -0.097577 -0.126377 -0.303999 0.222347 0.034075 -0.062279 0.173371 -0.110017 0.006672 -0.073821 -0.238908 0.197306 -0.316462 -0.062492 0.153069 0.186760 0.061310 0.168843 -0.111588 -0.198476 0.074194 -0.193071 0.103631 0.057514 -0.049048 0.060494 0.307758 0.035518 -0.113137 -0.132187 0.155784 -0.145091 -0.071967 -0.164201 -0.174426 0.247521 0.054232 -0.203633 0.222960 0.049504 -0.189710 0.013161 0.118869 -0.077243 -0.169385 0.083896 -0.183196 -0.082676 -0.194119 -0.313495 0.082673 0.006680 0.055396 -0.082118 0.082631 0.312222 -0.026676 -0.066389 0.257111 -0.124126 0.026517 0.184015 -0.023914 0.210365 0.091389 0.152493 0.077854 0.332860 0.103564 -0.102412 
-pharetra 0.018645 -0.286986 0.122716 -0.315337 -0.073365 -0.100957 -0.109384 -0.168058 -0.048797 0.194582 -0.030908 0.382805 -0.423345 -0.087648 0.151570 0.437185 0.244781 0.028406 0.113168 0.049876 0.039411 0.235686 0.107189 -0.166077 -0.110174 -0.212140 0.089087 0.030457 0.292052 0.192036 -0.105523 -0.128530 -0.313853 0.227885 0.037166 -0.055447 0.176595 -0.112176 0.004287 -0.074832 -0.256733 0.212754 -0.336744 -0.069693 0.172325 0.187135 0.072563 0.197113 -0.144435 -0.216112 0.077374 -0.191066 0.115412 0.062162 -0.040635 0.067324 0.328159 0.031679 -0.130085 -0.155335 0.183049 -0.171794 -0.079772 -0.186354 -0.183381 0.257476 0.060185 -0.197579 0.228621 0.057541 -0.202192 0.006271 0.125173 -0.075954 -0.184093 0.073633 -0.182052 -0.095189 -0.219838 -0.326075 0.079769 0.018309 0.055812 -0.090039 0.080511 0.328170 -0.018385 -0.089036 0.299381 -0.137903 0.015235 0.176018 -0.031763 0.236063 0.113849 0.155679 0.071161 0.355742 0.123484 -0.120997 
-Nullam 0.030373 -0.275675 0.119498 -0.321726 -0.073390 -0.089629 -0.096744 -0.175023 -0.051988 0.184250 -0.024409 0.386292 -0.387682 -0.080256 0.143347 0.421667 0.242880 0.033293 0.105426 0.033166 0.030945 0.231917 0.105833 -0.157311 -0.083439 -0.204578 0.085276 0.026202 0.294354 0.176334 -0.092906 -0.127336 -0.308058 0.210861 0.031489 -0.043680 0.172782 -0.101060 0.012489 -0.072523 -0.238977 0.204111 -0.314235 -0.074586 0.175952 0.175336 0.064972 0.179365 -0.124245 -0.212538 0.067405 -0.198604 0.114656 0.049630 -0.054454 0.078482 0.318817 0.032954 -0.109783 -0.157096 0.152607 -0.176177 -0.075460 -0.182405 -0.192785 0.249871 0.055097 -0.191386 0.219957 0.058380 -0.187514 -0.001484 0.122993 -0.072168 -0.176671 0.092838 -0.186799 -0.102212 -0.225488 -0.313768 0.069685 0.009622 0.074233 -0.093806 0.075508 0.315383 -0.018939 -0.072683 0.278372 -0.121031 0.017289 0.174470 -0.021584 0.221288 0.098070 0.153067 0.078487 0.340565 0.120740 -0.113537 
-risus 0.030217 -0.271782 0.117306 -0.292538 -0.062444 -0.094482 -0.084649 -0.153957 -0.060952 0.181006 -0.013605 0.371738 -0.396368 -0.089184 0.148164 0.432503 0.235644 0.031204 0.117624 0.047363 0.044660 0.231941 0.110574 -0.162203 -0.085593 -0.214317 0.097303 0.034435 0.280064 0.176167 -0.098965 -0.135988 -0.300801 0.209790 0.021639 -0.060002 0.170948 -0.097360 0.008541 -0.077903 -0.245012 0.206908 -0.306610 -0.061504 0.163786 0.177803 0.063471 0.171873 -0.120162 -0.199310 0.071213 -0.195091 0.090310 0.052774 -0.038905 0.059514 0.309277 0.029020 -0.126624 -0.148942 0.163428 -0.159516 -0.084802 -0.182246 -0.188826 0.253101 0.047892 -0.201327 0.207746 0.051547 -0.184443 0.014490 0.112958 -0.073673 -0.186576 0.070792 -0.183991 -0.097382 -0.213009 -0.303833 0.081548 0.006959 0.060265 -0.077939 0.079051 0.310193 -0.016687 -0.079991 0.274235 -0.122133 0.022674 0.181702 -0.028842 0.226395 0.094596 0.154378 0.071404 0.338096 0.115952 -0.113916 
-augue 0.020026 -0.256132 0.109871 -0.283018 -0.053817 -0.089557 -0.091531 -0.145869 -0.048824 0.173831 -0.017969 0.354132 -0.381671 -0.080978 0.127022 0.420778 0.233261 0.023047 0.109154 0.038400 0.029071 0.219898 0.105351 -0.145198 -0.085902 -0.206827 0.086227 0.038594 0.268916 0.157709 -0.086108 -0.115538 -0.277989 0.203074 0.030755 -0.051617 0.156201 -0.089350 0.008200 -0.063657 -0.228725 0.191369 -0.292461 -0.068456 0.153195 0.164237 0.070333 0.166113 -0.109620 -0.192504 0.055867 -0.192067 0.101090 0.045896 -0.044212 0.062160 0.296434 0.029698 -0.109756 -0.146207 0.150954 -0.162083 -0.076603 -0.175643 -0.187480 0.250104 0.056255 -0.194562 0.205656 0.053431 -0.182569 0.000079 0.104179 -0.067260 -0.177543 0.078258 -0.175289 -0.093968 -0.209398 -0.291855 0.070344 0.015897 0.053050 -0.085025 0.083632 0.305443 -0.016956 -0.079036 0.275580 -0.127652 0.021836 0.172461 -0.026825 0.217222 0.090889 0.144018 0.069032 0.335599 0.106206 -0.109004 
-sapien 0.019722 -0.259330 0.111024 -0.293253 -0.062244 -0.098044 -0.093217 -0.160108 -0.057806 0.190748 -0.026304 0.381420 -0.398951 -0.084315 0.138878 0.429513 0.244672 0.023527 0.107330 0.033661 0.034721 0.232227 0.104262 -0.163994 -0.093096 -0.211765 0.089521 0.036829 0.286155 0.183790 -0.099661 -0.127970 -0.295867 0.213829 0.032160 -0.049730 0.170779 -0.103326 0.007703 -0.079231 -0.254358 0.202073 -0.317229 -0.073422 0.164782 0.175234 0.065376 0.185373 -0.136588 -0.214869 0.058010 -0.204261 0.104024 0.051053 -0.039506 0.066428 0.314386 0.041375 -0.120480 -0.152313 0.170145 -0.170365 -0.075159 -0.190695 -0.193107 0.257012 0.056058 -0.205575 0.226689 0.046036 -0.202428 0.009949 0.118450 -0.072391 -0.198216 0.085400 -0.180275 -0.091051 -0.221165 -0.319397 0.074550 0.007730 0.072028 -0.089781 0.076820 0.316253 -0.014741 -0.078467 0.288480 -0.123404 0.026861 0.173322 -0.036576 0.230924 0.097900 0.151307 0.072352 0.350034 0.119584 -0.119069 
-condimentum 0.018806 -0.287048 0.119196 -0.315640 -0.069930 -0.095727 -0.093772 -0.169805 -0.051484 0.193828 -0.019487 0.381626 -0.416478 -0.085877 0.144926 0.433800 0.237421 0.031028 0.121405 0.043164 0.038514 0.248596 0.102451 -0.162895 -0.099938 -0.213538 0.105676 0.040915 0.284906 0.200080 -0.103985 -0.147660 -0.311172 0.221133 0.037154 -0.044813 0.166507 -0.111790 0.015319 -0.076200 -0.260828 0.215220 -0.325223 -0.068021 0.177385 0.179155 0.059584 0.186221 -0.129496 -0.206513 0.071474 -0.198160 0.103066 0.055720 -0.045791 0.070730 0.342826 0.041225 -0.131475 -0.153392 0.183028 -0.179396 -0.086188 -0.196559 -0.195586 0.271548 0.062057 -0.222317 0.229430 0.051360 -0.202920 0.004735 0.117560 -0.076511 -0.198012 0.075763 -0.203498 -0.101244 -0.235286 -0.334785 0.079863 0.009459 0.057321 -0.095099 0.082120 0.335895 -0.014190 -0.087267 0.299052 -0.131744 0.025461 0.182607 -0.034295 0.236069 0.098809 0.157851 0.077062 0.370731 0.123058 -0.124847 
-Duis 0.029968 -0.289391 0.119291 -0.332109 -0.083528 -0.099515 -0.099103 -0.169824 -0.060479 0.194714 -0.012620 0.389330 -0.423925 -0.089296 0.147367 0.447315 0.246460 0.039653 0.133426 0.048958 0.037103 0.250180 0.110807 -0.163741 -0.090036 -0.217599 0.108975 0.044922 0.292613 0.184889 -0.095942 -0.136849 -0.323344 0.225604 0.026008 -0.061197 0.181944 -0.119416 0.003364 -0.083151 -0.275769 0.216072 -0.345673 -0.076443 0.174078 0.198178 0.066175 0.188626 -0.137350 -0.217715 0.066705 -0.210856 0.103936 0.051602 -0.050787 0.058590 0.343251 0.036848 -0.130750 -0.146726 0.182473 -0.172722 -0.080274 -0.189622 -0.194219 0.268597 0.054132 -0.226231 0.242491 0.059982 -0.208965 0.010463 0.126265 -0.084271 -0.200112 0.080951 -0.195087 -0.093214 -0.223533 -0.339735 0.083592 0.015539 0.074141 -0.089367 0.090380 0.333291 -0.016903 -0.083993 0.290911 -0.130329 0.018968 0.194638 -0.037494 0.228056 0.099489 0.159745 0.074941 0.361860 0.122731 -0.122508 
-imperdiet 0.024968 -0.263201 0.110308 -0.287578 -0.070184 -0.078862 -0.098789 -0.160344 -0.056357 0.166258 -0.022298 0.353741 -0.392570 -0.068813 0.141641 0.405937 0.230014 0.020724 0.113225 0.042475 0.037309 0.215602 0.098172 -0.160260 -0.088587 -0.208439 0.082270 0.041257 0.259919 0.181855 -0.098761 -0.126434 -0.288707 0.211408 0.031352 -0.047739 0.159142 -0.104543 0.003446 -0.078508 -0.237561 0.191495 -0.292219 -0.056880 0.156380 0.166399 0.052972 0.168983 -0.117404 -0.195892 0.060641 -0.194227 0.092423 0.047373 -0.037475 0.061303 0.305756 0.038440 -0.117104 -0.142510 0.162668 -0.161258 -0.077837 -0.174046 -0.176710 0.243352 0.048066 -0.191429 0.210073 0.047847 -0.192702 0.002969 0.117687 -0.071301 -0.177261 0.076861 -0.170359 -0.084712 -0.214411 -0.294804 0.073786 0.012726 0.059900 -0.073416 0.075820 0.298649 -0.018008 -0.077237 0.254073 -0.116899 0.022922 0.165543 -0.033643 0.213558 0.087942 0.140426 0.067810 0.323706 0.117080 -0.102010 
-Quisque 0.030200 -0.295167 0.128308 -0.321259 -0.080053 -0.099335 -0.110670 -0.178162 -0.064232 0.196790 -0.019264 0.406305 -0.440794 -0.084171 0.156102 0.464369 0.266781 0.036913 0.124889 0.045110 0.043090 0.254965 0.112894 -0.168646 -0.101852 -0.231442 0.097886 0.042196 0.309942 0.189447 -0.114592 -0.138379 -0.333985 0.234453 0.035183 -0.058693 0.190646 -0.111829 0.002107 -0.078283 -0.272647 0.217785 -0.346730 -0.069140 0.180422 0.203752 0.079089 0.202567 -0.145599 -0.233329 0.067276 -0.218362 0.117351 0.061753 -0.046554 0.067572 0.346185 0.024971 -0.128508 -0.154543 0.175411 -0.174865 -0.081585 -0.199809 -0.205115 0.274453 0.066864 -0.212357 0.238981 0.054696 -0.212859 0.011701 0.134729 -0.079182 -0.203942 0.085705 -0.195317 -0.097469 -0.232059 -0.347798 0.087948 0.023664 0.066915 -0.088505 0.084744 0.341014 -0.018032 -0.085187 0.301333 -0.131590 0.019361 0.192734 -0.032314 0.235203 0.112107 0.166337 0.075328 0.376365 0.118333 -0.126659 
-Integer 0.027299 -0.294686 0.128963 -0.336723 -0.075378 -0.100286 -0.105403 -0.176966 -0.066186 0.198628 -0.022824 0.409842 -0.441334 -0.086626 0.153544 0.459526 0.265142 0.028780 0.127707 0.053356 0.037824 0.250502 0.118163 -0.171044 -0.099564 -0.229024 0.097308 0.034920 0.294381 0.173968 -0.098590 -0.135569 -0.308987 0.223853 0.034826 -0.052642 0.168770 -0.104699 0.008378 -0.074346 -0.257092 0.201944 -0.330108 -0.066379 0.172918 0.189475 0.071155 0.179390 -0.130681 -0.211902 0.066164 -0.208908 0.106484 0.062846 -0.053905 0.065386 0.329219 0.025866 -0.113793 -0.153016 0.163730 -0.169453 -0.074760 -0.186199 -0.192699 0.272094 0.054248 -0.207286 0.231314 0.061129 -0.199707 0.010083 0.123080 -0.072861 -0.189683 0.075247 -0.194149 -0.096754 -0.216489 -0.331690 0.077581 0.007603 0.062458 -0.079676 0.087658 0.322990 -0.019314 -0.084151 0.279061 -0.132745 0.020618 0.181881 -0.033010 0.222618 0.103453 0.164369 0.078557 0.363009 0.120698 -0.126629 
-Suspendisse 0.027570 -0.282846 0.118783 -0.309600 -0.076268 -0.082602 -0.099829 -0.167413 -0.064237 0.183520 -0.025000 0.384451 -0.398487 -0.081596 0.140962 0.429551 0.243760 0.027478 0.101009 0.041888 0.043991 0.235133 0.100346 -0.156986 -0.085155 -0.204054 0.095333 0.032706 0.289678 0.169529 -0.095402 -0.123298 -0.309027 0.210129 0.045667 -0.052747 0.174374 -0.098706 0.016462 -0.078257 -0.240716 0.201459 -0.317437 -0.065550 0.162738 0.176959 0.056729 0.186115 -0.129989 -0.214932 0.059401 -0.202837 0.108737 0.047125 -0.051505 0.063132 0.311249 0.027377 -0.114418 -0.151779 0.163455 -0.162260 -0.082173 -0.181634 -0.183314 0.264387 0.049522 -0.196154 0.216970 0.052546 -0.196034 0.008517 0.122367 -0.070299 -0.192547 0.087429 -0.188142 -0.084019 -0.222311 -0.319907 0.074958 0.016230 0.059463 -0.077971 0.080025 0.323354 -0.015540 -0.077412 0.283246 -0.116918 0.014197 0.181728 -0.032004 0.226730 0.107863 0.157999 0.079627 0.350086 0.119084 -0.111117 
-turpis 0.034694 -0.268211 0.098387 -0.265835 -0.060451 -0.074798 -0.085419 -0.139506 -0.051186 0.177849 -0.004601 0.334277 -0.358765 -0.074162 0.125739 0.402081 0.229777 0.019039 0.130514 0.039122 0.049289 0.210528 0.106294 -0.154186 -0.070393 -0.211117 0.082201 0.037800 0.255538 0.142141 -0.089023 -0.109041 -0.284136 0.200461 0.027530 -0.061247 0.153103 -0.111199 0.000500 -0.068231 -0.234550 0.187654 -0.292094 -0.063624 0.142294 0.183535 0.067348 0.165098 -0.110501 -0.183148 0.060503 -0.170083 0.095080 0.059918 -0.051977 0.048256 0.299259 0.014913 -0.107289 -0.142568 0.141156 -0.158607 -0.056116 -0.154919 -0.169681 0.244017 0.049656 -0.174165 0.199895 0.052950 -0.181471 0.004386 0.116493 -0.072173 -0.167857 0.081967 -0.161995 -0.083575 -0.168531 -0.279230 0.063557 0.026653 0.045603 -0.076153 0.087595 0.272881 -0.002294 -0.065394 0.239026 -0.131032 0.009756 0.168537 -0.026739 0.205057 0.083568 0.144006 0.054847 0.326956 0.094363 -0.105810 
-ornare 0.030380 -0.249348 0.106760 -0.272534 -0.066878 -0.083589 -0.079387 -0.144275 -0.046921 0.175376 -0.010838 0.331707 -0.364161 -0.066706 0.127508 0.369707 0.208921 0.028834 0.106496 0.044195 0.038964 0.203718 0.087812 -0.135553 -0.073095 -0.171485 0.088043 0.036493 0.226740 0.157303 -0.089275 -0.113980 -0.258179 0.194387 0.032039 -0.044621 0.138366 -0.102074 0.001092 -0.063735 -0.230831 0.174986 -0.279421 -0.054752 0.140364 0.167338 0.045574 0.161281 -0.111041 -0.187851 0.065194 -0.180613 0.095041 0.046953 -0.035440 0.054174 0.274162 0.032378 -0.109446 -0.118746 0.148546 -0.140342 -0.064816 -0.155614 -0.152887 0.221485 0.044065 -0.177364 0.199587 0.044561 -0.175877 0.003950 0.109400 -0.067187 -0.160828 0.062937 -0.162130 -0.076468 -0.174281 -0.272814 0.079908 0.009733 0.051363 -0.072862 0.073321 0.277567 -0.017766 -0.070688 0.241122 -0.113134 0.020669 0.151153 -0.030198 0.196880 0.093063 0.129200 0.064768 0.310449 0.095235 -0.104726 
-libero 0.023397 -0.289381 0.119356 -0.318600 -0.070724 -0.094635 -0.101522 -0.171078 -0.053206 0.200178 -0.024738 0.389381 -0.419801 -0.077155 0.144931 0.453777 0.255827 0.034351 0.121742 0.050715 0.042128 0.235274 0.103539 -0.167943 -0.094494 -0.225945 0.091490 0.038423 0.300492 0.194009 -0.102160 -0.135741 -0.311835 0.229635 0.037662 -0.062355 0.173141 -0.109303 0.008614 -0.075513 -0.252664 0.217563 -0.334500 -0.066479 0.180193 0.187490 0.068345 0.193108 -0.138979 -0.224062 0.073047 -0.210915 0.109160 0.052563 -0.050324 0.065632 0.331593 0.033521 -0.140333 -0.155697 0.180928 -0.179812 -0.087091 -0.195870 -0.200351 0.270751 0.052955 -0.216363 0.229692 0.060742 -0.206474 0.009795 0.118646 -0.080005 -0.202014 0.083867 -0.190824 -0.107341 -0.231659 -0.330615 0.089579 0.020959 0.067526 -0.091690 0.086410 0.334465 -0.021880 -0.084489 0.297862 -0.133516 0.022653 0.184964 -0.038938 0.236111 0.108544 0.166660 0.073752 0.366892 0.128557 -0.126586 
-tortor 0.033471 -0.269698 0.108308 -0.304846 -0.070020 -0.088767 -0.098340 -0.154556 -0.050718 0.187354 -0.012961 0.364570 -0.388152 -0.079482 0.138802 0.414812 0.224652 0.024510 0.112025 0.045521 0.045821 0.222887 0.098693 -0.158595 -0.080537 -0.204135 0.084787 0.038959 0.272788 0.163524 -0.099544 -0.118624 -0.288603 0.203937 0.021460 -0.060802 0.154647 -0.097969 -0.005511 -0.068936 -0.233987 0.196823 -0.302662 -0.066833 0.151512 0.172341 0.055271 0.166528 -0.115188 -0.196108 0.060181 -0.181294 0.096472 0.049719 -0.048257 0.062421 0.294922 0.024754 -0.114231 -0.137818 0.156272 -0.152960 -0.074358 -0.175165 -0.175441 0.240687 0.048964 -0.194699 0.212664 0.053009 -0.184120 0.007922 0.111692 -0.075578 -0.174699 0.072162 -0.175688 -0.092546 -0.197657 -0.298035 0.070337 0.010141 0.058805 -0.087806 0.068396 0.296166 -0.027821 -0.076301 0.251921 -0.123933 0.023232 0.174605 -0.033648 0.210065 0.101034 0.152693 0.077921 0.334645 0.112018 -0.114522 
-Proin 0.020584 -0.276830 0.114443 -0.308346 -0.067884 -0.092215 -0.106537 -0.165535 -0.063599 0.182598 -0.028690 0.390571 -0.402542 -0.087906 0.147536 0.442704 0.250120 0.029402 0.108108 0.049920 0.041352 0.250892 0.110146 -0.170549 -0.100959 -0.230448 0.095693 0.040333 0.306818 0.190911 -0.102226 -0.125810 -0.315822 0.215907 0.039641 -0.062021 0.170540 -0.097776 0.011394 -0.075056 -0.259388 0.211558 -0.324234 -0.074086 0.172180 0.178106 0.073146 0.192757 -0.136691 -0.215570 0.070637 -0.210249 0.109124 0.056854 -0.046165 0.060100 0.322546 0.029012 -0.125691 -0.154795 0.167834 -0.163050 -0.081201 -0.189056 -0.187309 0.269589 0.060783 -0.203223 0.222634 0.053102 -0.199733 0.006536 0.120503 -0.080392 -0.193018 0.074665 -0.185757 -0.103112 -0.217074 -0.327302 0.069753 0.011468 0.061375 -0.085258 0.081060 0.325169 -0.020874 -0.081283 0.278804 -0.131091 0.017300 0.182047 -0.035953 0.227166 0.099377 0.161596 0.072468 0.358243 0.117664 -0.110974 
-magna 0.031324 -0.270008 0.121220 -0.294472 -0.068756 -0.089719 -0.085812 -0.156168 -0.052205 0.180294 -0.017778 0.367189 -0.386161 -0.070083 0.133658 0.417128 0.231570 0.026585 0.115080 0.049329 0.034037 0.227761 0.098045 -0.154294 -0.079818 -0.210366 0.099456 0.040509 0.260652 0.169258 -0.095035 -0.126039 -0.289930 0.200062 0.036755 -0.055563 0.151127 -0.105911 0.013903 -0.066293 -0.242813 0.184105 -0.295963 -0.058828 0.155547 0.170475 0.055988 0.164071 -0.116013 -0.189889 0.058737 -0.182850 0.095827 0.049309 -0.047119 0.057955 0.299726 0.030044 -0.113176 -0.129439 0.155469 -0.150377 -0.078772 -0.175215 -0.172930 0.238332 0.049183 -0.198002 0.206946 0.044598 -0.185598 0.008844 0.110756 -0.070575 -0.180428 0.065806 -0.172294 -0.089296 -0.204989 -0.296490 0.076392 0.015735 0.059166 -0.072951 0.085837 0.291600 -0.014492 -0.077307 0.267931 -0.122432 0.012549 0.165816 -0.034904 0.211704 0.085027 0.138749 0.060708 0.334247 0.109503 -0.103348 
-faucibus 0.048011 -0.283535 0.125041 -0.308212 -0.073050 -0.093745 -0.093481 -0.164493 -0.057107 0.202981 -0.000461 0.383953 -0.414411 -0.086371 0.156557 0.436488 0.246357 0.034134 0.126603 0.065518 0.052087 0.245843 0.111960 -0.162911 -0.090390 -0.212575 0.085666 0.040416 0.280708 0.168445 -0.104116 -0.141051 -0.299849 0.219621 0.032984 -0.065693 0.175480 -0.116895 0.002726 -0.069941 -0.256194 0.210984 -0.339700 -0.069383 0.167717 0.201930 0.076983 0.176762 -0.127334 -0.214315 0.077288 -0.197328 0.106440 0.050812 -0.046075 0.071194 0.324691 0.026280 -0.125566 -0.149709 0.165188 -0.172399 -0.078523 -0.187211 -0.190158 0.258266 0.057278 -0.210668 0.225564 0.059422 -0.189992 0.008397 0.112876 -0.080458 -0.183554 0.086992 -0.191050 -0.108295 -0.214235 -0.331146 0.090116 0.018684 0.056114 -0.093556 0.094841 0.332567 -0.028408 -0.078758 0.272394 -0.124050 0.014888 0.180386 -0.038711 0.236746 0.088781 0.148742 0.083066 0.349102 0.107852 -0.118800 
-malesuada 0.071970 -0.284916 0.105152 -0.255288 -0.064647 -0.047701 -0.065949 -0.125195 -0.065190 0.179670 0.006758 0.320735 -0.383959 -0.080409 0.132613 0.430184 0.269328 0.010858 0.153873 0.062536 0.076579 0.232596 0.140792 -0.167759 -0.073879 -0.251132 0.099530 0.051354 0.230357 0.101336 -0.086970 -0.106277 -0.286626 0.195102 0.030348 -0.071442 0.154842 -0.118833 0.009626 -0.057378 -0.245816 0.181800 -0.278003 -0.073820 0.140938 0.209399 0.071395 0.144632 -0.087108 -0.168368 0.069919 -0.190229 0.096513 0.084555 -0.087972 0.046086 0.317804 -0.006954 -0.113426 -0.140003 0.132976 -0.157159 -0.033123 -0.165715 -0.162833 0.260718 0.062331 -0.181373 0.215441 0.062854 -0.197594 0.002075 0.126323 -0.079521 -0.150985 0.088953 -0.198563 -0.098981 -0.153079 -0.282175 0.066905 0.041859 0.010406 -0.086878 0.096360 0.285673 -0.000009 -0.054558 0.247640 -0.158549 -0.024895 0.188972 -0.025589 0.232534 0.054977 0.152504 0.047750 0.330745 0.068939 -0.094035 
-arcu 0.019777 -0.278359 0.122858 -0.313740 -0.070983 -0.088517 -0.093341 -0.165380 -0.066065 0.185385 -0.021081 0.365379 -0.402568 -0.090346 0.144489 0.429745 0.243298 0.038354 0.117586 0.038157 0.043281 0.229877 0.102106 -0.159420 -0.097564 -0.206673 0.088276 0.037225 0.276317 0.171017 -0.099655 -0.126304 -0.286127 0.202549 0.024384 -0.057230 0.166698 -0.105446 0.012594 -0.076226 -0.236553 0.195568 -0.305201 -0.067121 0.158663 0.179757 0.062604 0.184414 -0.126854 -0.209323 0.062262 -0.181411 0.102468 0.054278 -0.033947 0.050120 0.306994 0.031720 -0.113690 -0.146885 0.154490 -0.147223 -0.082078 -0.169794 -0.182750 0.244497 0.054571 -0.201059 0.204991 0.051656 -0.187683 0.005277 0.108526 -0.063650 -0.179031 0.069573 -0.171188 -0.088380 -0.212272 -0.296749 0.081400 0.012917 0.064256 -0.076659 0.071971 0.297242 -0.019676 -0.079954 0.272999 -0.109806 0.016010 0.161642 -0.033441 0.204019 0.094401 0.141645 0.069530 0.324734 0.102989 -0.107468 
-nunc 0.016506 -0.250922 0.106459 -0.280964 -0.067882 -0.089410 -0.086055 -0.148338 -0.041498 0.174653 -0.026497 0.354810 -0.381207 -0.063793 0.139186 0.402990 0.220252 0.034457 0.118913 0.035894 0.032031 0.228806 0.099380 -0.154805 -0.086691 -0.207159 0.088979 0.048712 0.263729 0.167960 -0.100861 -0.123847 -0.287328 0.216048 0.029378 -0.048934 0.153929 -0.105439 0.003077 -0.071155 -0.255540 0.192294 -0.300780 -0.058825 0.149517 0.176409 0.053563 0.168080 -0.120309 -0.199110 0.046510 -0.196271 0.099770 0.048551 -0.053819 0.068884 0.306757 0.029360 -0.119610 -0.139711 0.159609 -0.159041 -0.062008 -0.171302 -0.182261 0.241529 0.053557 -0.199109 0.212861 0.047123 -0.191649 -0.001377 0.112419 -0.070664 -0.193613 0.085917 -0.173460 -0.087233 -0.211041 -0.314795 0.088609 0.012403 0.058651 -0.085352 0.082499 0.307471 -0.024788 -0.076993 0.255833 -0.125319 0.026520 0.170374 -0.033118 0.219950 0.092469 0.146314 0.069347 0.336423 0.113899 -0.111938 
-maximus 0.019120 -0.260294 0.110326 -0.273942 -0.063269 -0.082307 -0.076171 -0.146996 -0.047260 0.169578 -0.014370 0.336677 -0.381455 -0.068889 0.135284 0.397519 0.223947 0.033673 0.105666 0.051646 0.039957 0.224070 0.094728 -0.152898 -0.083555 -0.183120 0.080990 0.032071 0.248306 0.163649 -0.086979 -0.122008 -0.281682 0.209863 0.029368 -0.050249 0.146685 -0.097577 0.000543 -0.069320 -0.231904 0.182214 -0.288852 -0.056875 0.151323 0.176243 0.059455 0.166501 -0.114739 -0.189873 0.063385 -0.188475 0.097432 0.043581 -0.045316 0.055720 0.300137 0.030832 -0.121919 -0.140888 0.155189 -0.161727 -0.074684 -0.178450 -0.170688 0.233716 0.051834 -0.193540 0.207860 0.046068 -0.187801 0.010002 0.107668 -0.059260 -0.172633 0.067861 -0.184880 -0.082132 -0.205420 -0.299515 0.080062 0.010835 0.046302 -0.083969 0.087579 0.303543 -0.021707 -0.080356 0.262637 -0.117363 0.016477 0.172506 -0.038521 0.219368 0.096842 0.141488 0.074084 0.329668 0.112558 -0.112415 
-gravida 0.025408 -0.282871 0.116688 -0.309245 -0.073109 -0.091258 -0.091921 -0.162128 -0.061623 0.200329 -0.013785 0.385498 -0.414354 -0.097192 0.141048 0.440950 0.251841 0.037393 0.113870 0.056755 0.047713 0.248271 0.117623 -0.170156 -0.094783 -0.225069 0.098733 0.046607 0.304065 0.181347 -0.106988 -0.131619 -0.313870 0.225739 0.030676 -0.053212 0.184686 -0.114749 0.013511 -0.068881 -0.266144 0.224777 -0.330239 -0.080538 0.174220 0.193955 0.076214 0.199974 -0.136141 -0.225707 0.076110 -0.200011 0.112059 0.051084 -0.046645 0.062329 0.328925 0.024870 -0.140944 -0.157117 0.177625 -0.187030 -0.090672 -0.193419 -0.199818 0.275770 0.056183 -0.214389 0.235310 0.052733 -0.203639 0.012673 0.119150 -0.066710 -0.187501 0.078651 -0.197162 -0.096809 -0.235130 -0.329366 0.079209 0.014602 0.063101 -0.100161 0.087525 0.331740 -0.015126 -0.076123 0.302124 -0.127922 0.012434 0.193229 -0.030024 0.240792 0.101527 0.159183 0.073139 0.358427 0.119863 -0.123074 
-neque 0.030578 -0.268138 0.117696 -0.298209 -0.068652 -0.095924 -0.102525 -0.161945 -0.055643 0.187432 -0.025535 0.373948 -0.400794 -0.080843 0.144146 0.433420 0.242338 0.034854 0.119534 0.038585 0.041867 0.237243 0.104558 -0.154496 -0.094540 -0.215783 0.095224 0.036540 0.291288 0.187031 -0.101916 -0.124538 -0.299781 0.212547 0.027811 -0.048494 0.166855 -0.107576 0.003850 -0.069757 -0.243250 0.207704 -0.318684 -0.072968 0.171653 0.178098 0.059893 0.181768 -0.130825 -0.202451 0.066119 -0.194957 0.098062 0.053616 -0.038116 0.063802 0.312312 0.032743 -0.125274 -0.148457 0.169593 -0.171494 -0.079357 -0.175118 -0.192390 0.250032 0.050833 -0.199489 0.214422 0.049764 -0.194542 0.010787 0.108430 -0.071741 -0.186182 0.078702 -0.183300 -0.087788 -0.217118 -0.311971 0.079875 0.007630 0.059512 -0.084090 0.084963 0.310412 -0.013850 -0.079928 0.273189 -0.119367 0.021285 0.172430 -0.035276 0.209440 0.096045 0.145714 0.076015 0.346863 0.112632 -0.114249 
-consectetur 0.036462 -0.302667 0.132138 -0.322015 -0.069111 -0.108979 -0.072255 -0.157731 -0.061331 0.212439 0.008147 0.379424 -0.441169 -0.093493 0.170624 0.443566 0.244265 0.035829 0.149209 0.089596 0.062603 0.259780 0.120341 -0.174630 -0.093628 -0.205267 0.146140 0.068251 0.245367 0.145421 -0.115688 -0.142109 -0.312297 0.242932 0.030423 -0.068715 0.160683 -0.116382 0.016980 -0.093791 -0.308946 0.206700 -0.358269 -0.050225 0.146656 0.206406 0.049333 0.186691 -0.122450 -0.215427 0.061222 -0.206932 0.096465 0.051568 -0.035521 0.037503 0.339733 0.038355 -0.133864 -0.136911 0.187679 -0.152603 -0.092671 -0.189575 -0.200843 0.282272 0.044841 -0.258925 0.234611 0.041321 -0.188747 0.018956 0.117835 -0.077258 -0.195056 0.064342 -0.194728 -0.090942 -0.201704 -0.339971 0.100321 0.025714 0.045432 -0.072034 0.104354 0.343847 -0.025769 -0.096160 0.287594 -0.110012 0.031496 0.219196 -0.038882 0.251513 0.094847 0.136627 0.078540 0.392243 0.123869 -0.126544 
-massa 0.023490 -0.291541 0.127642 -0.321506 -0.075148 -0.095791 -0.102515 -0.171496 -0.056783 0.199894 -0.028085 0.396066 -0.431218 -0.083689 0.149610 0.470031 0.258187 0.032512 0.122128 0.040030 0.049185 0.252679 0.112522 -0.172665 -0.095489 -0.242096 0.097003 0.040832 0.305923 0.194409 -0.109696 -0.138861 -0.309081 0.225907 0.031478 -0.051083 0.171418 -0.110206 0.003101 -0.078717 -0.265447 0.211262 -0.337317 -0.067082 0.176724 0.196005 0.072252 0.187973 -0.134521 -0.226076 0.062610 -0.204235 0.103267 0.049888 -0.054444 0.066215 0.335789 0.028685 -0.135639 -0.158214 0.179723 -0.179103 -0.077055 -0.198411 -0.198475 0.274679 0.061523 -0.217533 0.236302 0.058089 -0.203900 0.004046 0.124798 -0.079247 -0.190503 0.088157 -0.200997 -0.104673 -0.230492 -0.342402 0.083387 0.008807 0.070410 -0.098049 0.085573 0.333482 -0.022120 -0.085702 0.293878 -0.138923 0.021849 0.187575 -0.042730 0.239060 0.108355 0.162281 0.082930 0.366987 0.127697 -0.129322 
-interdum 0.028571 -0.273229 0.114038 -0.296609 -0.067192 -0.082478 -0.097508 -0.160522 -0.056559 0.193232 -0.021307 0.369403 -0.405106 -0.080110 0.152162 0.426116 0.240197 0.033638 0.107119 0.044758 0.049158 0.232404 0.101604 -0.161420 -0.094281 -0.212165 0.087423 0.038864 0.288650 0.174932 -0.102573 -0.130708 -0.293977 0.226841 0.027984 -0.053003 0.163813 -0.104958 -0.000662 -0.066686 -0.252458 0.198848 -0.310597 -0.061195 0.159566 0.178738 0.064745 0.180776 -0.131161 -0.210770 0.064072 -0.191552 0.110543 0.048671 -0.045666 0.060507 0.320505 0.039115 -0.130545 -0.150463 0.162529 -0.168935 -0.082171 -0.189187 -0.183503 0.249862 0.051310 -0.205934 0.212569 0.057313 -0.192024 0.003747 0.110216 -0.071520 -0.188125 0.082172 -0.185875 -0.099823 -0.210156 -0.315684 0.080221 0.017767 0.066973 -0.095200 0.088550 0.321907 -0.025516 -0.076657 0.278700 -0.117347 0.020149 0.178165 -0.041325 0.227366 0.103579 0.156920 0.075475 0.352782 0.113637 -0.117867 
-egestas 0.023022 -0.252856 0.109988 -0.278332 -0.070288 -0.085390 -0.080289 -0.143602 -0.046639 0.166713 -0.010774 0.340239 -0.389457 -0.067989 0.129233 0.394486 0.220315 0.036328 0.106785 0.050753 0.042040 0.227476 0.097964 -0.146463 -0.086737 -0.192048 0.102436 0.048390 0.252288 0.162928 -0.099392 -0.129585 -0.297918 0.209531 0.031609 -0.059314 0.148288 -0.108311 0.007163 -0.079583 -0.247702 0.189187 -0.315494 -0.062181 0.154824 0.183229 0.048364 0.169825 -0.116316 -0.206356 0.052429 -0.198205 0.095304 0.053307 -0.046643 0.057041 0.317158 0.027859 -0.118335 -0.134649 0.168562 -0.152011 -0.075425 -0.182528 -0.186689 0.242687 0.037514 -0.207988 0.212763 0.048557 -0.188600 0.014430 0.104766 -0.074656 -0.187524 0.066240 -0.183466 -0.084804 -0.197961 -0.310317 0.085690 0.015663 0.056105 -0.069942 0.077833 0.303278 -0.015325 -0.076172 0.255203 -0.115373 0.030646 0.163490 -0.027283 0.214768 0.096364 0.140757 0.070443 0.342046 0.112374 -0.112172 
-urna 0.019787 -0.246711 0.102682 -0.270225 -0.058952 -0.075583 -0.078469 -0.141556 -0.049547 0.156776 -0.014608 0.325663 -0.367836 -0.057603 0.131118 0.384944 0.214652 0.018553 0.110157 0.030113 0.029571 0.209352 0.087896 -0.141571 -0.086426 -0.182754 0.086266 0.043447 0.236065 0.152920 -0.082019 -0.121052 -0.264804 0.191093 0.030821 -0.048702 0.133592 -0.099424 0.006503 -0.074196 -0.223140 0.176562 -0.279691 -0.051420 0.135487 0.166096 0.054354 0.153235 -0.105238 -0.187497 0.040579 -0.172801 0.084337 0.046882 -0.047247 0.060868 0.273440 0.036552 -0.116715 -0.116735 0.156474 -0.137136 -0.061082 -0.164279 -0.168458 0.213837 0.036078 -0.187028 0.195644 0.048461 -0.171686 0.007276 0.104196 -0.061646 -0.161985 0.066817 -0.159897 -0.076780 -0.190358 -0.275276 0.075318 0.008854 0.059855 -0.069812 0.077964 0.265439 -0.019578 -0.077854 0.239834 -0.112650 0.018285 0.158122 -0.035927 0.199048 0.083180 0.129710 0.063343 0.300158 0.108203 -0.109171 
-diam 0.023906 -0.246688 0.105702 -0.278935 -0.058744 -0.081442 -0.085064 -0.140885 -0.050444 0.162377 -0.007958 0.336196 -0.352874 -0.073899 0.125430 0.391817 0.210513 0.027416 0.104937 0.037598 0.037195 0.212797 0.100925 -0.143929 -0.079696 -0.190059 0.078708 0.040954 0.257932 0.155271 -0.092809 -0.117511 -0.254015 0.189393 0.022684 -0.051427 0.152405 -0.091251 0.002148 -0.063526 -0.215262 0.182935 -0.278516 -0.066265 0.146218 0.152261 0.067232 0.157275 -0.116773 -0.180409 0.067711 -0.165255 0.090196 0.043455 -0.038947 0.051172 0.271594 0.027999 -0.114279 -0.138488 0.143465 -0.159561 -0.067728 -0.167700 -0.168648 0.232729 0.054213 -0.185310 0.193284 0.047375 -0.172128 0.008315 0.096549 -0.058198 -0.157924 0.068312 -0.162857 -0.086905 -0.195397 -0.268195 0.068502 0.004255 0.048338 -0.084665 0.070533 0.282443 -0.013517 -0.071363 0.257779 -0.114673 0.010315 0.155877 -0.043288 0.199387 0.085094 0.143459 0.065738 0.314914 0.108086 -0.100684 
-ultrices 0.044181 -0.254014 0.119219 -0.281824 -0.065544 -0.084299 -0.081990 -0.153294 -0.047491 0.188236 -0.013807 0.345758 -0.380452 -0.070412 0.147702 0.398851 0.243485 0.034248 0.101164 0.053825 0.039788 0.225304 0.095434 -0.153999 -0.087223 -0.210374 0.077733 0.030627 0.266591 0.158356 -0.092502 -0.128460 -0.286347 0.212429 0.038977 -0.058546 0.172981 -0.106247 0.011236 -0.059327 -0.237059 0.199796 -0.314145 -0.069919 0.159994 0.184861 0.070536 0.172982 -0.120479 -0.195370 0.065354 -0.192019 0.114832 0.058239 -0.058098 0.062781 0.307631 0.022885 -0.117271 -0.141801 0.149984 -0.169363 -0.075421 -0.179156 -0.184384 0.257439 0.062138 -0.204925 0.212621 0.066033 -0.188653 -0.000168 0.106847 -0.070728 -0.173072 0.078576 -0.179935 -0.100020 -0.194648 -0.313220 0.073881 0.016179 0.055496 -0.089214 0.076385 0.306263 -0.014638 -0.068049 0.247917 -0.112023 0.020671 0.170087 -0.032069 0.218032 0.086265 0.144772 0.074003 0.334297 0.109957 -0.105708 
-Nunc 0.020179 -0.260740 0.110279 -0.288482 -0.069694 -0.084338 -0.097830 -0.157446 -0.053483 0.178330 -0.024874 0.371405 -0.389176 -0.069018 0.139804 0.419846 0.234821 0.027089 0.104758 0.028632 0.037010 0.229945 0.102265 -0.156435 -0.091650 -0.220915 0.083810 0.037616 0.285837 0.179063 -0.091939 -0.113909 -0.297547 0.209981 0.040144 -0.058586 0.166639 -0.102558 0.013205 -0.071703 -0.239074 0.201188 -0.309169 -0.074625 0.159431 0.170757 0.060485 0.170899 -0.129923 -0.200494 0.053187 -0.183300 0.112617 0.055612 -0.040401 0.061061 0.294552 0.034945 -0.104403 -0.130996 0.149202 -0.162725 -0.078584 -0.168133 -0.180691 0.246773 0.057417 -0.188465 0.205340 0.058635 -0.182176 -0.000659 0.114139 -0.063995 -0.184831 0.074825 -0.168945 -0.090827 -0.206368 -0.300360 0.078143 0.020507 0.065540 -0.076199 0.079170 0.295485 -0.010762 -0.073159 0.255111 -0.125462 0.009075 0.173691 -0.030376 0.200686 0.101229 0.151491 0.076958 0.325807 0.114001 -0.111392 
-bibendum 0.032888 -0.282878 0.116083 -0.320017 -0.071153 -0.092723 -0.095462 -0.164157 -0.064393 0.195974 -0.010904 0.385501 -0.420563 -0.095544 0.151938 0.447421 0.255580 0.037540 0.127216 0.049800 0.048258 0.249842 0.110166 -0.172059 -0.102806 -0.224818 0.097864 0.039762 0.295652 0.171919 -0.097931 -0.139555 -0.317079 0.227410 0.030640 -0.056239 0.183171 -0.115932 0.013585 -0.073555 -0.266181 0.218149 -0.335473 -0.071098 0.177618 0.183975 0.078267 0.184444 -0.127816 -0.217151 0.080265 -0.203034 0.112903 0.055278 -0.046213 0.066742 0.342713 0.028479 -0.133438 -0.166346 0.183593 -0.179155 -0.080391 -0.192713 -0.193450 0.268026 0.054551 -0.221144 0.225637 0.058559 -0.202760 0.003843 0.115232 -0.072676 -0.195910 0.080280 -0.204467 -0.109120 -0.231150 -0.335740 0.076227 0.005531 0.056504 -0.091732 0.084273 0.338082 -0.019066 -0.084491 0.301396 -0.136992 0.009369 0.185166 -0.041041 0.237668 0.099244 0.161395 0.076999 0.373045 0.119056 -0.115781 
-Mauris 0.035279 -0.290881 0.128032 -0.321281 -0.072957 -0.106894 -0.101207 -0.170437 -0.070521 0.204739 0.000678 0.404362 -0.426786 -0.102306 0.158858 0.459465 0.255558 0.047404 0.120272 0.060430 0.043611 0.266924 0.120677 -0.175571 -0.096241 -0.235495 0.117472 0.040268 0.299543 0.174719 -0.112492 -0.146752 -0.331211 0.226776 0.026312 -0.053454 0.190752 -0.103775 0.011026 -0.084646 -0.273150 0.219078 -0.343250 -0.076352 0.170301 0.203116 0.059580 0.202286 -0.138427 -0.227828 0.064903 -0.214830 0.109797 0.054417 -0.056653 0.056299 0.351969 0.027289 -0.132701 -0.157772 0.173801 -0.178813 -0.081350 -0.190905 -0.211131 0.290786 0.062255 -0.240802 0.232961 0.048029 -0.195482 0.021116 0.121905 -0.074928 -0.196204 0.093224 -0.205271 -0.099848 -0.237129 -0.341204 0.092829 0.007529 0.067597 -0.093190 0.094046 0.348993 -0.023395 -0.090941 0.300335 -0.125351 0.025977 0.198593 -0.032149 0.241433 0.098631 0.168318 0.075134 0.382789 0.129592 -0.123422 
-nisi 0.027070 -0.281653 0.110483 -0.321145 -0.068560 -0.092259 -0.107535 -0.172324 -0.064750 0.191639 -0.011768 0.385750 -0.419255 -0.086015 0.142485 0.454854 0.255061 0.032003 0.113652 0.044220 0.039391 0.238979 0.106882 -0.173437 -0.086778 -0.219852 0.086135 0.045374 0.296049 0.175169 -0.102529 -0.127530 -0.295247 0.201993 0.031656 -0.050429 0.162674 -0.097442 0.016519 -0.063959 -0.237874 0.198806 -0.316390 -0.062852 0.171607 0.176632 0.076232 0.181520 -0.123361 -0.212706 0.067530 -0.197911 0.104618 0.055140 -0.045554 0.061578 0.318587 0.022859 -0.123568 -0.157689 0.163963 -0.170143 -0.078526 -0.184743 -0.193137 0.261346 0.060827 -0.204338 0.221077 0.050994 -0.193220 0.009629 0.109303 -0.071089 -0.188703 0.083552 -0.180791 -0.096989 -0.221773 -0.312515 0.078687 0.015160 0.063875 -0.095009 0.071529 0.322858 -0.024524 -0.071927 0.287289 -0.121639 0.022651 0.182735 -0.039324 0.220063 0.092425 0.160418 0.077659 0.342992 0.115809 -0.115733 
-nisl 0.026293 -0.274805 0.111313 -0.289441 -0.062646 -0.082394 -0.089617 -0.155958 -0.056329 0.186214 -0.018333 0.369062 -0.391321 -0.081127 0.139744 0.423232 0.242628 0.033339 0.118536 0.050663 0.036668 0.233153 0.102888 -0.163007 -0.094610 -0.216453 0.094459 0.046355 0.275994 0.171931 -0.098455 -0.122903 -0.296319 0.212439 0.022423 -0.054816 0.162935 -0.101471 0.006993 -0.069710 -0.253914 0.199675 -0.314598 -0.070161 0.165243 0.181378 0.063892 0.175067 -0.127907 -0.208256 0.060001 -0.191448 0.098449 0.059747 -0.049933 0.060838 0.327084 0.025305 -0.130977 -0.152227 0.166407 -0.165079 -0.071469 -0.182956 -0.188365 0.258374 0.048708 -0.213216 0.215935 0.053662 -0.188951 0.008791 0.111267 -0.071151 -0.186203 0.076188 -0.191612 -0.094055 -0.213594 -0.309293 0.081295 0.012625 0.060030 -0.086626 0.083192 0.319938 -0.024306 -0.082901 0.283065 -0.122885 0.013355 0.185037 -0.027997 0.235415 0.097900 0.159141 0.066025 0.353953 0.116981 -0.118312 
-Nam 0.030747 -0.281757 0.112839 -0.318268 -0.078765 -0.089125 -0.100565 -0.161062 -0.057282 0.185109 -0.021641 0.379636 -0.402697 -0.078618 0.150222 0.430596 0.246215 0.024085 0.117641 0.035347 0.042318 0.229581 0.103219 -0.163207 -0.087451 -0.215501 0.088366 0.031911 0.293673 0.174207 -0.102015 -0.119631 -0.292873 0.217189 0.034621 -0.053854 0.163323 -0.103394 0.014298 -0.061909 -0.238853 0.200300 -0.321340 -0.068641 0.163778 0.180559 0.065444 0.176239 -0.129350 -0.209699 0.064321 -0.195178 0.108501 0.052919 -0.039495 0.064072 0.309714 0.029992 -0.122269 -0.143777 0.164841 -0.170019 -0.087122 -0.174426 -0.183320 0.253944 0.058536 -0.193011 0.212354 0.056728 -0.191948 0.009525 0.111049 -0.063718 -0.181314 0.076047 -0.186402 -0.085415 -0.216929 -0.308708 0.073802 0.013003 0.064692 -0.088352 0.085356 0.305917 -0.011523 -0.080836 0.268614 -0.115057 0.013171 0.177234 -0.032310 0.209792 0.091980 0.154353 0.073898 0.337800 0.118715 -0.109916 
-pretium 0.021355 -0.287946 0.125280 -0.311771 -0.071709 -0.100489 -0.096387 -0.166768 -0.063198 0.198143 -0.024708 0.388622 -0.429592 -0.083426 0.159555 0.441257 0.244867 0.029698 0.117879 0.042765 0.041237 0.255784 0.118231 -0.175462 -0.097691 -0.217563 0.100070 0.039231 0.296174 0.184925 -0.112053 -0.136088 -0.321394 0.233400 0.029608 -0.053524 0.173046 -0.116201 0.012493 -0.088678 -0.273249 0.216804 -0.338197 -0.065703 0.176550 0.192787 0.063099 0.193868 -0.134955 -0.219600 0.066126 -0.215520 0.109744 0.061694 -0.048322 0.069860 0.344740 0.044809 -0.126440 -0.149124 0.187300 -0.171151 -0.089825 -0.204564 -0.197326 0.274117 0.061395 -0.231719 0.243512 0.046992 -0.204695 0.012562 0.118018 -0.084586 -0.206461 0.089079 -0.208360 -0.102277 -0.236278 -0.351256 0.091311 0.011249 0.064979 -0.088618 0.098128 0.359489 -0.015983 -0.091092 0.307459 -0.131624 0.015667 0.202244 -0.035125 0.251205 0.110347 0.168937 0.074483 0.376746 0.129412 -0.130407 
-fringilla 0.027287 -0.264015 0.115074 -0.289242 -0.071659 -0.088462 -0.092765 -0.152885 -0.045537 0.177565 -0.024764 0.358656 -0.402049 -0.073743 0.138076 0.403581 0.220665 0.029746 0.106780 0.049987 0.040356 0.224458 0.102992 -0.155751 -0.092494 -0.193199 0.091139 0.033585 0.266033 0.172911 -0.097388 -0.118272 -0.291173 0.210595 0.036564 -0.048398 0.156716 -0.104406 0.005471 -0.068882 -0.241163 0.197625 -0.301403 -0.065253 0.154927 0.174464 0.056253 0.169182 -0.121947 -0.193576 0.059945 -0.193067 0.093809 0.046411 -0.034415 0.064825 0.304366 0.025089 -0.124425 -0.138590 0.152184 -0.155546 -0.075779 -0.176892 -0.180533 0.247822 0.044666 -0.187529 0.212681 0.045695 -0.177382 0.010001 0.111959 -0.074097 -0.183727 0.071910 -0.182905 -0.090873 -0.199393 -0.300720 0.077331 0.008683 0.061619 -0.080868 0.074141 0.293272 -0.014167 -0.071922 0.261945 -0.119170 0.013468 0.157283 -0.034458 0.213416 0.089138 0.141361 0.074385 0.321574 0.108544 -0.103296 
-Aenean 0.029838 -0.284075 0.113592 -0.306323 -0.072810 -0.081458 -0.104092 -0.165274 -0.056695 0.179462 -0.022028 0.384964 -0.398081 -0.087210 0.152100 0.439009 0.250304 0.029183 0.111033 0.032004 0.030128 0.239922 0.104360 -0.169770 -0.099845 -0.220752 0.092683 0.041066 0.299767 0.174577 -0.096251 -0.129461 -0.310117 0.218329 0.037347 -0.057491 0.171893 -0.101280 0.018946 -0.068393 -0.257470 0.215023 -0.326638 -0.070378 0.168606 0.182762 0.070106 0.189458 -0.138839 -0.210622 0.071076 -0.197127 0.112139 0.058374 -0.049750 0.062738 0.331442 0.029158 -0.124782 -0.149604 0.169309 -0.165883 -0.088980 -0.176415 -0.183730 0.270431 0.066459 -0.199691 0.216576 0.050473 -0.193902 0.001443 0.127136 -0.063905 -0.190932 0.083576 -0.177971 -0.091773 -0.216780 -0.316633 0.075916 0.011386 0.062470 -0.083390 0.077818 0.323175 -0.015390 -0.084003 0.280151 -0.126328 0.012371 0.179280 -0.030458 0.222826 0.087872 0.164466 0.064907 0.347651 0.117857 -0.118382 
-scelerisque 0.018048 -0.252353 0.107909 -0.273718 -0.064274 -0.085536 -0.087612 -0.152223 -0.045382 0.170911 -0.023684 0.351282 -0.372671 -0.066770 0.132729 0.400062 0.223674 0.021025 0.103296 0.044258 0.033131 0.223815 0.095243 -0.146599 -0.082657 -0.193342 0.080689 0.031982 0.273010 0.168375 -0.087866 -0.127775 -0.279539 0.206104 0.034728 -0.054951 0.155007 -0.102926 -0.000744 -0.068446 -0.234090 0.195928 -0.288805 -0.066530 0.153422 0.158823 0.055468 0.161750 -0.121527 -0.194694 0.065114 -0.183955 0.096303 0.048429 -0.038974 0.067079 0.292920 0.026322 -0.110883 -0.134777 0.151385 -0.153384 -0.076452 -0.174087 -0.172191 0.237431 0.051773 -0.182956 0.200423 0.053477 -0.180403 0.004122 0.118206 -0.072575 -0.174602 0.080299 -0.178564 -0.096103 -0.211383 -0.307034 0.069630 0.005993 0.052275 -0.081160 0.068646 0.301075 -0.009571 -0.075256 0.258449 -0.122106 0.008282 0.164589 -0.040910 0.215020 0.092320 0.155993 0.071416 0.331531 0.114809 -0.115966 
-velit 0.022816 -0.253769 0.098960 -0.284739 -0.060884 -0.087475 -0.087807 -0.141268 -0.047951 0.169157 -0.008037 0.340610 -0.353208 -0.075856 0.126741 0.389687 0.208417 0.027808 0.102615 0.043940 0.040101 0.207073 0.096512 -0.139709 -0.077912 -0.192416 0.082267 0.035490 0.256423 0.156040 -0.088047 -0.123877 -0.280262 0.203054 0.020626 -0.051837 0.156219 -0.092357 0.006941 -0.073862 -0.226900 0.189003 -0.297595 -0.059157 0.153530 0.160050 0.066845 0.172839 -0.120481 -0.196880 0.068059 -0.184567 0.097335 0.049397 -0.038659 0.051267 0.299446 0.025140 -0.114383 -0.136432 0.154732 -0.156935 -0.082302 -0.168355 -0.173397 0.244923 0.050057 -0.192538 0.200106 0.046383 -0.172639 0.016724 0.102872 -0.059653 -0.170693 0.069392 -0.169132 -0.085129 -0.205248 -0.286870 0.065482 0.012711 0.050610 -0.072992 0.071972 0.291392 -0.016239 -0.077812 0.259187 -0.114415 0.019741 0.155548 -0.030193 0.208275 0.080972 0.135650 0.063840 0.322120 0.102439 -0.101162 
-aliquam 0.025275 -0.306947 0.131565 -0.347171 -0.086462 -0.109472 -0.113240 -0.191142 -0.064235 0.220780 -0.021860 0.437251 -0.467600 -0.096480 0.169620 0.488521 0.285976 0.035210 0.129890 0.053257 0.043314 0.263792 0.122174 -0.183868 -0.106239 -0.249346 0.102602 0.038301 0.336955 0.217175 -0.114521 -0.155024 -0.358323 0.240438 0.046942 -0.063894 0.189581 -0.118124 0.007711 -0.078873 -0.288679 0.239555 -0.373355 -0.080996 0.197164 0.197318 0.085698 0.213701 -0.151007 -0.229593 0.071932 -0.219115 0.120149 0.067001 -0.048394 0.084151 0.373521 0.034854 -0.134297 -0.174917 0.195753 -0.200194 -0.096068 -0.206032 -0.214924 0.286156 0.056597 -0.222346 0.247815 0.067436 -0.215198 0.005793 0.140218 -0.083063 -0.214821 0.090742 -0.202260 -0.114564 -0.259560 -0.368640 0.090922 0.020631 0.068099 -0.103613 0.087546 0.365656 -0.014445 -0.095291 0.321189 -0.150863 0.017563 0.197083 -0.043175 0.265505 0.127859 0.176475 0.083288 0.412373 0.142738 -0.126783 
-fermentum 0.019634 -0.262358 0.105968 -0.289709 -0.065268 -0.081586 -0.093437 -0.160488 -0.055953 0.179399 -0.032501 0.362227 -0.392529 -0.077470 0.148261 0.412933 0.230168 0.027296 0.099098 0.049286 0.032065 0.225359 0.102793 -0.154293 -0.092548 -0.219593 0.080843 0.041265 0.277478 0.173852 -0.101730 -0.129425 -0.292733 0.203391 0.035656 -0.058334 0.168687 -0.102516 0.011182 -0.069014 -0.242497 0.204172 -0.302021 -0.058796 0.158475 0.165719 0.057549 0.182052 -0.123488 -0.193853 0.072042 -0.176322 0.111826 0.052842 -0.039846 0.056014 0.310333 0.023770 -0.122382 -0.150848 0.164125 -0.164787 -0.080025 -0.175313 -0.170985 0.245593 0.055294 -0.188136 0.199061 0.049687 -0.184208 0.007173 0.123819 -0.066332 -0.176249 0.070924 -0.173283 -0.089381 -0.201386 -0.299746 0.077083 0.021250 0.057722 -0.081900 0.077763 0.307584 -0.010936 -0.076697 0.264790 -0.110401 0.014867 0.166841 -0.040549 0.213750 0.102060 0.144406 0.080548 0.326104 0.114272 -0.107674 
-Vivamus 0.026930 -0.269238 0.119233 -0.299789 -0.073484 -0.091308 -0.093975 -0.159482 -0.050139 0.189204 -0.021421 0.380083 -0.404839 -0.082862 0.151574 0.428725 0.240849 0.040054 0.119354 0.048586 0.032752 0.240876 0.102673 -0.164237 -0.086760 -0.203939 0.097050 0.042078 0.268521 0.172823 -0.101502 -0.120642 -0.294020 0.216325 0.028203 -0.055412 0.161654 -0.100410 0.009247 -0.066730 -0.246115 0.203638 -0.311484 -0.073698 0.162012 0.188654 0.057503 0.180391 -0.127030 -0.206617 0.062213 -0.194241 0.103099 0.048926 -0.038267 0.053935 0.318527 0.028037 -0.120180 -0.138956 0.171625 -0.162901 -0.075517 -0.175517 -0.187499 0.261901 0.053481 -0.203162 0.223263 0.051937 -0.193050 0.010769 0.123215 -0.069989 -0.185679 0.076598 -0.180824 -0.094445 -0.217702 -0.322276 0.079707 0.004669 0.060457 -0.084580 0.079882 0.311088 -0.023843 -0.081416 0.266645 -0.121729 0.020863 0.181259 -0.034802 0.209143 0.093483 0.144522 0.077001 0.338087 0.113110 -0.112800 
-mi 0.023884 -0.258149 0.107502 -0.281148 -0.058059 -0.087250 -0.093230 -0.150369 -0.049197 0.172495 -0.025883 0.354131 -0.377557 -0.072445 0.125952 0.408572 0.220279 0.030118 0.112933 0.037678 0.027399 0.222349 0.094851 -0.141920 -0.079720 -0.194726 0.083993 0.038262 0.262609 0.163497 -0.087325 -0.120673 -0.283104 0.213774 0.031211 -0.043588 0.163714 -0.098218 0.012055 -0.060228 -0.226625 0.190772 -0.297810 -0.061784 0.155671 0.172519 0.057561 0.164131 -0.112201 -0.198342 0.064376 -0.191736 0.099663 0.043303 -0.043917 0.063795 0.298348 0.034661 -0.122263 -0.143259 0.150740 -0.160283 -0.069260 -0.169196 -0.183074 0.236209 0.044887 -0.191767 0.198140 0.053390 -0.191383 0.005926 0.109840 -0.069635 -0.178557 0.077239 -0.174152 -0.087873 -0.207582 -0.298639 0.078105 0.006207 0.060013 -0.087208 0.076931 0.292712 -0.011150 -0.079564 0.270689 -0.123642 0.014594 0.173510 -0.024909 0.213102 0.089699 0.144594 0.075365 0.332998 0.106432 -0.117878 
-porttitor 0.026594 -0.252090 0.100443 -0.295027 -0.069269 -0.092222 -0.085601 -0.144654 -0.068639 0.166341 0.008241 0.334128 -0.356504 -0.111088 0.144594 0.425732 0.224678 0.048322 0.100676 0.056297 0.060738 0.238524 0.124453 -0.160628 -0.083183 -0.219745 0.096288 0.031896 0.278341 0.161212 -0.095731 -0.131493 -0.278575 0.205574 0.017382 -0.040308 0.182531 -0.090503 0.025916 -0.061965 -0.224661 0.220408 -0.314696 -0.072357 0.158711 0.164414 0.073090 0.189367 -0.111663 -0.199750 0.076520 -0.190010 0.073340 0.039795 -0.034943 0.049728 0.330430 0.025855 -0.143162 -0.167738 0.181504 -0.178373 -0.082310 -0.187829 -0.190826 0.263028 0.061255 -0.229603 0.209068 0.044506 -0.182778 0.019631 0.098922 -0.064817 -0.164714 0.076131 -0.205304 -0.098366 -0.238163 -0.288372 0.080341 -0.005499 0.050202 -0.099216 0.077828 0.336715 -0.028680 -0.077603 0.290700 -0.100217 0.003049 0.183528 -0.041706 0.236156 0.085675 0.137039 0.060311 0.333698 0.097181 -0.111091 
-vehicula 0.026142 -0.296336 0.126602 -0.332455 -0.082353 -0.100733 -0.104821 -0.184181 -0.056218 0.203198 -0.026953 0.410169 -0.434922 -0.091827 0.160546 0.456602 0.254228 0.025947 0.120749 0.051753 0.038578 0.251712 0.110290 -0.173857 -0.101896 -0.240564 0.091546 0.036965 0.317676 0.196897 -0.110418 -0.151564 -0.330245 0.235512 0.032513 -0.060715 0.185069 -0.112615 0.006119 -0.082442 -0.276548 0.221963 -0.339995 -0.075119 0.185018 0.191197 0.074904 0.200792 -0.148187 -0.229076 0.068525 -0.214329 0.115800 0.054302 -0.050699 0.074065 0.348544 0.034665 -0.130800 -0.162924 0.179567 -0.185539 -0.086732 -0.203680 -0.205157 0.278542 0.055834 -0.226920 0.235274 0.055134 -0.214506 0.004691 0.127395 -0.073172 -0.211222 0.085118 -0.199581 -0.108563 -0.238309 -0.356584 0.077997 0.016285 0.072843 -0.094956 0.085268 0.348745 -0.022069 -0.092557 0.302651 -0.134834 0.016112 0.192705 -0.046921 0.246286 0.112137 0.164174 0.083003 0.380466 0.125433 -0.125215 
-nulla 0.024359 -0.254458 0.102460 -0.276311 -0.065392 -0.088337 -0.089882 -0.146861 -0.038892 0.167877 -0.023347 0.361356 -0.382005 -0.069485 0.124373 0.397324 0.221659 0.024152 0.111688 0.039690 0.027446 0.217916 0.090426 -0.146055 -0.083952 -0.206610 0.084193 0.034971 0.278115 0.183684 -0.090543 -0.127080 -0.290779 0.212965 0.026968 -0.046295 0.164009 -0.103483 0.008085 -0.071804 -0.247443 0.199326 -0.310589 -0.066539 0.154778 0.167028 0.067233 0.177628 -0.117020 -0.196416 0.058492 -0.178399 0.097552 0.044624 -0.040329 0.072510 0.294844 0.035007 -0.114099 -0.133022 0.162394 -0.164393 -0.069882 -0.170265 -0.179153 0.235424 0.041760 -0.189683 0.204807 0.051612 -0.179299 0.008306 0.107122 -0.070822 -0.182920 0.076673 -0.170854 -0.083080 -0.198758 -0.296043 0.065044 0.020018 0.061539 -0.083934 0.079921 0.290406 -0.009994 -0.074961 0.253339 -0.124701 0.016647 0.160262 -0.039174 0.210773 0.090684 0.144836 0.061008 0.328710 0.109649 -0.108538 
-dapibus 0.039368 -0.287575 0.121691 -0.327742 -0.074443 -0.092988 -0.097806 -0.166455 -0.056465 0.201564 -0.012587 0.382924 -0.422574 -0.077857 0.151862 0.432237 0.250337 0.030775 0.123846 0.058084 0.038379 0.233389 0.106965 -0.162927 -0.096343 -0.216224 0.092799 0.035627 0.286487 0.181395 -0.104122 -0.132960 -0.308766 0.221769 0.030135 -0.052596 0.161720 -0.113766 0.011496 -0.072233 -0.265072 0.199829 -0.334677 -0.064954 0.168596 0.185965 0.060276 0.177140 -0.134892 -0.210245 0.072138 -0.203483 0.111459 0.052842 -0.047200 0.064742 0.328094 0.033457 -0.126686 -0.149425 0.175791 -0.166439 -0.084528 -0.186980 -0.189866 0.268617 0.052520 -0.211131 0.230470 0.055988 -0.199238 0.005187 0.120030 -0.079327 -0.183596 0.075679 -0.189914 -0.097054 -0.217502 -0.329155 0.082501 0.021992 0.063984 -0.082589 0.088081 0.317770 -0.015806 -0.086447 0.277029 -0.126047 0.020525 0.180399 -0.031359 0.222888 0.094126 0.143881 0.076290 0.345795 0.117684 -0.118119 
-volutpat 0.015707 -0.274159 0.118501 -0.306167 -0.065111 -0.083267 -0.096998 -0.168488 -0.049636 0.188637 -0.024924 0.381408 -0.404946 -0.079764 0.138992 0.428483 0.248044 0.034858 0.117666 0.042157 0.037519 0.235111 0.102111 -0.154685 -0.092544 -0.219138 0.093945 0.044262 0.288962 0.180749 -0.100576 -0.129579 -0.308647 0.228078 0.032113 -0.056481 0.171314 -0.110914 0.008124 -0.072790 -0.257888 0.215380 -0.320721 -0.064732 0.169642 0.179979 0.076317 0.191744 -0.135078 -0.218626 0.066800 -0.200269 0.106281 0.055624 -0.040547 0.073732 0.328831 0.033189 -0.123455 -0.146082 0.168278 -0.180871 -0.077370 -0.187658 -0.190556 0.253746 0.056198 -0.204289 0.228497 0.058118 -0.209050 0.001207 0.123666 -0.078131 -0.183448 0.077778 -0.192010 -0.097173 -0.223612 -0.321690 0.076011 0.015335 0.062463 -0.092291 0.080075 0.322546 -0.010752 -0.086853 0.275508 -0.124128 0.020188 0.172722 -0.037422 0.230571 0.106238 0.159956 0.078442 0.351111 0.119429 -0.118970 
-laoreet 0.023570 -0.242552 0.101629 -0.281093 -0.065110 -0.081618 -0.096271 -0.145695 -0.051000 0.171466 -0.011655 0.354580 -0.373804 -0.086761 0.132715 0.412738 0.231560 0.033609 0.106784 0.045529 0.047836 0.226408 0.112253 -0.161754 -0.087670 -0.222974 0.091069 0.035420 0.284882 0.173716 -0.100230 -0.131886 -0.292695 0.198795 0.031578 -0.043806 0.169169 -0.100113 0.012045 -0.061929 -0.229757 0.211688 -0.313267 -0.068215 0.167124 0.169495 0.073103 0.175283 -0.125032 -0.201446 0.068490 -0.188921 0.100808 0.059029 -0.035349 0.057284 0.316905 0.029204 -0.124407 -0.156925 0.167554 -0.181201 -0.089612 -0.178307 -0.191250 0.254434 0.054485 -0.197160 0.205720 0.046158 -0.191134 0.013298 0.115114 -0.073552 -0.174169 0.076486 -0.195215 -0.103631 -0.229274 -0.301576 0.075653 0.012465 0.057704 -0.097500 0.071588 0.315928 -0.018054 -0.073421 0.277314 -0.113026 0.000516 0.172494 -0.035875 0.227828 0.094500 0.147320 0.073785 0.336542 0.102439 -0.114422 
-ligula 0.021823 -0.279320 0.112223 -0.314118 -0.076989 -0.090425 -0.092479 -0.165608 -0.060324 0.187309 -0.016542 0.378216 -0.401097 -0.081546 0.152352 0.440894 0.242459 0.035767 0.115181 0.043141 0.043162 0.247735 0.107960 -0.167266 -0.093314 -0.227467 0.103838 0.041976 0.288642 0.184033 -0.101936 -0.139856 -0.311927 0.218825 0.023258 -0.056199 0.165857 -0.103676 0.009237 -0.075099 -0.249040 0.205076 -0.325106 -0.064118 0.169832 0.183260 0.069670 0.192223 -0.120430 -0.215884 0.071591 -0.207024 0.099204 0.056857 -0.043040 0.068506 0.329434 0.025345 -0.130438 -0.153447 0.177397 -0.173924 -0.083454 -0.185583 -0.201183 0.260198 0.057432 -0.218333 0.218818 0.050784 -0.201337 0.013281 0.111398 -0.075586 -0.184565 0.074386 -0.195196 -0.098269 -0.223710 -0.317712 0.082683 0.007079 0.052428 -0.091136 0.087809 0.332196 -0.014999 -0.077274 0.289804 -0.124316 0.020380 0.182669 -0.031269 0.232669 0.107480 0.158206 0.072300 0.356583 0.116072 -0.119309 
-erat 0.022665 -0.269378 0.114818 -0.302595 -0.074674 -0.093207 -0.098666 -0.156202 -0.052705 0.193853 -0.010687 0.364099 -0.404840 -0.075352 0.145273 0.434095 0.230267 0.037679 0.112168 0.044095 0.038890 0.226991 0.102910 -0.160752 -0.089008 -0.216472 0.089254 0.038946 0.278351 0.183677 -0.093157 -0.134878 -0.301267 0.212650 0.029320 -0.051120 0.161728 -0.098084 0.012964 -0.080315 -0.241689 0.198488 -0.314724 -0.064281 0.165357 0.171027 0.066527 0.186941 -0.128727 -0.212013 0.068509 -0.195025 0.103269 0.056389 -0.045932 0.057571 0.317990 0.039399 -0.119924 -0.141672 0.166543 -0.168943 -0.078612 -0.184986 -0.187171 0.247792 0.047863 -0.216506 0.216293 0.057934 -0.186378 0.011301 0.105133 -0.072728 -0.192733 0.077455 -0.180334 -0.090982 -0.213463 -0.317514 0.085732 0.006723 0.061702 -0.087834 0.088048 0.319169 -0.026346 -0.089378 0.274737 -0.122720 0.022590 0.176510 -0.033745 0.220264 0.103159 0.153966 0.077964 0.354930 0.113616 -0.111625 
-Nulla 0.030647 -0.313669 0.137657 -0.359232 -0.087290 -0.105930 -0.108734 -0.178915 -0.072163 0.218855 -0.018813 0.435993 -0.460660 -0.096808 0.173753 0.491431 0.278303 0.039698 0.130801 0.055875 0.037843 0.270667 0.126768 -0.183087 -0.100812 -0.243257 0.106465 0.041715 0.318239 0.195856 -0.102666 -0.137510 -0.335338 0.226243 0.038308 -0.064284 0.186949 -0.108524 0.007653 -0.080956 -0.276548 0.223787 -0.350854 -0.078131 0.185614 0.199196 0.073797 0.194653 -0.137652 -0.222358 0.070728 -0.219326 0.116663 0.064133 -0.053203 0.070466 0.347386 0.027902 -0.126239 -0.160549 0.181613 -0.175252 -0.090094 -0.202487 -0.213397 0.277811 0.062433 -0.226417 0.244198 0.059569 -0.216187 0.009059 0.124483 -0.076184 -0.200022 0.086562 -0.196694 -0.107002 -0.236678 -0.352815 0.075281 0.016982 0.065992 -0.094113 0.084285 0.347995 -0.011005 -0.091171 0.299335 -0.134187 0.022358 0.202744 -0.035054 0.240636 0.108935 0.174492 0.087581 0.388689 0.122768 -0.128391 
-lectus 0.022409 -0.258627 0.107858 -0.292439 -0.073706 -0.083502 -0.088829 -0.158718 -0.060315 0.180580 -0.013916 0.363745 -0.384154 -0.085112 0.138922 0.412429 0.222117 0.032064 0.107022 0.040620 0.037795 0.209105 0.096763 -0.146570 -0.087695 -0.203540 0.081368 0.039452 0.261210 0.167652 -0.094417 -0.127621 -0.278024 0.198052 0.019536 -0.050892 0.160263 -0.100379 0.007215 -0.065355 -0.224514 0.198712 -0.306908 -0.066255 0.151753 0.161493 0.060122 0.170448 -0.127504 -0.203682 0.062624 -0.183727 0.090891 0.046205 -0.034835 0.056372 0.297703 0.020397 -0.122664 -0.149956 0.152898 -0.152584 -0.078072 -0.171798 -0.183614 0.239304 0.046263 -0.200198 0.202842 0.047584 -0.183146 0.015177 0.114495 -0.067768 -0.173996 0.069204 -0.175138 -0.094015 -0.211816 -0.299414 0.076916 0.011808 0.054872 -0.080330 0.078298 0.303158 -0.013809 -0.074783 0.278980 -0.115329 0.020182 0.170458 -0.037521 0.208849 0.099431 0.153426 0.077785 0.334756 0.108128 -0.114988 
-Fusce 0.016448 -0.251109 0.107867 -0.279137 -0.061913 -0.083247 -0.096886 -0.151205 -0.050191 0.170877 -0.021073 0.348198 -0.377124 -0.066542 0.131914 0.401264 0.223545 0.030114 0.102739 0.031307 0.035744 0.222052 0.097781 -0.148706 -0.082061 -0.202572 0.083617 0.038625 0.268764 0.170855 -0.086389 -0.115917 -0.278708 0.194782 0.031143 -0.055332 0.159869 -0.091688 0.007282 -0.072316 -0.233729 0.189293 -0.287761 -0.061073 0.160619 0.166866 0.066995 0.172605 -0.122626 -0.187936 0.056867 -0.184767 0.104305 0.056272 -0.046349 0.065345 0.295855 0.024607 -0.104555 -0.140085 0.150532 -0.148378 -0.068341 -0.163959 -0.179119 0.237925 0.052417 -0.192450 0.202781 0.048947 -0.184444 0.009903 0.104272 -0.067462 -0.165810 0.080841 -0.164008 -0.082847 -0.207030 -0.290913 0.063613 0.008573 0.058447 -0.080579 0.079839 0.288388 -0.011540 -0.075026 0.252482 -0.107920 0.018327 0.167289 -0.038992 0.197035 0.091607 0.155050 0.066009 0.320972 0.120051 -0.109461 
-lorem 0.024603 -0.232753 0.103764 -0.268039 -0.053656 -0.077398 -0.084061 -0.136131 -0.050297 0.157217 -0.017601 0.315244 -0.335037 -0.068106 0.122884 0.376344 0.208844 0.035666 0.092075 0.033151 0.038559 0.198989 0.096905 -0.139182 -0.070752 -0.188557 0.079368 0.026374 0.246753 0.155593 -0.081080 -0.116156 -0.252202 0.187120 0.023626 -0.042107 0.152126 -0.086621 0.008382 -0.064623 -0.204008 0.179282 -0.266556 -0.060907 0.143612 0.153145 0.048211 0.153440 -0.109412 -0.178217 0.055191 -0.160448 0.076916 0.044719 -0.026591 0.046812 0.265653 0.020112 -0.112493 -0.118253 0.139345 -0.142294 -0.069575 -0.157777 -0.154170 0.209936 0.041897 -0.173254 0.184909 0.040587 -0.158743 0.016282 0.088945 -0.052696 -0.152767 0.063852 -0.151965 -0.075548 -0.186590 -0.259989 0.068079 0.007295 0.052170 -0.068378 0.067739 0.270978 -0.016710 -0.064117 0.245375 -0.097659 0.015232 0.156437 -0.023652 0.185174 0.084694 0.122689 0.057802 0.296323 0.098580 -0.094509 
-viverra 0.032083 -0.284510 0.125995 -0.321341 -0.077455 -0.101044 -0.091276 -0.169206 -0.061950 0.201615 -0.020609 0.384605 -0.426385 -0.086521 0.157419 0.453804 0.252689 0.037475 0.123610 0.057622 0.052601 0.246953 0.108267 -0.170269 -0.094997 -0.227868 0.100357 0.048090 0.298234 0.191693 -0.109718 -0.147883 -0.330150 0.237456 0.033760 -0.056480 0.181927 -0.110083 -0.000126 -0.084948 -0.278057 0.216809 -0.354126 -0.069037 0.188179 0.194305 0.079710 0.203202 -0.134005 -0.228758 0.066164 -0.213135 0.111283 0.062482 -0.049267 0.065286 0.351569 0.033083 -0.137437 -0.155541 0.194953 -0.187425 -0.086859 -0.195969 -0.208987 0.275100 0.069379 -0.223293 0.245106 0.060088 -0.213438 0.012083 0.127376 -0.079465 -0.202166 0.091911 -0.204141 -0.111458 -0.243971 -0.354718 0.085817 0.012170 0.060092 -0.103349 0.088135 0.349692 -0.019386 -0.095673 0.309040 -0.135328 0.018081 0.190818 -0.039259 0.252911 0.109143 0.170186 0.080586 0.386051 0.132486 -0.129748 
-Etiam 0.027229 -0.286423 0.124552 -0.314468 -0.068275 -0.089191 -0.100820 -0.171412 -0.055039 0.193537 -0.022384 0.403071 -0.418633 -0.085673 0.155429 0.452145 0.250199 0.029349 0.112946 0.041241 0.037776 0.247124 0.115199 -0.166766 -0.089197 -0.226890 0.098304 0.037957 0.308155 0.187820 -0.103490 -0.136481 -0.318342 0.220989 0.038687 -0.058332 0.183447 -0.101306 0.017535 -0.067465 -0.260978 0.218531 -0.330824 -0.067411 0.168824 0.188340 0.063153 0.191218 -0.127850 -0.221636 0.057235 -0.200469 0.109040 0.059508 -0.048556 0.062932 0.325210 0.028047 -0.118096 -0.145971 0.162120 -0.165612 -0.079873 -0.189467 -0.194097 0.269260 0.049898 -0.210676 0.222805 0.054472 -0.196816 0.014274 0.126282 -0.069447 -0.189528 0.082414 -0.190952 -0.091110 -0.225345 -0.328843 0.071148 0.014822 0.070227 -0.081358 0.082172 0.320428 -0.020930 -0.078572 0.275538 -0.125324 0.015481 0.190255 -0.034172 0.230167 0.095012 0.165808 0.076330 0.356165 0.124080 -0.122668 
-pellentesque 0.035162 -0.300393 0.132138 -0.331478 -0.076436 -0.089330 -0.104130 -0.169910 -0.068979 0.199975 -0.017279 0.405169 -0.425148 -0.093309 0.159970 0.444969 0.254694 0.031831 0.121419 0.049313 0.044458 0.237654 0.112403 -0.169685 -0.097288 -0.224584 0.092394 0.039139 0.285840 0.175212 -0.105050 -0.131110 -0.320571 0.231119 0.037544 -0.053989 0.173408 -0.113429 0.008397 -0.074476 -0.275377 0.213616 -0.343056 -0.065698 0.179565 0.190148 0.075191 0.199402 -0.134764 -0.225219 0.077951 -0.207330 0.107646 0.053333 -0.042666 0.074599 0.343716 0.029611 -0.136437 -0.153084 0.182658 -0.167920 -0.084362 -0.197883 -0.187100 0.278699 0.050555 -0.215446 0.227598 0.053537 -0.204540 -0.000790 0.123953 -0.071570 -0.201039 0.073572 -0.192275 -0.103038 -0.216746 -0.333857 0.081662 0.016746 0.058488 -0.080938 0.085892 0.335949 -0.024734 -0.082973 0.289677 -0.133775 0.017215 0.189992 -0.040771 0.243999 0.102272 0.162706 0.079732 0.375642 0.119605 -0.123845 
-aliquet 0.027926 -0.263538 0.118923 -0.288770 -0.071738 -0.082726 -0.097483 -0.153590 -0.058391 0.174409 -0.017285 0.360822 -0.379907 -0.079037 0.142831 0.410826 0.230085 0.033675 0.114045 0.037825 0.032520 0.230801 0.101442 -0.154544 -0.085677 -0.205303 0.084903 0.037574 0.269930 0.176156 -0.090706 -0.124414 -0.290478 0.212019 0.035095 -0.047598 0.164087 -0.096706 0.009772 -0.070896 -0.242961 0.203358 -0.313266 -0.068688 0.160236 0.167782 0.067416 0.174747 -0.132037 -0.206557 0.062458 -0.189433 0.105931 0.054855 -0.036684 0.067185 0.314880 0.031673 -0.115177 -0.143147 0.163652 -0.169550 -0.077544 -0.181056 -0.177525 0.252758 0.053185 -0.191506 0.219320 0.054844 -0.194965 0.010204 0.113683 -0.072516 -0.182823 0.070702 -0.185501 -0.093775 -0.223266 -0.318281 0.064556 0.019490 0.052691 -0.087292 0.084563 0.321168 -0.017011 -0.082735 0.277168 -0.123959 0.009715 0.168886 -0.030708 0.228256 0.097754 0.150539 0.077173 0.341246 0.118388 -0.118184 
-quam 0.015792 -0.253328 0.118113 -0.283412 -0.063943 -0.086683 -0.083783 -0.157877 -0.043967 0.167107 -0.018960 0.347436 -0.381705 -0.076255 0.137599 0.400308 0.218207 0.022440 0.104471 0.042458 0.026375 0.210166 0.091964 -0.140453 -0.080313 -0.200296 0.091319 0.035097 0.264900 0.172958 -0.093372 -0.123516 -0.275025 0.193082 0.026741 -0.062108 0.149759 -0.095009 0.011878 -0.071947 -0.240688 0.183752 -0.290809 -0.061787 0.147407 0.160824 0.057157 0.167056 -0.116294 -0.197089 0.053110 -0.178505 0.094518 0.049251 -0.043437 0.051504 0.297698 0.027944 -0.110971 -0.138336 0.148917 -0.149926 -0.068052 -0.166672 -0.177249 0.232391 0.048797 -0.197974 0.200293 0.044761 -0.176210 0.002070 0.111027 -0.067269 -0.180414 0.065179 -0.166407 -0.075999 -0.203458 -0.302996 0.075332 0.019393 0.058310 -0.077275 0.072108 0.296724 -0.017588 -0.082723 0.257168 -0.118609 0.028860 0.162835 -0.038354 0.207671 0.094876 0.149600 0.069170 0.327663 0.112567 -0.107642 
-pulvinar 0.019379 -0.251043 0.116536 -0.292179 -0.072848 -0.085994 -0.093190 -0.158451 -0.049908 0.181403 -0.028265 0.356715 -0.395553 -0.067440 0.143110 0.418341 0.231480 0.028735 0.116163 0.039923 0.038478 0.231603 0.102834 -0.154951 -0.090628 -0.209943 0.079849 0.039434 0.267525 0.168839 -0.089451 -0.124677 -0.296165 0.208060 0.034341 -0.053343 0.158801 -0.103483 0.010715 -0.066885 -0.232839 0.195442 -0.308849 -0.059418 0.162690 0.177708 0.064959 0.172257 -0.126280 -0.196239 0.059817 -0.193205 0.110922 0.057225 -0.052148 0.061698 0.309384 0.029416 -0.120701 -0.135053 0.163847 -0.164350 -0.074377 -0.175991 -0.176854 0.249255 0.048264 -0.191398 0.210874 0.049174 -0.196531 0.007814 0.119934 -0.075114 -0.179975 0.071898 -0.172641 -0.097479 -0.203865 -0.303997 0.069381 0.021593 0.059462 -0.082696 0.076032 0.311223 -0.017737 -0.075546 0.261988 -0.127798 0.011025 0.164931 -0.031188 0.221047 0.100066 0.151809 0.070844 0.333677 0.101421 -0.112307 
-Praesent 0.025668 -0.295048 0.118240 -0.341781 -0.071744 -0.104884 -0.107133 -0.165713 -0.069264 0.205463 -0.006936 0.407309 -0.423875 -0.095277 0.159741 0.466132 0.267189 0.039308 0.125312 0.059948 0.050897 0.266091 0.126181 -0.174975 -0.099942 -0.245541 0.099736 0.042754 0.319291 0.192015 -0.101704 -0.136018 -0.336909 0.235895 0.030278 -0.054827 0.192186 -0.117200 0.018808 -0.075863 -0.271689 0.228167 -0.357948 -0.085046 0.188127 0.197713 0.084418 0.198285 -0.140415 -0.237816 0.084723 -0.222214 0.108783 0.060636 -0.051882 0.056759 0.356429 0.031179 -0.134694 -0.167667 0.193194 -0.189376 -0.094421 -0.203556 -0.207308 0.295688 0.062397 -0.236342 0.246890 0.063343 -0.212851 0.011168 0.124368 -0.081049 -0.199027 0.085971 -0.217876 -0.108881 -0.252010 -0.344477 0.080427 0.008654 0.067113 -0.100456 0.094899 0.366697 -0.016456 -0.090141 0.317863 -0.138504 0.014791 0.208729 -0.038876 0.244172 0.098944 0.167211 0.080352 0.385756 0.132551 -0.123655 
-Morbi 0.025792 -0.284652 0.127219 -0.324281 -0.077777 -0.102180 -0.108918 -0.158812 -0.064417 0.189046 0.001477 0.394660 -0.401872 -0.097793 0.158830 0.452510 0.249238 0.045858 0.110885 0.039844 0.042166 0.242398 0.121430 -0.170105 -0.092025 -0.224631 0.089700 0.034510 0.306673 0.187131 -0.093608 -0.138300 -0.326199 0.220395 0.021503 -0.051912 0.189470 -0.101102 0.013179 -0.067620 -0.242132 0.219196 -0.338511 -0.073089 0.176098 0.191278 0.075460 0.193373 -0.134232 -0.218143 0.080193 -0.211895 0.096456 0.057906 -0.036732 0.067581 0.330754 0.032071 -0.132121 -0.164457 0.175645 -0.180745 -0.083516 -0.187789 -0.191800 0.272800 0.061130 -0.220683 0.220999 0.053644 -0.203644 0.019500 0.113000 -0.070314 -0.182824 0.075010 -0.193078 -0.100299 -0.233067 -0.319232 0.070918 0.000475 0.067620 -0.092192 0.077779 0.329974 -0.025328 -0.074566 0.288310 -0.118652 0.018196 0.189426 -0.028932 0.219892 0.104693 0.162815 0.076419 0.353673 0.121683 -0.118087 
-elit 0.032789 -0.268966 0.113939 -0.305599 -0.073268 -0.094258 -0.099054 -0.162268 -0.058976 0.186229 -0.021685 0.379564 -0.403814 -0.079688 0.142411 0.435987 0.236779 0.026288 0.118980 0.041843 0.040848 0.232468 0.102499 -0.164429 -0.093240 -0.214560 0.091982 0.037081 0.275045 0.172413 -0.093302 -0.126277 -0.291599 0.216158 0.029177 -0.058929 0.161843 -0.107817 0.004276 -0.072329 -0.251858 0.194156 -0.309970 -0.071065 0.159282 0.178168 0.060098 0.177486 -0.119660 -0.197154 0.057955 -0.192565 0.095716 0.046364 -0.040913 0.067611 0.303621 0.038545 -0.114833 -0.149081 0.161859 -0.165847 -0.069141 -0.178523 -0.187471 0.254606 0.054964 -0.209049 0.214071 0.055550 -0.184857 0.006544 0.118382 -0.066267 -0.180304 0.070416 -0.176330 -0.089417 -0.207063 -0.314450 0.083041 0.015349 0.061572 -0.086388 0.084324 0.316660 -0.011869 -0.081031 0.277384 -0.123333 0.022823 0.175912 -0.037448 0.219935 0.101177 0.148161 0.073497 0.349699 0.121623 -0.112360 
-lacinia 0.022626 -0.253372 0.104760 -0.282161 -0.061705 -0.086019 -0.081924 -0.150405 -0.051249 0.170543 -0.010480 0.341733 -0.370404 -0.082380 0.133620 0.397957 0.217229 0.036147 0.105096 0.049479 0.039053 0.222147 0.094119 -0.146815 -0.083482 -0.188767 0.089851 0.033825 0.248708 0.155699 -0.094200 -0.133296 -0.269355 0.206029 0.020919 -0.047348 0.153486 -0.094621 0.011489 -0.066089 -0.229272 0.182431 -0.290960 -0.058384 0.151459 0.168538 0.059869 0.175675 -0.115845 -0.187104 0.059279 -0.171679 0.088913 0.045235 -0.041169 0.046830 0.296081 0.030109 -0.122525 -0.131412 0.152712 -0.149657 -0.074086 -0.170389 -0.168572 0.229924 0.049101 -0.191925 0.198569 0.047413 -0.175447 0.010258 0.096791 -0.060605 -0.174858 0.068394 -0.178320 -0.090676 -0.198716 -0.280609 0.073285 0.011207 0.048457 -0.076690 0.081253 0.294617 -0.019581 -0.082363 0.257206 -0.106388 0.019789 0.169395 -0.041007 0.214361 0.083705 0.131780 0.065745 0.318804 0.100681 -0.107426 
-facilisis 0.028511 -0.275028 0.119863 -0.303408 -0.074679 -0.087019 -0.091357 -0.169173 -0.055568 0.185000 -0.026370 0.381084 -0.413374 -0.084824 0.141593 0.435051 0.250033 0.022176 0.122357 0.049304 0.037553 0.234572 0.103884 -0.158843 -0.095737 -0.213047 0.093763 0.045164 0.288720 0.178551 -0.093147 -0.126181 -0.312673 0.227551 0.039242 -0.063381 0.172787 -0.111893 0.007893 -0.071567 -0.263758 0.205605 -0.335817 -0.073625 0.169897 0.189799 0.067019 0.183954 -0.134058 -0.209101 0.070745 -0.197792 0.104622 0.056264 -0.043761 0.059960 0.326473 0.032983 -0.125506 -0.148260 0.172346 -0.172560 -0.085331 -0.179082 -0.185440 0.267178 0.051730 -0.204594 0.227326 0.054899 -0.202886 0.012139 0.131430 -0.068928 -0.180600 0.072043 -0.188588 -0.100769 -0.217569 -0.321245 0.081986 0.014447 0.067638 -0.085038 0.089796 0.329617 -0.021820 -0.076025 0.281863 -0.126882 0.016737 0.180186 -0.042184 0.225196 0.104407 0.153789 0.066393 0.352717 0.111523 -0.111851 
-eros 0.018310 -0.269371 0.119394 -0.296392 -0.063523 -0.085824 -0.088254 -0.154311 -0.052417 0.172755 -0.020210 0.374222 -0.392108 -0.075941 0.139968 0.416243 0.232153 0.030535 0.116784 0.032992 0.032788 0.232834 0.103436 -0.153533 -0.096668 -0.208376 0.097911 0.047465 0.280250 0.173969 -0.088027 -0.127666 -0.294811 0.218572 0.036129 -0.054018 0.164753 -0.102585 0.008608 -0.075016 -0.249510 0.198146 -0.323477 -0.076129 0.162030 0.179383 0.060422 0.179645 -0.124970 -0.205332 0.055678 -0.191451 0.096246 0.050411 -0.053194 0.061730 0.308937 0.027652 -0.117800 -0.137319 0.167504 -0.162665 -0.079908 -0.172547 -0.188540 0.254814 0.052599 -0.205071 0.214971 0.046500 -0.201616 0.003471 0.112571 -0.065290 -0.191022 0.080489 -0.181918 -0.095221 -0.224744 -0.312621 0.072632 0.014227 0.056039 -0.092225 0.077557 0.308237 -0.018367 -0.078630 0.280687 -0.127366 0.010451 0.172330 -0.040828 0.217201 0.100194 0.157876 0.073770 0.337769 0.118687 -0.118125 
-Curabitur 0.028084 -0.296118 0.128528 -0.321455 -0.074891 -0.096553 -0.101401 -0.171144 -0.065951 0.197504 -0.016087 0.408802 -0.428061 -0.087911 0.158853 0.465163 0.268450 0.030849 0.126158 0.048551 0.040835 0.256302 0.128981 -0.173173 -0.096023 -0.231344 0.092996 0.038214 0.309181 0.190702 -0.100867 -0.142095 -0.336196 0.237332 0.033379 -0.065894 0.192176 -0.107668 0.017599 -0.071877 -0.273459 0.227584 -0.347076 -0.083815 0.181834 0.197306 0.078734 0.196467 -0.136092 -0.216187 0.071236 -0.204387 0.113978 0.067583 -0.048933 0.070509 0.349171 0.028810 -0.122452 -0.153852 0.170372 -0.180334 -0.088374 -0.195238 -0.207116 0.279162 0.061142 -0.222982 0.238338 0.065107 -0.209934 0.010422 0.124348 -0.083497 -0.201108 0.092392 -0.209471 -0.106820 -0.237700 -0.346911 0.087324 0.013484 0.071299 -0.099018 0.085185 0.346480 -0.016014 -0.075493 0.301863 -0.134974 0.013596 0.201674 -0.032624 0.239188 0.108746 0.171935 0.080389 0.383137 0.127507 -0.126637 
-varius 0.042890 -0.287189 0.127433 -0.323868 -0.083448 -0.094798 -0.110535 -0.158576 -0.072745 0.194097 -0.014570 0.390305 -0.434321 -0.077633 0.149735 0.470109 0.268415 0.038565 0.112311 0.051399 0.046966 0.259769 0.111270 -0.179663 -0.096222 -0.244103 0.096018 0.045146 0.306144 0.182374 -0.099708 -0.143619 -0.320205 0.224504 0.040022 -0.061009 0.174630 -0.123997 0.005599 -0.079350 -0.275705 0.213235 -0.337235 -0.079175 0.173220 0.180639 0.066358 0.176578 -0.136627 -0.226677 0.072065 -0.213886 0.122835 0.056302 -0.037068 0.073845 0.338079 0.028585 -0.135811 -0.154280 0.176261 -0.189178 -0.076654 -0.189689 -0.203683 0.271981 0.052956 -0.202007 0.237661 0.064428 -0.206282 0.001128 0.122344 -0.076281 -0.180139 0.088581 -0.189869 -0.116970 -0.223431 -0.340450 0.067678 0.014777 0.066328 -0.097325 0.084392 0.327788 -0.026218 -0.078675 0.298207 -0.128429 -0.006604 0.178912 -0.025136 0.229795 0.103427 0.155664 0.087566 0.358791 0.109591 -0.109987 
-luctus 0.053806 -0.282744 0.123808 -0.314677 -0.062899 -0.077466 -0.096263 -0.165334 -0.049877 0.197021 -0.018634 0.378172 -0.414580 -0.070320 0.149553 0.444956 0.257718 0.035966 0.116434 0.058269 0.052981 0.238464 0.104139 -0.165814 -0.096439 -0.219197 0.090842 0.038594 0.284919 0.172132 -0.094699 -0.126851 -0.306615 0.224889 0.045706 -0.061813 0.178120 -0.118545 0.001606 -0.057875 -0.251239 0.202610 -0.328065 -0.071587 0.170341 0.192597 0.067024 0.178803 -0.127094 -0.213207 0.081731 -0.209548 0.122803 0.050599 -0.065489 0.068642 0.330258 0.024276 -0.134845 -0.155652 0.160303 -0.181677 -0.066531 -0.189444 -0.180911 0.255632 0.066391 -0.208681 0.219014 0.053928 -0.206141 -0.001476 0.127236 -0.076094 -0.181862 0.083256 -0.189637 -0.111653 -0.205138 -0.318523 0.074958 0.010909 0.063087 -0.087707 0.085252 0.314783 -0.020420 -0.068868 0.268302 -0.125248 0.002633 0.180485 -0.030906 0.246317 0.095029 0.170539 0.076902 0.351686 0.114544 -0.107746 
-rutrum 0.031303 -0.276074 0.118364 -0.310980 -0.070783 -0.093782 -0.100988 -0.164620 -0.050190 0.189073 -0.021276 0.374955 -0.410627 -0.084389 0.137724 0.436069 0.246291 0.027850 0.111087 0.045928 0.049596 0.246090 0.110891 -0.167235 -0.088498 -0.223633 0.096104 0.034381 0.291324 0.181311 -0.099041 -0.133205 -0.298355 0.218855 0.041866 -0.057919 0.167936 -0.114748 0.004190 -0.073834 -0.248863 0.207437 -0.323745 -0.067936 0.168591 0.192048 0.076461 0.179424 -0.123665 -0.204968 0.074175 -0.196398 0.106700 0.057889 -0.052335 0.074852 0.322747 0.016631 -0.122442 -0.155939 0.161572 -0.175258 -0.077663 -0.180253 -0.184201 0.251728 0.055913 -0.199613 0.226237 0.061732 -0.201892 -0.003248 0.122453 -0.078983 -0.189621 0.089678 -0.193939 -0.106041 -0.218633 -0.329834 0.080405 0.017323 0.063471 -0.101818 0.079745 0.330580 -0.017935 -0.070341 0.288542 -0.139352 0.003662 0.181186 -0.040732 0.234935 0.100712 0.165749 0.072659 0.359107 0.119786 -0.117472 
-cursus 0.021661 -0.284289 0.122456 -0.321773 -0.071227 -0.096359 -0.097287 -0.172447 -0.055356 0.198561 -0.019719 0.388117 -0.426327 -0.088209 0.146230 0.444152 0.257136 0.028322 0.116161 0.048587 0.045391 0.242372 0.104135 -0.170067 -0.092351 -0.225770 0.099113 0.046702 0.294870 0.181229 -0.101050 -0.135416 -0.314232 0.218139 0.035750 -0.048378 0.177195 -0.109474 0.017645 -0.065413 -0.265268 0.215041 -0.336284 -0.066372 0.173614 0.185620 0.067045 0.182989 -0.133473 -0.214179 0.064409 -0.209384 0.107889 0.055460 -0.045338 0.063518 0.333887 0.030971 -0.127234 -0.160685 0.167696 -0.181617 -0.090171 -0.196452 -0.193618 0.265511 0.050430 -0.210710 0.220096 0.059135 -0.200268 0.006966 0.119537 -0.067418 -0.195366 0.079575 -0.197594 -0.094735 -0.219476 -0.331589 0.077783 0.017753 0.057010 -0.096405 0.081511 0.324690 -0.015132 -0.089670 0.289914 -0.127244 0.017405 0.177958 -0.031466 0.238490 0.107298 0.155481 0.083564 0.363419 0.111940 -0.116393 
-eleifend 0.021062 -0.246658 0.098109 -0.273799 -0.059021 -0.074140 -0.081464 -0.142461 -0.043931 0.161258 -0.024986 0.333613 -0.358402 -0.063812 0.119494 0.373675 0.213859 0.024829 0.091324 0.044422 0.034081 0.197804 0.089504 -0.146298 -0.081664 -0.192265 0.073953 0.033542 0.251952 0.156352 -0.082219 -0.111536 -0.263136 0.200342 0.022299 -0.044300 0.147358 -0.096117 0.001303 -0.063056 -0.217602 0.183539 -0.272184 -0.063195 0.145262 0.158338 0.068608 0.152722 -0.118968 -0.189511 0.061810 -0.178366 0.095018 0.050348 -0.040431 0.066380 0.272925 0.035861 -0.106099 -0.140471 0.152108 -0.155833 -0.078155 -0.166732 -0.164165 0.233631 0.046905 -0.175821 0.199413 0.053109 -0.179470 0.003468 0.100488 -0.060719 -0.165968 0.062632 -0.162203 -0.092972 -0.192260 -0.269716 0.063698 0.010044 0.048146 -0.074764 0.071555 0.289062 -0.006176 -0.067411 0.251025 -0.109041 0.009141 0.157783 -0.035023 0.198342 0.091335 0.135991 0.068639 0.303835 0.104423 -0.096857 
-hendrerit 0.023384 -0.292872 0.123155 -0.319466 -0.069113 -0.097355 -0.100275 -0.167561 -0.059204 0.197202 -0.021498 0.376924 -0.414202 -0.090449 0.144201 0.438610 0.246276 0.023804 0.108793 0.051037 0.041209 0.241352 0.109757 -0.157996 -0.096346 -0.226143 0.087600 0.039905 0.286453 0.184803 -0.103112 -0.136608 -0.313014 0.213267 0.032335 -0.053080 0.175453 -0.112363 0.012234 -0.082142 -0.251654 0.210579 -0.320941 -0.066410 0.178634 0.187601 0.069786 0.192510 -0.128968 -0.217949 0.076584 -0.207984 0.105666 0.053507 -0.048395 0.067511 0.337413 0.030187 -0.130450 -0.156626 0.168789 -0.171300 -0.086518 -0.194776 -0.190748 0.274450 0.054765 -0.219223 0.231310 0.057292 -0.199134 -0.001307 0.124885 -0.068479 -0.194298 0.079232 -0.193689 -0.100495 -0.224110 -0.332292 0.073535 0.014832 0.059418 -0.082357 0.086086 0.326638 -0.015727 -0.084199 0.291642 -0.138909 0.014175 0.183502 -0.036815 0.233140 0.105442 0.151529 0.077393 0.356798 0.112879 -0.113603 
-consequat 0.030328 -0.282184 0.120233 -0.322014 -0.080494 -0.086072 -0.096061 -0.163044 -0.064250 0.188088 -0.012858 0.377855 -0.424726 -0.083874 0.143654 0.435641 0.250975 0.033025 0.118259 0.052516 0.042392 0.242516 0.108744 -0.161191 -0.093090 -0.217127 0.095241 0.041652 0.279451 0.176136 -0.098510 -0.139962 -0.301150 0.223875 0.033453 -0.048381 0.166746 -0.111788 0.000686 -0.078846 -0.251656 0.204447 -0.325724 -0.072114 0.172750 0.188135 0.063263 0.182341 -0.129152 -0.209798 0.070643 -0.201668 0.103706 0.055885 -0.050006 0.063990 0.325891 0.031892 -0.133705 -0.153115 0.167934 -0.175294 -0.086672 -0.188265 -0.186654 0.255511 0.051655 -0.208390 0.231159 0.047559 -0.199440 0.004922 0.122533 -0.076005 -0.183017 0.079793 -0.187347 -0.102395 -0.219201 -0.327424 0.077287 0.022225 0.069929 -0.093832 0.083565 0.332319 -0.022644 -0.082650 0.278325 -0.120299 0.017077 0.183847 -0.038115 0.231439 0.105572 0.148083 0.076095 0.349295 0.114362 -0.108040 
-iaculis 0.023810 -0.255410 0.108664 -0.284577 -0.062804 -0.089248 -0.095778 -0.154685 -0.050155 0.172547 -0.023791 0.359486 -0.372436 -0.080656 0.137860 0.409601 0.220699 0.025241 0.109170 0.044460 0.039619 0.217530 0.102784 -0.148866 -0.089524 -0.202285 0.078044 0.033876 0.265859 0.176364 -0.093665 -0.125524 -0.286934 0.205653 0.035811 -0.049260 0.167591 -0.097838 0.009845 -0.071435 -0.231125 0.201800 -0.306338 -0.063762 0.164005 0.165137 0.063926 0.177154 -0.121566 -0.201174 0.064099 -0.185280 0.097015 0.045083 -0.046157 0.066567 0.301844 0.024463 -0.120069 -0.152104 0.155280 -0.158933 -0.071831 -0.174557 -0.173861 0.240064 0.056008 -0.187726 0.208256 0.047977 -0.186002 0.012260 0.113290 -0.066812 -0.162821 0.078423 -0.175858 -0.086200 -0.209071 -0.296970 0.066006 0.006291 0.049891 -0.082047 0.070520 0.289071 -0.020653 -0.067221 0.258138 -0.110006 0.012378 0.165648 -0.026239 0.207535 0.095280 0.140568 0.063809 0.318008 0.107892 -0.105952 
-sollicitudin 0.039793 -0.289823 0.126288 -0.325835 -0.078459 -0.106330 -0.101339 -0.168187 -0.054204 0.206261 -0.019290 0.397886 -0.434779 -0.080288 0.155093 0.459801 0.253740 0.035503 0.130741 0.049926 0.048524 0.253261 0.117488 -0.169022 -0.102698 -0.235182 0.104986 0.046688 0.301630 0.196992 -0.105348 -0.148103 -0.321987 0.240357 0.035630 -0.061973 0.172966 -0.118617 0.015138 -0.077132 -0.281372 0.224798 -0.352589 -0.074713 0.179453 0.196323 0.063678 0.189063 -0.133951 -0.226646 0.077979 -0.211698 0.119639 0.055178 -0.055374 0.065512 0.345957 0.038104 -0.137090 -0.157550 0.185791 -0.182326 -0.081585 -0.193808 -0.201507 0.279216 0.049473 -0.223754 0.244567 0.052454 -0.208203 0.009189 0.127343 -0.073472 -0.197704 0.086781 -0.208742 -0.100793 -0.224557 -0.340140 0.090871 0.014661 0.067544 -0.089287 0.090474 0.337971 -0.020812 -0.079465 0.291051 -0.138369 0.013731 0.193597 -0.037515 0.248334 0.111367 0.168527 0.080735 0.379172 0.120378 -0.131071 
-sagittis 0.034317 -0.300333 0.121215 -0.332470 -0.079616 -0.094320 -0.098437 -0.175936 -0.059544 0.196714 -0.025552 0.384757 -0.419281 -0.080475 0.143416 0.449346 0.251165 0.025395 0.107549 0.053074 0.044485 0.232293 0.105583 -0.163513 -0.091030 -0.222224 0.094679 0.031201 0.296746 0.185182 -0.095627 -0.132476 -0.300641 0.225210 0.035711 -0.053359 0.176528 -0.103564 0.002891 -0.074221 -0.243431 0.207899 -0.320638 -0.066562 0.169328 0.186051 0.062991 0.183303 -0.134906 -0.209534 0.072376 -0.207725 0.103832 0.046320 -0.051368 0.065368 0.324797 0.026002 -0.133403 -0.153542 0.164175 -0.175797 -0.079942 -0.187136 -0.184946 0.261721 0.059438 -0.202569 0.222147 0.056702 -0.203472 0.007454 0.118764 -0.075640 -0.183597 0.075432 -0.191172 -0.102884 -0.219101 -0.320916 0.076000 0.010851 0.056518 -0.086620 0.075180 0.319162 -0.017138 -0.083549 0.279783 -0.120665 0.006282 0.179974 -0.032180 0.224134 0.101351 0.154039 0.079965 0.357453 0.115602 -0.109464 
-semper 0.025121 -0.279172 0.124572 -0.309855 -0.075491 -0.082694 -0.093275 -0.165246 -0.051426 0.176432 -0.022131 0.370758 -0.404724 -0.077181 0.143494 0.420278 0.241243 0.024669 0.108846 0.046344 0.035749 0.228547 0.103921 -0.152002 -0.096226 -0.209168 0.100910 0.044697 0.287861 0.167483 -0.094609 -0.125563 -0.308338 0.216919 0.032440 -0.054505 0.168233 -0.114176 0.014184 -0.068334 -0.259996 0.201742 -0.324396 -0.069462 0.173668 0.180936 0.069945 0.188101 -0.123662 -0.209608 0.067512 -0.203571 0.107149 0.052187 -0.044478 0.068575 0.329178 0.030067 -0.116844 -0.156271 0.163381 -0.163351 -0.081106 -0.192795 -0.185135 0.254194 0.060230 -0.207075 0.219145 0.049703 -0.192598 0.005189 0.121376 -0.070773 -0.193366 0.085273 -0.188714 -0.097071 -0.224614 -0.320989 0.077632 0.017418 0.053719 -0.092074 0.082630 0.318079 -0.018649 -0.082849 0.275743 -0.117740 0.014131 0.171961 -0.038170 0.228706 0.097716 0.152930 0.074844 0.345022 0.111077 -0.121658 
-sem 0.016635 -0.264171 0.107780 -0.291029 -0.064438 -0.084526 -0.085294 -0.163386 -0.049636 0.165341 -0.024209 0.353848 -0.379485 -0.072766 0.135563 0.405533 0.220784 0.029198 0.111726 0.038678 0.030481 0.219580 0.093718 -0.150842 -0.083806 -0.199005 0.086380 0.042498 0.267963 0.163478 -0.094858 -0.115717 -0.274520 0.204757 0.025787 -0.053767 0.156355 -0.100259 0.000785 -0.067911 -0.235817 0.195510 -0.292003 -0.059721 0.157611 0.169471 0.067603 0.179389 -0.119667 -0.198639 0.064237 -0.192529 0.092306 0.048866 -0.039857 0.064425 0.306555 0.032422 -0.121763 -0.144664 0.162080 -0.166698 -0.075942 -0.179297 -0.175668 0.247596 0.060298 -0.190233 0.201549 0.045221 -0.194163 0.005232 0.102196 -0.063314 -0.181888 0.063984 -0.177193 -0.095016 -0.210472 -0.297275 0.077806 0.013566 0.059918 -0.078207 0.085155 0.298096 -0.008626 -0.069817 0.264175 -0.114315 0.010903 0.170611 -0.042267 0.209520 0.090359 0.148316 0.072544 0.323070 0.105125 -0.108018 
-accumsan 0.026466 -0.264141 0.117907 -0.300425 -0.072403 -0.081817 -0.091513 -0.154353 -0.050741 0.183231 -0.021143 0.369582 -0.390550 -0.078886 0.138604 0.404461 0.241201 0.027712 0.109401 0.043670 0.040550 0.226240 0.104144 -0.147434 -0.095997 -0.215405 0.072383 0.036108 0.282390 0.179541 -0.091370 -0.132262 -0.291713 0.218713 0.034213 -0.047110 0.159683 -0.098796 0.003486 -0.075565 -0.246709 0.205197 -0.303710 -0.070971 0.170781 0.168711 0.064942 0.170396 -0.121485 -0.210172 0.073321 -0.188996 0.109223 0.050738 -0.037709 0.066807 0.306262 0.036758 -0.118694 -0.143838 0.164778 -0.168520 -0.084770 -0.179634 -0.174132 0.240824 0.047114 -0.191403 0.210642 0.055498 -0.194586 0.001314 0.118955 -0.067132 -0.181174 0.075287 -0.178314 -0.101511 -0.219937 -0.315861 0.080701 0.017266 0.062096 -0.089574 0.067501 0.309630 -0.017374 -0.070524 0.275518 -0.128343 0.009098 0.163365 -0.023225 0.214324 0.110366 0.149208 0.079871 0.338903 0.108563 -0.107867 
-est 0.018912 -0.266254 0.112760 -0.297196 -0.065335 -0.083182 -0.100589 -0.159741 -0.054075 0.174622 -0.016951 0.351681 -0.389450 -0.068927 0.133214 0.411721 0.230695 0.034899 0.109447 0.040618 0.025657 0.217846 0.091398 -0.147016 -0.094209 -0.196649 0.076411 0.037435 0.271594 0.177886 -0.088836 -0.117033 -0.287120 0.207377 0.029836 -0.050992 0.159929 -0.106327 0.008852 -0.068791 -0.228689 0.194896 -0.309232 -0.060465 0.154059 0.164837 0.055219 0.175733 -0.127037 -0.208940 0.060154 -0.180780 0.101349 0.048238 -0.045717 0.071533 0.301150 0.026901 -0.118156 -0.146202 0.150669 -0.165087 -0.070060 -0.178605 -0.180413 0.246047 0.046470 -0.186974 0.208830 0.057216 -0.182510 0.009253 0.114273 -0.065203 -0.175065 0.074741 -0.171758 -0.083871 -0.212140 -0.306785 0.072835 0.003670 0.071412 -0.082702 0.068290 0.293165 -0.016292 -0.081209 0.271224 -0.110665 0.018035 0.154974 -0.034373 0.205767 0.096805 0.146634 0.063154 0.325010 0.110388 -0.113247 
-euismod 0.017198 -0.277266 0.117054 -0.306036 -0.069989 -0.096760 -0.104038 -0.162924 -0.051586 0.189583 -0.019921 0.379147 -0.409316 -0.083696 0.145126 0.420656 0.239829 0.033767 0.112719 0.049150 0.035750 0.237285 0.107739 -0.161290 -0.087990 -0.216939 0.087463 0.041592 0.285712 0.185600 -0.101738 -0.135321 -0.305746 0.217683 0.034109 -0.048457 0.172573 -0.113008 0.004627 -0.072765 -0.260206 0.209876 -0.328319 -0.066291 0.165647 0.187343 0.066226 0.184806 -0.131458 -0.211949 0.068185 -0.189886 0.108322 0.058067 -0.043727 0.076700 0.326348 0.032721 -0.130607 -0.154129 0.178200 -0.181547 -0.079765 -0.190908 -0.195806 0.253357 0.052695 -0.209668 0.225069 0.057874 -0.201750 0.006415 0.120277 -0.070506 -0.193585 0.087012 -0.194068 -0.096986 -0.227949 -0.322211 0.080352 0.019528 0.056459 -0.092343 0.074754 0.323129 -0.011244 -0.082933 0.290508 -0.128745 0.018534 0.178701 -0.035641 0.235196 0.102878 0.151268 0.070172 0.360100 0.122039 -0.118084 
-venenatis 0.024205 -0.282311 0.122979 -0.310342 -0.064973 -0.089096 -0.097775 -0.170991 -0.059001 0.194385 -0.026236 0.380008 -0.402441 -0.067261 0.147002 0.420967 0.240337 0.025024 0.114137 0.055001 0.033492 0.234162 0.100478 -0.156227 -0.096381 -0.204446 0.100697 0.037596 0.277647 0.178128 -0.094719 -0.132152 -0.308899 0.232874 0.026892 -0.052575 0.167158 -0.110552 0.004135 -0.079314 -0.266037 0.210164 -0.321872 -0.060308 0.168460 0.189873 0.067363 0.186925 -0.131715 -0.212342 0.070731 -0.211383 0.111508 0.054185 -0.053004 0.067657 0.329360 0.040392 -0.120616 -0.146208 0.177398 -0.162917 -0.085479 -0.191381 -0.184208 0.254777 0.049449 -0.219309 0.219658 0.052222 -0.203728 0.006311 0.125443 -0.071141 -0.197897 0.071846 -0.193787 -0.092269 -0.218368 -0.331510 0.090323 0.013342 0.065895 -0.078351 0.085816 0.326504 -0.024235 -0.081082 0.278648 -0.121582 0.021066 0.174940 -0.035817 0.233669 0.103284 0.146894 0.083769 0.348740 0.120496 -0.121389 
-vulputate 0.028133 -0.291432 0.126258 -0.305987 -0.070821 -0.087131 -0.094699 -0.158008 -0.059790 0.181948 -0.028021 0.367512 -0.408141 -0.081313 0.140966 0.437255 0.242048 0.030197 0.120790 0.051410 0.029806 0.232772 0.100592 -0.153279 -0.088059 -0.203086 0.096789 0.039284 0.264717 0.165004 -0.104186 -0.123717 -0.293211 0.224214 0.033645 -0.046539 0.173982 -0.110489 0.009673 -0.068939 -0.250206 0.197701 -0.302653 -0.063504 0.162020 0.181339 0.059716 0.178278 -0.122690 -0.210113 0.062971 -0.203164 0.111193 0.059367 -0.053406 0.055426 0.322825 0.032468 -0.127782 -0.141657 0.165819 -0.161520 -0.080165 -0.182001 -0.192311 0.266445 0.052770 -0.205391 0.220943 0.055713 -0.194035 0.005245 0.118584 -0.066894 -0.190933 0.079433 -0.191896 -0.090359 -0.219461 -0.333267 0.085762 0.008564 0.053844 -0.086675 0.092943 0.317864 -0.018507 -0.085629 0.287757 -0.128744 0.022470 0.185971 -0.029022 0.227173 0.091548 0.155155 0.066845 0.359608 0.114615 -0.118952 
-mollis 0.031292 -0.281915 0.126396 -0.309543 -0.072382 -0.084676 -0.099371 -0.169141 -0.059769 0.185369 -0.009109 0.379600 -0.405002 -0.086733 0.148956 0.429034 0.248706 0.031050 0.114400 0.046217 0.041558 0.234505 0.099866 -0.153758 -0.093556 -0.211750 0.090738 0.048073 0.272429 0.179967 -0.103536 -0.136684 -0.303945 0.220190 0.028868 -0.046724 0.165916 -0.105783 0.018550 -0.080578 -0.251895 0.202791 -0.313930 -0.067786 0.169162 0.183310 0.065224 0.177539 -0.121567 -0.215091 0.068041 -0.195755 0.111387 0.059468 -0.039322 0.060915 0.328810 0.036669 -0.122459 -0.149018 0.166558 -0.167417 -0.078519 -0.186890 -0.183666 0.258261 0.051928 -0.210302 0.226678 0.047721 -0.199435 0.005121 0.114060 -0.068921 -0.175203 0.084703 -0.184811 -0.085578 -0.226287 -0.316540 0.086454 0.006693 0.060562 -0.078498 0.078452 0.321182 -0.021804 -0.083140 0.274631 -0.113535 0.018722 0.167878 -0.037769 0.223145 0.094570 0.141937 0.070275 0.337393 0.106041 -0.110207 
-tempus 0.017494 -0.273540 0.121941 -0.301520 -0.075525 -0.086907 -0.094824 -0.159790 -0.051154 0.188685 -0.030150 0.378171 -0.414964 -0.078978 0.152858 0.436299 0.245747 0.031616 0.111973 0.038428 0.032627 0.243202 0.103149 -0.171949 -0.101347 -0.228183 0.095267 0.031436 0.284235 0.189411 -0.102968 -0.130940 -0.314057 0.230759 0.026562 -0.054920 0.176130 -0.117451 0.005358 -0.082324 -0.251213 0.206841 -0.317468 -0.062940 0.167370 0.188563 0.074077 0.188844 -0.137825 -0.215392 0.073330 -0.194821 0.113832 0.052117 -0.042589 0.067424 0.321076 0.029326 -0.124557 -0.152522 0.176522 -0.179205 -0.082017 -0.180185 -0.180260 0.265777 0.056575 -0.203222 0.228397 0.055940 -0.204782 0.010035 0.129767 -0.064056 -0.180880 0.071745 -0.182340 -0.102511 -0.221312 -0.327009 0.075342 0.018089 0.054417 -0.084772 0.083300 0.317906 -0.019536 -0.083937 0.281569 -0.130681 0.008315 0.176526 -0.034124 0.219540 0.104892 0.152853 0.073703 0.357032 0.113304 -0.112584 
-auctor 0.014810 -0.262942 0.121728 -0.299910 -0.073049 -0.088002 -0.097372 -0.167297 -0.042272 0.176856 -0.035247 0.381557 -0.413762 -0.071171 0.141014 0.418071 0.240503 0.023698 0.114490 0.030593 0.034746 0.231762 0.094393 -0.171262 -0.094213 -0.215750 0.087678 0.036611 0.274544 0.178954 -0.102123 -0.127831 -0.291235 0.222083 0.039503 -0.048190 0.154789 -0.105799 -0.008295 -0.068322 -0.247566 0.188266 -0.297054 -0.051025 0.159147 0.163381 0.073335 0.174723 -0.122930 -0.190842 0.048567 -0.183124 0.107236 0.052845 -0.042233 0.075608 0.291858 0.040610 -0.113672 -0.133146 0.156753 -0.163179 -0.064959 -0.170440 -0.173832 0.237949 0.052721 -0.185055 0.214721 0.051658 -0.198537 -0.005582 0.122601 -0.063015 -0.178431 0.069350 -0.167718 -0.092808 -0.196089 -0.300206 0.069740 0.023188 0.063839 -0.080644 0.079201 0.290563 -0.010677 -0.080870 0.252205 -0.122404 0.015042 0.163333 -0.037759 0.207129 0.102774 0.151434 0.070889 0.328804 0.117845 -0.105556 
-blandit 0.026665 -0.260311 0.100916 -0.278254 -0.064580 -0.085270 -0.089442 -0.142745 -0.047123 0.175861 -0.023232 0.346645 -0.364286 -0.085010 0.138781 0.395308 0.214626 0.016867 0.106727 0.048283 0.039559 0.223263 0.106990 -0.148818 -0.090627 -0.216404 0.084024 0.040815 0.269892 0.161697 -0.094213 -0.134229 -0.279817 0.203657 0.028549 -0.054223 0.166173 -0.102432 0.017800 -0.064336 -0.235917 0.200812 -0.308878 -0.062706 0.158138 0.169710 0.067339 0.172057 -0.118272 -0.206476 0.064840 -0.183889 0.103839 0.060025 -0.040943 0.067674 0.316337 0.021424 -0.120559 -0.146898 0.163592 -0.168265 -0.071460 -0.174676 -0.179041 0.242100 0.048611 -0.190389 0.206537 0.048367 -0.178573 0.013596 0.111222 -0.062821 -0.172625 0.078697 -0.175563 -0.085689 -0.206661 -0.302692 0.067837 0.019645 0.060919 -0.080207 0.079785 0.306550 -0.012443 -0.074938 0.269891 -0.123512 0.008077 0.174031 -0.035712 0.218937 0.088005 0.145379 0.060372 0.324472 0.103439 -0.110717 
-elementum 0.024450 -0.284038 0.121867 -0.303701 -0.071969 -0.082819 -0.096375 -0.166037 -0.046561 0.187509 -0.024027 0.373933 -0.400246 -0.068628 0.142305 0.413539 0.243413 0.032167 0.110709 0.053501 0.040081 0.235434 0.103749 -0.158922 -0.096244 -0.204505 0.091130 0.039638 0.271566 0.169784 -0.107351 -0.121867 -0.291470 0.230461 0.034449 -0.052616 0.158866 -0.105946 -0.001561 -0.075676 -0.256828 0.197874 -0.320826 -0.065027 0.161256 0.191125 0.056115 0.171879 -0.129373 -0.216901 0.066393 -0.196170 0.107651 0.049194 -0.047028 0.055757 0.312665 0.031731 -0.126028 -0.132304 0.167765 -0.161563 -0.077161 -0.184404 -0.176565 0.249519 0.044074 -0.201069 0.223458 0.050377 -0.199430 -0.000152 0.123033 -0.071024 -0.193296 0.076216 -0.184298 -0.094799 -0.206354 -0.319750 0.085980 0.016837 0.058546 -0.080562 0.087118 0.309418 -0.025378 -0.087970 0.269406 -0.123219 0.024701 0.166551 -0.034776 0.225744 0.092194 0.137660 0.076488 0.343591 0.110203 -0.118178 
-ultricies 0.024750 -0.272830 0.112096 -0.301947 -0.076671 -0.101747 -0.096223 -0.164795 -0.055316 0.184721 -0.024200 0.373827 -0.405856 -0.079780 0.145520 0.438506 0.245035 0.032750 0.104078 0.052896 0.048419 0.241881 0.115407 -0.167618 -0.099081 -0.225487 0.094784 0.033443 0.298565 0.194364 -0.105680 -0.132940 -0.305926 0.225168 0.028056 -0.057116 0.176509 -0.110519 0.011650 -0.074397 -0.254285 0.209945 -0.331457 -0.072300 0.176702 0.188240 0.075794 0.193505 -0.139040 -0.221674 0.072781 -0.198127 0.110905 0.056442 -0.039489 0.067047 0.326184 0.030158 -0.129330 -0.157333 0.177725 -0.177585 -0.082204 -0.199196 -0.187285 0.268705 0.052065 -0.213232 0.218523 0.050986 -0.202162 0.004689 0.112682 -0.070604 -0.181167 0.082962 -0.199673 -0.099494 -0.236865 -0.329416 0.075201 0.013114 0.071353 -0.088950 0.088649 0.333830 -0.015334 -0.084453 0.287658 -0.119278 0.005485 0.175934 -0.035176 0.227705 0.101196 0.153472 0.067568 0.353355 0.121307 -0.119097 
-ex 0.025684 -0.282458 0.119111 -0.318924 -0.075518 -0.101829 -0.098466 -0.166959 -0.054602 0.194003 -0.014321 0.401273 -0.423720 -0.092584 0.155358 0.454920 0.245326 0.036481 0.125266 0.038194 0.043898 0.249024 0.104459 -0.170976 -0.094613 -0.232277 0.104722 0.037121 0.302449 0.194833 -0.105816 -0.142542 -0.322092 0.224361 0.030625 -0.050101 0.176869 -0.108636 0.003310 -0.077053 -0.260688 0.217897 -0.345480 -0.065788 0.166261 0.187546 0.073781 0.201523 -0.135555 -0.225090 0.062586 -0.208603 0.110242 0.049168 -0.042458 0.070573 0.329870 0.031407 -0.138331 -0.160047 0.182570 -0.179349 -0.085854 -0.184783 -0.205481 0.273085 0.062505 -0.217927 0.230968 0.055629 -0.208207 0.011469 0.111501 -0.078277 -0.186532 0.087345 -0.195654 -0.096967 -0.230767 -0.337221 0.083471 0.010969 0.056727 -0.086959 0.084860 0.334595 -0.027579 -0.086814 0.286525 -0.123478 0.025671 0.182881 -0.035720 0.237378 0.102372 0.162130 0.082672 0.357870 0.125139 -0.116126 
-lacus 0.030319 -0.292742 0.124159 -0.325216 -0.067776 -0.099547 -0.108463 -0.174569 -0.050210 0.198803 -0.026193 0.402223 -0.419067 -0.091465 0.151479 0.457843 0.261689 0.027792 0.122362 0.040120 0.038212 0.250162 0.114358 -0.176861 -0.089277 -0.237229 0.097680 0.041457 0.316753 0.193742 -0.096646 -0.129307 -0.332078 0.226251 0.033467 -0.058221 0.184174 -0.120562 0.006725 -0.077207 -0.263841 0.215645 -0.328135 -0.081508 0.177954 0.188150 0.077502 0.194865 -0.139582 -0.220787 0.068695 -0.208292 0.109631 0.054630 -0.046531 0.069311 0.336711 0.031271 -0.132741 -0.151578 0.173014 -0.185285 -0.080814 -0.198376 -0.201302 0.277055 0.056170 -0.218746 0.232975 0.062907 -0.205507 0.002605 0.121377 -0.075238 -0.197912 0.079431 -0.187430 -0.108559 -0.236086 -0.332204 0.082429 0.008386 0.059973 -0.095436 0.086660 0.329918 -0.007662 -0.082504 0.297363 -0.140805 0.018515 0.185908 -0.038119 0.227944 0.098850 0.169691 0.083568 0.358837 0.123954 -0.120846 
-congue 0.021024 -0.286313 0.123621 -0.305644 -0.070210 -0.085554 -0.100493 -0.166186 -0.057929 0.195143 -0.027223 0.385410 -0.428033 -0.077750 0.149597 0.458388 0.252093 0.028342 0.118364 0.041055 0.042613 0.248868 0.106015 -0.168602 -0.097218 -0.226905 0.087713 0.042316 0.300173 0.190362 -0.104523 -0.138849 -0.310695 0.232632 0.034446 -0.050425 0.180032 -0.115545 0.004601 -0.081757 -0.257296 0.216857 -0.334129 -0.077919 0.177959 0.186528 0.073335 0.186484 -0.137257 -0.217809 0.064642 -0.208731 0.115168 0.053105 -0.050172 0.073965 0.340490 0.034667 -0.126547 -0.158165 0.181748 -0.185295 -0.085901 -0.184895 -0.189133 0.272438 0.060144 -0.203651 0.227328 0.067015 -0.206755 -0.000951 0.126172 -0.070157 -0.188647 0.082175 -0.194458 -0.095635 -0.237653 -0.331569 0.080260 0.017712 0.055930 -0.091134 0.084378 0.335535 -0.010254 -0.088161 0.296671 -0.129642 0.018601 0.185288 -0.037598 0.229503 0.101166 0.158240 0.068022 0.365276 0.122150 -0.123536 
-dui 0.024481 -0.283298 0.117659 -0.324291 -0.078805 -0.090795 -0.097622 -0.171160 -0.053165 0.189924 -0.020287 0.388529 -0.423918 -0.082221 0.149141 0.453139 0.247955 0.036793 0.122339 0.045370 0.035205 0.239368 0.105699 -0.160612 -0.092731 -0.222989 0.101395 0.041360 0.290140 0.185049 -0.099094 -0.129507 -0.311762 0.217721 0.029093 -0.047208 0.177427 -0.103680 0.003173 -0.077482 -0.252863 0.204055 -0.329365 -0.066294 0.172682 0.178954 0.068394 0.187936 -0.129394 -0.207804 0.062432 -0.196679 0.104306 0.053259 -0.040436 0.068366 0.326294 0.026404 -0.121835 -0.153363 0.171796 -0.176918 -0.083597 -0.180061 -0.200049 0.263847 0.054717 -0.209251 0.225991 0.053761 -0.202606 0.012208 0.111472 -0.068298 -0.190658 0.073938 -0.185819 -0.092701 -0.227732 -0.329844 0.078769 0.013439 0.062806 -0.093567 0.084745 0.323713 -0.014442 -0.087033 0.294645 -0.127719 0.018618 0.184926 -0.031451 0.230303 0.093102 0.153844 0.073942 0.364965 0.124401 -0.113213 
-lobortis 0.022311 -0.254194 0.106557 -0.280299 -0.072627 -0.081272 -0.097386 -0.146586 -0.052413 0.165625 -0.019145 0.343316 -0.374351 -0.080912 0.127504 0.392213 0.223638 0.028812 0.108927 0.039477 0.044545 0.222172 0.100497 -0.159931 -0.089722 -0.213821 0.076409 0.041666 0.271766 0.172219 -0.090688 -0.129239 -0.295300 0.212398 0.034141 -0.049685 0.159757 -0.110766 0.008308 -0.066563 -0.242928 0.201572 -0.312524 -0.067684 0.172383 0.174678 0.073520 0.175293 -0.123713 -0.212738 0.072258 -0.191647 0.103709 0.048862 -0.050232 0.071734 0.316927 0.031071 -0.121249 -0.146006 0.166206 -0.172795 -0.076576 -0.185379 -0.184438 0.254773 0.051420 -0.192961 0.222590 0.048786 -0.189457 0.003274 0.113785 -0.071006 -0.172869 0.082520 -0.179143 -0.089194 -0.218712 -0.307813 0.066824 0.009683 0.056554 -0.088107 0.075843 0.321075 -0.022489 -0.074596 0.277808 -0.122764 0.014202 0.170867 -0.031361 0.224489 0.101547 0.147604 0.072446 0.349433 0.117016 -0.117220 
-purus 0.032549 -0.277016 0.113097 -0.317178 -0.069459 -0.088514 -0.099937 -0.158727 -0.062127 0.191441 -0.012071 0.378260 -0.401582 -0.091765 0.145496 0.446462 0.252266 0.036231 0.108869 0.040828 0.040693 0.240068 0.107812 -0.162671 -0.102826 -0.226554 0.085071 0.045821 0.299292 0.193562 -0.101525 -0.141533 -0.325056 0.224972 0.022028 -0.059220 0.178931 -0.110255 0.007490 -0.071036 -0.248389 0.220052 -0.320684 -0.073453 0.175337 0.190037 0.067564 0.187949 -0.132845 -0.215546 0.080500 -0.197819 0.113988 0.058141 -0.039980 0.069354 0.334257 0.036983 -0.132131 -0.160673 0.178501 -0.190775 -0.079904 -0.193908 -0.201990 0.275863 0.063731 -0.214996 0.232313 0.061837 -0.208306 0.012989 0.125977 -0.073234 -0.186125 0.090183 -0.205163 -0.106273 -0.242148 -0.322254 0.086967 0.014330 0.066174 -0.095544 0.080279 0.349872 -0.019707 -0.073330 0.310810 -0.138499 0.007725 0.189160 -0.040619 0.247280 0.116825 0.157803 0.074425 0.365272 0.110914 -0.125238 
-convallis 0.031022 -0.312753 0.130077 -0.340265 -0.079597 -0.087451 -0.106235 -0.171131 -0.055532 0.204402 -0.027699 0.402856 -0.449143 -0.082235 0.149453 0.459740 0.256649 0.024537 0.129141 0.050354 0.040011 0.255300 0.117467 -0.180989 -0.097813 -0.237216 0.098609 0.043670 0.297716 0.186535 -0.101879 -0.140728 -0.323142 0.224522 0.043250 -0.053365 0.182513 -0.123489 0.003783 -0.075706 -0.265013 0.210831 -0.340734 -0.064589 0.173744 0.195875 0.068145 0.191876 -0.136939 -0.216731 0.080913 -0.219735 0.118527 0.061017 -0.048283 0.077772 0.347007 0.030030 -0.130416 -0.164931 0.183256 -0.175497 -0.089816 -0.199578 -0.192149 0.281960 0.055724 -0.214930 0.238309 0.057525 -0.212584 0.000806 0.130774 -0.086485 -0.201397 0.077618 -0.202619 -0.112656 -0.226402 -0.344037 0.076821 0.017053 0.057653 -0.091056 0.092556 0.341146 -0.021269 -0.079813 0.295441 -0.140856 0.008421 0.195097 -0.038206 0.250719 0.098930 0.171609 0.079322 0.377788 0.128915 -0.129276 
-nibh 0.021921 -0.256215 0.103685 -0.279021 -0.061228 -0.079515 -0.085116 -0.148202 -0.051979 0.174051 -0.010669 0.358031 -0.390815 -0.076810 0.136890 0.411304 0.221363 0.028687 0.108975 0.037270 0.042633 0.221970 0.094454 -0.150896 -0.083377 -0.196517 0.089843 0.042579 0.261541 0.164593 -0.090294 -0.118787 -0.273015 0.205381 0.026792 -0.046866 0.154802 -0.099158 0.007660 -0.066811 -0.230061 0.187601 -0.292163 -0.059835 0.141937 0.163959 0.050931 0.165011 -0.108397 -0.188394 0.063414 -0.172801 0.091551 0.049402 -0.046078 0.055338 0.283554 0.027361 -0.112392 -0.139765 0.153837 -0.151786 -0.074744 -0.160436 -0.168918 0.225569 0.047956 -0.195366 0.197162 0.051913 -0.179922 0.013621 0.098422 -0.061630 -0.171747 0.065381 -0.167519 -0.079829 -0.193792 -0.277835 0.081724 0.008518 0.054226 -0.075808 0.079854 0.289223 -0.017714 -0.067305 0.256426 -0.110573 0.015931 0.160726 -0.038451 0.200731 0.086497 0.144017 0.068824 0.321458 0.098521 -0.103581 
-feugiat 0.035121 -0.284169 0.117431 -0.305614 -0.072424 -0.083546 -0.092127 -0.164296 -0.058807 0.187944 -0.017986 0.368507 -0.405911 -0.077228 0.153720 0.431589 0.249028 0.016814 0.119587 0.045489 0.036905 0.236873 0.105152 -0.166813 -0.091917 -0.224468 0.089994 0.040518 0.276442 0.176189 -0.107982 -0.124285 -0.315442 0.218769 0.041217 -0.058096 0.165687 -0.121128 0.003260 -0.081941 -0.255459 0.204327 -0.328785 -0.061509 0.172393 0.194626 0.065515 0.179549 -0.134708 -0.207518 0.073447 -0.202986 0.111544 0.064326 -0.062851 0.072001 0.331450 0.031590 -0.120311 -0.155337 0.165160 -0.167434 -0.070271 -0.188570 -0.188345 0.263421 0.048034 -0.204382 0.222422 0.055094 -0.206936 0.006343 0.132046 -0.070026 -0.196068 0.092297 -0.188646 -0.102357 -0.221709 -0.334261 0.073596 0.027504 0.066247 -0.092853 0.093132 0.329287 -0.022163 -0.078331 0.279583 -0.137639 0.012991 0.181384 -0.036987 0.237123 0.106264 0.155122 0.079706 0.361765 0.110424 -0.126599 
-placerat 0.021101 -0.236206 0.110557 -0.256402 -0.062273 -0.080291 -0.078995 -0.144578 -0.046978 0.155663 -0.019508 0.317099 -0.357684 -0.071225 0.129600 0.366073 0.214247 0.022705 0.091702 0.040999 0.030321 0.207907 0.093528 -0.145520 -0.080860 -0.187225 0.085821 0.031930 0.236688 0.158232 -0.088524 -0.111552 -0.257570 0.194452 0.020813 -0.044693 0.146251 -0.088777 0.010450 -0.062392 -0.215687 0.175114 -0.265880 -0.062010 0.136655 0.158935 0.054787 0.156585 -0.104911 -0.174074 0.047764 -0.169276 0.086887 0.050086 -0.038699 0.061115 0.280735 0.037330 -0.103557 -0.124899 0.153039 -0.139319 -0.073200 -0.169763 -0.163865 0.220140 0.048880 -0.190518 0.188952 0.044665 -0.172661 0.006292 0.108833 -0.063097 -0.169501 0.073334 -0.173124 -0.085623 -0.199777 -0.273588 0.074209 0.001810 0.055029 -0.076723 0.071156 0.277002 -0.008415 -0.068341 0.244404 -0.099187 0.021402 0.156193 -0.037833 0.199968 0.091169 0.129534 0.063545 0.299317 0.108796 -0.107406 
-metus 0.029164 -0.275293 0.109610 -0.311891 -0.073300 -0.088178 -0.096697 -0.164760 -0.069896 0.186642 -0.007815 0.377132 -0.392514 -0.094156 0.139029 0.434844 0.235755 0.025160 0.118271 0.048911 0.035979 0.233004 0.099118 -0.157105 -0.081199 -0.201531 0.098695 0.035418 0.264126 0.165007 -0.086385 -0.124409 -0.285552 0.194372 0.023277 -0.049106 0.158773 -0.100588 0.003595 -0.067285 -0.241005 0.200622 -0.306127 -0.059782 0.148048 0.161681 0.059970 0.177966 -0.117236 -0.186660 0.058773 -0.190988 0.095611 0.058857 -0.046416 0.062311 0.300067 0.033291 -0.119049 -0.146742 0.160246 -0.159606 -0.085473 -0.178289 -0.182643 0.247940 0.046212 -0.199115 0.211238 0.052252 -0.183303 0.005724 0.105408 -0.068591 -0.167434 0.076863 -0.171439 -0.083601 -0.197933 -0.303743 0.076945 0.011313 0.049966 -0.088938 0.087466 0.316271 -0.017508 -0.087062 0.273135 -0.128308 0.019432 0.183660 -0.030492 0.221596 0.085541 0.144741 0.077799 0.352944 0.105610 -0.109229 
-vestibulum 0.027828 -0.276944 0.114189 -0.312726 -0.070060 -0.097020 -0.093393 -0.166202 -0.061174 0.188025 -0.022690 0.381452 -0.413293 -0.082498 0.147034 0.433447 0.246554 0.039570 0.119194 0.056361 0.051837 0.238141 0.107742 -0.163691 -0.093898 -0.224994 0.096716 0.036907 0.284937 0.186435 -0.100283 -0.135759 -0.301218 0.220934 0.029629 -0.049626 0.178396 -0.111900 0.005921 -0.080077 -0.252981 0.208882 -0.321392 -0.062975 0.172146 0.183471 0.064381 0.183016 -0.125419 -0.218826 0.071326 -0.198865 0.109345 0.055649 -0.041272 0.061922 0.331815 0.033808 -0.127089 -0.158427 0.170503 -0.171973 -0.083013 -0.193075 -0.193367 0.256049 0.058211 -0.212695 0.223605 0.054743 -0.200934 0.007128 0.121868 -0.073286 -0.192366 0.075691 -0.193284 -0.105920 -0.224416 -0.330284 0.078914 0.012757 0.066636 -0.097284 0.087018 0.337151 -0.022204 -0.078309 0.297814 -0.129175 0.009439 0.179732 -0.037821 0.238212 0.108982 0.154400 0.069636 0.360579 0.106181 -0.119724 
-efficitur 0.032672 -0.276932 0.113786 -0.305401 -0.079503 -0.087246 -0.103114 -0.161232 -0.059492 0.193553 -0.025155 0.368828 -0.406155 -0.078707 0.145092 0.429109 0.242864 0.030162 0.119012 0.048162 0.034755 0.229189 0.100050 -0.156009 -0.089459 -0.207279 0.076986 0.037784 0.270826 0.174742 -0.097527 -0.119171 -0.289530 0.203338 0.030314 -0.048560 0.157059 -0.108566 0.006296 -0.067719 -0.226943 0.187832 -0.297969 -0.068460 0.163977 0.180092 0.069437 0.178640 -0.121553 -0.206388 0.077277 -0.195056 0.108016 0.054347 -0.041612 0.069447 0.301799 0.030910 -0.113810 -0.141664 0.159213 -0.174646 -0.075728 -0.183192 -0.171337 0.248451 0.047406 -0.194082 0.205272 0.047143 -0.191826 0.011450 0.108924 -0.071745 -0.173744 0.066522 -0.175692 -0.091739 -0.206604 -0.316851 0.079935 0.014892 0.063237 -0.084803 0.079496 0.317670 -0.018316 -0.073714 0.271388 -0.120728 0.017951 0.176917 -0.033945 0.212440 0.105395 0.142166 0.075939 0.334273 0.114451 -0.113665 
-dignissim 0.030329 -0.264570 0.109844 -0.288681 -0.062026 -0.085471 -0.088126 -0.156926 -0.058042 0.185601 -0.015013 0.368590 -0.397700 -0.076926 0.148880 0.424949 0.243616 0.035920 0.122574 0.060776 0.047930 0.236054 0.108942 -0.165049 -0.094330 -0.206217 0.103941 0.048289 0.257794 0.156724 -0.102760 -0.123478 -0.300011 0.224559 0.029748 -0.063242 0.162742 -0.104440 0.009866 -0.069454 -0.263855 0.209164 -0.325413 -0.060390 0.154965 0.192685 0.058839 0.176547 -0.120965 -0.209819 0.067040 -0.199699 0.106297 0.049958 -0.040876 0.061436 0.309202 0.027753 -0.132186 -0.148511 0.176484 -0.167443 -0.073085 -0.175825 -0.178199 0.259072 0.048839 -0.209822 0.217872 0.043591 -0.187936 0.014868 0.113917 -0.066997 -0.176705 0.081781 -0.186861 -0.086736 -0.205016 -0.317100 0.079878 0.022783 0.059920 -0.083208 0.085467 0.325829 -0.020270 -0.072084 0.268037 -0.118094 0.026323 0.196087 -0.028731 0.240333 0.103021 0.145877 0.076161 0.350731 0.111948 -0.113563 
-suscipit 0.018879 -0.268102 0.117324 -0.306165 -0.071863 -0.093409 -0.099320 -0.160736 -0.052553 0.183632 -0.019948 0.368736 -0.407486 -0.077610 0.148456 0.425368 0.235740 0.034467 0.108835 0.044087 0.034481 0.239134 0.101807 -0.157533 -0.087009 -0.200920 0.090686 0.043993 0.265116 0.169050 -0.101941 -0.130595 -0.289563 0.218630 0.027691 -0.047545 0.160540 -0.098594 0.014101 -0.075603 -0.238246 0.187022 -0.299713 -0.057432 0.157898 0.166599 0.057852 0.174184 -0.121397 -0.202358 0.057332 -0.183450 0.088770 0.053175 -0.036911 0.064992 0.308075 0.038363 -0.121507 -0.135839 0.162269 -0.155882 -0.079289 -0.172302 -0.187023 0.235087 0.048154 -0.205946 0.207892 0.049283 -0.188147 0.010081 0.113270 -0.069749 -0.176789 0.075854 -0.181453 -0.082449 -0.217217 -0.306709 0.080604 0.006812 0.062561 -0.076256 0.082143 0.313957 -0.015576 -0.079605 0.271485 -0.115072 0.019101 0.166724 -0.030011 0.221562 0.100239 0.134343 0.067791 0.341280 0.118826 -0.110766 
-commodo 0.031861 -0.268496 0.123300 -0.301188 -0.074090 -0.092539 -0.103270 -0.164448 -0.052315 0.186769 -0.021484 0.376387 -0.413635 -0.082475 0.152125 0.432993 0.249348 0.031065 0.107939 0.048931 0.043678 0.237152 0.108560 -0.162011 -0.091349 -0.210809 0.084704 0.042570 0.278921 0.181442 -0.094683 -0.124381 -0.299181 0.219544 0.031927 -0.048192 0.167102 -0.101753 0.005758 -0.061970 -0.237972 0.194264 -0.313331 -0.059456 0.165372 0.169861 0.063933 0.179623 -0.119320 -0.197755 0.072639 -0.202554 0.107218 0.048807 -0.043099 0.068430 0.307013 0.033087 -0.128483 -0.141600 0.164249 -0.168724 -0.080237 -0.179590 -0.185090 0.253504 0.052205 -0.197972 0.212124 0.044038 -0.188444 0.002654 0.118264 -0.071670 -0.176131 0.080515 -0.186647 -0.093862 -0.205468 -0.309009 0.073608 0.003188 0.053748 -0.082737 0.085129 0.314412 -0.013795 -0.069333 0.272443 -0.118803 0.017935 0.169541 -0.034089 0.225830 0.094042 0.151965 0.076080 0.343264 0.119803 -0.116626 
-molestie 0.029536 -0.288365 0.119369 -0.327103 -0.078259 -0.094045 -0.096470 -0.171405 -0.067200 0.203957 -0.013767 0.396362 -0.429183 -0.088280 0.158346 0.453576 0.254026 0.027810 0.130977 0.056758 0.048132 0.241205 0.104838 -0.165454 -0.089620 -0.216884 0.095295 0.040618 0.279454 0.182961 -0.110778 -0.128618 -0.325431 0.228235 0.035033 -0.061745 0.171723 -0.118651 0.000390 -0.075419 -0.268055 0.218090 -0.341974 -0.069252 0.170151 0.187284 0.057039 0.183401 -0.130365 -0.216888 0.064413 -0.214085 0.101817 0.053249 -0.044912 0.065884 0.332937 0.042335 -0.139000 -0.150765 0.184701 -0.175810 -0.078724 -0.187665 -0.191024 0.273494 0.045623 -0.228664 0.238224 0.050047 -0.209793 0.008328 0.125503 -0.073880 -0.198138 0.073111 -0.189584 -0.092180 -0.222874 -0.331497 0.086817 0.017660 0.061037 -0.084104 0.090607 0.330521 -0.024077 -0.091076 0.287753 -0.127612 0.024709 0.196639 -0.035309 0.234066 0.106644 0.149434 0.082339 0.360311 0.124753 -0.117149 
-mauris 0.017693 -0.278145 0.121474 -0.314598 -0.068976 -0.093696 -0.097327 -0.167072 -0.059315 0.184648 -0.014436 0.392955 -0.411478 -0.088499 0.154766 0.440599 0.250828 0.037257 0.113735 0.048728 0.040903 0.243907 0.106679 -0.173683 -0.099616 -0.226635 0.103784 0.043169 0.292455 0.178136 -0.102445 -0.140850 -0.313149 0.215176 0.036203 -0.049324 0.173981 -0.101024 0.016437 -0.067778 -0.257293 0.212480 -0.315760 -0.069947 0.176963 0.181689 0.066432 0.192196 -0.132125 -0.220454 0.071246 -0.203581 0.106974 0.058375 -0.051766 0.066325 0.336396 0.034542 -0.136349 -0.150026 0.179446 -0.171256 -0.086284 -0.191985 -0.188065 0.260688 0.063480 -0.214551 0.223688 0.049594 -0.201165 0.011215 0.122668 -0.069949 -0.187359 0.085617 -0.197628 -0.091523 -0.224136 -0.314535 0.083361 0.013068 0.061193 -0.086785 0.082813 0.338943 -0.022774 -0.082728 0.291933 -0.129750 0.008895 0.182832 -0.023460 0.232289 0.106103 0.156341 0.068597 0.354817 0.110609 -0.122995 
-ullamcorper 0.033539 -0.281235 0.120541 -0.307312 -0.074232 -0.084698 -0.092530 -0.159171 -0.060313 0.190296 -0.027117 0.377185 -0.423026 -0.078208 0.142314 0.436804 0.255604 0.034084 0.115695 0.054367 0.039729 0.247687 0.110049 -0.161673 -0.100336 -0.229699 0.087943 0.045214 0.283029 0.173207 -0.095668 -0.138597 -0.326379 0.228074 0.030846 -0.060488 0.174838 -0.116326 0.004485 -0.082286 -0.264762 0.212663 -0.335471 -0.076315 0.182096 0.193768 0.069949 0.184797 -0.130865 -0.215626 0.077577 -0.211692 0.111641 0.054608 -0.053987 0.074706 0.333984 0.034282 -0.125894 -0.152541 0.180496 -0.177564 -0.081432 -0.192362 -0.195362 0.265983 0.059143 -0.206987 0.232952 0.060716 -0.206368 0.006768 0.129488 -0.078452 -0.191492 0.077880 -0.197623 -0.101887 -0.231692 -0.339642 0.077874 0.021577 0.057108 -0.093309 0.081750 0.339651 -0.021952 -0.074638 0.291548 -0.136371 0.021477 0.196868 -0.033870 0.234937 0.106194 0.161002 0.080787 0.372620 0.117781 -0.114825 
-finibus 0.025112 -0.272648 0.118076 -0.302687 -0.071774 -0.095046 -0.089752 -0.163322 -0.054136 0.185935 -0.023016 0.372672 -0.400760 -0.076910 0.142342 0.414509 0.234149 0.032241 0.115231 0.050452 0.042766 0.224591 0.104914 -0.157125 -0.089205 -0.211088 0.101875 0.043514 0.274295 0.170673 -0.088228 -0.137132 -0.300144 0.219559 0.028137 -0.046684 0.154234 -0.103711 0.003271 -0.075349 -0.258860 0.202605 -0.318149 -0.060980 0.167681 0.175982 0.069602 0.182591 -0.124570 -0.204185 0.051810 -0.196339 0.101414 0.059389 -0.054062 0.062667 0.313793 0.029907 -0.130639 -0.147321 0.165950 -0.166491 -0.064050 -0.176689 -0.180883 0.244867 0.048042 -0.207726 0.210458 0.047307 -0.200344 -0.001093 0.113717 -0.072980 -0.187422 0.079252 -0.181831 -0.091817 -0.208352 -0.311898 0.079674 0.021223 0.058263 -0.083568 0.087951 0.310583 -0.020346 -0.082416 0.267734 -0.128136 0.019072 0.182096 -0.028431 0.229008 0.103723 0.149598 0.077826 0.346547 0.115457 -0.119967 
-tempor 0.022170 -0.285680 0.114097 -0.323319 -0.079218 -0.093148 -0.098129 -0.165741 -0.053904 0.195788 -0.023758 0.400049 -0.423100 -0.076585 0.148448 0.454041 0.245773 0.028802 0.124071 0.054633 0.037868 0.255326 0.117101 -0.164826 -0.099460 -0.224814 0.102854 0.036352 0.285997 0.196362 -0.097501 -0.147504 -0.316909 0.229586 0.032447 -0.060558 0.167203 -0.108830 0.003399 -0.080962 -0.273022 0.212087 -0.345551 -0.068225 0.163959 0.182066 0.065161 0.195450 -0.124702 -0.223252 0.065705 -0.198021 0.099380 0.050808 -0.038559 0.066528 0.340211 0.039857 -0.129783 -0.145894 0.174865 -0.172373 -0.081522 -0.184415 -0.194757 0.257988 0.048405 -0.208594 0.222529 0.055752 -0.202279 0.010406 0.117517 -0.069542 -0.192748 0.070473 -0.194152 -0.089223 -0.223803 -0.322706 0.082855 0.018636 0.058292 -0.082563 0.084003 0.329879 -0.023301 -0.078679 0.285434 -0.124559 0.020785 0.183005 -0.032061 0.228669 0.105939 0.151524 0.076226 0.366585 0.123134 -0.123704 
-sodales 0.023164 -0.269886 0.114051 -0.286914 -0.066144 -0.086237 -0.091131 -0.157790 -0.047403 0.180649 -0.019759 0.354476 -0.391211 -0.082399 0.141094 0.419703 0.239425 0.034339 0.104670 0.051778 0.039569 0.229367 0.105415 -0.155156 -0.085840 -0.213508 0.090562 0.032807 0.282388 0.168471 -0.091390 -0.114917 -0.288494 0.216383 0.025664 -0.050371 0.173577 -0.109114 0.008682 -0.063770 -0.240937 0.197426 -0.303854 -0.064354 0.158431 0.177167 0.062602 0.181519 -0.121912 -0.194800 0.073588 -0.192919 0.099160 0.053489 -0.040552 0.059044 0.312513 0.030564 -0.128724 -0.139906 0.152601 -0.167493 -0.081542 -0.175415 -0.176128 0.246528 0.061304 -0.194623 0.204891 0.050824 -0.191950 0.000229 0.122159 -0.069976 -0.168032 0.073052 -0.174948 -0.102477 -0.213968 -0.299440 0.075495 0.010800 0.058678 -0.082049 0.074598 0.303716 -0.010716 -0.076456 0.264461 -0.121703 0.001077 0.170827 -0.034094 0.217896 0.086426 0.155378 0.062525 0.323886 0.107042 -0.103572 
-odio 0.022793 -0.281184 0.116087 -0.306180 -0.068308 -0.085840 -0.096086 -0.165186 -0.048004 0.170599 -0.028096 0.360649 -0.390999 -0.066197 0.138172 0.408476 0.234123 0.019129 0.108937 0.029632 0.036207 0.222255 0.096304 -0.153295 -0.078491 -0.198231 0.087453 0.033987 0.265922 0.164712 -0.097175 -0.124716 -0.275674 0.202653 0.031657 -0.048786 0.156282 -0.107782 -0.006080 -0.074919 -0.227947 0.187494 -0.288274 -0.052288 0.156378 0.166761 0.059508 0.170415 -0.112817 -0.186739 0.054295 -0.183038 0.102132 0.042478 -0.047955 0.070742 0.292019 0.027855 -0.120488 -0.142434 0.154845 -0.162227 -0.070638 -0.176130 -0.176912 0.234500 0.047033 -0.194290 0.210611 0.045576 -0.188891 -0.004777 0.117655 -0.070304 -0.176819 0.080321 -0.169371 -0.092675 -0.204755 -0.304395 0.066019 0.012215 0.063943 -0.083762 0.073012 0.288638 -0.017177 -0.078746 0.252444 -0.116290 0.021537 0.161991 -0.036306 0.208719 0.094314 0.150112 0.078635 0.327824 0.115683 -0.115226 
-mattis 0.028037 -0.274122 0.112138 -0.303489 -0.062436 -0.087445 -0.098342 -0.163067 -0.051951 0.186060 -0.015337 0.371859 -0.407426 -0.088149 0.141284 0.436823 0.238723 0.028113 0.116157 0.055875 0.049043 0.240816 0.104812 -0.161967 -0.084221 -0.229967 0.093276 0.036099 0.287383 0.179691 -0.100477 -0.139080 -0.312853 0.228207 0.033342 -0.048879 0.172285 -0.108676 0.007124 -0.075141 -0.259942 0.209548 -0.328366 -0.071704 0.167706 0.181665 0.072760 0.179580 -0.134108 -0.212307 0.068347 -0.200870 0.103664 0.055672 -0.050535 0.058310 0.343039 0.025523 -0.134403 -0.150518 0.174102 -0.181552 -0.089640 -0.188005 -0.202600 0.265376 0.060601 -0.214612 0.224700 0.053482 -0.206713 0.011045 0.123693 -0.069628 -0.187843 0.075891 -0.200476 -0.105535 -0.233398 -0.325320 0.089492 0.010697 0.060846 -0.093732 0.081669 0.340415 -0.021659 -0.085914 0.295258 -0.129131 0.009676 0.182203 -0.040714 0.247319 0.101107 0.159868 0.071713 0.362617 0.117483 -0.128114 
-elit. 0.037415 -0.258885 0.107442 -0.259049 -0.056641 -0.095740 -0.048013 -0.124902 -0.045583 0.177308 0.004141 0.325083 -0.359050 -0.060558 0.136097 0.350505 0.199912 0.028672 0.132856 0.083814 0.036868 0.201846 0.091149 -0.124685 -0.070574 -0.168794 0.140278 0.068531 0.179878 0.121318 -0.088449 -0.128577 -0.245611 0.218037 0.007653 -0.059991 0.123370 -0.094817 -0.001739 -0.073776 -0.271109 0.155009 -0.296484 -0.044416 0.102994 0.162522 0.039156 0.158904 -0.094964 -0.172233 0.033378 -0.177280 0.077186 0.050986 -0.039260 0.023623 0.291979 0.039284 -0.126724 -0.105565 0.160321 -0.113295 -0.073904 -0.154258 -0.171403 0.226547 0.028685 -0.233539 0.201105 0.034862 -0.161381 0.015853 0.103714 -0.066707 -0.177989 0.054810 -0.175580 -0.078658 -0.154086 -0.279605 0.103699 0.021403 0.038755 -0.055065 0.102742 0.282097 -0.023509 -0.087325 0.243424 -0.097154 0.046499 0.185805 -0.033284 0.227872 0.088775 0.117705 0.073842 0.338313 0.104597 -0.110679 
-rhoncus 0.020654 -0.276249 0.116825 -0.313234 -0.077726 -0.092609 -0.103813 -0.162551 -0.056583 0.189647 -0.019560 0.381539 -0.407399 -0.083506 0.139767 0.433696 0.246720 0.029828 0.113887 0.049956 0.041960 0.243961 0.100920 -0.165649 -0.089701 -0.225625 0.092616 0.037351 0.288921 0.191383 -0.095219 -0.130117 -0.310621 0.219433 0.032745 -0.051221 0.173975 -0.109422 0.007523 -0.081674 -0.253368 0.210798 -0.320781 -0.065159 0.171294 0.180017 0.062438 0.177113 -0.127621 -0.214194 0.062354 -0.204192 0.108510 0.056117 -0.047356 0.073370 0.334751 0.030583 -0.122213 -0.144689 0.180703 -0.167212 -0.092166 -0.187670 -0.194352 0.260846 0.050051 -0.209074 0.223792 0.055083 -0.194379 0.007042 0.126200 -0.069622 -0.189667 0.076596 -0.185688 -0.099087 -0.235018 -0.328262 0.084554 0.006055 0.057576 -0.089798 0.071539 0.322002 -0.015720 -0.079539 0.285248 -0.119541 0.021925 0.186806 -0.032455 0.230730 0.103159 0.157776 0.074171 0.360323 0.123531 -0.121556 
-amet, 0.059052 -0.259820 0.100295 -0.253457 -0.058622 -0.107934 -0.042150 -0.127205 -0.071414 0.164248 0.030237 0.304427 -0.362495 -0.085849 0.151906 0.349581 0.173415 0.051414 0.142977 0.103541 0.047619 0.220709 0.099582 -0.120262 -0.070322 -0.142348 0.153603 0.077144 0.140288 0.072694 -0.102788 -0.123337 -0.234682 0.216727 -0.011406 -0.050516 0.120455 -0.083259 -0.003103 -0.092834 -0.275884 0.146562 -0.301584 -0.015607 0.069696 0.182758 -0.007946 0.157277 -0.080424 -0.177089 0.020579 -0.186307 0.040700 0.033629 -0.034499 0.003277 0.278521 0.034318 -0.121306 -0.076421 0.163625 -0.061671 -0.068065 -0.141283 -0.174253 0.232452 0.012915 -0.253098 0.192667 0.004420 -0.131449 0.030443 0.080414 -0.048334 -0.197102 0.052072 -0.189814 -0.041172 -0.129083 -0.279275 0.129334 0.008209 0.025677 -0.025655 0.115341 0.284748 -0.050722 -0.099479 0.211787 -0.068746 0.076714 0.199309 -0.031878 0.230504 0.064299 0.097459 0.068723 0.337958 0.092411 -0.120231 
-dictum 0.026090 -0.262208 0.120640 -0.300141 -0.076113 -0.078659 -0.098602 -0.155384 -0.045771 0.179502 -0.029498 0.376787 -0.407807 -0.074407 0.137194 0.424403 0.235551 0.024598 0.114920 0.040234 0.045309 0.238577 0.107157 -0.153278 -0.091161 -0.214606 0.096510 0.034776 0.277854 0.178192 -0.090310 -0.132068 -0.302009 0.214560 0.031871 -0.052781 0.160111 -0.102702 0.012730 -0.071019 -0.256278 0.194679 -0.318487 -0.068140 0.170317 0.171977 0.062599 0.172771 -0.117512 -0.208230 0.056786 -0.202063 0.115002 0.053684 -0.051381 0.068242 0.316415 0.037029 -0.114805 -0.145923 0.161963 -0.161759 -0.071404 -0.175703 -0.186990 0.251013 0.051739 -0.198646 0.208639 0.052189 -0.188077 -0.000091 0.119142 -0.062774 -0.181330 0.071936 -0.183078 -0.088999 -0.215211 -0.305164 0.076924 0.009373 0.061329 -0.087825 0.084558 0.302321 -0.018509 -0.076339 0.265841 -0.118114 0.020078 0.174990 -0.036306 0.213626 0.099710 0.153464 0.075536 0.337065 0.107482 -0.120813 
-metus. 0.023707 -0.213204 0.091809 -0.237097 -0.054530 -0.069980 -0.073841 -0.121558 -0.034752 0.136269 -0.013569 0.280561 -0.299378 -0.056598 0.106468 0.322737 0.176946 0.024850 0.087101 0.032627 0.036113 0.182829 0.073135 -0.121954 -0.070554 -0.171043 0.074454 0.032004 0.221887 0.140842 -0.079430 -0.107243 -0.235214 0.161934 0.015136 -0.035046 0.130022 -0.075994 -0.000525 -0.049054 -0.184908 0.159964 -0.234468 -0.046375 0.123363 0.142065 0.044814 0.129494 -0.094140 -0.152700 0.052049 -0.150214 0.078738 0.043651 -0.035965 0.058524 0.241090 0.017493 -0.096545 -0.119236 0.128366 -0.131236 -0.051088 -0.142989 -0.137944 0.190622 0.050366 -0.153736 0.162460 0.031955 -0.152641 0.011220 0.081285 -0.052885 -0.143814 0.059873 -0.149349 -0.078449 -0.172212 -0.230328 0.056193 0.012521 0.045038 -0.074717 0.054015 0.236120 -0.014421 -0.059852 0.217758 -0.098471 0.014106 0.135170 -0.026074 0.175625 0.072555 0.121375 0.052847 0.264558 0.084828 -0.088122 
-ligula. 0.013568 -0.246966 0.098143 -0.263374 -0.057268 -0.080448 -0.086578 -0.143729 -0.049465 0.155481 -0.020507 0.324391 -0.353452 -0.062870 0.130311 0.377218 0.213601 0.026958 0.096318 0.033357 0.029480 0.205382 0.093452 -0.136997 -0.087147 -0.196623 0.080143 0.037655 0.249799 0.155542 -0.088579 -0.108106 -0.255862 0.196340 0.022801 -0.047033 0.148783 -0.086949 0.000190 -0.065240 -0.220257 0.184029 -0.267282 -0.055322 0.138429 0.152633 0.055517 0.165685 -0.113961 -0.183540 0.060292 -0.175130 0.095228 0.038492 -0.039085 0.050346 0.274072 0.030580 -0.111648 -0.124832 0.145580 -0.152734 -0.074347 -0.160765 -0.160000 0.223707 0.040474 -0.184143 0.190323 0.042107 -0.173223 0.010092 0.096416 -0.060365 -0.163062 0.066503 -0.159552 -0.089961 -0.184647 -0.276068 0.066696 0.018196 0.047528 -0.069724 0.078792 0.273346 -0.019428 -0.068812 0.245597 -0.109523 0.011969 0.152475 -0.035450 0.196765 0.086720 0.135103 0.067266 0.309083 0.099443 -0.104952 
-non, 0.028154 -0.268079 0.108143 -0.290304 -0.068528 -0.078839 -0.092983 -0.155131 -0.045946 0.177569 -0.016641 0.371388 -0.389453 -0.072988 0.140597 0.418028 0.239221 0.026707 0.111427 0.041658 0.041736 0.230791 0.099674 -0.163850 -0.097468 -0.216414 0.093223 0.044483 0.291484 0.177641 -0.098909 -0.125406 -0.316542 0.221139 0.032714 -0.047884 0.179851 -0.114341 0.011059 -0.077234 -0.258633 0.218488 -0.338462 -0.077754 0.175760 0.186181 0.077019 0.186239 -0.139748 -0.225763 0.075725 -0.212538 0.121099 0.062135 -0.049164 0.071155 0.325046 0.037577 -0.128705 -0.160613 0.170175 -0.181136 -0.088120 -0.194544 -0.193972 0.273482 0.055623 -0.209644 0.230839 0.051098 -0.209086 0.015424 0.113066 -0.074123 -0.188268 0.081801 -0.194030 -0.092990 -0.225742 -0.322321 0.078205 0.016552 0.066494 -0.092550 0.078772 0.324465 -0.025101 -0.070246 0.290881 -0.127837 0.020706 0.180271 -0.030410 0.226331 0.098170 0.157572 0.081929 0.353130 0.122462 -0.123036 
-sem. 0.017068 -0.215957 0.092276 -0.248961 -0.051809 -0.077587 -0.081297 -0.132431 -0.039412 0.141393 -0.024155 0.295438 -0.319555 -0.062096 0.116294 0.335458 0.193772 0.023203 0.090200 0.034964 0.027470 0.175055 0.076939 -0.125860 -0.067884 -0.157119 0.077614 0.032028 0.212171 0.145573 -0.069288 -0.101644 -0.229141 0.171155 0.026408 -0.041015 0.126355 -0.079311 -0.000535 -0.052624 -0.191707 0.154801 -0.248114 -0.046258 0.117817 0.138390 0.051109 0.142654 -0.099178 -0.156755 0.042808 -0.152160 0.086203 0.041957 -0.039452 0.052941 0.248058 0.027302 -0.101062 -0.117224 0.124820 -0.135501 -0.060607 -0.142070 -0.149391 0.189988 0.043964 -0.161049 0.161815 0.036387 -0.146125 -0.001882 0.097811 -0.057940 -0.142306 0.065500 -0.138556 -0.071295 -0.169963 -0.244943 0.062167 0.010183 0.047979 -0.071975 0.064817 0.237897 -0.008554 -0.070441 0.212534 -0.094685 0.020188 0.133066 -0.025087 0.172074 0.078752 0.119845 0.058371 0.274511 0.093835 -0.095863 
-urna. 0.021387 -0.237964 0.100018 -0.262954 -0.062952 -0.079541 -0.081924 -0.139496 -0.047911 0.164315 -0.006334 0.326208 -0.361860 -0.073820 0.132261 0.389168 0.209996 0.021976 0.108185 0.036566 0.035881 0.215260 0.089716 -0.140284 -0.082264 -0.178419 0.081525 0.031133 0.234747 0.153484 -0.087012 -0.112952 -0.252476 0.186041 0.019695 -0.043900 0.139547 -0.090071 0.001820 -0.062676 -0.213231 0.169111 -0.275035 -0.055897 0.139862 0.162440 0.046682 0.158277 -0.111490 -0.181809 0.056954 -0.167075 0.088823 0.049409 -0.042705 0.050360 0.268572 0.029659 -0.110467 -0.121820 0.149871 -0.141540 -0.067073 -0.158338 -0.156199 0.217813 0.042271 -0.184605 0.183353 0.048180 -0.165584 0.010380 0.098595 -0.060927 -0.163690 0.069386 -0.161584 -0.086160 -0.187487 -0.280763 0.074683 0.012749 0.056063 -0.073739 0.074422 0.275426 -0.018006 -0.075302 0.230290 -0.103812 0.018892 0.150294 -0.030381 0.190409 0.089146 0.126535 0.060318 0.304259 0.100555 -0.104997 
-arcu. 0.023376 -0.228006 0.091007 -0.254462 -0.054556 -0.080113 -0.080051 -0.128898 -0.049467 0.147494 -0.008888 0.299914 -0.326831 -0.064243 0.116190 0.342631 0.195367 0.020990 0.092565 0.038022 0.033486 0.178219 0.082809 -0.131018 -0.073038 -0.158406 0.076387 0.030818 0.206980 0.128470 -0.083802 -0.098216 -0.235901 0.158624 0.021746 -0.047847 0.127391 -0.084173 0.001427 -0.057310 -0.195376 0.147658 -0.233471 -0.044667 0.127673 0.138881 0.051509 0.127501 -0.098372 -0.157091 0.049512 -0.144034 0.076925 0.038077 -0.035782 0.041928 0.235578 0.028360 -0.096847 -0.115734 0.128238 -0.122285 -0.063969 -0.143855 -0.138063 0.198826 0.038052 -0.162191 0.173413 0.036167 -0.154411 0.001583 0.094475 -0.049026 -0.141339 0.063832 -0.142166 -0.070121 -0.157176 -0.240492 0.057576 0.018767 0.041620 -0.066189 0.067807 0.237651 -0.018346 -0.058976 0.213154 -0.100295 0.010289 0.131580 -0.026148 0.172269 0.086479 0.122248 0.056250 0.271867 0.090974 -0.086786 
-nulla, 0.017306 -0.217053 0.094108 -0.247939 -0.052453 -0.073497 -0.082072 -0.131327 -0.050264 0.153356 -0.012596 0.306777 -0.317814 -0.064610 0.109065 0.340759 0.189010 0.029102 0.091645 0.040331 0.027708 0.180257 0.088381 -0.127166 -0.069607 -0.167892 0.075346 0.028739 0.217281 0.140786 -0.074278 -0.103751 -0.239515 0.176312 0.021354 -0.036196 0.136174 -0.078611 0.012396 -0.063641 -0.207356 0.163040 -0.255914 -0.055314 0.127927 0.143603 0.047804 0.140676 -0.099015 -0.172454 0.043694 -0.153321 0.072494 0.046797 -0.038727 0.052357 0.250259 0.023991 -0.090485 -0.110755 0.137010 -0.130279 -0.066685 -0.144132 -0.146825 0.209191 0.043655 -0.173468 0.179684 0.033953 -0.154339 0.010698 0.096467 -0.060882 -0.145200 0.066371 -0.150522 -0.074188 -0.176730 -0.247134 0.063406 0.010901 0.046783 -0.065534 0.062158 0.254243 -0.012436 -0.065665 0.217988 -0.092273 0.013889 0.140254 -0.022567 0.175268 0.077957 0.112421 0.055662 0.272871 0.093254 -0.087502 
-velit. 0.015980 -0.213036 0.075593 -0.240643 -0.046830 -0.069118 -0.076458 -0.109638 -0.054550 0.137240 0.002825 0.263807 -0.287725 -0.087876 0.098825 0.335878 0.186080 0.038403 0.078301 0.044377 0.037836 0.176259 0.087696 -0.121817 -0.061638 -0.185873 0.068872 0.034046 0.224534 0.122351 -0.073114 -0.095870 -0.226223 0.155834 0.006698 -0.028377 0.143448 -0.065979 0.021767 -0.049422 -0.181265 0.165736 -0.238125 -0.057846 0.127372 0.128201 0.062473 0.144324 -0.095279 -0.166201 0.065211 -0.152550 0.068868 0.033297 -0.021765 0.036930 0.260520 0.019424 -0.109180 -0.130569 0.132222 -0.151986 -0.073787 -0.148121 -0.156007 0.210849 0.043556 -0.170675 0.169971 0.045069 -0.148747 0.017476 0.074659 -0.056458 -0.126565 0.058976 -0.160091 -0.083881 -0.182862 -0.229418 0.058068 -0.013223 0.036108 -0.082943 0.057128 0.254692 -0.012338 -0.050808 0.239376 -0.094928 -0.002717 0.140387 -0.018700 0.183222 0.061209 0.121700 0.052265 0.267801 0.085891 -0.086728 
-nisl. 0.018604 -0.225873 0.098904 -0.250239 -0.063370 -0.079912 -0.082141 -0.134533 -0.047471 0.161749 -0.009457 0.308746 -0.330707 -0.064905 0.116532 0.350727 0.204183 0.030961 0.094116 0.045251 0.040439 0.194655 0.089683 -0.127380 -0.073929 -0.185652 0.077464 0.034252 0.232951 0.143075 -0.086427 -0.111104 -0.252768 0.178553 0.019951 -0.036436 0.138617 -0.085728 0.012021 -0.057097 -0.199774 0.170174 -0.257385 -0.055767 0.140966 0.148265 0.065389 0.153726 -0.105358 -0.174169 0.057737 -0.166212 0.093764 0.045254 -0.036320 0.058200 0.266850 0.021914 -0.109511 -0.128875 0.140848 -0.147472 -0.060664 -0.156938 -0.159160 0.223682 0.045012 -0.173405 0.182543 0.038415 -0.170289 0.004420 0.088908 -0.060030 -0.153769 0.073933 -0.165568 -0.087747 -0.191759 -0.265708 0.059255 0.013305 0.044061 -0.082793 0.063973 0.271222 -0.018335 -0.066181 0.239199 -0.106099 0.012600 0.146130 -0.035523 0.186682 0.080755 0.131028 0.063754 0.286222 0.095594 -0.100765 
-faucibus. 0.060941 -0.228680 0.094938 -0.232541 -0.051164 -0.053641 -0.062650 -0.115024 -0.046217 0.156125 0.006085 0.277908 -0.323996 -0.062877 0.118705 0.331471 0.200497 0.018467 0.107570 0.059731 0.040203 0.177632 0.094020 -0.130920 -0.059013 -0.165661 0.067002 0.031995 0.173784 0.085579 -0.069923 -0.090540 -0.230853 0.164674 0.020396 -0.058418 0.123007 -0.094520 -0.003657 -0.047087 -0.198740 0.134556 -0.221078 -0.046828 0.098447 0.153386 0.052787 0.116373 -0.069265 -0.136977 0.063658 -0.142623 0.083669 0.048630 -0.062293 0.042962 0.229133 0.004121 -0.084334 -0.104162 0.106203 -0.115363 -0.040161 -0.123418 -0.130799 0.195982 0.030967 -0.161080 0.165535 0.056961 -0.140466 0.009008 0.093331 -0.064549 -0.119289 0.062797 -0.150557 -0.070571 -0.108951 -0.233341 0.055506 0.020793 0.016375 -0.063978 0.075520 0.225714 -0.017903 -0.042112 0.181457 -0.098268 0.006446 0.140598 -0.020006 0.177796 0.055271 0.111486 0.061218 0.246861 0.066064 -0.067472 
-ut, 0.029387 -0.253770 0.107323 -0.285371 -0.063598 -0.087967 -0.095975 -0.153453 -0.057099 0.173197 -0.012242 0.353346 -0.361911 -0.078790 0.136276 0.389918 0.222414 0.037145 0.108885 0.047681 0.039270 0.219886 0.097944 -0.147493 -0.077891 -0.198775 0.087778 0.040773 0.257747 0.155190 -0.091533 -0.127434 -0.279279 0.199511 0.032654 -0.046772 0.150361 -0.089462 0.005957 -0.064620 -0.230963 0.193312 -0.289275 -0.063785 0.146318 0.161804 0.051001 0.165427 -0.115282 -0.194534 0.051732 -0.176297 0.093442 0.051854 -0.036964 0.055611 0.293276 0.030174 -0.109227 -0.134868 0.146786 -0.150883 -0.074920 -0.162644 -0.174143 0.229726 0.054434 -0.186382 0.191948 0.039637 -0.170917 0.004429 0.104041 -0.068964 -0.166380 0.076173 -0.166715 -0.085915 -0.189931 -0.282128 0.071014 0.006013 0.062332 -0.082015 0.071762 0.295426 -0.016244 -0.071327 0.244167 -0.116196 0.022260 0.170593 -0.026433 0.207957 0.087630 0.134873 0.070433 0.313593 0.103716 -0.105483 
-augue. 0.014518 -0.218086 0.089601 -0.237076 -0.048581 -0.074430 -0.074934 -0.122346 -0.044723 0.146963 -0.019069 0.282377 -0.305934 -0.058435 0.106162 0.336096 0.179891 0.015971 0.087603 0.031432 0.036253 0.179807 0.077521 -0.129642 -0.071772 -0.162950 0.062036 0.029155 0.215114 0.126393 -0.070334 -0.102611 -0.228520 0.158821 0.018808 -0.042494 0.129849 -0.082403 0.004229 -0.046243 -0.187652 0.154232 -0.238945 -0.046647 0.128262 0.132391 0.059505 0.140678 -0.102323 -0.156648 0.055873 -0.140113 0.082146 0.041744 -0.033577 0.046809 0.230608 0.020599 -0.096477 -0.120735 0.121698 -0.128480 -0.065559 -0.139346 -0.144241 0.200538 0.037426 -0.146103 0.156659 0.045926 -0.146132 0.006631 0.081771 -0.052928 -0.133337 0.054304 -0.142561 -0.071898 -0.164449 -0.242507 0.055129 0.011239 0.045998 -0.074523 0.067469 0.249957 -0.010216 -0.065828 0.232783 -0.106826 0.005577 0.147433 -0.030066 0.184303 0.078153 0.130802 0.055754 0.278558 0.089461 -0.089847 
-nulla. 0.023305 -0.214263 0.093704 -0.239144 -0.049929 -0.073649 -0.082316 -0.130897 -0.038959 0.135803 -0.018428 0.295049 -0.322560 -0.060308 0.112906 0.339800 0.184343 0.027320 0.096882 0.030744 0.035880 0.182757 0.078781 -0.120650 -0.080274 -0.171405 0.074454 0.037634 0.213088 0.143999 -0.082568 -0.098605 -0.238025 0.173904 0.020497 -0.041667 0.129758 -0.080866 0.005904 -0.059663 -0.200999 0.154419 -0.256081 -0.043068 0.128063 0.139658 0.051103 0.142991 -0.099217 -0.174259 0.048371 -0.155545 0.082262 0.044777 -0.034406 0.049461 0.244927 0.015912 -0.101588 -0.110570 0.136618 -0.132230 -0.058709 -0.147609 -0.145950 0.201234 0.032754 -0.158926 0.174743 0.031678 -0.157822 0.006268 0.089868 -0.049051 -0.155002 0.065071 -0.141853 -0.076970 -0.164899 -0.246079 0.062822 0.005247 0.052352 -0.059502 0.065140 0.248603 -0.024616 -0.070402 0.213715 -0.088913 0.018642 0.132925 -0.022512 0.173690 0.083811 0.115025 0.054158 0.272668 0.085446 -0.091871 
-leo. 0.022399 -0.199629 0.091129 -0.220205 -0.052535 -0.065020 -0.066100 -0.117005 -0.043023 0.134589 -0.012837 0.267996 -0.295400 -0.050837 0.108884 0.319042 0.179457 0.024769 0.082924 0.023221 0.026724 0.169220 0.069015 -0.125489 -0.069525 -0.165536 0.072419 0.029510 0.208748 0.133825 -0.070345 -0.094534 -0.214531 0.159710 0.018444 -0.040344 0.124497 -0.084751 -0.002461 -0.055864 -0.184388 0.153380 -0.224484 -0.042834 0.121803 0.129777 0.040286 0.123365 -0.096308 -0.151243 0.049700 -0.145293 0.077507 0.040833 -0.040803 0.044791 0.227662 0.028109 -0.095746 -0.102516 0.125115 -0.124468 -0.054006 -0.138459 -0.140169 0.187984 0.031962 -0.153408 0.161823 0.032856 -0.136318 0.006245 0.076896 -0.050118 -0.135488 0.062970 -0.136639 -0.073761 -0.161331 -0.228685 0.054809 0.005382 0.038376 -0.066708 0.065654 0.234357 -0.010621 -0.062177 0.208476 -0.096563 0.010661 0.128479 -0.028915 0.164649 0.074410 0.115375 0.052988 0.257674 0.087845 -0.084079 
-tincidunt. 0.018992 -0.189228 0.075996 -0.209138 -0.045743 -0.055079 -0.074308 -0.114260 -0.038189 0.121539 -0.017330 0.251572 -0.274652 -0.050644 0.099687 0.296236 0.161325 0.020535 0.081734 0.023234 0.024878 0.155925 0.063109 -0.111561 -0.065391 -0.147155 0.061629 0.031147 0.193421 0.119745 -0.062013 -0.087856 -0.200501 0.142428 0.021124 -0.040765 0.103088 -0.072132 0.001202 -0.036191 -0.160705 0.136077 -0.204083 -0.044241 0.113546 0.120352 0.042064 0.114501 -0.084632 -0.134253 0.043961 -0.135515 0.072718 0.040164 -0.037964 0.047610 0.201127 0.023504 -0.080659 -0.101951 0.102444 -0.118979 -0.049687 -0.126881 -0.123346 0.162299 0.036970 -0.134288 0.142540 0.041397 -0.131069 0.007058 0.073514 -0.050751 -0.123347 0.050735 -0.122709 -0.065188 -0.143042 -0.202795 0.043585 0.011270 0.046205 -0.061161 0.056204 0.207573 -0.016971 -0.051184 0.174946 -0.083925 0.006694 0.121057 -0.025941 0.148523 0.067258 0.102812 0.045361 0.227919 0.077697 -0.076425 
-nunc. 0.016117 -0.205672 0.089212 -0.229446 -0.047570 -0.071798 -0.070500 -0.116500 -0.037708 0.133376 -0.016216 0.284247 -0.303902 -0.055095 0.110477 0.320533 0.183506 0.016909 0.082963 0.031462 0.021649 0.172438 0.074116 -0.118522 -0.064774 -0.161752 0.068730 0.034093 0.213899 0.144003 -0.071850 -0.100446 -0.223748 0.155040 0.017657 -0.033949 0.118800 -0.075300 0.004217 -0.059361 -0.184994 0.152822 -0.226065 -0.043905 0.116040 0.120553 0.042523 0.135379 -0.092273 -0.147929 0.043654 -0.141476 0.071463 0.039497 -0.029916 0.050897 0.232622 0.028527 -0.091650 -0.104292 0.123494 -0.126863 -0.058136 -0.136026 -0.130136 0.179460 0.037711 -0.145813 0.162001 0.032874 -0.139431 0.005111 0.086453 -0.056551 -0.141224 0.053498 -0.128775 -0.074661 -0.167615 -0.230010 0.051615 0.005549 0.047964 -0.063855 0.057031 0.233209 -0.006587 -0.057273 0.198198 -0.089724 0.011383 0.131021 -0.025166 0.155187 0.069624 0.116980 0.055177 0.243731 0.082675 -0.087002 
-odio. 0.025247 -0.239850 0.102288 -0.263035 -0.057450 -0.078663 -0.084413 -0.141361 -0.046037 0.165213 -0.012897 0.323930 -0.352535 -0.073127 0.127584 0.362117 0.207011 0.027637 0.101312 0.039161 0.035111 0.191353 0.091902 -0.134372 -0.073527 -0.179525 0.084464 0.034734 0.240695 0.149155 -0.082934 -0.104877 -0.258560 0.181607 0.020607 -0.047946 0.136154 -0.092784 0.002872 -0.063284 -0.219062 0.171475 -0.268783 -0.051170 0.140380 0.150236 0.056035 0.151855 -0.100714 -0.173622 0.058188 -0.162266 0.086928 0.044202 -0.034331 0.050299 0.266705 0.026271 -0.104698 -0.120586 0.142755 -0.134685 -0.068903 -0.155075 -0.162394 0.224354 0.037715 -0.174339 0.191532 0.039030 -0.162212 0.010557 0.093522 -0.064366 -0.156581 0.067839 -0.148698 -0.074110 -0.179332 -0.261770 0.063892 0.011990 0.053121 -0.073591 0.065203 0.269136 -0.017604 -0.066816 0.234526 -0.102077 0.008962 0.141146 -0.024647 0.187924 0.084807 0.128981 0.056099 0.291382 0.099477 -0.100236 
-diam. 0.022134 -0.211513 0.087725 -0.237061 -0.059215 -0.071821 -0.073686 -0.129796 -0.047240 0.146904 -0.016751 0.295795 -0.310255 -0.072315 0.106116 0.343132 0.186255 0.027418 0.087903 0.036040 0.036208 0.185356 0.078234 -0.125962 -0.065465 -0.176664 0.077537 0.026820 0.218503 0.144044 -0.072350 -0.105173 -0.230720 0.161838 0.024017 -0.036260 0.127027 -0.080698 0.005065 -0.058374 -0.191030 0.166874 -0.237812 -0.058038 0.124334 0.135838 0.061871 0.135771 -0.103787 -0.156847 0.050420 -0.152187 0.081774 0.036349 -0.033899 0.055724 0.242772 0.030078 -0.095955 -0.124905 0.119106 -0.139534 -0.062615 -0.142954 -0.145771 0.202469 0.044576 -0.163620 0.162554 0.042942 -0.148596 0.001556 0.091102 -0.059393 -0.140433 0.064335 -0.143036 -0.079052 -0.171638 -0.239996 0.054203 0.006953 0.043105 -0.068346 0.068720 0.254906 -0.008050 -0.056161 0.216897 -0.097598 0.011541 0.133926 -0.033016 0.171969 0.079867 0.119951 0.058461 0.270108 0.085613 -0.085947 
-risus. 0.025933 -0.224171 0.095965 -0.237055 -0.052377 -0.072713 -0.067601 -0.127070 -0.045472 0.146758 -0.008496 0.288112 -0.319717 -0.059117 0.108836 0.338515 0.189006 0.024419 0.093536 0.042466 0.036399 0.173350 0.079704 -0.120761 -0.068139 -0.168164 0.071214 0.023847 0.208077 0.131675 -0.079295 -0.091060 -0.225498 0.167118 0.019293 -0.040732 0.121061 -0.087930 0.006656 -0.054737 -0.181360 0.147692 -0.244158 -0.051886 0.115886 0.139099 0.050724 0.136635 -0.090530 -0.150266 0.054489 -0.149338 0.073427 0.035825 -0.036923 0.050597 0.235530 0.022640 -0.102429 -0.112351 0.123842 -0.130106 -0.051232 -0.141607 -0.140152 0.188702 0.034249 -0.152184 0.165965 0.043527 -0.144423 0.006913 0.086824 -0.051703 -0.138468 0.063550 -0.142842 -0.076508 -0.155723 -0.236327 0.051435 0.009523 0.042173 -0.063371 0.062138 0.232185 -0.014298 -0.058668 0.205325 -0.093927 0.007894 0.138795 -0.018460 0.175563 0.067218 0.118065 0.059485 0.258674 0.088590 -0.078775 
-dolor. 0.024866 -0.197169 0.079798 -0.219821 -0.048199 -0.068237 -0.074271 -0.114668 -0.039362 0.123362 -0.016571 0.267314 -0.277908 -0.061543 0.106452 0.298273 0.171194 0.019203 0.081438 0.036483 0.036121 0.162259 0.074670 -0.109840 -0.068626 -0.158508 0.069553 0.032956 0.195609 0.120317 -0.064863 -0.092219 -0.207778 0.149991 0.014162 -0.033165 0.112871 -0.069802 0.005786 -0.046774 -0.169595 0.150208 -0.226685 -0.043887 0.116258 0.119253 0.051498 0.122705 -0.089257 -0.144019 0.051491 -0.137533 0.072205 0.036558 -0.028815 0.050897 0.221937 0.018536 -0.095057 -0.113818 0.119300 -0.125293 -0.059694 -0.130979 -0.140517 0.181275 0.036551 -0.144167 0.147177 0.030581 -0.136807 0.009918 0.073216 -0.049071 -0.125147 0.054770 -0.131749 -0.064233 -0.155992 -0.223064 0.049830 0.003272 0.040966 -0.062970 0.057299 0.226096 -0.018193 -0.056596 0.203480 -0.084777 0.008583 0.128615 -0.018772 0.165998 0.074228 0.112882 0.054433 0.245391 0.078714 -0.080749 
-libero. 0.017881 -0.221265 0.095408 -0.241131 -0.054459 -0.069595 -0.075097 -0.123654 -0.040837 0.144495 -0.010772 0.286246 -0.306933 -0.058671 0.109961 0.330191 0.178577 0.022586 0.090149 0.034395 0.029157 0.175944 0.080162 -0.116582 -0.068123 -0.162672 0.082634 0.034146 0.205601 0.132521 -0.074426 -0.103847 -0.226106 0.171451 0.022001 -0.041948 0.118113 -0.076436 0.008137 -0.061394 -0.193725 0.155972 -0.241945 -0.049004 0.120223 0.141317 0.038322 0.136854 -0.086540 -0.155084 0.043770 -0.153168 0.079007 0.038828 -0.029865 0.047333 0.238218 0.025381 -0.098355 -0.108007 0.126461 -0.117361 -0.062428 -0.134433 -0.144540 0.190430 0.037252 -0.164895 0.171713 0.042221 -0.150455 0.001775 0.093827 -0.049924 -0.145640 0.058187 -0.145712 -0.065763 -0.169481 -0.245316 0.068064 0.008061 0.046588 -0.061128 0.067982 0.251901 -0.017439 -0.068303 0.218586 -0.093630 0.013731 0.144209 -0.033007 0.168405 0.069820 0.111131 0.057173 0.269347 0.086853 -0.090431 
-ipsum. 0.018156 -0.219434 0.100529 -0.252596 -0.049100 -0.066015 -0.082895 -0.126888 -0.042650 0.148310 -0.025190 0.303283 -0.325835 -0.063406 0.114006 0.341922 0.187074 0.019088 0.087911 0.029353 0.027816 0.182114 0.084402 -0.130249 -0.068631 -0.172554 0.068857 0.030625 0.227312 0.144757 -0.083788 -0.104949 -0.229738 0.170669 0.023371 -0.037332 0.125816 -0.079844 0.002474 -0.054593 -0.200392 0.158548 -0.249052 -0.055079 0.136446 0.137526 0.056411 0.140761 -0.096688 -0.168470 0.044190 -0.154837 0.077712 0.033407 -0.029474 0.046344 0.249773 0.021299 -0.097356 -0.113974 0.135620 -0.132994 -0.065753 -0.144572 -0.143603 0.201202 0.038896 -0.157519 0.172497 0.039960 -0.154131 0.007437 0.088968 -0.054965 -0.149548 0.054820 -0.137578 -0.079824 -0.165837 -0.243218 0.060277 0.004048 0.049200 -0.063332 0.058311 0.244795 -0.011317 -0.066701 0.226780 -0.092231 0.017105 0.133633 -0.028885 0.171240 0.074781 0.125441 0.056792 0.265561 0.100159 -0.085287 
-dui. 0.020023 -0.190118 0.082335 -0.215287 -0.049414 -0.062661 -0.064149 -0.116296 -0.038901 0.136282 -0.009880 0.260777 -0.282559 -0.060645 0.095043 0.302303 0.171620 0.027489 0.081651 0.038982 0.024456 0.164578 0.075231 -0.105361 -0.059376 -0.144741 0.067848 0.033205 0.192511 0.120168 -0.074151 -0.095762 -0.211483 0.157288 0.020680 -0.034559 0.118042 -0.069642 0.009319 -0.046893 -0.177165 0.142159 -0.220544 -0.046409 0.112807 0.122712 0.046714 0.133542 -0.084978 -0.145141 0.048157 -0.137953 0.067829 0.030535 -0.025567 0.036890 0.226676 0.015484 -0.095413 -0.102085 0.115502 -0.115230 -0.054526 -0.125919 -0.134760 0.188103 0.036408 -0.150106 0.158298 0.032877 -0.139088 0.008908 0.081061 -0.048730 -0.135098 0.059360 -0.141165 -0.069454 -0.155949 -0.230871 0.061369 0.011586 0.042412 -0.056862 0.056835 0.224690 -0.010905 -0.056550 0.208011 -0.092406 0.020458 0.127087 -0.026278 0.163897 0.064075 0.117238 0.049542 0.253088 0.080775 -0.079227 
-quam. 0.023306 -0.194433 0.078347 -0.206660 -0.044909 -0.061614 -0.059579 -0.110530 -0.044868 0.128364 -0.009967 0.262112 -0.272146 -0.059894 0.104854 0.299682 0.170243 0.018718 0.076390 0.029167 0.031813 0.159774 0.080751 -0.118781 -0.068386 -0.159330 0.072078 0.026629 0.204870 0.123188 -0.072264 -0.094352 -0.214251 0.150974 0.015631 -0.040290 0.123011 -0.078840 0.009104 -0.050705 -0.177156 0.147210 -0.230231 -0.048244 0.111538 0.121909 0.044341 0.131505 -0.091947 -0.149527 0.045107 -0.134883 0.064403 0.041893 -0.029380 0.037213 0.219366 0.021728 -0.092610 -0.110268 0.117470 -0.115264 -0.058889 -0.128266 -0.132413 0.183865 0.035187 -0.151529 0.157490 0.041011 -0.128309 0.010292 0.075195 -0.050730 -0.121001 0.049784 -0.132032 -0.067804 -0.152863 -0.222325 0.048827 0.010577 0.041993 -0.059638 0.058871 0.215555 -0.009093 -0.052895 0.191990 -0.082716 0.004433 0.115714 -0.027253 0.148638 0.060279 0.106332 0.053279 0.240638 0.071816 -0.085501 
-per 0.026679 -0.265873 -0.017456 -0.418486 -0.063180 -0.240861 -0.117838 -0.038352 -0.220504 0.177916 0.340583 0.319367 -0.088454 -0.625607 0.137218 0.725677 0.309256 0.256475 0.042956 0.193530 0.346195 0.418732 0.372520 -0.242159 -0.032394 -0.449084 0.152644 -0.049783 0.499960 0.169711 -0.084870 -0.196262 -0.247138 0.070227 -0.144947 0.045255 0.457533 0.081546 0.262701 0.043224 0.029821 0.549523 -0.390229 -0.241907 0.238994 0.064178 0.245132 0.383048 -0.136354 -0.254028 0.394583 -0.183955 -0.105176 -0.025324 0.137487 -0.054549 0.617049 -0.098496 -0.306403 -0.534881 0.310569 -0.462904 -0.355337 -0.249986 -0.368352 0.499046 0.196561 -0.425316 0.152986 0.011345 -0.093721 0.189885 -0.090546 0.012891 0.049775 0.100731 -0.518554 -0.242452 -0.600132 -0.192977 0.009984 -0.261598 -0.070993 -0.310101 0.033666 0.709989 -0.058877 -0.019543 0.719012 -0.055554 -0.184857 0.334730 -0.031007 0.411341 -0.044693 0.192670 -0.006036 0.441218 0.031262 -0.142301 
-tellus. 0.019774 -0.194300 0.075156 -0.208240 -0.045438 -0.066588 -0.071521 -0.113995 -0.039798 0.128333 -0.009372 0.258131 -0.276466 -0.054059 0.091823 0.292307 0.168671 0.018148 0.078336 0.025719 0.034376 0.156444 0.066032 -0.107764 -0.066243 -0.148809 0.056252 0.026088 0.194666 0.118624 -0.068104 -0.090949 -0.202716 0.146407 0.018808 -0.035725 0.106787 -0.071628 0.012092 -0.054999 -0.162639 0.134554 -0.203721 -0.042882 0.105560 0.118651 0.039105 0.115616 -0.086159 -0.139332 0.043793 -0.126497 0.073961 0.037299 -0.026467 0.043954 0.201924 0.020359 -0.079933 -0.107690 0.107497 -0.108252 -0.055705 -0.121902 -0.124742 0.167176 0.037596 -0.136202 0.148371 0.038722 -0.130572 0.010501 0.072705 -0.047673 -0.116378 0.057264 -0.120857 -0.059194 -0.141830 -0.207824 0.053047 0.005699 0.041440 -0.059020 0.048085 0.214529 -0.008223 -0.048893 0.186075 -0.082609 0.004412 0.110776 -0.028340 0.145214 0.065286 0.101573 0.042886 0.226690 0.070484 -0.076879 
-vitae, 0.026717 -0.253715 0.109231 -0.291950 -0.060409 -0.087103 -0.094380 -0.152743 -0.054322 0.174450 -0.015989 0.358349 -0.385253 -0.082443 0.137125 0.411066 0.230215 0.033014 0.109764 0.048847 0.037728 0.219893 0.103963 -0.155883 -0.083994 -0.205267 0.090400 0.041723 0.273564 0.170942 -0.096181 -0.127941 -0.293845 0.212227 0.035759 -0.046273 0.167898 -0.104031 0.005792 -0.069970 -0.238401 0.195105 -0.309283 -0.059395 0.156065 0.174790 0.069102 0.172726 -0.121547 -0.201860 0.055730 -0.192313 0.105848 0.053691 -0.051617 0.065343 0.312353 0.030553 -0.114600 -0.144180 0.158898 -0.156089 -0.080944 -0.182961 -0.187580 0.251453 0.049350 -0.197957 0.216827 0.049212 -0.182430 0.006565 0.109591 -0.073334 -0.180194 0.074278 -0.177256 -0.097028 -0.204318 -0.303668 0.078180 0.009808 0.060529 -0.084606 0.082299 0.304637 -0.010947 -0.076592 0.275133 -0.126954 0.022602 0.171182 -0.033974 0.225083 0.093097 0.154642 0.077746 0.348479 0.114347 -0.108725 
-sed, 0.028009 -0.262394 0.104275 -0.288870 -0.068052 -0.077831 -0.088799 -0.150275 -0.050841 0.167453 -0.013828 0.340659 -0.377791 -0.073571 0.133376 0.393764 0.229705 0.026888 0.106170 0.050287 0.033972 0.223210 0.099324 -0.149228 -0.090476 -0.203104 0.085737 0.036285 0.268575 0.160098 -0.086690 -0.121499 -0.275750 0.203795 0.034070 -0.050757 0.158892 -0.099092 0.014842 -0.071528 -0.230592 0.190503 -0.299672 -0.062853 0.153678 0.175854 0.058458 0.172150 -0.113584 -0.203116 0.060190 -0.188867 0.090128 0.046456 -0.044652 0.061035 0.305780 0.031929 -0.114483 -0.138138 0.157854 -0.157155 -0.070555 -0.179380 -0.178458 0.237499 0.044787 -0.200138 0.203243 0.050230 -0.179222 0.008907 0.106490 -0.068472 -0.176550 0.070781 -0.180883 -0.082604 -0.199506 -0.294932 0.076231 0.005644 0.050851 -0.081222 0.069502 0.293166 -0.020403 -0.080583 0.257544 -0.112275 0.017242 0.168318 -0.028326 0.202211 0.082358 0.136732 0.066723 0.319472 0.102052 -0.104627 
-sapien. 0.020642 -0.178519 0.074300 -0.204637 -0.043937 -0.053830 -0.062625 -0.109628 -0.034466 0.124411 -0.017996 0.250115 -0.271244 -0.049876 0.093850 0.283667 0.156510 0.019013 0.075889 0.032905 0.032411 0.151823 0.065757 -0.106025 -0.060777 -0.153934 0.069055 0.031894 0.191014 0.115559 -0.068733 -0.088670 -0.204603 0.151017 0.017089 -0.034325 0.109879 -0.073790 0.004179 -0.049182 -0.172541 0.136997 -0.212579 -0.042751 0.108614 0.117192 0.047841 0.128958 -0.083754 -0.139219 0.041529 -0.126180 0.072904 0.031579 -0.030416 0.042378 0.213248 0.018514 -0.085433 -0.096453 0.116841 -0.113846 -0.052614 -0.121266 -0.128422 0.170044 0.037565 -0.133892 0.144083 0.037404 -0.124048 0.004124 0.078903 -0.042179 -0.128744 0.049480 -0.116479 -0.063936 -0.145761 -0.204600 0.051224 0.008161 0.041809 -0.050720 0.050184 0.210103 -0.008999 -0.049948 0.176939 -0.075326 0.009839 0.117801 -0.026632 0.152510 0.071744 0.104480 0.049977 0.222530 0.069060 -0.077019 
-mauris. 0.021995 -0.190996 0.081043 -0.215981 -0.049063 -0.072558 -0.062293 -0.113220 -0.042302 0.136448 -0.004185 0.267948 -0.278246 -0.064685 0.103488 0.304879 0.168678 0.020661 0.074190 0.033256 0.029218 0.161381 0.073168 -0.108452 -0.052279 -0.150071 0.057155 0.027534 0.196369 0.121900 -0.062611 -0.081285 -0.194510 0.144532 0.015659 -0.034967 0.118169 -0.073651 0.002729 -0.049320 -0.161781 0.139942 -0.210268 -0.052654 0.103583 0.123871 0.043064 0.121722 -0.082372 -0.135569 0.052228 -0.131412 0.072103 0.030160 -0.031727 0.041547 0.211613 0.020080 -0.083167 -0.097708 0.111132 -0.112744 -0.061225 -0.116115 -0.132511 0.173675 0.033921 -0.144883 0.145154 0.029812 -0.130261 -0.001174 0.074051 -0.050552 -0.124250 0.056088 -0.126645 -0.060048 -0.149743 -0.213523 0.058512 0.011613 0.031729 -0.055022 0.054360 0.218924 -0.002913 -0.051913 0.197818 -0.083854 0.010033 0.116492 -0.026971 0.150241 0.055913 0.103244 0.053494 0.239125 0.084200 -0.082406 
-in, 0.022463 -0.275454 0.119139 -0.318084 -0.074985 -0.091860 -0.098287 -0.161072 -0.054615 0.194247 -0.022919 0.387736 -0.410261 -0.084886 0.143939 0.439938 0.249672 0.030961 0.108376 0.048536 0.041993 0.243773 0.115504 -0.166814 -0.098889 -0.225961 0.092996 0.042152 0.292951 0.182554 -0.105662 -0.133094 -0.312441 0.216135 0.035914 -0.051005 0.178651 -0.101689 0.017859 -0.079524 -0.249877 0.216629 -0.327706 -0.071859 0.174768 0.182882 0.070909 0.186932 -0.136926 -0.224341 0.067664 -0.203464 0.107864 0.050434 -0.047410 0.069713 0.330965 0.030456 -0.125979 -0.154011 0.171528 -0.173836 -0.084623 -0.186016 -0.196725 0.270309 0.056681 -0.214023 0.226552 0.060475 -0.196038 0.010044 0.115494 -0.077359 -0.193366 0.084780 -0.194615 -0.103102 -0.227243 -0.321450 0.076523 0.005896 0.061711 -0.090843 0.075073 0.331443 -0.023659 -0.085851 0.290219 -0.126169 0.018580 0.178140 -0.039405 0.225089 0.094578 0.163077 0.074495 0.355191 0.119656 -0.117533 
-est. 0.018926 -0.204586 0.093846 -0.233581 -0.052511 -0.070965 -0.073935 -0.126528 -0.041332 0.142881 -0.012604 0.275190 -0.306446 -0.065785 0.109702 0.325436 0.187038 0.020297 0.087162 0.038734 0.033510 0.179619 0.074218 -0.115269 -0.069029 -0.161725 0.075320 0.023974 0.213625 0.136564 -0.073113 -0.096489 -0.234502 0.165725 0.020847 -0.034006 0.127057 -0.079134 0.005491 -0.050676 -0.190790 0.159669 -0.245909 -0.046704 0.122032 0.128576 0.046513 0.135615 -0.091996 -0.158882 0.052067 -0.144890 0.081419 0.044236 -0.028612 0.048793 0.237833 0.027211 -0.094692 -0.113587 0.131143 -0.126973 -0.061731 -0.137667 -0.146495 0.191083 0.043086 -0.159311 0.160038 0.041722 -0.142440 0.011717 0.080562 -0.049459 -0.138310 0.052549 -0.139696 -0.068113 -0.172729 -0.244037 0.059218 0.004371 0.040105 -0.063775 0.057405 0.250587 -0.021090 -0.058557 0.211186 -0.094394 0.010677 0.132223 -0.031854 0.164843 0.075553 0.115758 0.051388 0.258293 0.087709 -0.088261 
-sed. 0.024210 -0.208702 0.092180 -0.233544 -0.046861 -0.063282 -0.072556 -0.122654 -0.041742 0.137062 -0.008606 0.278259 -0.292996 -0.069284 0.106318 0.319779 0.179389 0.023273 0.090889 0.033185 0.035034 0.176811 0.084783 -0.121019 -0.067622 -0.164467 0.076757 0.022026 0.206223 0.135397 -0.076475 -0.100092 -0.222265 0.159231 0.017722 -0.032449 0.130584 -0.082429 0.002800 -0.059560 -0.188477 0.158403 -0.246739 -0.048160 0.126515 0.136907 0.046085 0.132154 -0.090625 -0.162483 0.044586 -0.149560 0.076686 0.042521 -0.037737 0.051899 0.248031 0.024269 -0.091826 -0.112614 0.122101 -0.125537 -0.060164 -0.141322 -0.137620 0.190291 0.041361 -0.149995 0.164252 0.040692 -0.144162 0.006274 0.088057 -0.049863 -0.134315 0.059715 -0.143925 -0.076358 -0.166854 -0.237011 0.056787 0.008716 0.041044 -0.068331 0.061947 0.237068 -0.010902 -0.057498 0.207158 -0.088210 0.011555 0.133711 -0.025597 0.170264 0.075178 0.119686 0.055034 0.260562 0.081929 -0.089675 
-eget, 0.025500 -0.244517 0.106184 -0.278060 -0.063929 -0.078422 -0.091321 -0.147262 -0.056173 0.167068 -0.018659 0.343323 -0.363812 -0.076686 0.135463 0.391806 0.220080 0.024280 0.095995 0.045696 0.038598 0.214068 0.098766 -0.142968 -0.083684 -0.202235 0.089923 0.028182 0.263961 0.162064 -0.088266 -0.112952 -0.275775 0.196685 0.021872 -0.049596 0.157406 -0.097323 -0.000655 -0.073187 -0.231308 0.190610 -0.295075 -0.060315 0.152728 0.166961 0.066174 0.170764 -0.118793 -0.193448 0.064757 -0.191497 0.097234 0.046165 -0.044321 0.061430 0.305733 0.031930 -0.112820 -0.142279 0.159554 -0.157580 -0.078744 -0.168216 -0.175102 0.247568 0.046081 -0.194064 0.208284 0.044930 -0.180364 0.008835 0.114950 -0.070190 -0.168270 0.071369 -0.176860 -0.095276 -0.206374 -0.294892 0.075659 0.015335 0.052385 -0.079261 0.074931 0.306379 -0.014233 -0.079142 0.261564 -0.109304 0.020307 0.166275 -0.037179 0.209022 0.096537 0.152420 0.074722 0.328689 0.113036 -0.103208 
-fames 0.085071 -0.282685 0.099272 -0.235694 -0.065960 -0.047512 -0.071980 -0.128745 -0.054273 0.191890 0.014968 0.335425 -0.369937 -0.097113 0.128291 0.448693 0.279601 0.029217 0.155445 0.084979 0.077039 0.250167 0.149432 -0.176803 -0.061250 -0.254570 0.103968 0.048367 0.220786 0.082204 -0.101776 -0.104832 -0.293069 0.190186 0.023617 -0.087188 0.166149 -0.117075 0.008559 -0.045508 -0.237882 0.175008 -0.298260 -0.068577 0.124477 0.225128 0.082444 0.134936 -0.077467 -0.174736 0.087583 -0.196062 0.091169 0.092308 -0.098264 0.046655 0.317162 -0.024716 -0.125620 -0.150236 0.112425 -0.163233 -0.020295 -0.161293 -0.170728 0.265352 0.056763 -0.197528 0.202461 0.068654 -0.197961 0.015360 0.113109 -0.083808 -0.148410 0.100113 -0.205292 -0.095518 -0.138469 -0.282441 0.070857 0.058122 0.019737 -0.069951 0.107957 0.287584 -0.014330 -0.044807 0.224383 -0.156347 -0.022680 0.198935 -0.019117 0.238542 0.051452 0.162334 0.064452 0.329622 0.054542 -0.110561 
-eros. 0.021838 -0.176318 0.071042 -0.200704 -0.044597 -0.059735 -0.062546 -0.110426 -0.032414 0.115847 -0.014490 0.244394 -0.253142 -0.047682 0.091346 0.263039 0.151128 0.011107 0.073821 0.026286 0.023971 0.137158 0.066022 -0.105497 -0.066292 -0.135118 0.054144 0.019968 0.181497 0.120225 -0.064501 -0.083780 -0.191218 0.140812 0.016099 -0.029098 0.107173 -0.075282 0.009249 -0.044099 -0.165123 0.136224 -0.209287 -0.041269 0.106993 0.112600 0.042567 0.119899 -0.089335 -0.138812 0.046882 -0.125994 0.067577 0.036801 -0.033022 0.048293 0.204108 0.020856 -0.081438 -0.102457 0.113338 -0.112049 -0.057111 -0.114245 -0.122238 0.162137 0.032787 -0.130188 0.139335 0.038160 -0.122897 0.006320 0.077529 -0.041742 -0.123510 0.047985 -0.114698 -0.061858 -0.135453 -0.206220 0.053636 0.010473 0.038108 -0.058418 0.051932 0.201826 -0.005465 -0.048241 0.186059 -0.073119 0.010253 0.111292 -0.021400 0.147039 0.068760 0.102842 0.043543 0.228136 0.072770 -0.078695 
-sapien, 0.015709 -0.184650 0.075213 -0.196371 -0.047869 -0.055533 -0.061848 -0.110830 -0.034975 0.123841 -0.017068 0.245519 -0.257572 -0.054146 0.094682 0.278976 0.158444 0.019945 0.066568 0.027160 0.026868 0.151464 0.065468 -0.102003 -0.063295 -0.139247 0.060380 0.019634 0.194008 0.116479 -0.058469 -0.079860 -0.199163 0.142215 0.021852 -0.033852 0.119521 -0.068073 0.007882 -0.045443 -0.159705 0.142780 -0.208946 -0.041209 0.111233 0.116516 0.043367 0.122529 -0.079985 -0.137050 0.041648 -0.134769 0.068441 0.037525 -0.029902 0.048258 0.215690 0.019160 -0.086603 -0.098632 0.114015 -0.116477 -0.051962 -0.122211 -0.121343 0.172817 0.035339 -0.136663 0.148576 0.029790 -0.127918 0.009782 0.076086 -0.045887 -0.125205 0.056536 -0.125178 -0.060698 -0.148479 -0.215846 0.051184 0.011189 0.040419 -0.064862 0.055672 0.219119 -0.012826 -0.049329 0.199598 -0.086877 0.004673 0.119951 -0.020393 0.151439 0.065843 0.111557 0.046638 0.235907 0.078532 -0.078633 
-lectus, 0.017364 -0.208029 0.095996 -0.238282 -0.056824 -0.076759 -0.080665 -0.122269 -0.044549 0.137710 -0.012694 0.295526 -0.310595 -0.060820 0.114032 0.337914 0.187870 0.026002 0.082653 0.031101 0.029559 0.185425 0.086275 -0.121303 -0.069798 -0.173710 0.073874 0.030050 0.227297 0.141391 -0.080939 -0.106053 -0.239520 0.170237 0.021595 -0.038614 0.134922 -0.079730 0.009284 -0.048608 -0.189903 0.161981 -0.242514 -0.060069 0.129665 0.132089 0.044805 0.138337 -0.097580 -0.170010 0.048930 -0.159525 0.078994 0.035305 -0.034334 0.047792 0.247141 0.029951 -0.098931 -0.119300 0.137279 -0.131379 -0.063470 -0.140146 -0.141871 0.204231 0.042656 -0.164150 0.166048 0.041754 -0.149323 0.008882 0.090087 -0.057965 -0.136793 0.062440 -0.146026 -0.073693 -0.176153 -0.249498 0.055830 0.007844 0.050809 -0.074248 0.060993 0.242592 -0.008288 -0.064134 0.208748 -0.095833 0.013840 0.134669 -0.026546 0.162066 0.076766 0.112520 0.060083 0.265380 0.087413 -0.080912 
-tellus, 0.026698 -0.232151 0.097017 -0.264508 -0.055999 -0.082065 -0.084005 -0.140713 -0.050880 0.165982 -0.021358 0.326344 -0.346221 -0.063265 0.125240 0.366821 0.205921 0.020337 0.097728 0.040019 0.031900 0.203009 0.090045 -0.135723 -0.072524 -0.192833 0.076364 0.036271 0.246444 0.154455 -0.084028 -0.119965 -0.260930 0.188325 0.025988 -0.040318 0.137925 -0.090306 0.010667 -0.061162 -0.219839 0.181952 -0.276655 -0.064967 0.139246 0.158884 0.050879 0.150496 -0.108844 -0.173256 0.053311 -0.166260 0.092180 0.044449 -0.031233 0.056852 0.269562 0.023415 -0.111373 -0.125102 0.147742 -0.140738 -0.068726 -0.156235 -0.154987 0.213544 0.039769 -0.176037 0.187745 0.040410 -0.166469 -0.000772 0.099835 -0.058518 -0.162963 0.074149 -0.156842 -0.076043 -0.186585 -0.272339 0.068482 0.010895 0.058100 -0.076069 0.072849 0.269975 -0.009394 -0.063519 0.234339 -0.102698 0.016554 0.155387 -0.023516 0.189260 0.086602 0.121057 0.064801 0.297685 0.098000 -0.095957 
-lorem. 0.024838 -0.196478 0.088627 -0.219841 -0.049082 -0.068199 -0.072185 -0.121427 -0.039413 0.131071 -0.007049 0.270442 -0.278296 -0.054960 0.105306 0.306448 0.168279 0.026368 0.080427 0.029403 0.036661 0.164000 0.081487 -0.109912 -0.069271 -0.159431 0.063355 0.022243 0.203378 0.133120 -0.076337 -0.093586 -0.216425 0.151996 0.022831 -0.032905 0.128145 -0.068877 0.007820 -0.046789 -0.171172 0.157765 -0.231636 -0.055142 0.117596 0.130480 0.048457 0.132487 -0.088210 -0.146791 0.053888 -0.131534 0.076922 0.038615 -0.030113 0.048209 0.220445 0.015419 -0.092666 -0.112406 0.111701 -0.119141 -0.059064 -0.132360 -0.127939 0.178374 0.045174 -0.144595 0.148520 0.032142 -0.139287 0.004899 0.085195 -0.050032 -0.120554 0.053222 -0.132056 -0.066610 -0.156733 -0.219889 0.049199 0.009520 0.038624 -0.060844 0.051440 0.217748 -0.005633 -0.052305 0.198904 -0.092512 0.005044 0.131346 -0.016005 0.162972 0.068873 0.114049 0.049735 0.234141 0.085867 -0.072840 
-egestas. 0.031920 -0.199926 0.070816 -0.185315 -0.045833 -0.048542 -0.063018 -0.099922 -0.036430 0.129368 -0.012530 0.247733 -0.262558 -0.049856 0.092538 0.298487 0.173975 0.003368 0.095574 0.036792 0.039376 0.163488 0.089827 -0.106157 -0.048538 -0.160805 0.068206 0.025267 0.181106 0.097059 -0.065927 -0.073207 -0.208903 0.135780 0.026776 -0.050260 0.112145 -0.078496 0.000009 -0.048186 -0.170691 0.128720 -0.205504 -0.040332 0.102811 0.137022 0.049144 0.117493 -0.071525 -0.121878 0.041597 -0.133167 0.068645 0.058162 -0.055518 0.036132 0.220902 0.002135 -0.074010 -0.098628 0.095517 -0.106019 -0.031580 -0.117085 -0.118237 0.180038 0.032736 -0.134591 0.141232 0.038834 -0.136273 -0.005231 0.087570 -0.047415 -0.119238 0.057377 -0.133408 -0.069981 -0.116264 -0.193931 0.047038 0.022217 0.020345 -0.055834 0.068302 0.196513 0.000911 -0.040287 0.181513 -0.100746 -0.004984 0.122455 -0.027713 0.148019 0.046465 0.117404 0.034995 0.229838 0.063376 -0.081762 
-turpis. 0.023809 -0.201431 0.089493 -0.232310 -0.053324 -0.074952 -0.071057 -0.121237 -0.035845 0.142864 -0.008701 0.272749 -0.298214 -0.059419 0.111469 0.323913 0.178583 0.021396 0.080936 0.037131 0.027221 0.175789 0.082155 -0.120360 -0.071584 -0.158700 0.067717 0.033010 0.214216 0.134119 -0.079539 -0.095095 -0.225341 0.154757 0.017342 -0.038867 0.130467 -0.082022 0.007777 -0.052253 -0.177714 0.149425 -0.239480 -0.044951 0.123835 0.128093 0.053341 0.128472 -0.097631 -0.147423 0.058779 -0.142672 0.073538 0.039827 -0.030710 0.050930 0.233656 0.024289 -0.101223 -0.119044 0.124474 -0.122867 -0.066422 -0.138235 -0.136599 0.192045 0.041426 -0.154901 0.165374 0.034525 -0.145430 0.001804 0.087137 -0.055259 -0.136736 0.059351 -0.130616 -0.077910 -0.159995 -0.236701 0.059786 0.006949 0.039987 -0.067044 0.059546 0.232805 -0.008151 -0.056386 0.213556 -0.095104 0.007607 0.133880 -0.028357 0.169853 0.070108 0.113724 0.055516 0.253141 0.080182 -0.079260 
-tortor. 0.021257 -0.211626 0.088189 -0.247335 -0.055029 -0.074897 -0.078055 -0.126072 -0.045766 0.138018 -0.013314 0.292042 -0.308487 -0.057900 0.109233 0.329529 0.186436 0.028934 0.090676 0.033199 0.036175 0.183797 0.074777 -0.122657 -0.075401 -0.168157 0.070726 0.033786 0.218522 0.135848 -0.077314 -0.105443 -0.233232 0.159501 0.019688 -0.041427 0.129082 -0.082908 0.013430 -0.051748 -0.187649 0.162459 -0.242511 -0.048122 0.122891 0.129542 0.056561 0.134499 -0.098264 -0.161248 0.048020 -0.144970 0.076573 0.044581 -0.029892 0.057451 0.243684 0.019600 -0.100187 -0.115550 0.124397 -0.126664 -0.063085 -0.134600 -0.146223 0.191130 0.040840 -0.148628 0.165990 0.040636 -0.146155 0.002006 0.081885 -0.054889 -0.135753 0.062454 -0.144683 -0.075621 -0.172156 -0.238742 0.058155 0.010113 0.043232 -0.069880 0.053931 0.245132 -0.011413 -0.060257 0.216351 -0.089206 0.008776 0.131200 -0.027083 0.169940 0.075095 0.109600 0.051004 0.262384 0.081131 -0.083300 
-purus. 0.023852 -0.183481 0.081311 -0.197007 -0.041887 -0.051264 -0.055115 -0.100443 -0.040413 0.110541 -0.011789 0.236890 -0.263999 -0.048092 0.093840 0.273258 0.153154 0.016255 0.074485 0.024475 0.027333 0.133596 0.066827 -0.096053 -0.051462 -0.129780 0.055268 0.022532 0.164620 0.104123 -0.061050 -0.074341 -0.186611 0.130628 0.022846 -0.027805 0.093534 -0.061540 0.000775 -0.045160 -0.145787 0.122022 -0.196173 -0.039846 0.100699 0.111207 0.048367 0.111327 -0.073755 -0.127698 0.041064 -0.124160 0.069748 0.026885 -0.036838 0.039726 0.181948 0.017694 -0.079221 -0.092440 0.094017 -0.103919 -0.047756 -0.116865 -0.110331 0.156008 0.026787 -0.115803 0.133352 0.037422 -0.120456 0.008231 0.066815 -0.044939 -0.110730 0.049536 -0.117200 -0.063677 -0.127341 -0.192201 0.048539 0.014940 0.032262 -0.050277 0.047283 0.192898 -0.016758 -0.052985 0.160394 -0.075102 0.008018 0.102830 -0.022979 0.140324 0.064591 0.099668 0.047321 0.207212 0.065308 -0.066709 
-orci. 0.012091 -0.171400 0.069487 -0.192869 -0.044825 -0.055765 -0.061685 -0.100344 -0.031053 0.115988 -0.004445 0.225256 -0.246971 -0.043631 0.085233 0.260665 0.144416 0.022529 0.071277 0.028501 0.020814 0.141448 0.067666 -0.094806 -0.054821 -0.135239 0.064644 0.032317 0.166219 0.111573 -0.059364 -0.083736 -0.186762 0.130540 0.013417 -0.031732 0.100415 -0.062126 0.001075 -0.048283 -0.154090 0.130351 -0.192258 -0.040458 0.101466 0.104973 0.035562 0.110409 -0.080286 -0.138437 0.039860 -0.118232 0.061687 0.033124 -0.032277 0.031128 0.205089 0.017720 -0.081110 -0.095357 0.108849 -0.103758 -0.044406 -0.120161 -0.114809 0.160704 0.037513 -0.137632 0.137936 0.032757 -0.118190 0.008561 0.070305 -0.038609 -0.119635 0.043831 -0.119007 -0.054075 -0.134075 -0.191423 0.058231 0.003646 0.044765 -0.050350 0.053673 0.195223 -0.008498 -0.055471 0.175456 -0.078085 0.014437 0.119622 -0.027465 0.144403 0.063182 0.090229 0.048194 0.220979 0.071062 -0.076475 
-mi, 0.021606 -0.236194 0.103806 -0.265699 -0.060785 -0.074651 -0.086713 -0.137276 -0.046795 0.167600 -0.014010 0.328568 -0.352258 -0.071210 0.120238 0.365752 0.200893 0.024039 0.095713 0.039597 0.035415 0.199029 0.090385 -0.132183 -0.076246 -0.185797 0.085796 0.033677 0.246941 0.156507 -0.090138 -0.114741 -0.267126 0.196254 0.024161 -0.047997 0.148135 -0.091284 0.010037 -0.070708 -0.218949 0.178658 -0.282089 -0.057114 0.145426 0.153016 0.057878 0.159688 -0.111669 -0.184882 0.049232 -0.161131 0.087541 0.049652 -0.035269 0.050871 0.275118 0.027883 -0.104189 -0.133557 0.142672 -0.143499 -0.067436 -0.159482 -0.160632 0.218652 0.048092 -0.180047 0.185947 0.040021 -0.164748 0.010021 0.094834 -0.061200 -0.161880 0.072808 -0.162390 -0.083961 -0.186873 -0.276289 0.069648 0.016516 0.052140 -0.069751 0.070113 0.271649 -0.014468 -0.062351 0.234962 -0.104466 0.020135 0.149896 -0.029556 0.184393 0.087019 0.127151 0.059103 0.293556 0.097441 -0.103092 
-primis 0.071617 -0.239403 0.093073 -0.224327 -0.048358 -0.057379 -0.050033 -0.114557 -0.050438 0.176528 0.019847 0.287319 -0.328241 -0.069531 0.126411 0.347575 0.211470 0.022887 0.133773 0.076897 0.069625 0.189419 0.112067 -0.132125 -0.042207 -0.181040 0.062876 0.028595 0.171452 0.071379 -0.075480 -0.082294 -0.236546 0.165376 0.027838 -0.070145 0.129835 -0.097128 -0.010556 -0.043989 -0.198305 0.147043 -0.247215 -0.054315 0.110798 0.188393 0.051181 0.118502 -0.062289 -0.136158 0.076589 -0.158352 0.085693 0.051468 -0.067694 0.045058 0.246918 0.006095 -0.098887 -0.098194 0.095191 -0.123444 -0.031292 -0.128051 -0.128310 0.208513 0.038920 -0.149812 0.171934 0.049601 -0.142170 0.007557 0.085702 -0.077738 -0.112828 0.072162 -0.154313 -0.089927 -0.106982 -0.239589 0.065399 0.030231 0.019489 -0.060092 0.081295 0.233982 -0.019214 -0.032587 0.164582 -0.106698 0.008872 0.161726 -0.005656 0.194187 0.054618 0.118722 0.066190 0.256415 0.061721 -0.070474 
-quis, 0.023105 -0.241462 0.097266 -0.274969 -0.058567 -0.084870 -0.087378 -0.140710 -0.054663 0.164768 -0.017265 0.338495 -0.366571 -0.084777 0.133636 0.399380 0.221763 0.029640 0.097353 0.040520 0.037868 0.221883 0.096661 -0.144736 -0.080767 -0.207019 0.092557 0.034363 0.255122 0.160991 -0.091385 -0.116864 -0.273519 0.185685 0.022439 -0.052822 0.155000 -0.087303 0.009650 -0.057538 -0.226832 0.194461 -0.287357 -0.068424 0.143146 0.164026 0.052285 0.166116 -0.111189 -0.190457 0.056204 -0.175431 0.094113 0.048470 -0.037203 0.053005 0.289403 0.024784 -0.113159 -0.135535 0.148048 -0.154396 -0.066645 -0.160660 -0.171359 0.232582 0.050482 -0.191741 0.194422 0.044977 -0.166375 0.010291 0.103747 -0.064848 -0.168936 0.073857 -0.175917 -0.082607 -0.196360 -0.284607 0.075026 0.016403 0.061243 -0.079905 0.071225 0.283516 -0.019778 -0.072257 0.251230 -0.113758 0.016214 0.171716 -0.026286 0.204888 0.090423 0.141075 0.065384 0.318007 0.104980 -0.099838 
-velit, 0.022844 -0.233748 0.097197 -0.268890 -0.061315 -0.085872 -0.084803 -0.138722 -0.047498 0.161069 -0.013138 0.323989 -0.341988 -0.077925 0.122153 0.371737 0.205970 0.026500 0.098336 0.042815 0.033036 0.200600 0.088417 -0.136308 -0.077554 -0.185158 0.078713 0.036656 0.241546 0.147664 -0.088725 -0.111146 -0.265080 0.184160 0.030904 -0.041816 0.149280 -0.093141 0.006585 -0.069455 -0.213138 0.185355 -0.281212 -0.060857 0.138075 0.154941 0.050344 0.153987 -0.114975 -0.185222 0.060571 -0.167116 0.086058 0.039972 -0.037679 0.059733 0.282676 0.025200 -0.103385 -0.124083 0.140623 -0.139522 -0.075004 -0.154238 -0.162531 0.219900 0.041851 -0.183218 0.193182 0.046430 -0.168444 0.011360 0.095672 -0.060329 -0.157806 0.069369 -0.160118 -0.080733 -0.187845 -0.273162 0.070608 0.009937 0.055961 -0.072549 0.068600 0.281020 -0.021376 -0.077349 0.243145 -0.101232 0.015688 0.160556 -0.029658 0.193772 0.092505 0.130015 0.070981 0.304564 0.109696 -0.100973 
-felis. 0.027086 -0.216852 0.086939 -0.241728 -0.057553 -0.070861 -0.068141 -0.124947 -0.040823 0.139241 -0.010500 0.289408 -0.310604 -0.065236 0.110236 0.331287 0.184261 0.025424 0.090818 0.040839 0.029503 0.179922 0.083304 -0.127076 -0.067066 -0.166345 0.067141 0.024996 0.219585 0.133593 -0.078416 -0.097798 -0.223389 0.161687 0.029707 -0.043010 0.126255 -0.080609 0.000297 -0.056093 -0.195540 0.162062 -0.249291 -0.053590 0.118807 0.142443 0.050452 0.134299 -0.094987 -0.152479 0.052824 -0.148196 0.083546 0.042508 -0.039300 0.047816 0.238243 0.023161 -0.102170 -0.120214 0.122545 -0.132559 -0.062005 -0.144152 -0.144323 0.197850 0.035183 -0.152489 0.167596 0.043477 -0.146580 0.007646 0.087638 -0.055041 -0.144265 0.055675 -0.136911 -0.078253 -0.162391 -0.234455 0.061294 0.015958 0.041742 -0.073844 0.066462 0.249073 -0.011327 -0.064325 0.215786 -0.094281 0.010155 0.140859 -0.029265 0.177105 0.077549 0.110284 0.054368 0.274344 0.091363 -0.083717 
-ex. 0.018821 -0.216181 0.094488 -0.243436 -0.061280 -0.068531 -0.074446 -0.133247 -0.045404 0.148562 -0.007919 0.297607 -0.319785 -0.065433 0.110859 0.344273 0.194868 0.025597 0.095747 0.044019 0.031397 0.180116 0.083942 -0.130135 -0.067316 -0.172902 0.075747 0.028365 0.222827 0.130595 -0.079896 -0.102937 -0.235134 0.168765 0.023526 -0.040364 0.136853 -0.080717 0.001568 -0.053970 -0.190604 0.162233 -0.247384 -0.057002 0.127949 0.138310 0.059433 0.143495 -0.099689 -0.163325 0.052806 -0.160669 0.088170 0.045904 -0.031769 0.047199 0.250182 0.023369 -0.099902 -0.119255 0.126389 -0.130941 -0.068019 -0.142483 -0.143322 0.202341 0.047280 -0.158872 0.171380 0.045176 -0.149228 0.000858 0.096946 -0.059831 -0.137504 0.068842 -0.145145 -0.077122 -0.179288 -0.248255 0.063990 0.017702 0.043529 -0.074375 0.065174 0.266950 -0.009569 -0.063110 0.231577 -0.104675 0.013518 0.148956 -0.026121 0.189965 0.078296 0.124795 0.058304 0.282686 0.089998 -0.093350 
-posuere. 0.018681 -0.199171 0.080973 -0.222047 -0.048784 -0.061859 -0.059926 -0.108793 -0.042074 0.135483 -0.007306 0.255708 -0.272022 -0.053974 0.103111 0.286411 0.156285 0.014347 0.080817 0.034390 0.023926 0.154874 0.069564 -0.102704 -0.061524 -0.137785 0.064654 0.033776 0.176523 0.115692 -0.069127 -0.085309 -0.184809 0.152422 0.015024 -0.033829 0.106121 -0.073167 0.002125 -0.056124 -0.163672 0.129597 -0.209606 -0.043651 0.102002 0.117740 0.032176 0.115342 -0.089038 -0.144583 0.041607 -0.134371 0.069596 0.028312 -0.023123 0.031511 0.207993 0.025061 -0.086042 -0.091753 0.108937 -0.097529 -0.058022 -0.117891 -0.118671 0.164720 0.032411 -0.144676 0.149854 0.033364 -0.121709 0.005813 0.075969 -0.047364 -0.129399 0.048206 -0.114384 -0.062182 -0.130575 -0.205207 0.059306 0.013704 0.041169 -0.054059 0.057843 0.203518 -0.012573 -0.051266 0.178384 -0.074618 0.012802 0.118578 -0.023183 0.151564 0.064439 0.090424 0.049904 0.228919 0.076945 -0.070725 
-lacus. 0.011355 -0.203728 0.084109 -0.231012 -0.055251 -0.075363 -0.071343 -0.114437 -0.046257 0.131336 -0.014464 0.273614 -0.287852 -0.066553 0.109885 0.316870 0.168632 0.021880 0.084246 0.034169 0.033176 0.176097 0.071119 -0.115352 -0.072122 -0.157148 0.069658 0.024103 0.208242 0.132551 -0.074918 -0.101263 -0.220879 0.158869 0.018235 -0.035257 0.129728 -0.078651 0.003654 -0.051488 -0.184406 0.156458 -0.243998 -0.047091 0.122315 0.133918 0.049521 0.138264 -0.098084 -0.159257 0.044111 -0.146094 0.073867 0.042314 -0.034542 0.053381 0.242427 0.016766 -0.096835 -0.114782 0.128157 -0.127746 -0.059574 -0.131804 -0.137247 0.186060 0.037732 -0.158401 0.164397 0.040622 -0.148345 0.011837 0.086203 -0.056623 -0.138169 0.062472 -0.136353 -0.065056 -0.169712 -0.229624 0.053944 0.008464 0.049514 -0.065099 0.060874 0.237000 -0.018998 -0.061330 0.213658 -0.091577 0.016969 0.133814 -0.025915 0.168472 0.069249 0.113108 0.048142 0.257553 0.088352 -0.094193 
-augue, 0.018480 -0.215930 0.090773 -0.247262 -0.058626 -0.075140 -0.077231 -0.127030 -0.050191 0.150689 -0.019568 0.296407 -0.325845 -0.066581 0.108203 0.343954 0.187228 0.031738 0.092839 0.037985 0.029802 0.182749 0.076380 -0.122340 -0.068095 -0.163820 0.071845 0.033893 0.219044 0.143104 -0.073629 -0.105441 -0.232411 0.169053 0.024299 -0.044947 0.129565 -0.079542 0.003854 -0.056161 -0.192392 0.156631 -0.236874 -0.050572 0.119272 0.134109 0.048112 0.132988 -0.098968 -0.164331 0.048882 -0.155681 0.082490 0.040869 -0.031248 0.050558 0.237734 0.023485 -0.096771 -0.115470 0.122096 -0.129268 -0.066820 -0.137567 -0.146692 0.194663 0.036160 -0.161614 0.164904 0.034613 -0.153552 0.012064 0.087697 -0.055664 -0.138911 0.057214 -0.149196 -0.067105 -0.164742 -0.243162 0.060024 0.004959 0.045403 -0.060611 0.069539 0.240688 -0.012605 -0.067630 0.215116 -0.092365 0.018593 0.142009 -0.022127 0.163321 0.074170 0.117089 0.054534 0.270546 0.093572 -0.089347 
-ac, 0.025381 -0.224001 0.101018 -0.258183 -0.054854 -0.071575 -0.077053 -0.130613 -0.052369 0.152503 -0.010503 0.313065 -0.331854 -0.074559 0.117490 0.362603 0.196517 0.029111 0.087455 0.042070 0.033514 0.193181 0.091252 -0.140057 -0.076666 -0.175967 0.079734 0.028834 0.235348 0.141039 -0.081076 -0.109231 -0.251558 0.179523 0.019769 -0.044708 0.140046 -0.093390 0.013444 -0.059256 -0.209028 0.177320 -0.268441 -0.054104 0.140006 0.157585 0.059915 0.150942 -0.109478 -0.184099 0.051785 -0.163970 0.081445 0.049418 -0.040422 0.054718 0.266573 0.026367 -0.109351 -0.121881 0.141371 -0.146361 -0.071247 -0.161330 -0.163525 0.222066 0.050650 -0.177355 0.187977 0.042929 -0.161116 0.008973 0.100269 -0.060461 -0.162618 0.066094 -0.155717 -0.084522 -0.190562 -0.262097 0.064093 0.007468 0.054374 -0.075276 0.070306 0.271329 -0.016110 -0.073784 0.248296 -0.103506 0.016841 0.153175 -0.024825 0.186492 0.084660 0.130844 0.057473 0.299349 0.096463 -0.102441 
-volutpat. 0.011849 -0.186361 0.072065 -0.203827 -0.040318 -0.056008 -0.064634 -0.109521 -0.043498 0.125734 -0.004862 0.248837 -0.258511 -0.059507 0.099230 0.291067 0.155008 0.022737 0.068366 0.029808 0.032688 0.163827 0.065754 -0.106671 -0.061973 -0.145709 0.063543 0.021153 0.193080 0.119570 -0.067942 -0.084491 -0.204555 0.144672 0.014818 -0.032570 0.111938 -0.068881 0.008813 -0.048998 -0.165388 0.143384 -0.212473 -0.048904 0.107681 0.115118 0.044283 0.126215 -0.081382 -0.136163 0.043408 -0.131287 0.071301 0.034015 -0.026605 0.050626 0.218450 0.018789 -0.087308 -0.096727 0.115073 -0.118934 -0.048181 -0.129227 -0.127834 0.168406 0.040233 -0.135794 0.148683 0.037618 -0.132708 0.011971 0.081819 -0.044208 -0.124973 0.056459 -0.126888 -0.061889 -0.141291 -0.211773 0.050529 0.011303 0.034631 -0.063858 0.055335 0.220475 -0.011523 -0.052363 0.194422 -0.080367 0.008827 0.118778 -0.020375 0.156108 0.071718 0.105153 0.052990 0.231628 0.078100 -0.079794 
-massa, 0.020851 -0.208052 0.089292 -0.227423 -0.057031 -0.071631 -0.070806 -0.123333 -0.047690 0.140477 -0.006220 0.293703 -0.314963 -0.065730 0.112022 0.335034 0.190428 0.024486 0.090225 0.038221 0.032742 0.192873 0.086647 -0.125042 -0.064000 -0.176158 0.074915 0.027688 0.223943 0.137606 -0.077758 -0.097161 -0.232170 0.174266 0.023296 -0.039592 0.137090 -0.086187 0.009724 -0.054038 -0.193738 0.163483 -0.257849 -0.050981 0.132365 0.141327 0.050948 0.146286 -0.093784 -0.156044 0.057233 -0.150882 0.073053 0.047399 -0.036384 0.049864 0.254154 0.022194 -0.093035 -0.114168 0.129509 -0.130140 -0.059866 -0.137019 -0.144516 0.202005 0.046873 -0.166026 0.169053 0.040005 -0.151588 0.003803 0.091180 -0.056414 -0.143233 0.056506 -0.143022 -0.074966 -0.165073 -0.248334 0.064210 0.007285 0.039819 -0.068216 0.067969 0.250230 -0.012817 -0.056556 0.210809 -0.100912 0.010583 0.148217 -0.027318 0.171784 0.069583 0.125426 0.050346 0.267932 0.090759 -0.094526 
-dolor, 0.025034 -0.204594 0.088238 -0.235543 -0.055354 -0.073379 -0.071926 -0.122416 -0.043463 0.142961 -0.006475 0.280806 -0.312458 -0.069251 0.114326 0.327895 0.182525 0.024902 0.093519 0.037637 0.032381 0.176740 0.087930 -0.122399 -0.073637 -0.163463 0.075684 0.032050 0.222317 0.134363 -0.083099 -0.105890 -0.234125 0.174364 0.019382 -0.044938 0.137299 -0.076631 0.012572 -0.057114 -0.197553 0.164085 -0.250572 -0.055287 0.129456 0.137485 0.057458 0.143735 -0.105531 -0.162649 0.056855 -0.159975 0.084012 0.044599 -0.032484 0.047396 0.250253 0.021034 -0.099008 -0.121042 0.135637 -0.136589 -0.069719 -0.146206 -0.148954 0.204187 0.043773 -0.163366 0.173717 0.036658 -0.150382 0.008879 0.089146 -0.058383 -0.141030 0.066423 -0.146503 -0.070390 -0.167557 -0.243439 0.066904 0.005796 0.046575 -0.069740 0.061758 0.249934 -0.017227 -0.057519 0.219290 -0.096601 0.016744 0.144500 -0.025319 0.186043 0.079330 0.114368 0.054253 0.268064 0.086871 -0.090417 
-nibh, 0.020661 -0.188616 0.082886 -0.215440 -0.045652 -0.055345 -0.066379 -0.109474 -0.042633 0.126191 -0.015066 0.262848 -0.270249 -0.060127 0.100890 0.293130 0.162808 0.017961 0.075623 0.027926 0.025581 0.161277 0.069504 -0.113124 -0.058874 -0.150676 0.071010 0.022403 0.192340 0.126392 -0.064863 -0.089627 -0.205968 0.155122 0.017966 -0.030601 0.120216 -0.074617 0.005959 -0.050735 -0.171759 0.146406 -0.225947 -0.041345 0.113988 0.129616 0.039644 0.124414 -0.079248 -0.142428 0.042223 -0.132342 0.070086 0.034251 -0.028327 0.047114 0.218332 0.021471 -0.080916 -0.098557 0.108323 -0.106695 -0.052555 -0.116219 -0.126374 0.177316 0.040659 -0.147252 0.143583 0.033906 -0.129495 0.004962 0.075922 -0.044139 -0.127194 0.053223 -0.124426 -0.060696 -0.148220 -0.212981 0.052909 0.004575 0.047729 -0.053764 0.059627 0.217893 -0.006754 -0.059762 0.188061 -0.081625 0.021030 0.124883 -0.030048 0.139703 0.058701 0.107041 0.051844 0.230699 0.081852 -0.079626 
-mi. 0.018083 -0.210883 0.085163 -0.243034 -0.050027 -0.067790 -0.071349 -0.129345 -0.041065 0.138179 -0.015279 0.289102 -0.306646 -0.060181 0.108829 0.322319 0.180936 0.020678 0.084053 0.033595 0.037398 0.175850 0.083771 -0.117394 -0.074686 -0.160212 0.069315 0.030876 0.222216 0.134492 -0.079536 -0.103487 -0.230197 0.169243 0.027760 -0.041810 0.132968 -0.081703 0.003284 -0.060173 -0.193445 0.164014 -0.244416 -0.055424 0.136548 0.139360 0.057224 0.139702 -0.098278 -0.164829 0.054858 -0.157381 0.087814 0.035561 -0.039854 0.045744 0.245858 0.028595 -0.093713 -0.125343 0.127033 -0.134609 -0.069194 -0.151180 -0.154072 0.195637 0.045199 -0.165314 0.170401 0.033471 -0.154844 0.009059 0.095507 -0.055529 -0.141905 0.060837 -0.148996 -0.074767 -0.174008 -0.247771 0.054855 0.010994 0.050392 -0.072413 0.059257 0.250012 -0.019269 -0.061307 0.221176 -0.099153 0.012005 0.139848 -0.030070 0.177248 0.081084 0.120787 0.057834 0.270169 0.092785 -0.093039 
-tincidunt, 0.021890 -0.251974 0.099830 -0.278172 -0.069490 -0.082196 -0.088427 -0.138643 -0.055971 0.169939 -0.016818 0.325372 -0.359844 -0.074802 0.130964 0.383340 0.212387 0.030030 0.105307 0.047541 0.041076 0.211102 0.100007 -0.140291 -0.077052 -0.188692 0.083916 0.034830 0.238005 0.153197 -0.090928 -0.116289 -0.262733 0.194728 0.025266 -0.047966 0.146381 -0.100383 0.004284 -0.066209 -0.229172 0.181214 -0.280315 -0.054343 0.139219 0.159003 0.060811 0.159824 -0.108708 -0.179642 0.060865 -0.175189 0.086729 0.049186 -0.041426 0.058909 0.280787 0.026573 -0.111114 -0.133563 0.152809 -0.142594 -0.069484 -0.162106 -0.164374 0.232781 0.044067 -0.186400 0.202390 0.046354 -0.167997 0.004734 0.098372 -0.066749 -0.166539 0.064895 -0.166525 -0.080903 -0.193922 -0.285929 0.070053 0.016554 0.050918 -0.080323 0.076288 0.285304 -0.020364 -0.073724 0.251032 -0.115911 0.009667 0.166758 -0.032195 0.205455 0.083212 0.131706 0.067834 0.312163 0.104544 -0.105219 
-quis. 0.017400 -0.200145 0.078770 -0.213912 -0.052681 -0.059369 -0.068586 -0.114065 -0.045677 0.126766 -0.011987 0.265409 -0.285647 -0.060455 0.103489 0.317442 0.182518 0.020101 0.087875 0.033454 0.026662 0.171341 0.083384 -0.120369 -0.061716 -0.162073 0.074024 0.029145 0.203849 0.119516 -0.069422 -0.098013 -0.216820 0.149881 0.024545 -0.044687 0.114590 -0.080495 0.003530 -0.051655 -0.185235 0.142316 -0.231515 -0.049453 0.116064 0.127477 0.046689 0.127608 -0.088348 -0.146006 0.046761 -0.139597 0.074554 0.045414 -0.037395 0.041781 0.235515 0.012938 -0.086761 -0.107084 0.120307 -0.119971 -0.053553 -0.131170 -0.135940 0.182804 0.034188 -0.141739 0.152309 0.035123 -0.140493 0.004058 0.085941 -0.052572 -0.134666 0.053580 -0.142080 -0.067118 -0.149402 -0.223766 0.048004 0.010710 0.034817 -0.064917 0.057396 0.226549 -0.016266 -0.054911 0.198215 -0.089913 0.002929 0.125485 -0.028296 0.164966 0.069397 0.104939 0.052715 0.243914 0.070343 -0.087657 
-est, 0.019628 -0.214027 0.087649 -0.229745 -0.051185 -0.067348 -0.076516 -0.122349 -0.039413 0.144177 -0.012759 0.280573 -0.304358 -0.063255 0.108782 0.337016 0.190235 0.027042 0.090284 0.032854 0.031129 0.182951 0.078432 -0.127304 -0.072932 -0.161624 0.071092 0.035806 0.214675 0.139883 -0.076420 -0.099249 -0.229901 0.164929 0.020458 -0.041204 0.134765 -0.083149 0.006763 -0.064153 -0.194388 0.162120 -0.251487 -0.055774 0.125988 0.141883 0.047404 0.137201 -0.095197 -0.166884 0.055373 -0.151514 0.079709 0.041021 -0.032523 0.047650 0.252121 0.027709 -0.100862 -0.118675 0.128797 -0.135605 -0.062805 -0.145147 -0.152085 0.202374 0.041409 -0.165002 0.175216 0.042667 -0.154207 0.012358 0.085246 -0.058225 -0.149800 0.062980 -0.153868 -0.075788 -0.172923 -0.245473 0.063120 0.012373 0.046032 -0.071158 0.065704 0.247562 -0.009588 -0.059476 0.225238 -0.096426 0.006948 0.143214 -0.030437 0.174185 0.069165 0.119456 0.054065 0.265685 0.090866 -0.085456 
-erat. 0.017970 -0.175060 0.073003 -0.195533 -0.045045 -0.062554 -0.057710 -0.104049 -0.029231 0.117710 -0.012141 0.226894 -0.244265 -0.056795 0.091661 0.262120 0.144555 0.023249 0.069549 0.029419 0.022615 0.136104 0.063050 -0.096434 -0.051400 -0.130177 0.051796 0.028868 0.175582 0.105360 -0.058473 -0.080723 -0.176873 0.123590 0.018141 -0.029508 0.097443 -0.061352 0.006039 -0.046513 -0.143178 0.125438 -0.189439 -0.041356 0.099196 0.105118 0.041091 0.110377 -0.072812 -0.127276 0.044871 -0.111357 0.063693 0.027872 -0.026286 0.041167 0.192741 0.017599 -0.073030 -0.088105 0.099852 -0.104447 -0.048508 -0.111163 -0.116997 0.156326 0.030867 -0.123827 0.126113 0.032871 -0.114856 0.005887 0.071480 -0.039710 -0.107515 0.050793 -0.110721 -0.061524 -0.127757 -0.185635 0.052665 0.006239 0.041003 -0.055619 0.046087 0.195302 -0.006493 -0.054173 0.172760 -0.069528 0.009511 0.111951 -0.021268 0.132701 0.061730 0.098839 0.043121 0.210322 0.072158 -0.070950 
-justo. 0.017222 -0.157451 0.070102 -0.182116 -0.036450 -0.054368 -0.058320 -0.090553 -0.034635 0.118482 0.002299 0.212435 -0.243226 -0.053497 0.084128 0.255041 0.145799 0.020776 0.075207 0.037508 0.034156 0.140890 0.071649 -0.101799 -0.052692 -0.126463 0.055487 0.020813 0.165755 0.103881 -0.062023 -0.084787 -0.177189 0.127599 0.013572 -0.036992 0.106534 -0.060383 0.003098 -0.042469 -0.146402 0.122164 -0.193486 -0.037043 0.093552 0.107164 0.034009 0.102420 -0.069896 -0.124663 0.042275 -0.108420 0.061265 0.033109 -0.028160 0.036599 0.187575 0.013741 -0.076743 -0.091684 0.095777 -0.102330 -0.050567 -0.104540 -0.111878 0.151680 0.037229 -0.130198 0.135711 0.029954 -0.106413 0.013942 0.066560 -0.039180 -0.106341 0.047805 -0.109108 -0.060994 -0.125302 -0.184552 0.045368 0.009461 0.034008 -0.054369 0.043094 0.192704 -0.013459 -0.043748 0.169882 -0.068896 0.007217 0.106786 -0.019509 0.130098 0.050536 0.092847 0.046663 0.207781 0.072303 -0.064228 
-nibh. 0.019277 -0.204285 0.083721 -0.231705 -0.056823 -0.078141 -0.076198 -0.124951 -0.048540 0.146613 -0.001889 0.280797 -0.307916 -0.080339 0.110378 0.332918 0.185313 0.028762 0.088051 0.033297 0.036117 0.183084 0.088606 -0.127844 -0.056840 -0.180371 0.076170 0.031775 0.217361 0.139403 -0.076185 -0.098196 -0.229562 0.157980 0.022474 -0.037897 0.136322 -0.081034 0.007611 -0.050620 -0.183190 0.169390 -0.243553 -0.055843 0.132621 0.134897 0.057586 0.137767 -0.093011 -0.160812 0.056511 -0.148268 0.075495 0.039635 -0.033179 0.048956 0.254009 0.023650 -0.098303 -0.114196 0.130699 -0.139374 -0.065419 -0.134908 -0.149105 0.202936 0.043970 -0.166656 0.175485 0.042241 -0.144236 0.009547 0.086801 -0.049085 -0.135718 0.068506 -0.147679 -0.081717 -0.175403 -0.246722 0.067265 -0.001330 0.039883 -0.071795 0.061716 0.250383 -0.015695 -0.067050 0.224135 -0.099017 0.009877 0.148267 -0.024297 0.171884 0.066967 0.123391 0.053467 0.270905 0.084615 -0.092961 
-nisi, 0.021044 -0.219973 0.090402 -0.245829 -0.058495 -0.074143 -0.074494 -0.128705 -0.041516 0.144305 -0.010755 0.288146 -0.321253 -0.072449 0.119124 0.343604 0.187527 0.032977 0.089546 0.041032 0.037039 0.190259 0.083974 -0.125864 -0.070161 -0.163053 0.077843 0.029711 0.218600 0.130915 -0.079285 -0.106404 -0.231789 0.179338 0.011698 -0.038903 0.138676 -0.076718 0.012048 -0.059791 -0.193523 0.164308 -0.257349 -0.042500 0.125244 0.143875 0.044313 0.145284 -0.094292 -0.171510 0.054035 -0.158695 0.074003 0.042929 -0.030384 0.049241 0.252632 0.020546 -0.104367 -0.118932 0.130836 -0.124239 -0.060775 -0.144803 -0.153300 0.200493 0.038285 -0.170586 0.174825 0.029623 -0.143582 0.014228 0.093685 -0.058735 -0.144372 0.059098 -0.153573 -0.070010 -0.170468 -0.249180 0.068380 0.000600 0.050683 -0.066004 0.065566 0.255742 -0.021333 -0.062379 0.223640 -0.089653 0.020326 0.146729 -0.024832 0.177685 0.071484 0.125455 0.050853 0.277896 0.091167 -0.099814 
-placerat. 0.018118 -0.184427 0.077446 -0.207904 -0.054775 -0.066554 -0.061992 -0.105963 -0.044551 0.127213 -0.008719 0.251187 -0.263845 -0.062859 0.095553 0.295138 0.154185 0.018838 0.075384 0.025032 0.026062 0.158877 0.072279 -0.104319 -0.056215 -0.144105 0.060123 0.029284 0.189846 0.111116 -0.057307 -0.082532 -0.196734 0.138528 0.016393 -0.038240 0.107714 -0.065786 0.005103 -0.046830 -0.158704 0.139026 -0.200425 -0.038575 0.105443 0.117857 0.040451 0.121724 -0.086761 -0.131483 0.041626 -0.126542 0.059844 0.035630 -0.018764 0.039532 0.200376 0.017620 -0.083752 -0.091925 0.111465 -0.105818 -0.060478 -0.117017 -0.123385 0.168716 0.027751 -0.136217 0.145393 0.030170 -0.123624 0.009712 0.071403 -0.048596 -0.108687 0.051205 -0.123052 -0.063406 -0.137946 -0.203850 0.044633 0.004021 0.031550 -0.054448 0.057465 0.208040 -0.004258 -0.046315 0.191498 -0.087138 0.008638 0.116119 -0.023572 0.140817 0.066026 0.103320 0.049739 0.221425 0.072805 -0.080756 
-congue. 0.023621 -0.191653 0.084763 -0.210305 -0.051574 -0.062232 -0.066150 -0.108623 -0.033224 0.131395 -0.016104 0.254202 -0.278076 -0.051034 0.098118 0.294167 0.168224 0.015544 0.074654 0.032895 0.027509 0.155797 0.063129 -0.106636 -0.062570 -0.140386 0.064811 0.024774 0.187328 0.126770 -0.061672 -0.084743 -0.203834 0.149066 0.026483 -0.038313 0.105072 -0.071001 0.007115 -0.051811 -0.172558 0.132627 -0.214140 -0.046689 0.107784 0.120605 0.044625 0.117437 -0.081662 -0.138545 0.045199 -0.130593 0.072611 0.033081 -0.025626 0.040509 0.205409 0.023136 -0.078127 -0.091617 0.113286 -0.111327 -0.057928 -0.127748 -0.120088 0.173334 0.040450 -0.136425 0.151158 0.037439 -0.131537 -0.000346 0.081114 -0.052619 -0.118069 0.051201 -0.122078 -0.064796 -0.149196 -0.215856 0.055881 0.015265 0.036814 -0.055646 0.054238 0.210818 -0.008613 -0.052916 0.184961 -0.079800 0.005226 0.109401 -0.019436 0.148166 0.062114 0.097002 0.048426 0.225077 0.072785 -0.069103 
-eros, 0.021259 -0.205994 0.087756 -0.232060 -0.054675 -0.075342 -0.076424 -0.118402 -0.037451 0.144613 -0.015954 0.278408 -0.294545 -0.059728 0.108973 0.321880 0.179072 0.024616 0.078726 0.034913 0.030081 0.176729 0.077631 -0.120993 -0.070200 -0.165802 0.070811 0.026312 0.221230 0.127532 -0.076455 -0.097826 -0.232268 0.165836 0.021237 -0.039834 0.132155 -0.077701 0.011725 -0.055029 -0.193396 0.158014 -0.237097 -0.053025 0.129686 0.138174 0.052169 0.138492 -0.095331 -0.161385 0.046801 -0.150966 0.079394 0.045545 -0.034044 0.044824 0.244628 0.021599 -0.094973 -0.117047 0.130474 -0.128076 -0.056321 -0.139120 -0.150797 0.203366 0.045121 -0.164523 0.166300 0.034716 -0.150343 0.002907 0.088412 -0.052713 -0.143259 0.060866 -0.147762 -0.075286 -0.165208 -0.237459 0.059385 0.005366 0.050646 -0.065482 0.064523 0.240571 -0.010610 -0.061465 0.218814 -0.097596 0.004710 0.138829 -0.028516 0.171064 0.069983 0.111037 0.052399 0.258287 0.090712 -0.092228 
-enim, 0.012411 -0.183435 0.083204 -0.206135 -0.042061 -0.061556 -0.068784 -0.110423 -0.040963 0.132143 -0.009447 0.251979 -0.264222 -0.057209 0.094876 0.286188 0.168597 0.026235 0.071555 0.031625 0.025714 0.160671 0.069266 -0.110004 -0.064161 -0.140815 0.067201 0.027488 0.195090 0.118708 -0.067749 -0.087923 -0.209003 0.146174 0.014948 -0.026056 0.116618 -0.066400 0.003679 -0.044767 -0.166818 0.136988 -0.216355 -0.040063 0.108068 0.113651 0.049232 0.124311 -0.091739 -0.145019 0.036915 -0.136874 0.076153 0.038123 -0.026204 0.043798 0.225978 0.027280 -0.089159 -0.099461 0.116653 -0.115053 -0.055507 -0.128871 -0.125555 0.174488 0.035432 -0.138295 0.151747 0.031684 -0.131218 0.008129 0.070639 -0.049019 -0.125409 0.053182 -0.127756 -0.065378 -0.153120 -0.211383 0.054121 0.000745 0.037586 -0.060645 0.058390 0.223369 -0.015846 -0.052119 0.198056 -0.075013 0.010681 0.116323 -0.026738 0.151738 0.072858 0.101452 0.054253 0.232704 0.083607 -0.075244 
-eu, 0.027368 -0.242887 0.107551 -0.273576 -0.063918 -0.084397 -0.090380 -0.142846 -0.047831 0.163123 -0.014505 0.329762 -0.363290 -0.070256 0.127452 0.387441 0.211676 0.035042 0.108896 0.048218 0.042043 0.216808 0.097362 -0.140318 -0.081015 -0.192157 0.082283 0.035478 0.251945 0.155689 -0.087863 -0.116720 -0.268660 0.188983 0.031812 -0.042928 0.143687 -0.088560 0.007242 -0.065430 -0.225305 0.186648 -0.281560 -0.065487 0.147999 0.160671 0.053195 0.159644 -0.107537 -0.188285 0.058564 -0.179509 0.096733 0.047156 -0.041226 0.056227 0.282042 0.029140 -0.107560 -0.128415 0.153516 -0.142913 -0.075271 -0.166844 -0.173268 0.230826 0.051001 -0.188443 0.198118 0.045329 -0.176812 0.009370 0.106123 -0.065197 -0.163062 0.067997 -0.174708 -0.083113 -0.194438 -0.287819 0.074762 0.005203 0.049421 -0.072637 0.069993 0.291052 -0.022345 -0.066671 0.253472 -0.106196 0.012287 0.166725 -0.024723 0.198808 0.086169 0.133380 0.069333 0.312591 0.106631 -0.103415 
-ante. 0.018018 -0.207352 0.086042 -0.231688 -0.055025 -0.066349 -0.074349 -0.119404 -0.038529 0.141660 -0.012680 0.283069 -0.306360 -0.062796 0.109980 0.328238 0.181896 0.021686 0.081723 0.036011 0.028563 0.179786 0.076407 -0.117625 -0.072746 -0.169779 0.071492 0.031417 0.218660 0.138977 -0.073519 -0.098363 -0.230639 0.173549 0.018298 -0.036265 0.126114 -0.084792 0.006722 -0.054136 -0.190345 0.159451 -0.247881 -0.057564 0.126217 0.140928 0.054865 0.139359 -0.096965 -0.165729 0.057424 -0.155941 0.085383 0.038883 -0.035141 0.046630 0.246531 0.027229 -0.096280 -0.118832 0.133023 -0.133219 -0.059847 -0.145551 -0.145941 0.204288 0.044209 -0.159184 0.171174 0.036464 -0.155513 0.012168 0.085400 -0.059639 -0.144318 0.063126 -0.151723 -0.074373 -0.174811 -0.243948 0.062594 0.003982 0.042090 -0.068995 0.065423 0.254537 -0.014037 -0.066005 0.225306 -0.097172 0.012710 0.138858 -0.025416 0.182837 0.080265 0.115374 0.058067 0.275833 0.093026 -0.095353 
-arcu, 0.020636 -0.190384 0.081295 -0.216640 -0.051777 -0.063879 -0.062681 -0.118617 -0.042513 0.130435 -0.008514 0.256551 -0.286948 -0.060354 0.105922 0.305514 0.173226 0.028114 0.077037 0.032260 0.028859 0.174488 0.074727 -0.112760 -0.058680 -0.164153 0.065652 0.028055 0.200474 0.121171 -0.073132 -0.088620 -0.213409 0.152069 0.018408 -0.042759 0.117251 -0.066557 0.004132 -0.051274 -0.177488 0.155216 -0.230253 -0.043429 0.116523 0.128522 0.045107 0.136355 -0.092491 -0.146863 0.045097 -0.138652 0.071894 0.032845 -0.039242 0.043301 0.235972 0.013565 -0.093067 -0.103777 0.119984 -0.121515 -0.057694 -0.127503 -0.141553 0.183773 0.045465 -0.155147 0.153681 0.033012 -0.134651 0.006063 0.083952 -0.051802 -0.129847 0.056657 -0.132876 -0.067665 -0.162809 -0.230083 0.055562 0.010419 0.044689 -0.065378 0.064583 0.228608 -0.011270 -0.056796 0.209167 -0.083756 0.016053 0.136091 -0.028272 0.170301 0.063877 0.107887 0.047000 0.249142 0.085460 -0.084872 
-nunc, 0.017449 -0.213586 0.084890 -0.232830 -0.048661 -0.065569 -0.074895 -0.120438 -0.046261 0.147057 -0.009660 0.288078 -0.313688 -0.066180 0.106570 0.336404 0.190444 0.029450 0.090310 0.040369 0.031954 0.182085 0.078051 -0.122804 -0.072205 -0.175443 0.069476 0.031568 0.217671 0.132123 -0.079470 -0.100401 -0.236890 0.168234 0.019053 -0.038188 0.127245 -0.078778 0.009893 -0.055891 -0.196422 0.162456 -0.242538 -0.057494 0.125326 0.143935 0.057638 0.142365 -0.103462 -0.161495 0.055064 -0.154914 0.081429 0.045917 -0.031584 0.044822 0.246315 0.028873 -0.099773 -0.116054 0.126728 -0.136061 -0.064919 -0.144906 -0.147026 0.199805 0.043643 -0.156102 0.171710 0.042379 -0.148632 0.008002 0.091831 -0.056223 -0.141927 0.059280 -0.145691 -0.078644 -0.171115 -0.243645 0.062044 0.005337 0.047605 -0.070975 0.067323 0.245723 -0.010154 -0.066058 0.222860 -0.097774 0.012162 0.144121 -0.029130 0.180717 0.078880 0.122084 0.057744 0.274010 0.091696 -0.084753 
-blandit. 0.027078 -0.225583 0.096061 -0.242513 -0.058124 -0.078070 -0.076202 -0.126926 -0.045906 0.145355 -0.016344 0.297276 -0.323001 -0.062677 0.115228 0.340606 0.187323 0.021968 0.094840 0.035423 0.038361 0.179520 0.082405 -0.125802 -0.070661 -0.166834 0.069855 0.027941 0.219678 0.145789 -0.079666 -0.109193 -0.239135 0.176231 0.021580 -0.038461 0.128134 -0.083682 -0.001681 -0.059618 -0.194357 0.154660 -0.247583 -0.045727 0.127177 0.140751 0.054784 0.145919 -0.093177 -0.164187 0.047609 -0.153623 0.077801 0.041134 -0.040970 0.047796 0.251732 0.027198 -0.093662 -0.115226 0.124542 -0.126514 -0.061600 -0.139095 -0.140775 0.201909 0.036191 -0.155377 0.165052 0.040199 -0.152205 0.005832 0.086079 -0.060736 -0.142804 0.058003 -0.154295 -0.079198 -0.168734 -0.247820 0.060996 0.015921 0.039800 -0.072801 0.066383 0.252072 -0.018695 -0.064486 0.216896 -0.091911 0.011720 0.138135 -0.027335 0.182105 0.076014 0.123649 0.054759 0.267820 0.085346 -0.095493 
-vehicula. 0.023609 -0.164854 0.070140 -0.181302 -0.044739 -0.051950 -0.061470 -0.100497 -0.034664 0.107165 -0.013806 0.232566 -0.247651 -0.047482 0.087791 0.257915 0.143458 0.014938 0.078467 0.030566 0.024241 0.145217 0.061528 -0.102097 -0.048571 -0.136352 0.054136 0.020866 0.166145 0.110395 -0.062106 -0.076723 -0.170947 0.124869 0.024544 -0.031385 0.099546 -0.058319 0.007097 -0.040338 -0.151648 0.121054 -0.186961 -0.041647 0.098759 0.108431 0.043721 0.102204 -0.070835 -0.124841 0.034778 -0.113401 0.058654 0.035236 -0.028966 0.044885 0.185447 0.019066 -0.073630 -0.083250 0.096378 -0.097277 -0.043831 -0.103408 -0.100431 0.141274 0.034895 -0.116991 0.123393 0.029568 -0.113626 -0.003715 0.068546 -0.039082 -0.113001 0.046862 -0.108466 -0.055690 -0.125021 -0.187521 0.041491 0.012198 0.040903 -0.049570 0.047501 0.179779 -0.005308 -0.047181 0.156221 -0.070004 0.007778 0.106947 -0.017811 0.130884 0.054696 0.097426 0.037367 0.205276 0.064588 -0.065133 
-magna. 0.028003 -0.213745 0.091431 -0.237931 -0.055484 -0.076522 -0.070847 -0.125083 -0.044261 0.147203 -0.006626 0.278004 -0.305511 -0.054108 0.109246 0.318888 0.173819 0.027187 0.090159 0.044008 0.036119 0.173578 0.081601 -0.115093 -0.069808 -0.160111 0.083152 0.036514 0.200562 0.132094 -0.075665 -0.107687 -0.224973 0.172091 0.019296 -0.045281 0.119755 -0.075525 -0.001718 -0.054513 -0.201217 0.151815 -0.244965 -0.050541 0.117017 0.134748 0.041760 0.143397 -0.091203 -0.155598 0.045394 -0.149846 0.075182 0.043705 -0.037916 0.046422 0.241178 0.025938 -0.091878 -0.108214 0.128635 -0.119183 -0.061736 -0.141970 -0.148633 0.192820 0.044711 -0.164031 0.171112 0.039693 -0.148531 0.006994 0.082137 -0.057913 -0.141830 0.064291 -0.145529 -0.072102 -0.158512 -0.237481 0.065820 0.016173 0.038654 -0.062417 0.062347 0.244823 -0.020285 -0.064192 0.207938 -0.090465 0.019649 0.143216 -0.022888 0.178350 0.074346 0.111231 0.061782 0.276390 0.092077 -0.090148 
-erat, 0.026044 -0.213629 0.085044 -0.235895 -0.058382 -0.069995 -0.077119 -0.122656 -0.048578 0.147098 -0.013602 0.285762 -0.315474 -0.062617 0.110164 0.332952 0.187335 0.019904 0.094687 0.040489 0.028762 0.186534 0.089568 -0.131016 -0.077793 -0.176226 0.071211 0.035333 0.227007 0.134493 -0.076820 -0.101076 -0.240074 0.172829 0.026262 -0.039876 0.130259 -0.080154 0.005434 -0.050321 -0.192560 0.165462 -0.253039 -0.050515 0.126904 0.141625 0.052965 0.150622 -0.093894 -0.170896 0.050547 -0.162644 0.084631 0.038991 -0.039404 0.046752 0.260390 0.026775 -0.097085 -0.120532 0.133021 -0.141100 -0.059372 -0.154295 -0.147976 0.211651 0.046993 -0.164780 0.171702 0.042639 -0.159182 0.004798 0.097844 -0.052125 -0.144535 0.069095 -0.145993 -0.072171 -0.172202 -0.243515 0.064810 0.010134 0.041559 -0.067001 0.063959 0.253077 -0.009653 -0.066415 0.231769 -0.102778 0.011593 0.147946 -0.029309 0.178260 0.074830 0.127372 0.054422 0.284015 0.086972 -0.094468 
-nec, 0.023465 -0.224897 0.095849 -0.253253 -0.054281 -0.073974 -0.081375 -0.137243 -0.044441 0.150541 -0.009046 0.312986 -0.323688 -0.073441 0.117897 0.347831 0.203834 0.028964 0.093123 0.042845 0.035022 0.193233 0.084169 -0.137260 -0.074863 -0.177929 0.077642 0.033425 0.240469 0.150135 -0.080486 -0.103329 -0.251595 0.174517 0.028023 -0.043627 0.149402 -0.090842 0.007804 -0.054281 -0.200881 0.172121 -0.269340 -0.061959 0.142311 0.147953 0.054592 0.153725 -0.110821 -0.174726 0.051194 -0.166690 0.082781 0.043101 -0.042553 0.049375 0.267846 0.023270 -0.105064 -0.130074 0.137332 -0.142638 -0.072585 -0.156440 -0.154004 0.218067 0.044163 -0.169326 0.179606 0.038600 -0.164776 0.010840 0.105236 -0.066227 -0.157684 0.064263 -0.160189 -0.078770 -0.183112 -0.269174 0.063292 0.012609 0.047320 -0.074470 0.073356 0.271345 -0.013225 -0.069513 0.237170 -0.102103 0.017785 0.157091 -0.033373 0.197717 0.080572 0.129922 0.063768 0.298517 0.099304 -0.095940 
-et, 0.020262 -0.230486 0.099033 -0.265646 -0.065511 -0.076954 -0.076218 -0.135425 -0.051981 0.153011 -0.007021 0.312919 -0.334901 -0.077641 0.120864 0.359768 0.201930 0.034083 0.102165 0.041948 0.038658 0.204146 0.096118 -0.132223 -0.072614 -0.189167 0.078218 0.034979 0.243894 0.152197 -0.083050 -0.111359 -0.255728 0.182635 0.024974 -0.042779 0.143612 -0.089636 0.005166 -0.068807 -0.211323 0.178676 -0.274733 -0.057541 0.138559 0.154377 0.057912 0.160092 -0.111225 -0.185143 0.055153 -0.171698 0.090783 0.045835 -0.041658 0.050017 0.277363 0.023119 -0.108247 -0.124612 0.142036 -0.141931 -0.065193 -0.161360 -0.160152 0.221868 0.043710 -0.182023 0.190246 0.045544 -0.158898 0.003128 0.102207 -0.064037 -0.154874 0.068675 -0.159331 -0.076420 -0.191637 -0.267555 0.065414 0.007420 0.056723 -0.082089 0.068814 0.281575 -0.019078 -0.065826 0.248263 -0.110276 0.016856 0.164583 -0.025419 0.198573 0.089363 0.135824 0.060625 0.303895 0.096535 -0.100861 
-porttitor. 0.025473 -0.190233 0.081187 -0.214307 -0.050688 -0.064833 -0.070034 -0.114206 -0.042368 0.122071 -0.010935 0.250291 -0.290615 -0.057120 0.104024 0.301206 0.170724 0.026854 0.086766 0.038065 0.027123 0.169083 0.079437 -0.106534 -0.065278 -0.148442 0.062389 0.031288 0.193915 0.119377 -0.064633 -0.095473 -0.218238 0.160087 0.016669 -0.032885 0.123114 -0.074337 0.005695 -0.051030 -0.181107 0.149587 -0.227837 -0.043308 0.110943 0.127423 0.044913 0.124339 -0.081968 -0.152184 0.043445 -0.135850 0.072272 0.038394 -0.039426 0.045767 0.223020 0.014977 -0.090222 -0.110245 0.112707 -0.120755 -0.058916 -0.128135 -0.130181 0.183856 0.039365 -0.144661 0.156535 0.032066 -0.139754 0.003466 0.084643 -0.050817 -0.126488 0.058836 -0.139623 -0.065170 -0.156851 -0.228797 0.053077 0.008176 0.045869 -0.061596 0.057978 0.230609 -0.012610 -0.059995 0.206315 -0.090180 0.006128 0.131502 -0.021093 0.163812 0.073035 0.114487 0.048463 0.242062 0.083892 -0.086312 
-enim. 0.025021 -0.196042 0.085114 -0.219720 -0.050490 -0.068185 -0.064246 -0.116493 -0.044374 0.131799 -0.003945 0.262776 -0.285921 -0.060206 0.098917 0.302153 0.171869 0.028919 0.089770 0.034386 0.033830 0.165512 0.073974 -0.110620 -0.064816 -0.158005 0.066989 0.030782 0.200426 0.123979 -0.070393 -0.095258 -0.209795 0.151037 0.020289 -0.040061 0.116276 -0.073143 0.001083 -0.049886 -0.179917 0.146254 -0.217613 -0.043470 0.116706 0.122797 0.041478 0.128078 -0.085700 -0.141024 0.049896 -0.134215 0.070863 0.033230 -0.034871 0.047062 0.219636 0.017973 -0.092179 -0.109837 0.115417 -0.114755 -0.056664 -0.125745 -0.131664 0.174329 0.037454 -0.138917 0.148247 0.039555 -0.135347 0.008135 0.078602 -0.047832 -0.126967 0.058724 -0.134275 -0.063667 -0.149938 -0.220419 0.048076 0.011296 0.045064 -0.062848 0.056852 0.225108 -0.013445 -0.054982 0.198622 -0.089715 0.013003 0.124248 -0.028383 0.159783 0.065414 0.104767 0.045459 0.247771 0.077606 -0.077029 
-massa. 0.016484 -0.168499 0.074656 -0.188065 -0.040489 -0.059283 -0.056122 -0.105520 -0.038935 0.110078 -0.008697 0.234753 -0.250975 -0.056452 0.094324 0.268286 0.148534 0.024082 0.069267 0.025402 0.026827 0.139388 0.066319 -0.098507 -0.050879 -0.138942 0.054764 0.030391 0.182246 0.109918 -0.064896 -0.086453 -0.187450 0.129043 0.021278 -0.033865 0.100831 -0.067374 0.006373 -0.042770 -0.147585 0.128732 -0.199827 -0.037340 0.103359 0.110017 0.040003 0.113546 -0.075631 -0.129345 0.045531 -0.117285 0.060313 0.033980 -0.030833 0.038772 0.194090 0.012887 -0.079295 -0.091395 0.103752 -0.102622 -0.051663 -0.109199 -0.115054 0.150850 0.035325 -0.126537 0.132308 0.034162 -0.121022 0.002779 0.068283 -0.043012 -0.111214 0.048821 -0.116525 -0.054880 -0.132246 -0.190174 0.048345 0.001625 0.038952 -0.058518 0.045909 0.185999 -0.011737 -0.049951 0.168447 -0.074928 0.010657 0.110106 -0.019798 0.138597 0.053253 0.091285 0.045966 0.204361 0.064977 -0.070848 
-quam, 0.014490 -0.192299 0.090718 -0.215929 -0.048079 -0.068178 -0.065361 -0.119645 -0.042688 0.139509 -0.007014 0.270441 -0.285613 -0.059334 0.104937 0.317853 0.176171 0.023733 0.073496 0.033867 0.032674 0.176769 0.080646 -0.120773 -0.066843 -0.154520 0.068604 0.027273 0.212028 0.132917 -0.067609 -0.093100 -0.220863 0.155871 0.019548 -0.042189 0.128311 -0.079646 0.004458 -0.058376 -0.185683 0.156371 -0.243850 -0.046926 0.117289 0.134740 0.050239 0.132955 -0.093424 -0.158144 0.047947 -0.146839 0.073718 0.043262 -0.034218 0.051694 0.238140 0.024423 -0.093443 -0.104722 0.119022 -0.118812 -0.063540 -0.134733 -0.141415 0.184091 0.039917 -0.150832 0.162598 0.037254 -0.135710 0.010205 0.084981 -0.054545 -0.132939 0.055285 -0.139295 -0.075481 -0.165706 -0.231397 0.059389 0.001741 0.037996 -0.069908 0.061284 0.241858 -0.015713 -0.054268 0.215553 -0.091888 0.016314 0.130662 -0.027970 0.168167 0.069190 0.117092 0.056200 0.259891 0.080159 -0.082364 
-leo, 0.023904 -0.229042 0.096259 -0.266388 -0.059911 -0.073831 -0.077066 -0.140815 -0.045171 0.154276 -0.014359 0.310564 -0.337848 -0.068609 0.121890 0.365760 0.207519 0.029167 0.101912 0.037214 0.028995 0.197836 0.096660 -0.133982 -0.073039 -0.187325 0.080590 0.028881 0.239118 0.146045 -0.082572 -0.110133 -0.247772 0.183989 0.020902 -0.046491 0.138165 -0.090347 0.012553 -0.061209 -0.206304 0.172140 -0.266415 -0.057345 0.136238 0.148974 0.055525 0.156730 -0.103414 -0.180250 0.052392 -0.160777 0.083936 0.044886 -0.037194 0.047097 0.266068 0.025528 -0.098840 -0.123404 0.139161 -0.139806 -0.074683 -0.149610 -0.162497 0.223090 0.047914 -0.173905 0.187305 0.040917 -0.166670 0.011632 0.098700 -0.063321 -0.152550 0.067299 -0.157256 -0.078582 -0.190360 -0.261502 0.061103 0.008436 0.055101 -0.068451 0.073643 0.273755 -0.012470 -0.064057 0.242539 -0.101937 0.015618 0.157056 -0.031551 0.194864 0.080138 0.133293 0.060801 0.296468 0.099507 -0.104350 
-eleifend. 0.022616 -0.192510 0.075144 -0.202988 -0.049133 -0.055748 -0.064730 -0.110845 -0.035980 0.127542 -0.017604 0.257651 -0.268475 -0.054853 0.093418 0.283603 0.156140 0.018760 0.076844 0.034066 0.023050 0.152755 0.076089 -0.110244 -0.065220 -0.139825 0.057167 0.024470 0.189073 0.112467 -0.066458 -0.090989 -0.198588 0.148567 0.020940 -0.029282 0.117079 -0.070486 0.001699 -0.047650 -0.167777 0.136975 -0.207691 -0.045615 0.113349 0.120719 0.044181 0.127551 -0.079767 -0.140362 0.047219 -0.135992 0.074798 0.041211 -0.029129 0.049584 0.207587 0.023001 -0.079605 -0.096281 0.104463 -0.110800 -0.049855 -0.123500 -0.120728 0.170710 0.038761 -0.136274 0.144013 0.037000 -0.132025 0.005977 0.078903 -0.048246 -0.115958 0.047835 -0.126583 -0.066361 -0.139503 -0.205825 0.045769 0.009858 0.037143 -0.055824 0.058986 0.216360 -0.015000 -0.046248 0.183932 -0.086510 0.012028 0.125668 -0.021806 0.153119 0.062046 0.109435 0.051677 0.232780 0.074330 -0.079915 
-sem, 0.014559 -0.171077 0.079952 -0.197308 -0.047733 -0.052598 -0.058101 -0.105228 -0.032491 0.124812 -0.007960 0.244110 -0.253860 -0.051390 0.095101 0.276226 0.153109 0.017138 0.079724 0.022767 0.021640 0.155368 0.066141 -0.105628 -0.062081 -0.135148 0.062972 0.025586 0.184950 0.119682 -0.061999 -0.086545 -0.195259 0.142563 0.013023 -0.034489 0.113186 -0.070630 0.001223 -0.048943 -0.159499 0.127810 -0.203921 -0.044602 0.101891 0.121108 0.046627 0.120193 -0.082071 -0.134430 0.038629 -0.122479 0.063629 0.031884 -0.030610 0.044501 0.207551 0.022886 -0.078562 -0.089440 0.102533 -0.103791 -0.051903 -0.115691 -0.125513 0.165474 0.038999 -0.133777 0.144123 0.032984 -0.120980 0.004016 0.070283 -0.044335 -0.119709 0.048677 -0.125381 -0.056562 -0.142510 -0.203721 0.047521 0.009919 0.036745 -0.052688 0.052732 0.200321 -0.013959 -0.051090 0.180278 -0.084958 0.016871 0.114478 -0.021884 0.142001 0.064080 0.097333 0.051450 0.220274 0.071155 -0.077773 
-ligula, 0.023759 -0.205819 0.083194 -0.229811 -0.051062 -0.064686 -0.076093 -0.119918 -0.039898 0.137760 -0.009435 0.279098 -0.309457 -0.060952 0.102459 0.337374 0.192280 0.021568 0.085878 0.041482 0.028254 0.179369 0.077205 -0.126307 -0.064142 -0.172207 0.070008 0.028284 0.213661 0.127663 -0.077191 -0.090437 -0.222660 0.153109 0.020843 -0.036567 0.122241 -0.080181 0.008034 -0.046489 -0.178936 0.154523 -0.233059 -0.049120 0.117043 0.133357 0.047119 0.129113 -0.090542 -0.144620 0.046767 -0.146950 0.081018 0.045290 -0.035489 0.043077 0.237524 0.018612 -0.086372 -0.109447 0.111778 -0.122603 -0.056334 -0.134468 -0.142864 0.193397 0.035862 -0.147704 0.159516 0.043059 -0.144558 0.006850 0.085157 -0.056733 -0.131517 0.052588 -0.138837 -0.074900 -0.151892 -0.229671 0.056674 0.011024 0.040060 -0.069033 0.064599 0.235691 -0.010273 -0.050807 0.205085 -0.090078 0.011177 0.132142 -0.025537 0.166090 0.067983 0.116404 0.046356 0.250100 0.074622 -0.081994 
-vulputate. 0.015776 -0.172347 0.072068 -0.188561 -0.043845 -0.056264 -0.060828 -0.095277 -0.037170 0.113374 -0.014904 0.229979 -0.249699 -0.046145 0.083846 0.261229 0.153948 0.019693 0.068473 0.030113 0.030788 0.145687 0.065318 -0.102963 -0.057339 -0.138420 0.053954 0.022681 0.173586 0.110856 -0.055543 -0.078747 -0.190639 0.138852 0.019532 -0.035155 0.108678 -0.070118 0.006362 -0.038119 -0.154845 0.134220 -0.204048 -0.038938 0.105043 0.116045 0.045576 0.118516 -0.076454 -0.132108 0.041372 -0.123146 0.064827 0.037488 -0.027906 0.044206 0.200384 0.015080 -0.082768 -0.095425 0.098100 -0.108419 -0.055027 -0.119455 -0.120003 0.163274 0.030146 -0.124750 0.139382 0.027242 -0.121982 0.003619 0.075963 -0.041038 -0.117076 0.052155 -0.121641 -0.058515 -0.133663 -0.198224 0.046435 0.015784 0.037710 -0.053430 0.046011 0.197357 -0.011833 -0.051513 0.183627 -0.078315 0.013629 0.110654 -0.023959 0.140604 0.056034 0.096785 0.042049 0.219027 0.069569 -0.068740 
-lectus. 0.019559 -0.187518 0.076710 -0.211830 -0.050364 -0.062574 -0.064625 -0.109546 -0.032518 0.128825 -0.012268 0.252914 -0.274076 -0.058262 0.102091 0.292829 0.165627 0.025950 0.082734 0.029664 0.022584 0.164237 0.067370 -0.103110 -0.064449 -0.148025 0.067541 0.028382 0.189752 0.123161 -0.065385 -0.094654 -0.200939 0.155999 0.012000 -0.039692 0.117169 -0.074689 -0.001661 -0.057034 -0.171591 0.135415 -0.214232 -0.036441 0.104489 0.122031 0.044543 0.127158 -0.086339 -0.150029 0.041133 -0.137469 0.071156 0.042837 -0.034835 0.047616 0.224574 0.022507 -0.089378 -0.107210 0.119018 -0.110778 -0.058010 -0.124156 -0.132821 0.171292 0.039825 -0.146940 0.149241 0.034598 -0.138138 0.010958 0.079744 -0.051205 -0.129214 0.054516 -0.132647 -0.066229 -0.154585 -0.217044 0.057004 0.005117 0.043082 -0.051903 0.055085 0.220928 -0.011699 -0.058681 0.197407 -0.080343 0.019311 0.127571 -0.017955 0.152165 0.073853 0.106645 0.050834 0.235366 0.082701 -0.084131 
-nisl, 0.021268 -0.187130 0.079345 -0.196631 -0.050005 -0.061203 -0.063174 -0.107621 -0.040373 0.131988 -0.010564 0.252517 -0.279113 -0.064686 0.105933 0.296856 0.166815 0.024783 0.077268 0.041051 0.029367 0.170768 0.080866 -0.115898 -0.066115 -0.152693 0.063268 0.022586 0.198582 0.124458 -0.070322 -0.092244 -0.214446 0.148650 0.013462 -0.037848 0.118174 -0.068594 0.006720 -0.056947 -0.166758 0.144592 -0.217608 -0.045372 0.105965 0.127931 0.040079 0.133696 -0.089881 -0.143149 0.051729 -0.131380 0.074594 0.041375 -0.029586 0.043712 0.224780 0.018877 -0.091260 -0.099818 0.117664 -0.113340 -0.052657 -0.125341 -0.126624 0.173275 0.041507 -0.140893 0.147302 0.032913 -0.137345 0.011154 0.076629 -0.046714 -0.131975 0.057090 -0.138436 -0.066231 -0.151822 -0.219651 0.052612 0.004176 0.035326 -0.059764 0.056588 0.230283 -0.010576 -0.057267 0.194796 -0.089598 0.016248 0.129953 -0.019641 0.166967 0.070610 0.105393 0.055087 0.246424 0.078413 -0.079139 
-ut. 0.017181 -0.206834 0.081090 -0.222603 -0.047992 -0.060894 -0.065057 -0.122649 -0.048879 0.141412 -0.006114 0.273665 -0.292295 -0.064956 0.106474 0.325345 0.182227 0.017000 0.085471 0.035441 0.034958 0.175593 0.081670 -0.115773 -0.066367 -0.163694 0.062300 0.023423 0.207947 0.127736 -0.076830 -0.099979 -0.223301 0.159127 0.024379 -0.035594 0.124804 -0.072861 0.003738 -0.057951 -0.183577 0.158373 -0.238702 -0.053084 0.122111 0.130636 0.052859 0.138039 -0.093135 -0.154061 0.057306 -0.140278 0.072502 0.033092 -0.033994 0.042546 0.230967 0.017066 -0.097140 -0.113049 0.119593 -0.122994 -0.057354 -0.130620 -0.136268 0.192460 0.040929 -0.150294 0.164728 0.041723 -0.144645 0.004965 0.083717 -0.048485 -0.127923 0.057595 -0.137126 -0.075065 -0.158859 -0.235306 0.053522 0.011010 0.047011 -0.065362 0.063005 0.234398 -0.016108 -0.061470 0.214588 -0.095303 0.010498 0.138162 -0.029546 0.171166 0.074151 0.116073 0.054917 0.259414 0.081694 -0.083748 
-at, 0.024270 -0.254219 0.105753 -0.281319 -0.065052 -0.087474 -0.089176 -0.151131 -0.057283 0.177257 -0.004151 0.349954 -0.366850 -0.081417 0.131292 0.399817 0.221761 0.031685 0.106201 0.045362 0.043531 0.225772 0.104546 -0.146040 -0.086142 -0.194982 0.089678 0.042934 0.264851 0.166292 -0.091851 -0.123434 -0.281013 0.207905 0.023932 -0.046812 0.164501 -0.100438 0.006891 -0.071822 -0.235509 0.190562 -0.296878 -0.061271 0.155297 0.167273 0.058997 0.178749 -0.119559 -0.201797 0.060844 -0.182091 0.092650 0.048338 -0.040377 0.049295 0.300350 0.030072 -0.116380 -0.140826 0.158222 -0.155634 -0.080583 -0.176855 -0.183223 0.238642 0.046384 -0.201574 0.205285 0.045648 -0.178572 0.014603 0.104563 -0.069350 -0.175322 0.075783 -0.179396 -0.091203 -0.205101 -0.294480 0.070051 0.010949 0.052780 -0.082429 0.078723 0.309893 -0.018377 -0.075859 0.266227 -0.117477 0.015811 0.173133 -0.030816 0.220868 0.094819 0.146353 0.062681 0.331896 0.106560 -0.117105 
-ex, 0.020911 -0.199619 0.084068 -0.230051 -0.052960 -0.069903 -0.073181 -0.120937 -0.039326 0.137687 -0.018878 0.275987 -0.292838 -0.062567 0.108092 0.316385 0.174334 0.026742 0.085312 0.033940 0.030437 0.174658 0.080709 -0.118903 -0.072014 -0.166259 0.071058 0.025072 0.216825 0.135626 -0.070254 -0.094302 -0.228416 0.165254 0.015787 -0.038817 0.130562 -0.082002 0.010268 -0.058120 -0.186256 0.156102 -0.235787 -0.045451 0.129794 0.133077 0.052047 0.139158 -0.095782 -0.156047 0.050811 -0.147190 0.081508 0.043456 -0.034920 0.042658 0.240510 0.022462 -0.095706 -0.115162 0.129647 -0.127634 -0.061221 -0.131702 -0.137324 0.192400 0.039574 -0.153135 0.158183 0.039486 -0.151052 0.004843 0.088542 -0.050393 -0.140014 0.060897 -0.138040 -0.070080 -0.166960 -0.235184 0.056771 0.009995 0.047059 -0.064126 0.065530 0.245284 -0.011475 -0.056541 0.210678 -0.092896 0.010926 0.134431 -0.018337 0.164804 0.080753 0.113695 0.054818 0.257834 0.089620 -0.090157 
-pharetra. 0.015457 -0.171499 0.076962 -0.184780 -0.039802 -0.055790 -0.056925 -0.099193 -0.041845 0.113124 -0.014269 0.233788 -0.246239 -0.043400 0.084855 0.267971 0.149333 0.022197 0.071818 0.031131 0.030124 0.139831 0.070016 -0.101390 -0.064484 -0.141211 0.055874 0.032438 0.179270 0.116729 -0.065345 -0.087672 -0.189773 0.136003 0.012981 -0.031976 0.114534 -0.068544 0.003761 -0.040965 -0.162383 0.131385 -0.198732 -0.051754 0.113958 0.113647 0.040042 0.118004 -0.081570 -0.139027 0.040308 -0.130688 0.072729 0.039711 -0.036611 0.043593 0.203250 0.019466 -0.086414 -0.092215 0.112705 -0.110993 -0.055788 -0.125014 -0.126776 0.164509 0.040264 -0.133590 0.138310 0.033197 -0.130170 0.004040 0.071486 -0.044936 -0.121531 0.056882 -0.127333 -0.065798 -0.143358 -0.209357 0.044521 0.012075 0.039314 -0.066972 0.050824 0.205899 -0.006266 -0.051421 0.187181 -0.079832 0.012407 0.115441 -0.018097 0.145485 0.067410 0.105933 0.048687 0.223996 0.072026 -0.072023 
-odio, 0.020198 -0.200490 0.091062 -0.235577 -0.049084 -0.063258 -0.066618 -0.119007 -0.045070 0.135762 -0.005690 0.281246 -0.292304 -0.068659 0.103567 0.312115 0.176628 0.022034 0.076462 0.031184 0.030893 0.166342 0.082030 -0.113249 -0.066716 -0.158665 0.068694 0.028910 0.209195 0.126777 -0.074233 -0.093848 -0.217723 0.156077 0.026266 -0.039876 0.133961 -0.079968 0.009389 -0.050462 -0.182169 0.154878 -0.235781 -0.048813 0.119579 0.129789 0.045175 0.135949 -0.096743 -0.156903 0.054501 -0.145551 0.071286 0.039757 -0.030568 0.048866 0.238082 0.024724 -0.089067 -0.109332 0.125790 -0.125350 -0.061110 -0.134557 -0.142360 0.187131 0.041125 -0.154623 0.159931 0.033146 -0.144648 0.004131 0.082544 -0.046256 -0.132707 0.053972 -0.132903 -0.076363 -0.168078 -0.238659 0.055844 0.001895 0.045277 -0.064171 0.060144 0.231030 -0.012939 -0.064327 0.207499 -0.090291 0.017425 0.130355 -0.023727 0.166773 0.072048 0.114130 0.057477 0.260278 0.088362 -0.082358 
-a, 0.032280 -0.275004 0.117309 -0.311102 -0.067146 -0.090586 -0.094791 -0.158648 -0.058673 0.192782 -0.013231 0.375610 -0.405792 -0.082061 0.145339 0.438181 0.238603 0.030206 0.112381 0.048980 0.044620 0.241611 0.111717 -0.167261 -0.098166 -0.219817 0.092523 0.039417 0.294112 0.177039 -0.099681 -0.130634 -0.313317 0.224704 0.030385 -0.052741 0.175878 -0.105358 0.007101 -0.073883 -0.254318 0.213791 -0.330969 -0.073097 0.168870 0.182927 0.074657 0.189681 -0.130327 -0.216275 0.073023 -0.200817 0.107122 0.055406 -0.039004 0.058109 0.320884 0.036493 -0.129603 -0.152559 0.165109 -0.171802 -0.081330 -0.190920 -0.189319 0.270659 0.053256 -0.216193 0.220088 0.055633 -0.198899 0.015068 0.112856 -0.071439 -0.180271 0.086608 -0.190664 -0.099842 -0.232273 -0.328000 0.073982 0.007712 0.063418 -0.089188 0.086642 0.332632 -0.019516 -0.076263 0.295094 -0.124626 0.010612 0.189655 -0.032124 0.234415 0.100326 0.161185 0.074337 0.350663 0.115922 -0.111061 
-Lorem 0.046042 -0.235960 0.102475 -0.248744 -0.065420 -0.086657 -0.042321 -0.112077 -0.050675 0.169855 0.030384 0.281757 -0.327781 -0.075648 0.121381 0.324901 0.183541 0.042238 0.124625 0.084610 0.056729 0.196580 0.101228 -0.120791 -0.063508 -0.146776 0.119154 0.061304 0.161371 0.089804 -0.096152 -0.115479 -0.233516 0.188977 0.014555 -0.057413 0.114496 -0.081947 0.007516 -0.077892 -0.254077 0.153895 -0.284976 -0.049620 0.103573 0.160251 0.020854 0.155510 -0.089585 -0.174441 0.047594 -0.168415 0.067288 0.039815 -0.026553 0.015246 0.268363 0.025948 -0.117849 -0.099575 0.156644 -0.097914 -0.067400 -0.140066 -0.162888 0.223269 0.032313 -0.220988 0.188414 0.027052 -0.134247 0.020536 0.079473 -0.062091 -0.154711 0.052342 -0.171301 -0.064810 -0.144270 -0.273770 0.098612 0.016544 0.029323 -0.047165 0.099556 0.282002 -0.037192 -0.078500 0.220214 -0.090442 0.044327 0.193259 -0.021144 0.213214 0.071580 0.096333 0.063773 0.305203 0.091457 -0.088496 
-adipiscing 0.040088 -0.297884 0.124342 -0.302220 -0.066436 -0.103508 -0.063892 -0.142772 -0.067979 0.208060 0.030417 0.359597 -0.432496 -0.089121 0.152744 0.412890 0.220934 0.051367 0.142668 0.106204 0.054650 0.249869 0.129931 -0.144043 -0.083340 -0.184934 0.156447 0.066632 0.203649 0.117895 -0.113141 -0.132834 -0.296853 0.244028 0.020276 -0.064907 0.144542 -0.103401 0.004070 -0.089296 -0.313125 0.186230 -0.343747 -0.061603 0.135632 0.210458 0.044544 0.186419 -0.121438 -0.210410 0.055716 -0.207375 0.075097 0.058668 -0.038244 0.020249 0.344663 0.029183 -0.134950 -0.123139 0.183083 -0.138594 -0.076232 -0.172472 -0.199562 0.281747 0.043951 -0.278938 0.248653 0.032030 -0.183836 0.017244 0.103967 -0.064609 -0.204821 0.058373 -0.216683 -0.084361 -0.182074 -0.339810 0.120184 0.019058 0.045289 -0.058233 0.115239 0.352456 -0.028703 -0.093181 0.289080 -0.124985 0.039912 0.229127 -0.035424 0.269727 0.102621 0.127884 0.085373 0.399679 0.105956 -0.114938 
-at. 0.020849 -0.202534 0.085067 -0.214130 -0.055278 -0.065500 -0.070244 -0.111844 -0.038677 0.130398 -0.006020 0.265176 -0.281101 -0.059798 0.100647 0.310263 0.174254 0.026016 0.086557 0.038847 0.035962 0.166442 0.078589 -0.115670 -0.067287 -0.164412 0.066128 0.025995 0.202987 0.130403 -0.077154 -0.089958 -0.227594 0.161907 0.017541 -0.046344 0.123156 -0.073792 0.012355 -0.057566 -0.184395 0.155339 -0.229768 -0.056394 0.115879 0.136694 0.054284 0.129372 -0.091596 -0.155794 0.048192 -0.138616 0.079920 0.045440 -0.023721 0.045606 0.236029 0.022126 -0.092719 -0.114483 0.120890 -0.124633 -0.057395 -0.135062 -0.133033 0.188723 0.042041 -0.151878 0.159549 0.042088 -0.139680 0.005885 0.080664 -0.046046 -0.127663 0.052779 -0.135331 -0.063317 -0.157292 -0.229851 0.059133 0.017164 0.036555 -0.061134 0.054920 0.232381 -0.017297 -0.054097 0.211527 -0.084633 0.012908 0.132998 -0.028709 0.168316 0.072758 0.106077 0.050585 0.254180 0.073511 -0.084689 
-nec. 0.023163 -0.194013 0.080494 -0.219205 -0.048205 -0.059784 -0.072719 -0.112208 -0.044108 0.134462 -0.007556 0.263548 -0.289069 -0.062410 0.101924 0.303306 0.172105 0.024715 0.086911 0.033203 0.027327 0.170516 0.073729 -0.118689 -0.061801 -0.160893 0.062604 0.021819 0.201473 0.128677 -0.070975 -0.091369 -0.210302 0.152425 0.020393 -0.038843 0.125958 -0.073124 0.001179 -0.047785 -0.170783 0.148178 -0.219300 -0.052531 0.117171 0.126077 0.050953 0.127272 -0.086613 -0.141739 0.047581 -0.139422 0.076913 0.039716 -0.030628 0.043580 0.222264 0.019005 -0.087431 -0.111580 0.113718 -0.118486 -0.057904 -0.132142 -0.135925 0.182697 0.044383 -0.151132 0.159129 0.038738 -0.139584 0.007479 0.082039 -0.056220 -0.127641 0.053084 -0.129694 -0.073677 -0.157535 -0.219767 0.049790 0.014676 0.040387 -0.067549 0.053351 0.226104 -0.016837 -0.053505 0.203290 -0.091405 0.009774 0.125422 -0.026829 0.162281 0.072474 0.104524 0.055856 0.239138 0.075817 -0.081025 
-neque. 0.018311 -0.191574 0.079720 -0.214688 -0.045218 -0.063520 -0.066458 -0.110553 -0.037321 0.131992 -0.013537 0.262848 -0.278182 -0.057808 0.099838 0.294871 0.167825 0.024500 0.082079 0.028083 0.027998 0.165566 0.076118 -0.117098 -0.065448 -0.153457 0.066824 0.020033 0.197898 0.125668 -0.062860 -0.086006 -0.204185 0.146776 0.017646 -0.037066 0.118740 -0.070361 0.005266 -0.051205 -0.173280 0.144156 -0.217304 -0.043827 0.110876 0.116443 0.051750 0.125913 -0.091869 -0.148514 0.046325 -0.139128 0.070982 0.034542 -0.028748 0.044564 0.220601 0.024943 -0.088645 -0.110491 0.119728 -0.120903 -0.061028 -0.125444 -0.126777 0.179166 0.037110 -0.145048 0.147675 0.034166 -0.136242 0.001638 0.080406 -0.044778 -0.124718 0.052865 -0.128531 -0.064985 -0.158385 -0.212710 0.055654 0.003393 0.039328 -0.061972 0.051092 0.223363 -0.015468 -0.059794 0.196975 -0.088691 0.008782 0.126956 -0.024771 0.153707 0.063890 0.112364 0.045020 0.242533 0.078250 -0.080974 
-scelerisque. 0.013347 -0.156862 0.055750 -0.175014 -0.041215 -0.047744 -0.053663 -0.095896 -0.035704 0.108748 -0.005203 0.207985 -0.220212 -0.050109 0.080987 0.250362 0.131138 0.024325 0.058027 0.031725 0.029824 0.130880 0.058016 -0.088014 -0.052340 -0.121774 0.053755 0.022282 0.161509 0.098153 -0.053024 -0.076425 -0.166669 0.118533 0.014073 -0.021239 0.092120 -0.049521 0.004803 -0.041816 -0.130409 0.125836 -0.177657 -0.037179 0.093126 0.096865 0.046817 0.108229 -0.075004 -0.112550 0.047533 -0.111123 0.051638 0.022391 -0.024476 0.040416 0.184399 0.014150 -0.073774 -0.094205 0.095633 -0.097762 -0.049996 -0.106223 -0.110828 0.138640 0.032322 -0.114132 0.125598 0.031529 -0.109868 0.003293 0.062597 -0.036470 -0.098445 0.045463 -0.113241 -0.060897 -0.125894 -0.169551 0.039402 0.002603 0.026086 -0.053198 0.042715 0.188859 -0.012808 -0.044346 0.166647 -0.068321 0.002792 0.099786 -0.015288 0.128794 0.056590 0.089320 0.039482 0.190639 0.066847 -0.058182 
-accumsan. 0.021399 -0.170742 0.073621 -0.193472 -0.042163 -0.057679 -0.060875 -0.105002 -0.041996 0.115956 -0.004879 0.231622 -0.253201 -0.053841 0.090023 0.270930 0.156884 0.018171 0.072645 0.030092 0.026366 0.147996 0.065414 -0.100331 -0.055484 -0.136205 0.063271 0.023114 0.175577 0.107635 -0.056401 -0.079688 -0.183149 0.136656 0.022236 -0.035975 0.102898 -0.061523 0.008087 -0.047842 -0.149862 0.128766 -0.194321 -0.044226 0.100632 0.113189 0.033860 0.115630 -0.078505 -0.128979 0.040491 -0.124796 0.061013 0.033539 -0.025214 0.037826 0.201197 0.018465 -0.080897 -0.097085 0.106012 -0.108845 -0.049719 -0.116115 -0.114483 0.166803 0.034007 -0.131105 0.133979 0.027493 -0.118953 0.007999 0.066021 -0.045664 -0.118430 0.048294 -0.115581 -0.056279 -0.129811 -0.197365 0.049179 0.010413 0.042443 -0.051134 0.050150 0.196374 -0.012602 -0.046709 0.176403 -0.078745 0.010865 0.116744 -0.015321 0.145002 0.061668 0.095206 0.048038 0.213656 0.075020 -0.075611 
-semper. 0.017833 -0.200367 0.077937 -0.218674 -0.053811 -0.065769 -0.065024 -0.111236 -0.038600 0.134617 -0.012401 0.264441 -0.279878 -0.059242 0.095047 0.301437 0.166583 0.023292 0.082256 0.038510 0.022644 0.159600 0.072979 -0.115195 -0.067446 -0.153954 0.060799 0.029235 0.205254 0.128405 -0.067617 -0.086269 -0.214914 0.149771 0.019704 -0.034801 0.121127 -0.072158 0.006141 -0.051832 -0.178216 0.152559 -0.227073 -0.054430 0.116007 0.127868 0.049467 0.136017 -0.090735 -0.154718 0.049951 -0.141282 0.075835 0.036038 -0.032214 0.042494 0.231642 0.021471 -0.088671 -0.109630 0.114476 -0.121612 -0.061905 -0.137009 -0.134306 0.189930 0.046125 -0.147529 0.157846 0.034964 -0.137418 0.008374 0.084980 -0.051835 -0.125509 0.057999 -0.133317 -0.075073 -0.156567 -0.230023 0.055187 0.003517 0.039972 -0.064363 0.057046 0.237722 -0.015192 -0.059789 0.200493 -0.088982 0.008044 0.132180 -0.026995 0.157345 0.064668 0.113674 0.054416 0.244669 0.079926 -0.084077 
-pellentesque. 0.022928 -0.176661 0.067450 -0.193533 -0.048208 -0.053638 -0.056915 -0.105739 -0.036641 0.119119 -0.008359 0.244342 -0.253333 -0.061376 0.096529 0.286136 0.155896 0.022702 0.078246 0.034991 0.029988 0.152298 0.074040 -0.108614 -0.054068 -0.143863 0.060431 0.026372 0.174931 0.110415 -0.057218 -0.085978 -0.197457 0.136619 0.017007 -0.032638 0.110322 -0.070452 0.006441 -0.045116 -0.157539 0.132621 -0.198766 -0.040056 0.100403 0.120306 0.049662 0.117027 -0.082759 -0.125840 0.046826 -0.122243 0.060335 0.032709 -0.028038 0.042959 0.198335 0.015402 -0.078318 -0.093961 0.101632 -0.107615 -0.050639 -0.114857 -0.119931 0.160907 0.040763 -0.129270 0.134756 0.030304 -0.121191 0.008689 0.074004 -0.044936 -0.112444 0.047979 -0.120948 -0.064290 -0.136837 -0.198713 0.042308 0.009295 0.032097 -0.056100 0.056386 0.205797 -0.012127 -0.046541 0.180112 -0.083070 0.006091 0.113033 -0.019507 0.148280 0.057160 0.096311 0.051082 0.225785 0.069690 -0.079813 
-vel, 0.025429 -0.222820 0.091683 -0.246784 -0.056539 -0.070627 -0.078220 -0.123965 -0.041561 0.151613 -0.008154 0.297916 -0.325166 -0.074009 0.112555 0.345776 0.193373 0.030819 0.089721 0.037279 0.028958 0.185401 0.087763 -0.134271 -0.076511 -0.176024 0.078585 0.028315 0.223944 0.133557 -0.082627 -0.100905 -0.236949 0.172800 0.017781 -0.043211 0.134517 -0.087510 0.005331 -0.054942 -0.195501 0.168725 -0.252165 -0.058541 0.126681 0.141071 0.057800 0.141314 -0.105543 -0.171451 0.060096 -0.154200 0.081651 0.041145 -0.036088 0.043253 0.258016 0.018216 -0.097184 -0.124635 0.128109 -0.136330 -0.065397 -0.146072 -0.152796 0.207939 0.048626 -0.166190 0.174593 0.042600 -0.150095 0.002492 0.095704 -0.052467 -0.143093 0.056459 -0.152945 -0.076892 -0.179807 -0.253028 0.066481 0.012871 0.042827 -0.067540 0.061482 0.263524 -0.014608 -0.063625 0.226129 -0.101078 0.008710 0.150725 -0.026669 0.182772 0.075450 0.126472 0.054743 0.284040 0.092149 -0.097462 
-lobortis. 0.018029 -0.170212 0.075215 -0.191965 -0.043070 -0.057673 -0.062151 -0.105074 -0.037791 0.115788 -0.006068 0.236348 -0.246261 -0.052100 0.094922 0.279162 0.155860 0.017826 0.069771 0.025969 0.031002 0.150388 0.068022 -0.100255 -0.057518 -0.138254 0.052088 0.022954 0.175175 0.117851 -0.060645 -0.085775 -0.182243 0.137820 0.016101 -0.032244 0.103883 -0.065465 0.001457 -0.039779 -0.151372 0.130601 -0.201671 -0.046580 0.105149 0.107769 0.039260 0.113503 -0.078251 -0.132418 0.048992 -0.123218 0.063219 0.033417 -0.027503 0.043159 0.194252 0.012291 -0.084524 -0.097168 0.109814 -0.103729 -0.054740 -0.117790 -0.118783 0.160803 0.036902 -0.127245 0.139776 0.033970 -0.123204 0.006904 0.073336 -0.037972 -0.118227 0.047282 -0.114043 -0.058119 -0.142446 -0.194379 0.048075 0.009068 0.042652 -0.055348 0.047766 0.199969 -0.010504 -0.051103 0.185519 -0.075675 0.011550 0.112777 -0.027948 0.143500 0.058076 0.101399 0.044088 0.217117 0.074378 -0.075875 
-sollicitudin. 0.021685 -0.186212 0.079440 -0.210593 -0.049692 -0.065792 -0.067880 -0.108918 -0.038730 0.132445 -0.007927 0.256259 -0.270315 -0.065500 0.096851 0.297237 0.162965 0.018158 0.075578 0.026039 0.033339 0.164510 0.070361 -0.103631 -0.062482 -0.151087 0.056689 0.029236 0.198230 0.125803 -0.058862 -0.093678 -0.198302 0.146560 0.020954 -0.038380 0.117978 -0.067805 0.010414 -0.048388 -0.159873 0.147293 -0.209874 -0.053112 0.110770 0.122651 0.050331 0.123163 -0.089547 -0.142057 0.051416 -0.134973 0.072725 0.041357 -0.024648 0.044190 0.212584 0.020782 -0.086056 -0.100215 0.112012 -0.118846 -0.053587 -0.125968 -0.126233 0.176306 0.037764 -0.135192 0.143618 0.035121 -0.133366 0.002827 0.070204 -0.044762 -0.122849 0.058467 -0.129495 -0.068455 -0.151882 -0.205950 0.046423 0.008645 0.041098 -0.060713 0.056380 0.223602 -0.013565 -0.047331 0.194718 -0.081275 0.006678 0.121684 -0.021055 0.152241 0.062522 0.107197 0.047848 0.233858 0.078898 -0.074120 
-urna, 0.023707 -0.207782 0.081765 -0.228393 -0.054051 -0.070970 -0.073976 -0.125898 -0.046926 0.134327 -0.007699 0.277288 -0.300260 -0.061055 0.105892 0.320018 0.184507 0.025712 0.080165 0.032277 0.034847 0.182042 0.083847 -0.125357 -0.066715 -0.170007 0.071971 0.028267 0.218992 0.138658 -0.074237 -0.100564 -0.228348 0.164814 0.023432 -0.041513 0.132803 -0.075943 0.004300 -0.054505 -0.188174 0.156460 -0.240840 -0.057458 0.120417 0.136480 0.045234 0.136084 -0.098450 -0.159302 0.050158 -0.151279 0.068987 0.042901 -0.033067 0.038715 0.236960 0.017835 -0.100048 -0.113581 0.127086 -0.129157 -0.056761 -0.132536 -0.147260 0.193190 0.035144 -0.162739 0.158005 0.042535 -0.143957 0.005889 0.086813 -0.050175 -0.130606 0.057230 -0.142447 -0.070856 -0.160349 -0.232762 0.054668 0.007379 0.047254 -0.071429 0.060464 0.234059 -0.017595 -0.056253 0.213302 -0.087445 0.006409 0.138684 -0.021545 0.165242 0.073180 0.118398 0.055388 0.259635 0.087839 -0.088954 
-bibendum. 0.017528 -0.176694 0.067592 -0.191051 -0.039454 -0.057997 -0.053569 -0.092418 -0.034824 0.110092 -0.010643 0.225267 -0.246818 -0.048877 0.085272 0.268513 0.155718 0.023222 0.068025 0.030441 0.028998 0.141125 0.072347 -0.099221 -0.059217 -0.136719 0.055940 0.023330 0.174299 0.108429 -0.062992 -0.075846 -0.178434 0.139460 0.017796 -0.030085 0.102209 -0.062595 0.007197 -0.039357 -0.148552 0.125513 -0.189888 -0.045255 0.103464 0.114836 0.039240 0.107472 -0.073324 -0.129621 0.046449 -0.122916 0.063462 0.030232 -0.030778 0.037800 0.193803 0.012575 -0.079238 -0.087957 0.100477 -0.102273 -0.052197 -0.111684 -0.115932 0.162209 0.032496 -0.129680 0.135002 0.032358 -0.119481 0.010846 0.063355 -0.042841 -0.111659 0.049162 -0.121585 -0.057363 -0.137695 -0.196853 0.044683 0.006672 0.029908 -0.052621 0.052576 0.196853 -0.015504 -0.043480 0.174128 -0.072257 0.013378 0.113174 -0.021678 0.137602 0.059107 0.095903 0.040016 0.206020 0.072442 -0.070419 
-lacinia. 0.008010 -0.127375 0.049618 -0.138744 -0.033218 -0.040439 -0.046454 -0.077320 -0.022322 0.078704 -0.009685 0.163696 -0.174892 -0.039616 0.067217 0.189794 0.105472 0.009852 0.051336 0.019380 0.012251 0.110862 0.042353 -0.074229 -0.041794 -0.092215 0.045610 0.014175 0.127377 0.079757 -0.048567 -0.055463 -0.131670 0.093404 0.010267 -0.026410 0.077600 -0.050321 0.005319 -0.028375 -0.109486 0.090235 -0.140762 -0.032038 0.069930 0.086102 0.025474 0.078446 -0.054914 -0.089072 0.030594 -0.086794 0.045072 0.020191 -0.015185 0.030378 0.142423 0.009741 -0.056400 -0.068469 0.073071 -0.072196 -0.032058 -0.082851 -0.087031 0.117384 0.018769 -0.093153 0.096303 0.027655 -0.083292 -0.001312 0.054666 -0.026514 -0.083946 0.028805 -0.086942 -0.044823 -0.098776 -0.139123 0.036391 0.002285 0.023229 -0.033672 0.032652 0.133217 -0.010027 -0.035563 0.121982 -0.056095 0.006890 0.071757 -0.018573 0.096438 0.045782 0.064035 0.035363 0.147721 0.049060 -0.047369 
-metus, 0.025027 -0.189934 0.076885 -0.203973 -0.047824 -0.057146 -0.065687 -0.111566 -0.044158 0.132404 -0.009560 0.253926 -0.271622 -0.059890 0.095160 0.291208 0.168255 0.026673 0.074302 0.035083 0.031692 0.158891 0.076491 -0.110767 -0.066941 -0.154479 0.064918 0.025896 0.194440 0.116848 -0.069264 -0.089096 -0.204890 0.146111 0.019833 -0.035951 0.115774 -0.068743 0.003229 -0.045784 -0.165404 0.147886 -0.216578 -0.051203 0.118751 0.124773 0.053623 0.123520 -0.088861 -0.144230 0.048321 -0.138588 0.077145 0.034966 -0.028008 0.040659 0.219530 0.023376 -0.079429 -0.110764 0.109251 -0.119675 -0.050431 -0.126408 -0.132545 0.178205 0.034295 -0.138831 0.155565 0.034813 -0.134245 0.009852 0.084482 -0.045475 -0.124046 0.053953 -0.132307 -0.066280 -0.152670 -0.216900 0.048141 0.015943 0.039743 -0.054851 0.053202 0.222307 -0.011369 -0.055176 0.190409 -0.088182 0.014655 0.127854 -0.026461 0.150008 0.070474 0.109909 0.045181 0.235218 0.076798 -0.083917 
-dignissim, 0.025354 -0.205713 0.093193 -0.231902 -0.050355 -0.070797 -0.075068 -0.120063 -0.044342 0.141647 -0.005846 0.298332 -0.303945 -0.070734 0.106651 0.329389 0.186165 0.022455 0.083256 0.039557 0.033517 0.186135 0.083826 -0.129349 -0.066292 -0.169098 0.073294 0.032230 0.229346 0.143794 -0.079962 -0.100243 -0.236984 0.167536 0.017902 -0.039753 0.131170 -0.081213 0.005007 -0.053632 -0.202620 0.168128 -0.256877 -0.051497 0.132970 0.135658 0.049192 0.143759 -0.102435 -0.165014 0.049502 -0.154827 0.076749 0.038271 -0.031834 0.051851 0.251805 0.019970 -0.097325 -0.116981 0.135571 -0.133277 -0.061116 -0.137723 -0.146927 0.198809 0.047390 -0.163980 0.165054 0.040756 -0.149023 0.013466 0.085255 -0.050152 -0.146193 0.064841 -0.144824 -0.077299 -0.165660 -0.245191 0.061170 0.006517 0.047667 -0.072345 0.064918 0.243578 -0.010520 -0.059097 0.223139 -0.096263 0.010892 0.142734 -0.023921 0.175633 0.075455 0.116708 0.058162 0.264797 0.084932 -0.091196 
-condimentum. 0.017756 -0.177200 0.075478 -0.202066 -0.051960 -0.065478 -0.060473 -0.107906 -0.034903 0.127317 -0.012924 0.242358 -0.272123 -0.059463 0.099931 0.284319 0.162785 0.023824 0.071355 0.033772 0.033274 0.154671 0.072500 -0.108791 -0.059037 -0.144797 0.057877 0.022494 0.188074 0.109980 -0.068258 -0.084699 -0.195464 0.146835 0.022819 -0.037224 0.113808 -0.075941 0.011128 -0.051740 -0.160547 0.133060 -0.201798 -0.042332 0.111099 0.123151 0.046526 0.116434 -0.087734 -0.137231 0.050633 -0.131872 0.064230 0.033085 -0.028908 0.039416 0.212785 0.019774 -0.084155 -0.103629 0.113404 -0.109679 -0.049660 -0.118907 -0.126112 0.172311 0.040555 -0.138112 0.140112 0.037818 -0.128352 0.001562 0.080106 -0.045103 -0.118147 0.048857 -0.123262 -0.065639 -0.138160 -0.208563 0.049899 0.008915 0.031676 -0.053833 0.049909 0.213352 -0.007625 -0.048711 0.189329 -0.083541 0.001892 0.118212 -0.028508 0.143503 0.065852 0.101279 0.045071 0.226675 0.071806 -0.072492 
-mollis. 0.016739 -0.132266 0.055981 -0.147838 -0.039159 -0.043885 -0.044110 -0.079834 -0.035434 0.095297 -0.002646 0.177319 -0.190524 -0.045346 0.075633 0.214524 0.117984 0.018647 0.061016 0.022747 0.017729 0.122317 0.052834 -0.082421 -0.044169 -0.108135 0.048288 0.028539 0.139995 0.092152 -0.048839 -0.068643 -0.145365 0.117315 0.007333 -0.027578 0.086869 -0.051525 0.008387 -0.038846 -0.124215 0.110172 -0.158805 -0.031212 0.079882 0.095201 0.028075 0.094584 -0.059729 -0.109415 0.031888 -0.105734 0.047058 0.021650 -0.024825 0.023967 0.170348 0.013208 -0.066330 -0.073537 0.089219 -0.089505 -0.046931 -0.093613 -0.099341 0.138449 0.029930 -0.118873 0.116070 0.021869 -0.095653 0.007067 0.060449 -0.036234 -0.089267 0.041413 -0.096695 -0.050650 -0.120605 -0.158521 0.042306 0.004178 0.026400 -0.045395 0.047449 0.166379 -0.011785 -0.044169 0.143235 -0.054297 0.010333 0.097093 -0.023285 0.112026 0.046648 0.078639 0.038632 0.181592 0.058659 -0.058910 
-ultricies. 0.025668 -0.183781 0.081509 -0.214813 -0.047508 -0.062117 -0.065557 -0.108413 -0.037481 0.132156 -0.010474 0.264924 -0.280142 -0.052385 0.096751 0.302818 0.165019 0.017745 0.077093 0.039509 0.028749 0.165777 0.074256 -0.108503 -0.061844 -0.156499 0.061817 0.025808 0.195950 0.126090 -0.062663 -0.092662 -0.209370 0.146900 0.024032 -0.037894 0.117768 -0.071021 0.004756 -0.053153 -0.175234 0.148219 -0.216966 -0.048468 0.110793 0.124467 0.043184 0.125503 -0.085837 -0.148283 0.048037 -0.131797 0.067717 0.032474 -0.035019 0.045759 0.220079 0.027017 -0.088052 -0.101334 0.116514 -0.118420 -0.049442 -0.132870 -0.123629 0.175903 0.035360 -0.137834 0.154670 0.034156 -0.139101 0.000376 0.077085 -0.052657 -0.130020 0.055909 -0.127030 -0.062307 -0.155034 -0.220669 0.047734 0.014148 0.044673 -0.053604 0.058689 0.217684 -0.010209 -0.054505 0.190527 -0.088804 0.008491 0.128274 -0.022572 0.161925 0.069564 0.106820 0.049251 0.241644 0.082064 -0.074996 
-ipsum, 0.026380 -0.209666 0.085176 -0.233062 -0.048674 -0.073591 -0.068847 -0.116935 -0.045012 0.134729 -0.011580 0.283252 -0.296248 -0.057152 0.105278 0.322150 0.176817 0.021832 0.088522 0.037923 0.024959 0.177868 0.084398 -0.119638 -0.059762 -0.164840 0.073215 0.026067 0.210988 0.122795 -0.070779 -0.101319 -0.226147 0.155584 0.017939 -0.040772 0.125854 -0.074631 0.006972 -0.056855 -0.189388 0.151502 -0.234681 -0.046987 0.117885 0.129213 0.047734 0.131120 -0.096599 -0.154873 0.047081 -0.148435 0.077945 0.045283 -0.040048 0.043643 0.229605 0.016441 -0.094876 -0.104420 0.123547 -0.119498 -0.063454 -0.133667 -0.141780 0.184619 0.037903 -0.157743 0.162081 0.035321 -0.139307 0.008558 0.087234 -0.053507 -0.132709 0.061294 -0.143789 -0.073636 -0.162259 -0.232391 0.061659 0.013715 0.044317 -0.067055 0.068179 0.240196 -0.012044 -0.057721 0.205057 -0.095993 0.010979 0.133232 -0.026349 0.165706 0.066216 0.110616 0.053722 0.256198 0.081939 -0.087936 
-et. 0.021051 -0.180648 0.079326 -0.203484 -0.046265 -0.061639 -0.062225 -0.113026 -0.035923 0.118289 -0.012587 0.256654 -0.264756 -0.053699 0.089758 0.281985 0.160156 0.021878 0.079950 0.024981 0.027110 0.156538 0.067423 -0.103048 -0.066010 -0.144301 0.062226 0.025338 0.192419 0.113413 -0.062350 -0.086150 -0.200106 0.148107 0.015219 -0.029577 0.110681 -0.066825 0.003959 -0.044648 -0.163922 0.138113 -0.215447 -0.041797 0.107794 0.119196 0.040410 0.124903 -0.079336 -0.141330 0.042799 -0.124919 0.065146 0.033985 -0.026271 0.044801 0.202984 0.022200 -0.079450 -0.094941 0.108781 -0.113927 -0.049171 -0.124218 -0.119587 0.174482 0.027089 -0.136203 0.142217 0.031553 -0.127509 0.009202 0.075030 -0.049380 -0.116729 0.054105 -0.120041 -0.057235 -0.146412 -0.203619 0.045896 0.012666 0.043896 -0.062555 0.053793 0.207769 -0.010530 -0.050334 0.186620 -0.080847 0.010957 0.116012 -0.017819 0.145536 0.064063 0.094426 0.054553 0.228808 0.076709 -0.079361 
-nisi. 0.018668 -0.217701 0.087284 -0.244031 -0.055515 -0.072832 -0.072461 -0.130319 -0.046568 0.151838 -0.010738 0.297632 -0.317869 -0.064574 0.115315 0.342402 0.184269 0.028595 0.088654 0.033096 0.038266 0.181995 0.085630 -0.122038 -0.072519 -0.172554 0.074653 0.031537 0.219040 0.138197 -0.073383 -0.107069 -0.236544 0.170116 0.020981 -0.042495 0.134161 -0.083847 0.006772 -0.052407 -0.193245 0.163409 -0.250794 -0.055749 0.129534 0.136222 0.048398 0.143202 -0.097267 -0.162945 0.050743 -0.148981 0.083690 0.041899 -0.033635 0.043901 0.258295 0.026256 -0.103112 -0.124176 0.134135 -0.129074 -0.063949 -0.142361 -0.146791 0.202686 0.043393 -0.165868 0.171261 0.040300 -0.155788 0.013927 0.090815 -0.060527 -0.139798 0.054704 -0.145270 -0.068785 -0.178130 -0.246013 0.059763 0.007518 0.048206 -0.062590 0.059547 0.256418 -0.008824 -0.060975 0.226200 -0.099738 0.011560 0.136023 -0.023706 0.170415 0.078771 0.118370 0.060419 0.264953 0.091065 -0.086608 
-luctus. 0.020662 -0.179893 0.066034 -0.192249 -0.040771 -0.055477 -0.054705 -0.107004 -0.037375 0.119191 -0.006191 0.237429 -0.253216 -0.058879 0.094522 0.268223 0.151941 0.024668 0.076220 0.036342 0.029254 0.150120 0.066907 -0.104280 -0.057651 -0.145336 0.063320 0.024044 0.172644 0.110113 -0.059569 -0.083455 -0.185858 0.137352 0.014773 -0.028179 0.104441 -0.065498 0.008498 -0.047333 -0.153652 0.134429 -0.197659 -0.043740 0.108743 0.117425 0.045395 0.109762 -0.079615 -0.128915 0.041434 -0.121049 0.061387 0.038053 -0.028231 0.040221 0.214081 0.010772 -0.086032 -0.095082 0.105333 -0.107023 -0.055194 -0.119329 -0.127601 0.172328 0.032034 -0.132209 0.141298 0.036502 -0.123303 0.002060 0.075631 -0.047817 -0.121049 0.050488 -0.124626 -0.062461 -0.147607 -0.209558 0.052481 0.010223 0.034796 -0.056056 0.049862 0.210220 -0.010771 -0.049504 0.180990 -0.077669 0.011469 0.116911 -0.018522 0.153366 0.057734 0.094534 0.043048 0.219393 0.066652 -0.077865 
-id, 0.028597 -0.237713 0.095601 -0.265103 -0.055018 -0.080446 -0.077519 -0.130705 -0.050154 0.162675 -0.003355 0.316661 -0.344463 -0.081360 0.129697 0.370126 0.200968 0.026143 0.093416 0.049994 0.035078 0.200113 0.095877 -0.137963 -0.078645 -0.183060 0.087925 0.039555 0.241452 0.140142 -0.087664 -0.107862 -0.260230 0.180524 0.026012 -0.046163 0.149233 -0.091438 0.011156 -0.062733 -0.214726 0.181839 -0.277196 -0.064180 0.140755 0.157927 0.052088 0.162363 -0.101871 -0.180234 0.056373 -0.175920 0.088780 0.045045 -0.038413 0.053369 0.283507 0.025196 -0.114825 -0.128650 0.144216 -0.141838 -0.069719 -0.158110 -0.159623 0.225980 0.045728 -0.190062 0.187033 0.047746 -0.160122 0.009299 0.098142 -0.066188 -0.159208 0.072880 -0.171433 -0.079238 -0.181978 -0.271258 0.064524 0.005281 0.050268 -0.069195 0.073749 0.281859 -0.021116 -0.067646 0.240153 -0.109462 0.017638 0.163658 -0.026088 0.197433 0.083707 0.133639 0.062320 0.301201 0.098772 -0.104699 
-neque, 0.016317 -0.184133 0.078949 -0.194132 -0.043706 -0.057665 -0.060147 -0.110786 -0.036108 0.127704 -0.014967 0.246758 -0.271560 -0.056232 0.102117 0.285326 0.159252 0.019027 0.077403 0.032840 0.032885 0.155950 0.068904 -0.108091 -0.063035 -0.148216 0.058861 0.027418 0.184905 0.119133 -0.061531 -0.090043 -0.194634 0.150384 0.017440 -0.037130 0.117210 -0.067484 0.010797 -0.045082 -0.171783 0.140856 -0.218609 -0.046448 0.108045 0.117926 0.048628 0.120878 -0.087800 -0.138164 0.041074 -0.126913 0.072744 0.039765 -0.036147 0.047604 0.215969 0.021620 -0.083034 -0.103644 0.112390 -0.112293 -0.050494 -0.116393 -0.119530 0.165522 0.034289 -0.136262 0.139920 0.038374 -0.128446 0.005357 0.080233 -0.051668 -0.119195 0.058882 -0.121202 -0.057350 -0.144200 -0.211101 0.048150 0.016207 0.038599 -0.060002 0.053641 0.210866 -0.006711 -0.045607 0.186420 -0.079241 0.005082 0.118372 -0.023241 0.148396 0.063760 0.106287 0.051271 0.236556 0.079057 -0.075285 
-facilisis. 0.015437 -0.184472 0.086107 -0.203297 -0.047636 -0.061397 -0.063770 -0.112744 -0.033634 0.129298 -0.004854 0.254333 -0.278851 -0.059351 0.096282 0.289910 0.158245 0.020749 0.079526 0.028960 0.027739 0.161522 0.071759 -0.101999 -0.061327 -0.140606 0.061181 0.028888 0.198263 0.126501 -0.072542 -0.094115 -0.203942 0.147220 0.021790 -0.036762 0.111957 -0.068212 0.000444 -0.048345 -0.171759 0.144072 -0.219902 -0.043212 0.111014 0.118891 0.050842 0.125213 -0.091835 -0.135770 0.041234 -0.136975 0.065242 0.037381 -0.030334 0.042600 0.213531 0.015854 -0.084217 -0.104379 0.118123 -0.113404 -0.051704 -0.118078 -0.129864 0.177273 0.032496 -0.141995 0.143849 0.039147 -0.135290 0.004240 0.078295 -0.051931 -0.126835 0.048814 -0.127853 -0.064732 -0.144436 -0.209480 0.048226 0.008682 0.039679 -0.064759 0.053069 0.221383 -0.008608 -0.052515 0.189105 -0.086874 0.009240 0.116161 -0.029794 0.154061 0.063840 0.101328 0.050364 0.232278 0.070147 -0.081268 
-dui, 0.012432 -0.190741 0.083969 -0.208272 -0.053918 -0.069879 -0.059386 -0.113645 -0.041833 0.127500 -0.009260 0.260287 -0.283743 -0.058913 0.095926 0.296577 0.157147 0.020440 0.084699 0.030748 0.026004 0.159261 0.073508 -0.111627 -0.063642 -0.150203 0.074110 0.030103 0.182170 0.117964 -0.067949 -0.087319 -0.205375 0.146691 0.011265 -0.042402 0.116629 -0.074151 0.001463 -0.052712 -0.166930 0.142174 -0.211355 -0.037296 0.100264 0.123142 0.037692 0.126431 -0.088162 -0.143335 0.038524 -0.129921 0.061795 0.039638 -0.027592 0.040582 0.216683 0.015339 -0.086571 -0.100208 0.118010 -0.110058 -0.054923 -0.120975 -0.132228 0.178231 0.038338 -0.147221 0.146858 0.035062 -0.126535 0.005168 0.082876 -0.047821 -0.128338 0.047119 -0.131875 -0.060138 -0.150316 -0.209897 0.053320 0.004535 0.037608 -0.050784 0.058632 0.216451 -0.008588 -0.057338 0.195804 -0.076842 0.013785 0.126358 -0.024178 0.147100 0.062684 0.101534 0.046738 0.232943 0.076281 -0.084870 
-aliquet. 0.023016 -0.186340 0.073335 -0.197616 -0.048604 -0.057290 -0.062337 -0.111243 -0.037889 0.120589 -0.008844 0.243053 -0.267609 -0.048311 0.090573 0.286320 0.164196 0.024846 0.081059 0.030523 0.026053 0.156685 0.073934 -0.101951 -0.062656 -0.141826 0.060406 0.021955 0.186153 0.115977 -0.066455 -0.092382 -0.192646 0.140750 0.023149 -0.038513 0.108865 -0.065391 0.002505 -0.051859 -0.158950 0.133959 -0.203065 -0.044957 0.108014 0.115619 0.041367 0.120757 -0.088106 -0.141003 0.046472 -0.123307 0.070024 0.028730 -0.036770 0.042092 0.215113 0.015529 -0.086754 -0.095916 0.109926 -0.115901 -0.049121 -0.121708 -0.121329 0.168440 0.035204 -0.133616 0.142263 0.035889 -0.125450 0.004773 0.081419 -0.040343 -0.123795 0.051103 -0.121035 -0.063332 -0.151351 -0.204923 0.050526 0.001889 0.040636 -0.060524 0.051994 0.208297 -0.012356 -0.050289 0.192171 -0.086626 0.005947 0.124308 -0.019719 0.156989 0.068975 0.108773 0.045415 0.238559 0.074835 -0.074373 
-eu. 0.023948 -0.211310 0.092695 -0.233772 -0.055280 -0.073497 -0.075729 -0.120038 -0.049567 0.145915 -0.011610 0.277596 -0.303131 -0.061784 0.111531 0.332316 0.189014 0.024076 0.093400 0.030587 0.034040 0.178460 0.080828 -0.126569 -0.071785 -0.162359 0.072520 0.035628 0.210417 0.132077 -0.074122 -0.098824 -0.229263 0.167092 0.017941 -0.043336 0.132024 -0.073581 0.008166 -0.059311 -0.186819 0.160900 -0.242922 -0.049489 0.121765 0.137253 0.053491 0.143736 -0.094182 -0.160095 0.052359 -0.146956 0.078227 0.036863 -0.030404 0.050604 0.243623 0.026540 -0.094367 -0.118776 0.125100 -0.124404 -0.056912 -0.141877 -0.146258 0.190543 0.042121 -0.157950 0.164378 0.040687 -0.154476 0.012511 0.091319 -0.055816 -0.137257 0.061177 -0.141201 -0.067507 -0.166638 -0.240928 0.058457 0.003633 0.040051 -0.066288 0.065939 0.248415 -0.012190 -0.058208 0.214121 -0.101326 0.015034 0.141727 -0.027018 0.168000 0.075158 0.123808 0.061532 0.264059 0.092367 -0.092564 
-dapibus. 0.021425 -0.177681 0.078771 -0.198759 -0.044191 -0.060722 -0.058806 -0.102281 -0.040110 0.112561 -0.014325 0.235167 -0.259962 -0.052203 0.097962 0.277852 0.153662 0.017049 0.068050 0.034063 0.030794 0.142444 0.072290 -0.099893 -0.059735 -0.139132 0.059620 0.017892 0.179218 0.107035 -0.065535 -0.087480 -0.190621 0.138611 0.013779 -0.032568 0.102811 -0.063601 0.003421 -0.050836 -0.150450 0.127233 -0.208092 -0.039307 0.107598 0.118650 0.040477 0.117576 -0.076144 -0.136053 0.040547 -0.123973 0.061937 0.036418 -0.025378 0.040437 0.205606 0.014383 -0.082351 -0.096537 0.105662 -0.101205 -0.048139 -0.119629 -0.122649 0.166053 0.026720 -0.135958 0.139839 0.035238 -0.122056 0.003292 0.075956 -0.050410 -0.117158 0.046400 -0.118944 -0.065693 -0.140628 -0.202938 0.046979 0.006205 0.033493 -0.055140 0.047191 0.202296 -0.016427 -0.054353 0.182854 -0.072777 0.011491 0.110494 -0.021619 0.144009 0.062071 0.101266 0.052131 0.226399 0.072184 -0.077862 
-suscipit. 0.008703 -0.107582 0.044825 -0.123458 -0.030133 -0.031718 -0.032041 -0.064798 -0.019561 0.070737 -0.008736 0.147279 -0.162243 -0.027607 0.055432 0.170435 0.095029 0.015119 0.042405 0.014809 0.014069 0.091106 0.040812 -0.065231 -0.035688 -0.079580 0.035641 0.021198 0.107543 0.067241 -0.040001 -0.052767 -0.118786 0.086754 0.011408 -0.018796 0.057288 -0.043505 -0.001468 -0.033398 -0.095303 0.078413 -0.124219 -0.027208 0.062964 0.067139 0.017683 0.063895 -0.045977 -0.084125 0.021286 -0.078432 0.038322 0.023895 -0.010682 0.022140 0.120193 0.009520 -0.048292 -0.047981 0.063588 -0.063005 -0.034167 -0.073147 -0.066092 0.093797 0.015775 -0.082930 0.081556 0.021953 -0.072992 0.007105 0.047879 -0.028062 -0.079890 0.024187 -0.074691 -0.034000 -0.080005 -0.119852 0.032307 0.002876 0.026629 -0.030716 0.038949 0.117406 -0.014058 -0.035806 0.108227 -0.051297 0.008947 0.070154 -0.014131 0.088221 0.036896 0.053264 0.033797 0.139111 0.047734 -0.043256 
-euismod. 0.012535 -0.166360 0.065798 -0.183192 -0.047899 -0.050982 -0.059279 -0.100740 -0.032345 0.118989 -0.011840 0.224653 -0.245923 -0.045803 0.084693 0.265281 0.149595 0.012667 0.070809 0.027016 0.031395 0.149119 0.064273 -0.101545 -0.053981 -0.133513 0.061088 0.023687 0.178044 0.114214 -0.056849 -0.074988 -0.188904 0.132865 0.019262 -0.029190 0.103865 -0.069723 0.002647 -0.042223 -0.154744 0.125525 -0.190136 -0.043087 0.099955 0.112850 0.043583 0.109328 -0.082287 -0.124823 0.045782 -0.123330 0.060689 0.034154 -0.030257 0.038808 0.198866 0.023511 -0.080730 -0.099656 0.105121 -0.111265 -0.052664 -0.113938 -0.120391 0.159006 0.033618 -0.130902 0.135528 0.030126 -0.119927 0.006958 0.076511 -0.049897 -0.106757 0.047483 -0.118714 -0.063130 -0.137313 -0.194253 0.048025 0.009787 0.036236 -0.060668 0.050660 0.203980 -0.014725 -0.043840 0.180160 -0.077956 0.003991 0.109822 -0.016315 0.146183 0.058475 0.098161 0.043052 0.220422 0.065238 -0.067207 
-feugiat. 0.018935 -0.127350 0.054260 -0.141232 -0.032981 -0.039251 -0.045274 -0.075632 -0.027246 0.085920 -0.002291 0.172689 -0.189895 -0.044822 0.068492 0.212109 0.118683 0.011016 0.051414 0.024732 0.028201 0.111706 0.052815 -0.080734 -0.048241 -0.108074 0.040628 0.026031 0.137699 0.080372 -0.050148 -0.067991 -0.146896 0.103319 0.011458 -0.033346 0.078794 -0.050284 0.009007 -0.029969 -0.118701 0.103020 -0.150367 -0.034756 0.073058 0.085080 0.032660 0.084312 -0.063071 -0.102938 0.031247 -0.092247 0.047138 0.030731 -0.024437 0.035215 0.152245 0.007552 -0.066221 -0.074529 0.084283 -0.084414 -0.036469 -0.090472 -0.089835 0.121077 0.024626 -0.098472 0.109667 0.028611 -0.092145 0.004044 0.057621 -0.034954 -0.089006 0.042446 -0.088679 -0.047833 -0.101841 -0.148908 0.038684 0.002177 0.028094 -0.048891 0.040816 0.154248 -0.003609 -0.035130 0.140573 -0.062564 -0.001877 0.092555 -0.021180 0.114364 0.051089 0.073016 0.031095 0.170533 0.059411 -0.052701 
-libero, 0.023202 -0.186553 0.082111 -0.211626 -0.053373 -0.065620 -0.069436 -0.113933 -0.038635 0.129015 -0.007663 0.261986 -0.276187 -0.056179 0.095726 0.301123 0.172768 0.018366 0.073916 0.037994 0.032219 0.162348 0.075271 -0.106376 -0.060357 -0.155840 0.065141 0.022731 0.202443 0.121645 -0.073460 -0.094398 -0.210029 0.156348 0.021031 -0.034937 0.119427 -0.068662 0.002170 -0.054158 -0.172901 0.141870 -0.225278 -0.045933 0.118366 0.124232 0.055829 0.129132 -0.088114 -0.151988 0.049924 -0.139413 0.067976 0.039121 -0.033917 0.042004 0.218825 0.022918 -0.080199 -0.106610 0.114837 -0.119504 -0.050706 -0.125546 -0.129614 0.184000 0.035343 -0.144576 0.156644 0.033143 -0.139103 0.005746 0.082962 -0.047626 -0.127839 0.049536 -0.130661 -0.066260 -0.156684 -0.223530 0.058406 0.014958 0.043877 -0.062519 0.058797 0.221561 -0.007592 -0.056383 0.196373 -0.089774 0.013536 0.119870 -0.028883 0.150886 0.065372 0.110919 0.051750 0.240333 0.072472 -0.081468 
-id. 0.015933 -0.185557 0.083577 -0.210682 -0.045664 -0.059517 -0.070572 -0.104659 -0.041048 0.128288 -0.008922 0.254762 -0.270914 -0.058178 0.097708 0.289189 0.158612 0.027065 0.080934 0.031708 0.022606 0.159777 0.068437 -0.105839 -0.056420 -0.146896 0.061868 0.022653 0.196298 0.126711 -0.064682 -0.088161 -0.199532 0.146903 0.015819 -0.031583 0.119803 -0.075777 0.002585 -0.054251 -0.173311 0.143363 -0.213211 -0.042585 0.108308 0.124339 0.041294 0.127862 -0.085473 -0.145969 0.042715 -0.139984 0.070876 0.037195 -0.033325 0.041306 0.221804 0.027354 -0.083617 -0.100431 0.110792 -0.119433 -0.053418 -0.129766 -0.130932 0.174620 0.033836 -0.150090 0.156597 0.030760 -0.128355 0.005143 0.072856 -0.050991 -0.124363 0.058688 -0.126477 -0.060429 -0.147724 -0.217585 0.050503 0.006728 0.042556 -0.055734 0.055964 0.220138 -0.009948 -0.052718 0.197069 -0.089646 0.016186 0.125782 -0.027240 0.149908 0.065698 0.102064 0.047342 0.243744 0.082426 -0.079673 
-dictum. 0.014439 -0.169876 0.071637 -0.197145 -0.038019 -0.052893 -0.067483 -0.103289 -0.038244 0.122619 -0.011043 0.232491 -0.257591 -0.053535 0.086782 0.266143 0.147250 0.016677 0.075120 0.031769 0.026397 0.141016 0.068118 -0.095739 -0.060052 -0.139529 0.059806 0.026452 0.177800 0.106151 -0.062890 -0.085854 -0.189450 0.137302 0.013459 -0.035770 0.113314 -0.068314 0.012033 -0.048564 -0.157594 0.134372 -0.203240 -0.039067 0.107730 0.115894 0.041461 0.116200 -0.083518 -0.135828 0.043478 -0.121703 0.068926 0.037928 -0.027760 0.038449 0.209196 0.023050 -0.079350 -0.093546 0.108621 -0.108612 -0.055332 -0.118962 -0.117043 0.168573 0.037064 -0.135621 0.136316 0.032607 -0.124631 0.008332 0.071158 -0.048826 -0.112257 0.052313 -0.122861 -0.064728 -0.143744 -0.207886 0.055996 0.012920 0.038676 -0.057258 0.057126 0.209613 -0.016161 -0.053376 0.181160 -0.081223 0.016891 0.119543 -0.020023 0.146100 0.063504 0.101381 0.044767 0.222082 0.072639 -0.080245 
-consequat. 0.014308 -0.169296 0.071753 -0.192536 -0.047836 -0.054030 -0.061122 -0.098721 -0.038152 0.117605 -0.003418 0.227270 -0.249243 -0.057411 0.091644 0.268248 0.157478 0.026469 0.074900 0.030566 0.030809 0.150463 0.071863 -0.097952 -0.058556 -0.137219 0.067466 0.026794 0.175015 0.103333 -0.066044 -0.085327 -0.187939 0.131456 0.020809 -0.027365 0.098212 -0.061532 0.011253 -0.043260 -0.153715 0.127463 -0.192221 -0.036113 0.103807 0.118334 0.043478 0.111794 -0.079367 -0.135183 0.041708 -0.118884 0.059910 0.038296 -0.029729 0.034516 0.200986 0.022273 -0.084222 -0.089066 0.105051 -0.097932 -0.049945 -0.115752 -0.113699 0.164228 0.030201 -0.135890 0.132730 0.024958 -0.124860 0.006502 0.069002 -0.041044 -0.117329 0.047417 -0.124608 -0.055706 -0.131581 -0.195168 0.050442 0.005003 0.037084 -0.051109 0.056587 0.201856 -0.014280 -0.053634 0.175663 -0.076791 0.004646 0.107162 -0.014668 0.141963 0.056621 0.087324 0.047274 0.215071 0.069399 -0.071754 
-tortor, 0.014649 -0.178354 0.081128 -0.206425 -0.047004 -0.058510 -0.064300 -0.106208 -0.042214 0.120647 -0.012508 0.244642 -0.258311 -0.054469 0.100019 0.286001 0.160908 0.021342 0.079660 0.028039 0.030049 0.158609 0.070454 -0.103916 -0.056786 -0.137325 0.059060 0.029960 0.181037 0.117340 -0.067220 -0.088875 -0.191400 0.135913 0.016168 -0.033699 0.111932 -0.066599 0.001475 -0.041601 -0.159420 0.135609 -0.205380 -0.045785 0.104737 0.114348 0.041778 0.112731 -0.082861 -0.139064 0.038641 -0.126068 0.067603 0.039190 -0.029235 0.041183 0.207527 0.016691 -0.085588 -0.101890 0.110905 -0.108337 -0.048475 -0.120258 -0.119405 0.161464 0.035343 -0.140271 0.144514 0.037952 -0.126476 0.003226 0.075599 -0.040759 -0.115441 0.054663 -0.121511 -0.067335 -0.142585 -0.198006 0.043908 0.002728 0.042150 -0.060565 0.046017 0.207124 -0.008193 -0.056075 0.177570 -0.080194 0.009592 0.118716 -0.026383 0.144465 0.058738 0.100261 0.042398 0.222080 0.077578 -0.069980 
-consectetur. 0.011609 -0.165194 0.060312 -0.188356 -0.037172 -0.053078 -0.063074 -0.095362 -0.040274 0.111170 -0.006239 0.229923 -0.228308 -0.058409 0.080877 0.255785 0.143160 0.023593 0.070417 0.033006 0.027737 0.139646 0.066744 -0.095169 -0.051012 -0.136232 0.050352 0.017955 0.174568 0.105722 -0.053803 -0.083215 -0.179716 0.127547 0.013473 -0.025382 0.105694 -0.057997 0.013561 -0.041193 -0.140702 0.133468 -0.194636 -0.046298 0.105799 0.104386 0.038464 0.113960 -0.074175 -0.129566 0.048702 -0.112789 0.062067 0.033654 -0.015782 0.036966 0.191612 0.021299 -0.073759 -0.097660 0.107016 -0.107236 -0.057144 -0.109011 -0.108429 0.153843 0.030902 -0.127743 0.132475 0.028843 -0.115614 0.015390 0.064575 -0.041580 -0.107426 0.047520 -0.112730 -0.060569 -0.139252 -0.188470 0.038529 0.002488 0.038462 -0.053866 0.052980 0.200592 -0.018251 -0.048661 0.172505 -0.074954 0.007414 0.112843 -0.021425 0.132891 0.060055 0.090043 0.039873 0.206696 0.070712 -0.071570 
-vitae. 0.013780 -0.162532 0.074428 -0.184962 -0.047388 -0.059830 -0.058656 -0.094208 -0.031358 0.110555 -0.009599 0.226506 -0.246149 -0.049823 0.087008 0.264559 0.142357 0.022246 0.072320 0.031747 0.027152 0.141335 0.065895 -0.102548 -0.050237 -0.133850 0.055411 0.022646 0.180273 0.114308 -0.058386 -0.081116 -0.183149 0.140903 0.016435 -0.033251 0.110107 -0.064378 0.005264 -0.049887 -0.148073 0.129558 -0.202588 -0.044199 0.098418 0.112736 0.045632 0.110518 -0.082905 -0.124891 0.047238 -0.126885 0.065305 0.029629 -0.025207 0.038377 0.198555 0.025039 -0.071188 -0.087706 0.103130 -0.109050 -0.051240 -0.109746 -0.113521 0.166948 0.030799 -0.126685 0.140483 0.039759 -0.116171 0.007776 0.068019 -0.042749 -0.116056 0.052375 -0.111105 -0.061024 -0.135111 -0.196991 0.045809 0.014502 0.039437 -0.054853 0.050850 0.202458 -0.011466 -0.049550 0.171958 -0.082344 0.012933 0.112602 -0.027376 0.135791 0.061504 0.098497 0.043850 0.219497 0.075870 -0.072712 
-Class 0.032818 -0.247068 0.067659 -0.270707 -0.073801 -0.104378 -0.076451 -0.134826 -0.084696 0.205169 0.032618 0.321193 -0.316160 -0.170850 0.118402 0.428972 0.230275 0.055767 0.083649 0.050594 0.074119 0.235657 0.138511 -0.141370 -0.042997 -0.241376 0.094556 0.050120 0.291506 0.155979 -0.080280 -0.120784 -0.256466 0.145401 0.002459 -0.025330 0.203508 -0.052393 0.046924 -0.053899 -0.204642 0.248655 -0.299317 -0.088748 0.177674 0.152410 0.094208 0.205089 -0.115320 -0.207707 0.117506 -0.165746 0.082757 0.043636 -0.019783 0.021927 0.336712 -0.004008 -0.131420 -0.195667 0.182962 -0.207450 -0.115471 -0.163140 -0.217014 0.279847 0.081518 -0.226715 0.216350 0.037397 -0.145254 0.016318 0.078607 -0.049014 -0.122613 0.103219 -0.226700 -0.110324 -0.269893 -0.293412 0.077153 -0.041191 0.016423 -0.119551 0.060762 0.344422 -0.002945 -0.057669 0.338417 -0.120529 0.008785 0.193295 -0.021404 0.225632 0.067903 0.170057 0.069845 0.328443 0.097429 -0.122335 
-aptent 0.025745 -0.225344 0.055069 -0.272999 -0.064222 -0.119299 -0.075510 -0.134401 -0.096593 0.207684 0.064923 0.294678 -0.284275 -0.226866 0.097635 0.452699 0.233848 0.078243 0.060491 0.070380 0.096667 0.244325 0.170015 -0.150356 -0.032036 -0.253302 0.106066 0.041392 0.319651 0.160627 -0.077419 -0.118229 -0.244174 0.129110 -0.023177 -0.019395 0.227699 -0.037796 0.072431 -0.045645 -0.180035 0.286827 -0.284075 -0.111342 0.184612 0.152213 0.109325 0.230043 -0.107599 -0.223746 0.144753 -0.159508 0.055903 0.031965 0.002717 0.020217 0.359281 -0.011520 -0.135297 -0.218430 0.179364 -0.227350 -0.133276 -0.149971 -0.238876 0.286894 0.090797 -0.252160 0.207921 0.031112 -0.132773 0.017143 0.057420 -0.039723 -0.092944 0.102572 -0.254744 -0.121394 -0.290036 -0.280749 0.076363 -0.067114 0.017779 -0.129217 0.044486 0.373824 -0.013397 -0.057066 0.383364 -0.130070 0.007986 0.203441 -0.023841 0.233009 0.042639 0.171504 0.070449 0.326765 0.093347 -0.117239 
-taciti 0.035062 -0.236562 0.049949 -0.283124 -0.074226 -0.136737 -0.079723 -0.133258 -0.113233 0.215388 0.089022 0.296477 -0.266164 -0.265235 0.110239 0.475944 0.239024 0.086211 0.063042 0.075476 0.129873 0.267040 0.180237 -0.147669 -0.028170 -0.280366 0.104097 0.041115 0.335857 0.150316 -0.086687 -0.115849 -0.245850 0.116456 -0.019986 -0.021350 0.254397 -0.023150 0.092691 -0.025130 -0.161282 0.301238 -0.295694 -0.110673 0.183496 0.146500 0.116314 0.249210 -0.114694 -0.227222 0.158410 -0.165146 0.046665 0.023590 0.008578 0.017185 0.371794 -0.015026 -0.153360 -0.253773 0.195219 -0.243403 -0.155808 -0.169230 -0.263043 0.303218 0.103112 -0.266272 0.209983 0.029608 -0.126667 0.025757 0.038815 -0.044441 -0.085987 0.117350 -0.274014 -0.140795 -0.328660 -0.286309 0.071579 -0.075867 -0.003331 -0.151920 0.045926 0.399812 -0.003226 -0.054028 0.413498 -0.132256 -0.004262 0.230590 -0.027215 0.248413 0.040816 0.185914 0.057538 0.338331 0.085477 -0.126959 
-sociosqu 0.028246 -0.223331 0.021778 -0.269573 -0.062992 -0.148580 -0.084623 -0.106028 -0.119675 0.191697 0.127565 0.268239 -0.209729 -0.311829 0.088273 0.472344 0.233944 0.104643 0.043731 0.083944 0.143517 0.264402 0.202296 -0.143227 -0.012427 -0.281899 0.108994 0.030641 0.335727 0.144485 -0.076364 -0.128456 -0.225788 0.089576 -0.051448 -0.003572 0.257945 0.004012 0.118889 -0.009167 -0.123911 0.323189 -0.280279 -0.129972 0.182654 0.113819 0.133540 0.253333 -0.110907 -0.218978 0.200440 -0.158703 0.025738 0.020737 0.033557 -0.011878 0.392053 -0.037819 -0.169149 -0.280294 0.206517 -0.267770 -0.186429 -0.152067 -0.261943 0.319983 0.112246 -0.267678 0.192084 0.026047 -0.106652 0.058712 0.014534 -0.029312 -0.052419 0.111397 -0.290394 -0.148779 -0.353961 -0.254334 0.060923 -0.105327 -0.021848 -0.169231 0.036474 0.420894 -0.014836 -0.036251 0.442620 -0.116217 -0.023280 0.232708 -0.021717 0.258389 0.020822 0.169911 0.046455 0.328738 0.076048 -0.128132 
-ad 0.025565 -0.189959 0.008369 -0.259311 -0.066648 -0.150968 -0.067209 -0.075917 -0.133860 0.177872 0.155791 0.229265 -0.157481 -0.345466 0.075058 0.465129 0.205293 0.117674 0.040686 0.108070 0.178049 0.262906 0.217753 -0.137330 -0.006779 -0.268922 0.111082 0.014500 0.322371 0.129116 -0.068573 -0.109686 -0.194828 0.069381 -0.065408 0.014872 0.270502 0.024290 0.128606 0.001476 -0.068385 0.331332 -0.257161 -0.138243 0.178442 0.089812 0.140225 0.253910 -0.087888 -0.191125 0.215851 -0.136684 -0.000519 0.002192 0.049934 -0.026481 0.377290 -0.036606 -0.173721 -0.291774 0.191410 -0.264926 -0.192856 -0.142176 -0.262692 0.310917 0.115245 -0.260244 0.153314 0.013709 -0.074515 0.061270 -0.018560 -0.008078 -0.019369 0.101118 -0.300755 -0.148757 -0.360047 -0.210809 0.049251 -0.138238 -0.034566 -0.169337 0.021929 0.422527 -0.017048 -0.029103 0.448708 -0.088415 -0.057416 0.225557 -0.015667 0.252934 0.000782 0.158620 0.042337 0.302863 0.048775 -0.117883 
-litora 0.033929 -0.202357 0.008808 -0.260027 -0.053473 -0.158323 -0.074965 -0.071627 -0.137703 0.158740 0.165971 0.245818 -0.146095 -0.355110 0.082418 0.475463 0.213380 0.128062 0.033829 0.108833 0.180138 0.263433 0.219349 -0.151532 -0.008390 -0.282694 0.105211 -0.001327 0.337124 0.119758 -0.067859 -0.119742 -0.192713 0.062332 -0.072329 0.015315 0.273577 0.015516 0.131929 0.007279 -0.058192 0.343520 -0.271857 -0.135792 0.175070 0.081105 0.151553 0.258807 -0.095438 -0.195427 0.218215 -0.131537 -0.016008 0.009664 0.056426 -0.027644 0.385427 -0.049713 -0.178909 -0.308326 0.203311 -0.278914 -0.203546 -0.153583 -0.258281 0.306745 0.113002 -0.273103 0.148886 0.027234 -0.077617 0.072441 -0.012626 -0.015931 -0.012115 0.100014 -0.306648 -0.143800 -0.374620 -0.202234 0.031878 -0.141982 -0.034996 -0.184481 0.025056 0.441213 -0.026507 -0.036097 0.449063 -0.083548 -0.055018 0.223880 -0.010952 0.255480 0.008084 0.163371 0.026091 0.309710 0.057065 -0.104557 
-torquent 0.017948 -0.199194 -0.000651 -0.270080 -0.055073 -0.157814 -0.070482 -0.067077 -0.139520 0.150508 0.189065 0.243554 -0.133461 -0.383943 0.090727 0.490372 0.228016 0.153607 0.033481 0.123026 0.211537 0.289693 0.240593 -0.161795 -0.023434 -0.293214 0.117346 -0.008950 0.341304 0.116909 -0.063095 -0.125064 -0.187717 0.057522 -0.073631 0.005699 0.291874 0.034289 0.154182 0.004124 -0.036987 0.366662 -0.280944 -0.145742 0.184244 0.082434 0.157694 0.259491 -0.092899 -0.184263 0.248726 -0.139591 -0.031406 0.010313 0.065884 -0.039032 0.406874 -0.051112 -0.198866 -0.334273 0.203977 -0.300123 -0.221256 -0.167574 -0.261030 0.329381 0.130845 -0.282077 0.144954 0.023131 -0.082931 0.088788 -0.037070 -0.013742 -0.000220 0.087033 -0.331105 -0.168292 -0.405819 -0.189227 0.041293 -0.149842 -0.045640 -0.197562 0.040257 0.468929 -0.027426 -0.033786 0.473535 -0.082599 -0.079083 0.232079 -0.021125 0.266342 0.003075 0.164407 0.029062 0.315470 0.044733 -0.119191 
-conubia 0.024237 -0.165804 -0.027187 -0.249640 -0.043487 -0.164407 -0.069214 -0.023531 -0.150078 0.101840 0.206889 0.201494 -0.053160 -0.410215 0.071154 0.466448 0.199690 0.158165 0.021402 0.137454 0.226947 0.270926 0.234936 -0.148073 -0.012668 -0.288014 0.102323 -0.023631 0.327347 0.094652 -0.058734 -0.128461 -0.147667 0.029110 -0.093659 0.020933 0.291338 0.039300 0.161798 0.031568 0.019427 0.358049 -0.240287 -0.146410 0.150555 0.037927 0.158234 0.243716 -0.088031 -0.148695 0.253995 -0.103874 -0.070941 -0.005492 0.089230 -0.047722 0.387970 -0.052170 -0.202917 -0.331636 0.187738 -0.302464 -0.236290 -0.152095 -0.253420 0.311731 0.125947 -0.265745 0.094859 0.016270 -0.042824 0.109990 -0.058788 0.002613 0.035700 0.080841 -0.329232 -0.161951 -0.396426 -0.130803 0.003929 -0.172099 -0.053549 -0.207920 0.024692 0.456701 -0.033893 -0.018274 0.461894 -0.046321 -0.114866 0.218645 -0.021500 0.252317 -0.025455 0.131206 -0.003329 0.278753 0.022033 -0.089975 
-nostra, 0.019578 -0.174873 -0.014410 -0.260748 -0.044071 -0.142078 -0.067102 -0.030295 -0.131056 0.100462 0.196100 0.214110 -0.071307 -0.366655 0.087876 0.445587 0.185043 0.151974 0.036001 0.129156 0.198257 0.258909 0.221509 -0.151522 -0.022834 -0.262978 0.085185 -0.032589 0.305347 0.104007 -0.059116 -0.122545 -0.165150 0.049942 -0.078353 0.010493 0.262248 0.041559 0.155994 0.019723 0.009256 0.321233 -0.234372 -0.139910 0.152182 0.038792 0.151680 0.228778 -0.082996 -0.147568 0.235395 -0.110730 -0.051984 -0.002522 0.078082 -0.033649 0.377121 -0.059341 -0.190638 -0.317120 0.175877 -0.278438 -0.217405 -0.154950 -0.221981 0.299198 0.124292 -0.250540 0.107331 0.022756 -0.056783 0.116967 -0.050661 0.004450 0.014151 0.061731 -0.302145 -0.145275 -0.370871 -0.125898 0.007598 -0.139136 -0.042942 -0.185817 0.023278 0.430005 -0.035819 -0.028043 0.419343 -0.031916 -0.107870 0.213930 -0.021902 0.236636 -0.014947 0.131341 -0.002590 0.270485 0.034030 -0.096057 
-inceptos 0.027689 -0.190460 0.010580 -0.270017 -0.046238 -0.143773 -0.072835 -0.055294 -0.120104 0.115305 0.152145 0.244292 -0.132574 -0.320731 0.098610 0.460943 0.206862 0.127956 0.052553 0.117961 0.180636 0.264356 0.207741 -0.152773 -0.033783 -0.267046 0.090209 -0.018960 0.304135 0.115016 -0.073849 -0.136891 -0.187678 0.085381 -0.064036 0.001839 0.264469 -0.001046 0.125201 -0.005499 -0.042450 0.315729 -0.258369 -0.120597 0.158099 0.069351 0.144068 0.224586 -0.101295 -0.156543 0.203095 -0.124602 -0.032974 0.004819 0.060333 -0.019502 0.375377 -0.029901 -0.182576 -0.291506 0.189129 -0.263794 -0.199387 -0.172739 -0.229129 0.304355 0.112356 -0.247254 0.126646 0.022457 -0.083982 0.103338 -0.009179 -0.010425 -0.028910 0.074296 -0.290548 -0.145782 -0.352441 -0.167383 0.023500 -0.121973 -0.016343 -0.175521 0.048568 0.425900 -0.035475 -0.033879 0.402855 -0.050984 -0.088844 0.207483 -0.025456 0.242492 0.008231 0.141638 0.007647 0.294100 0.053534 -0.097200 
-himenaeos. 0.012296 -0.168737 0.023126 -0.248126 -0.033639 -0.108492 -0.062002 -0.053755 -0.104042 0.098805 0.133736 0.232094 -0.136218 -0.263389 0.093492 0.390977 0.183330 0.117201 0.049046 0.092793 0.155439 0.232559 0.165151 -0.144337 -0.041186 -0.239163 0.082942 -0.007582 0.271019 0.105189 -0.057911 -0.121255 -0.171710 0.088672 -0.053432 -0.000504 0.217613 -0.001413 0.099817 -0.009946 -0.051408 0.273566 -0.241983 -0.114140 0.132892 0.069771 0.106766 0.186919 -0.091243 -0.138030 0.169479 -0.116364 -0.017360 0.005578 0.038763 -0.015036 0.326396 -0.031958 -0.165374 -0.252968 0.165512 -0.236206 -0.156646 -0.151684 -0.189003 0.267758 0.084998 -0.219773 0.112032 0.018733 -0.094898 0.077331 -0.008753 -0.007952 -0.034020 0.051428 -0.257040 -0.127109 -0.303262 -0.151924 0.024731 -0.091330 -0.009886 -0.154315 0.039986 0.369651 -0.023991 -0.035378 0.347769 -0.040765 -0.079545 0.177290 -0.032093 0.221283 0.014311 0.106769 0.009490 0.255050 0.043938 -0.081414 
-Interdum 0.049757 -0.215044 0.080331 -0.205217 -0.049367 -0.056900 -0.055031 -0.103078 -0.041711 0.147537 0.008193 0.253931 -0.291046 -0.070096 0.105978 0.335824 0.194625 0.015627 0.108177 0.058704 0.052457 0.185654 0.096530 -0.128068 -0.052302 -0.183617 0.073185 0.037970 0.175528 0.081974 -0.069252 -0.079442 -0.225914 0.157137 0.020869 -0.060687 0.123273 -0.089952 0.007372 -0.043857 -0.186800 0.139027 -0.224937 -0.055966 0.105688 0.160463 0.058571 0.125452 -0.073651 -0.139574 0.060646 -0.146107 0.075167 0.061708 -0.057335 0.038619 0.249163 0.005218 -0.085450 -0.115312 0.101693 -0.127659 -0.038070 -0.124284 -0.133643 0.206047 0.037289 -0.159535 0.165180 0.051534 -0.146754 0.009746 0.091359 -0.061126 -0.122142 0.066584 -0.151062 -0.077819 -0.120975 -0.226360 0.057820 0.020588 0.016308 -0.060749 0.071667 0.226830 -0.012974 -0.048816 0.191351 -0.103610 -0.001245 0.149115 -0.023105 0.183909 0.053698 0.121069 0.047086 0.263182 0.061418 -0.083120 
-commodo. 0.026387 -0.176096 0.072686 -0.189959 -0.040834 -0.057098 -0.055989 -0.107540 -0.036416 0.118139 -0.009475 0.233663 -0.255854 -0.049366 0.086762 0.271521 0.154159 0.017655 0.075677 0.040875 0.024315 0.154782 0.071561 -0.107636 -0.059947 -0.143654 0.064151 0.022729 0.181166 0.103902 -0.059671 -0.078391 -0.196019 0.132704 0.013848 -0.039125 0.100916 -0.068173 0.008411 -0.039986 -0.158293 0.126033 -0.201554 -0.047633 0.102216 0.116987 0.042691 0.120018 -0.075679 -0.135606 0.046298 -0.131805 0.066580 0.034242 -0.036670 0.036582 0.201502 0.018881 -0.077531 -0.091770 0.106023 -0.103588 -0.051731 -0.112668 -0.118568 0.165503 0.032321 -0.140138 0.136210 0.029003 -0.118042 0.002654 0.077821 -0.043592 -0.120760 0.048601 -0.119310 -0.058044 -0.131577 -0.195167 0.045158 0.012951 0.032204 -0.053955 0.053704 0.197187 -0.012972 -0.045464 0.174094 -0.077469 0.012605 0.122834 -0.021207 0.142192 0.055784 0.094525 0.041612 0.221375 0.067496 -0.076610 
-potenti. 0.022186 -0.218860 0.093825 -0.238763 -0.059858 -0.069701 -0.082725 -0.130943 -0.053035 0.139112 -0.018177 0.286916 -0.312551 -0.070175 0.107959 0.335201 0.188659 0.022349 0.083842 0.038028 0.034064 0.183195 0.081038 -0.132868 -0.076770 -0.175473 0.069014 0.033603 0.216012 0.128524 -0.075122 -0.099616 -0.225688 0.167804 0.024597 -0.036942 0.126644 -0.077656 0.004079 -0.044901 -0.188686 0.151576 -0.240533 -0.043968 0.120025 0.136958 0.058833 0.134029 -0.101595 -0.157796 0.052033 -0.153141 0.085335 0.040444 -0.034293 0.050646 0.237947 0.015325 -0.100784 -0.119416 0.120754 -0.127172 -0.062132 -0.147769 -0.147984 0.205734 0.042521 -0.152240 0.169496 0.044026 -0.150701 0.003946 0.092059 -0.051285 -0.145410 0.067474 -0.144810 -0.074323 -0.170092 -0.242553 0.053118 0.011837 0.042029 -0.068840 0.061200 0.252035 -0.021998 -0.057620 0.225571 -0.093187 0.012827 0.139748 -0.021201 0.178184 0.068605 0.112463 0.057469 0.271887 0.082683 -0.089023 
-tempus. 0.025043 -0.168740 0.070713 -0.186614 -0.047444 -0.054923 -0.059713 -0.101601 -0.038383 0.118386 -0.004818 0.229455 -0.258628 -0.050899 0.095152 0.274852 0.157940 0.017559 0.071664 0.034895 0.033516 0.149539 0.074226 -0.109526 -0.059935 -0.137199 0.053991 0.026648 0.172638 0.110085 -0.063753 -0.088816 -0.191449 0.138863 0.021759 -0.032364 0.109725 -0.062344 0.003600 -0.041283 -0.165518 0.129218 -0.205289 -0.047478 0.101561 0.111923 0.045610 0.113991 -0.084128 -0.132992 0.048567 -0.125714 0.071657 0.036695 -0.029564 0.042200 0.199043 0.022769 -0.077664 -0.097372 0.109016 -0.112081 -0.052363 -0.119743 -0.112118 0.159773 0.039164 -0.124943 0.137481 0.031111 -0.121660 0.010464 0.076040 -0.041348 -0.117447 0.048447 -0.122693 -0.059289 -0.138087 -0.197053 0.051757 0.008950 0.033733 -0.057184 0.049074 0.205319 -0.007528 -0.046072 0.183423 -0.076857 0.013961 0.112769 -0.023588 0.144403 0.061273 0.100439 0.045901 0.224841 0.072068 -0.073181 
-varius. 0.009999 -0.145533 0.062930 -0.165027 -0.034528 -0.054108 -0.049791 -0.088935 -0.033385 0.098883 -0.009440 0.191679 -0.206563 -0.048397 0.069205 0.223974 0.124236 0.019545 0.057790 0.025263 0.024856 0.123435 0.059433 -0.084506 -0.051877 -0.112410 0.046497 0.018376 0.145860 0.094835 -0.050578 -0.069282 -0.154965 0.108550 0.011935 -0.023692 0.088651 -0.054428 0.010449 -0.030987 -0.127257 0.113923 -0.159897 -0.042867 0.080380 0.095275 0.037924 0.098673 -0.063997 -0.107198 0.039171 -0.104837 0.054834 0.023021 -0.027285 0.037151 0.165267 0.015378 -0.060154 -0.079715 0.088474 -0.086159 -0.043921 -0.095233 -0.097077 0.135337 0.027584 -0.106409 0.110514 0.023194 -0.099775 0.001706 0.060259 -0.032098 -0.095514 0.038255 -0.096584 -0.052095 -0.114399 -0.157760 0.041680 0.000237 0.031969 -0.051197 0.043921 0.165391 -0.005666 -0.036240 0.148161 -0.064820 0.011515 0.095918 -0.016178 0.116355 0.050189 0.084428 0.034450 0.177799 0.063499 -0.055426 
-iaculis. 0.014294 -0.143705 0.058124 -0.154083 -0.037732 -0.045643 -0.052383 -0.080257 -0.026475 0.094870 -0.012227 0.191654 -0.204924 -0.040422 0.067939 0.213589 0.125476 0.016431 0.054742 0.029398 0.022001 0.121874 0.053278 -0.082298 -0.046931 -0.114238 0.046522 0.021125 0.144640 0.094946 -0.053089 -0.071839 -0.153451 0.114185 0.012250 -0.026281 0.088992 -0.050159 0.001686 -0.037067 -0.127981 0.109809 -0.164663 -0.038990 0.087404 0.094886 0.032023 0.091436 -0.060327 -0.107315 0.034864 -0.100448 0.055366 0.025281 -0.021795 0.031609 0.163150 0.018696 -0.060894 -0.076340 0.082076 -0.087625 -0.049060 -0.101492 -0.095426 0.127513 0.030760 -0.104752 0.107440 0.025108 -0.101561 0.007071 0.056710 -0.037576 -0.087835 0.036397 -0.102935 -0.047233 -0.111176 -0.168462 0.037806 0.007775 0.036106 -0.050523 0.041574 0.172492 -0.011806 -0.038938 0.144927 -0.059671 0.012014 0.090554 -0.021532 0.119749 0.052151 0.076958 0.044685 0.179111 0.058756 -0.063016 
-cursus. 0.018576 -0.152178 0.064868 -0.166961 -0.038638 -0.051529 -0.053078 -0.090017 -0.028681 0.101886 -0.001989 0.196787 -0.215885 -0.050848 0.072562 0.235070 0.132067 0.017748 0.064701 0.023757 0.022573 0.122340 0.060173 -0.083180 -0.047080 -0.120816 0.048413 0.021566 0.149349 0.088309 -0.047130 -0.074001 -0.161544 0.112918 0.019675 -0.023547 0.087128 -0.055839 0.001500 -0.037326 -0.132970 0.107174 -0.165424 -0.033726 0.087698 0.099107 0.041389 0.099326 -0.061219 -0.107104 0.037486 -0.107469 0.050756 0.033632 -0.027314 0.038096 0.171813 0.020345 -0.064882 -0.083186 0.085951 -0.095001 -0.037141 -0.102109 -0.095941 0.133544 0.030255 -0.117569 0.116718 0.033687 -0.103266 -0.000128 0.064529 -0.036372 -0.089866 0.046224 -0.107616 -0.049147 -0.116965 -0.163447 0.044507 0.001545 0.032374 -0.048611 0.044765 0.168748 -0.012393 -0.041990 0.148180 -0.072245 0.008158 0.096146 -0.012365 0.122752 0.046042 0.082024 0.039010 0.183714 0.060037 -0.061467 
-hac 0.023407 -0.221282 0.085526 -0.239936 -0.052941 -0.058710 -0.066977 -0.123003 -0.054632 0.156236 -0.001518 0.291437 -0.314725 -0.068464 0.119646 0.352299 0.190806 0.018792 0.082996 0.048880 0.045814 0.187412 0.087612 -0.121647 -0.073351 -0.185047 0.076613 0.027363 0.223346 0.129536 -0.078172 -0.113004 -0.241555 0.174438 0.024346 -0.039989 0.138855 -0.089285 0.005719 -0.057137 -0.188198 0.170903 -0.262624 -0.065562 0.132149 0.150308 0.053983 0.152200 -0.090735 -0.172266 0.048818 -0.158360 0.077461 0.040295 -0.029757 0.048578 0.272487 0.023268 -0.113815 -0.138846 0.131105 -0.148252 -0.061758 -0.159435 -0.163281 0.218910 0.050508 -0.177238 0.172347 0.053341 -0.147273 0.011241 0.085706 -0.059911 -0.152097 0.063012 -0.151673 -0.085932 -0.179517 -0.263929 0.055170 0.008659 0.047944 -0.081335 0.072716 0.260095 -0.018109 -0.059222 0.237966 -0.095730 0.005297 0.146888 -0.033160 0.186066 0.079207 0.119721 0.058981 0.279349 0.090316 -0.080628 
-habitasse 0.020913 -0.227899 0.093152 -0.250281 -0.063312 -0.063310 -0.069991 -0.123004 -0.062240 0.168283 -0.013733 0.310880 -0.331083 -0.075820 0.121327 0.373694 0.198149 0.032870 0.093846 0.050881 0.039145 0.202082 0.091838 -0.138974 -0.084220 -0.200511 0.073717 0.026817 0.247616 0.144189 -0.078844 -0.117703 -0.263075 0.176627 0.022554 -0.047303 0.152917 -0.091905 0.017507 -0.065885 -0.205480 0.181691 -0.278356 -0.066925 0.136883 0.150510 0.062733 0.158629 -0.102172 -0.183532 0.057220 -0.173734 0.091506 0.038950 -0.031182 0.050051 0.275876 0.029523 -0.122441 -0.150266 0.141652 -0.150150 -0.068210 -0.159708 -0.172463 0.230683 0.042798 -0.177371 0.186565 0.049392 -0.162618 0.015604 0.088355 -0.067653 -0.164766 0.063615 -0.162129 -0.080321 -0.185501 -0.268278 0.057650 0.009349 0.055137 -0.083369 0.076957 0.283286 -0.015921 -0.062081 0.247479 -0.107460 0.007696 0.152011 -0.024410 0.203292 0.085648 0.130273 0.060863 0.303985 0.093388 -0.087307 
-platea 0.018827 -0.191924 0.078359 -0.205424 -0.042594 -0.053389 -0.060323 -0.101014 -0.053320 0.132372 -0.008217 0.253892 -0.268913 -0.059977 0.106749 0.306896 0.167737 0.016310 0.069606 0.035887 0.034582 0.165179 0.079962 -0.107203 -0.065232 -0.162836 0.067371 0.029833 0.203078 0.120750 -0.071437 -0.098547 -0.221134 0.144620 0.018645 -0.029203 0.126215 -0.072026 0.009742 -0.051113 -0.166542 0.149380 -0.229941 -0.059637 0.119448 0.127664 0.048337 0.127971 -0.085411 -0.154676 0.042274 -0.149721 0.072552 0.034712 -0.027962 0.046457 0.231729 0.017153 -0.104154 -0.124652 0.122857 -0.130516 -0.058008 -0.143868 -0.147168 0.196273 0.043030 -0.152315 0.152851 0.045330 -0.130927 0.011919 0.076801 -0.054379 -0.136799 0.050751 -0.138175 -0.075956 -0.158473 -0.235070 0.048757 0.003402 0.046233 -0.071115 0.066919 0.231372 -0.012884 -0.045599 0.216766 -0.085421 0.000494 0.125319 -0.029866 0.175265 0.066369 0.100271 0.047346 0.246007 0.081542 -0.076617 
-dictumst. 0.015898 -0.183009 0.078084 -0.195256 -0.049817 -0.055659 -0.058526 -0.098987 -0.051425 0.134277 -0.012600 0.248435 -0.268423 -0.065567 0.103808 0.302250 0.160815 0.016941 0.073782 0.038091 0.035960 0.166480 0.078254 -0.112917 -0.064152 -0.160583 0.066310 0.025454 0.192768 0.111975 -0.069918 -0.091523 -0.213132 0.140513 0.018166 -0.032131 0.114675 -0.066396 0.008997 -0.048173 -0.171012 0.136616 -0.221136 -0.049497 0.112593 0.125682 0.048850 0.127803 -0.079521 -0.143725 0.048229 -0.139994 0.070448 0.039403 -0.023032 0.044475 0.227393 0.021971 -0.090422 -0.119668 0.110941 -0.125860 -0.056502 -0.134904 -0.130352 0.185265 0.038508 -0.143661 0.143771 0.039038 -0.125470 0.009692 0.067625 -0.056659 -0.131712 0.053747 -0.127575 -0.073310 -0.144103 -0.214360 0.043054 0.013407 0.044316 -0.066631 0.059832 0.225233 -0.012994 -0.050225 0.198330 -0.082294 0.004999 0.119691 -0.022227 0.157332 0.068894 0.107324 0.055012 0.244185 0.072689 -0.069066 
-ullamcorper. 0.021430 -0.151204 0.066195 -0.172421 -0.043934 -0.055041 -0.051358 -0.085335 -0.029542 0.102088 -0.002170 0.203617 -0.216687 -0.056009 0.076022 0.246032 0.135254 0.017254 0.067615 0.027178 0.031702 0.133588 0.063255 -0.091990 -0.045296 -0.123215 0.058005 0.018444 0.151976 0.098975 -0.053827 -0.075180 -0.164865 0.121475 0.012979 -0.026998 0.092053 -0.056045 0.004808 -0.044492 -0.142191 0.116161 -0.181280 -0.044213 0.090792 0.098888 0.037868 0.105937 -0.068990 -0.112841 0.042547 -0.110369 0.057933 0.027898 -0.029016 0.039580 0.181969 0.012693 -0.071098 -0.089686 0.090335 -0.096695 -0.047609 -0.106651 -0.114604 0.151968 0.031966 -0.119214 0.122454 0.028485 -0.106500 0.007782 0.064853 -0.045156 -0.105944 0.043497 -0.108148 -0.053398 -0.121950 -0.175545 0.044906 0.002333 0.035811 -0.051416 0.044783 0.186709 -0.010655 -0.040896 0.160997 -0.075317 0.012599 0.103186 -0.020090 0.129234 0.055964 0.090463 0.039702 0.197723 0.059697 -0.070535 
-consequat, 0.021172 -0.204912 0.091891 -0.228568 -0.048761 -0.066164 -0.073804 -0.122552 -0.041086 0.135804 -0.015229 0.274427 -0.294067 -0.067874 0.109223 0.320570 0.187508 0.029568 0.080049 0.031283 0.036582 0.176579 0.082697 -0.122607 -0.064363 -0.160822 0.071990 0.024988 0.221366 0.141586 -0.078381 -0.098099 -0.228064 0.169572 0.023031 -0.040850 0.132167 -0.081703 0.013331 -0.056310 -0.188055 0.163481 -0.241557 -0.050085 0.123795 0.133752 0.052200 0.143829 -0.092892 -0.155876 0.055543 -0.149525 0.077970 0.038174 -0.036820 0.052514 0.253952 0.018017 -0.097071 -0.124420 0.128037 -0.135652 -0.068819 -0.140504 -0.151158 0.201450 0.046671 -0.153356 0.173229 0.040567 -0.148206 0.004727 0.086897 -0.051106 -0.142872 0.064685 -0.147732 -0.071505 -0.172672 -0.244190 0.057954 0.010691 0.046433 -0.071772 0.064522 0.241974 -0.019056 -0.061187 0.214479 -0.094529 0.008471 0.133259 -0.019807 0.172373 0.074096 0.122629 0.055188 0.266305 0.084778 -0.081307 
-fringilla. 0.017941 -0.150033 0.065428 -0.167195 -0.043214 -0.046636 -0.049872 -0.095146 -0.030157 0.102698 -0.014381 0.211467 -0.218133 -0.046829 0.075202 0.239854 0.134905 0.017614 0.062569 0.024797 0.026448 0.126733 0.065450 -0.088267 -0.051498 -0.125239 0.046557 0.023201 0.168322 0.099985 -0.052122 -0.076773 -0.173243 0.120551 0.016393 -0.031147 0.093848 -0.053998 0.004816 -0.036635 -0.143134 0.123409 -0.185613 -0.038725 0.094255 0.095594 0.033589 0.097699 -0.072886 -0.113163 0.038689 -0.103536 0.052765 0.032673 -0.026882 0.034420 0.176814 0.010062 -0.064284 -0.084573 0.088631 -0.089718 -0.043601 -0.095483 -0.098214 0.139358 0.031443 -0.112271 0.114345 0.026323 -0.103925 0.008018 0.063385 -0.042155 -0.093465 0.036943 -0.099922 -0.052874 -0.114379 -0.165631 0.035009 0.010973 0.031345 -0.049182 0.046318 0.171031 -0.004420 -0.041304 0.152300 -0.063122 0.011697 0.097075 -0.024983 0.116447 0.058114 0.088802 0.041784 0.190641 0.065072 -0.061498 
-risus, 0.016331 -0.181613 0.083646 -0.210097 -0.050755 -0.065896 -0.065255 -0.112979 -0.034731 0.126789 -0.014484 0.255820 -0.265475 -0.047211 0.095396 0.285171 0.161656 0.023410 0.074073 0.028206 0.032814 0.155308 0.072872 -0.114033 -0.059574 -0.153827 0.070658 0.022286 0.195008 0.117483 -0.070702 -0.091190 -0.206330 0.144450 0.016931 -0.035945 0.117737 -0.067678 0.002513 -0.054535 -0.167817 0.142423 -0.215197 -0.049411 0.115674 0.124946 0.040557 0.125727 -0.089120 -0.140051 0.042041 -0.132427 0.071693 0.032978 -0.032553 0.035295 0.216884 0.025620 -0.084525 -0.097856 0.115143 -0.116955 -0.057078 -0.131036 -0.126491 0.175344 0.037669 -0.144124 0.149841 0.033876 -0.128930 0.008028 0.073778 -0.050010 -0.132182 0.047712 -0.126237 -0.067851 -0.148232 -0.219626 0.057088 0.014317 0.042955 -0.055566 0.053300 0.215759 -0.015418 -0.057240 0.186340 -0.080975 0.011607 0.123580 -0.024863 0.152713 0.070628 0.101481 0.053291 0.233719 0.084835 -0.074965 
-aliquam, 0.015502 -0.187572 0.080331 -0.210590 -0.051276 -0.058653 -0.055484 -0.113667 -0.038621 0.128016 -0.003332 0.243102 -0.269191 -0.058335 0.101769 0.286248 0.157323 0.021269 0.081498 0.040103 0.036892 0.158718 0.071796 -0.104771 -0.060878 -0.151790 0.066463 0.035165 0.184007 0.118183 -0.065935 -0.094526 -0.201709 0.144838 0.014002 -0.036427 0.117015 -0.075321 0.006350 -0.054285 -0.175325 0.147183 -0.219616 -0.048360 0.112138 0.124541 0.038090 0.132439 -0.082992 -0.145531 0.045493 -0.140981 0.071430 0.034416 -0.031170 0.035582 0.231454 0.022606 -0.085724 -0.100716 0.122584 -0.116022 -0.061958 -0.133807 -0.130358 0.179448 0.033896 -0.159195 0.151926 0.033397 -0.132987 0.003768 0.082843 -0.046752 -0.127131 0.054189 -0.136117 -0.060982 -0.152345 -0.218698 0.062988 0.005293 0.043746 -0.060091 0.065756 0.236001 -0.018438 -0.054722 0.203138 -0.088727 0.010852 0.138394 -0.028024 0.167973 0.064794 0.100957 0.056173 0.247123 0.085418 -0.080587 
-ornare. 0.018882 -0.144914 0.056385 -0.164725 -0.042364 -0.044498 -0.049510 -0.084606 -0.028240 0.103536 -0.011953 0.193763 -0.209591 -0.041839 0.079902 0.228562 0.132374 0.015274 0.062570 0.022485 0.025644 0.123449 0.051778 -0.086309 -0.045246 -0.111704 0.050982 0.025790 0.149902 0.093722 -0.054561 -0.067597 -0.166434 0.118544 0.013744 -0.033857 0.087602 -0.059191 0.005157 -0.037970 -0.136232 0.116451 -0.173663 -0.034598 0.091093 0.102207 0.034954 0.099347 -0.062350 -0.116312 0.036665 -0.107613 0.054849 0.027459 -0.028401 0.031849 0.173690 0.019376 -0.067868 -0.080507 0.094978 -0.094282 -0.047081 -0.102369 -0.098311 0.136843 0.034314 -0.110607 0.121411 0.030886 -0.108276 -0.001162 0.066806 -0.036760 -0.103173 0.045614 -0.099890 -0.052497 -0.117934 -0.171241 0.038998 0.011208 0.026935 -0.048985 0.046052 0.174088 -0.010696 -0.044901 0.156740 -0.070346 0.008368 0.101401 -0.016505 0.125755 0.053794 0.088877 0.041518 0.192587 0.059612 -0.060433 
-non. 0.026180 -0.178938 0.077458 -0.201670 -0.042325 -0.061123 -0.062030 -0.101345 -0.036431 0.121956 -0.011116 0.251647 -0.265383 -0.060635 0.089903 0.282764 0.162246 0.025548 0.080836 0.037738 0.031060 0.150081 0.072267 -0.100785 -0.053799 -0.141023 0.062997 0.031459 0.181119 0.115623 -0.060251 -0.092092 -0.198179 0.145147 0.018256 -0.030713 0.107187 -0.061829 0.008081 -0.048690 -0.169460 0.138606 -0.212995 -0.043808 0.107722 0.116262 0.043692 0.122743 -0.084779 -0.140756 0.039403 -0.131693 0.066515 0.033667 -0.032502 0.039361 0.211966 0.020288 -0.079570 -0.105180 0.106541 -0.105236 -0.056930 -0.122160 -0.123083 0.177146 0.040255 -0.145555 0.146994 0.031113 -0.129271 0.004128 0.075359 -0.052476 -0.126152 0.052986 -0.121303 -0.065446 -0.141050 -0.209727 0.056518 0.009351 0.041490 -0.058581 0.053762 0.218052 -0.013023 -0.052892 0.191549 -0.087152 0.014848 0.127237 -0.019894 0.155407 0.060317 0.102825 0.047570 0.237191 0.079129 -0.082961 
-vestibulum. 0.013945 -0.120447 0.046515 -0.132523 -0.027081 -0.041063 -0.040549 -0.066336 -0.027744 0.071846 -0.008387 0.156495 -0.169107 -0.031309 0.066154 0.182068 0.097027 0.016026 0.046940 0.022804 0.018656 0.102758 0.044159 -0.065415 -0.035302 -0.088941 0.042848 0.018241 0.113765 0.071850 -0.042534 -0.060623 -0.129496 0.096137 0.014042 -0.016487 0.067950 -0.048232 0.000464 -0.034936 -0.104010 0.091017 -0.133647 -0.023148 0.066645 0.078757 0.030330 0.080038 -0.057801 -0.093245 0.030239 -0.086525 0.041303 0.018230 -0.017421 0.031660 0.143974 0.018919 -0.053665 -0.068305 0.077492 -0.073345 -0.036305 -0.081263 -0.076868 0.105863 0.016856 -0.087297 0.094872 0.025445 -0.081681 0.002590 0.049465 -0.031825 -0.080506 0.031729 -0.078415 -0.041259 -0.101403 -0.135475 0.033719 -0.001587 0.029457 -0.037888 0.042282 0.140287 -0.007185 -0.037204 0.130231 -0.053431 0.008946 0.082617 -0.017178 0.100895 0.040365 0.074613 0.031543 0.157202 0.055872 -0.049665 
-a. 0.013861 -0.218818 0.096204 -0.244541 -0.052462 -0.074421 -0.081707 -0.130639 -0.043510 0.153109 -0.019870 0.303858 -0.322954 -0.066788 0.111833 0.349927 0.190177 0.022716 0.096022 0.035909 0.026525 0.187109 0.086359 -0.121822 -0.073589 -0.174802 0.071686 0.037345 0.223376 0.147476 -0.081894 -0.111620 -0.240361 0.177828 0.022327 -0.048410 0.135252 -0.083355 0.002844 -0.060715 -0.202196 0.162385 -0.254168 -0.056549 0.134865 0.145016 0.055594 0.145683 -0.098466 -0.175068 0.055018 -0.158211 0.081485 0.043070 -0.035568 0.049028 0.257035 0.022461 -0.094357 -0.125494 0.137691 -0.143918 -0.070632 -0.143834 -0.157859 0.203322 0.046180 -0.162724 0.173215 0.040164 -0.155889 0.002876 0.089027 -0.058125 -0.146853 0.061724 -0.149413 -0.072223 -0.183344 -0.258387 0.065114 0.011918 0.044308 -0.074417 0.067050 0.255504 -0.009785 -0.061365 0.225157 -0.099241 0.012305 0.147967 -0.026867 0.177856 0.079755 0.119370 0.063804 0.278657 0.093549 -0.098948 
-laoreet. 0.018805 -0.148728 0.062787 -0.155127 -0.038508 -0.048855 -0.052750 -0.084536 -0.034019 0.101037 -0.007984 0.196831 -0.208407 -0.048610 0.070435 0.228488 0.127372 0.020909 0.060948 0.020674 0.018078 0.120485 0.054605 -0.082480 -0.047874 -0.109066 0.051135 0.019864 0.149227 0.096341 -0.051330 -0.069081 -0.148813 0.112855 0.014869 -0.030945 0.087474 -0.056938 0.005084 -0.041431 -0.126190 0.107277 -0.166363 -0.031531 0.080849 0.090081 0.036022 0.099016 -0.069136 -0.110433 0.031385 -0.103898 0.048702 0.024863 -0.020498 0.034145 0.162989 0.015430 -0.069430 -0.082423 0.085685 -0.087271 -0.043376 -0.096556 -0.095638 0.136746 0.029924 -0.107372 0.112461 0.028415 -0.103373 0.009106 0.063061 -0.030233 -0.092783 0.043255 -0.091787 -0.051517 -0.110519 -0.162748 0.041717 0.003925 0.026245 -0.050607 0.040794 0.162694 -0.007973 -0.042029 0.141299 -0.057705 0.005111 0.088782 -0.019265 0.111178 0.053009 0.077497 0.038850 0.185300 0.058608 -0.058797 
-tristique. 0.011624 -0.138213 0.057413 -0.155223 -0.036243 -0.049452 -0.048799 -0.076002 -0.022614 0.086676 -0.007852 0.176566 -0.198809 -0.038941 0.072129 0.208545 0.111312 0.011297 0.053033 0.018825 0.013784 0.107280 0.042372 -0.075747 -0.046131 -0.101180 0.047703 0.021695 0.134841 0.091937 -0.045778 -0.062734 -0.144785 0.104666 0.010310 -0.025025 0.084238 -0.052861 0.007688 -0.040197 -0.117564 0.103389 -0.151598 -0.033915 0.078716 0.084223 0.032079 0.083807 -0.057669 -0.099634 0.028821 -0.097112 0.044643 0.021693 -0.023398 0.036266 0.151631 0.014726 -0.062464 -0.069550 0.084868 -0.073994 -0.036055 -0.093986 -0.092793 0.119040 0.025385 -0.103176 0.104765 0.021920 -0.099495 0.000196 0.062982 -0.037027 -0.091455 0.038662 -0.093439 -0.039017 -0.108216 -0.150785 0.034951 0.002143 0.025467 -0.037121 0.037371 0.149241 -0.014096 -0.037115 0.135557 -0.061216 0.009122 0.083674 -0.013643 0.103372 0.044881 0.075838 0.033968 0.162580 0.057515 -0.057076 
-magna, 0.011645 -0.174734 0.072433 -0.197547 -0.050152 -0.054511 -0.056736 -0.103275 -0.031809 0.117610 -0.005410 0.236896 -0.252394 -0.056465 0.088115 0.267281 0.153132 0.021687 0.070765 0.027764 0.029561 0.149912 0.063937 -0.099165 -0.060407 -0.135357 0.051939 0.024528 0.184457 0.115154 -0.064151 -0.076495 -0.190297 0.140007 0.022519 -0.030990 0.108319 -0.063924 0.010729 -0.048808 -0.157487 0.135263 -0.204074 -0.041707 0.110324 0.114741 0.044501 0.118551 -0.078994 -0.137060 0.046342 -0.123473 0.071348 0.033420 -0.029880 0.041275 0.205841 0.023014 -0.076929 -0.105186 0.103950 -0.114025 -0.054852 -0.119626 -0.121561 0.163666 0.038514 -0.136662 0.144426 0.038862 -0.127984 0.005536 0.074023 -0.043200 -0.114991 0.052296 -0.127144 -0.064865 -0.145965 -0.203185 0.043456 0.006961 0.033871 -0.060292 0.048022 0.219668 -0.016910 -0.052724 0.191087 -0.078963 0.014726 0.122278 -0.018842 0.150591 0.070542 0.102131 0.046549 0.223159 0.079429 -0.075168 
-venenatis. 0.021728 -0.162349 0.070907 -0.184456 -0.043297 -0.051341 -0.052095 -0.098304 -0.028448 0.107877 -0.010713 0.222274 -0.239292 -0.045972 0.082492 0.254730 0.138652 0.021168 0.067210 0.027547 0.020711 0.136927 0.064246 -0.090977 -0.051913 -0.123598 0.049676 0.025803 0.160600 0.099502 -0.055073 -0.073659 -0.179994 0.121646 0.014404 -0.037194 0.094120 -0.065024 0.000624 -0.040292 -0.146006 0.120175 -0.190279 -0.038715 0.102502 0.107249 0.041778 0.100182 -0.069390 -0.115279 0.037536 -0.116873 0.061351 0.027694 -0.023885 0.035338 0.176867 0.020187 -0.078860 -0.083887 0.093612 -0.100637 -0.046469 -0.107362 -0.105399 0.147260 0.027570 -0.121078 0.124893 0.033860 -0.112386 0.008504 0.065097 -0.039163 -0.108570 0.049281 -0.108245 -0.061930 -0.117566 -0.179349 0.041511 0.009776 0.039733 -0.055274 0.044686 0.183662 -0.007237 -0.040223 0.163309 -0.076612 0.007302 0.106432 -0.022363 0.133533 0.054304 0.089990 0.049484 0.201630 0.063305 -0.064710 
-felis, 0.016198 -0.170540 0.067946 -0.185592 -0.039987 -0.059328 -0.055499 -0.096131 -0.031108 0.118007 -0.005997 0.223197 -0.244328 -0.052819 0.092926 0.262521 0.151544 0.023330 0.069291 0.033877 0.026930 0.148897 0.066978 -0.095480 -0.055188 -0.137531 0.057887 0.024863 0.173189 0.106939 -0.063000 -0.076541 -0.187421 0.132277 0.015908 -0.027526 0.106680 -0.058564 0.009674 -0.043150 -0.150410 0.121454 -0.200861 -0.042694 0.099467 0.112589 0.039885 0.114848 -0.076573 -0.124972 0.036941 -0.128323 0.063660 0.032656 -0.024003 0.041109 0.198901 0.018714 -0.072968 -0.097736 0.097679 -0.104475 -0.051812 -0.111861 -0.116926 0.160168 0.028634 -0.131860 0.138270 0.034500 -0.117901 0.007938 0.068973 -0.048084 -0.109134 0.048119 -0.113930 -0.055495 -0.133911 -0.195044 0.042880 0.010784 0.040466 -0.055169 0.050611 0.199851 -0.008477 -0.049388 0.177553 -0.077337 0.012403 0.118479 -0.017732 0.137122 0.062526 0.093649 0.048457 0.212954 0.070344 -0.067000 
-efficitur, 0.023399 -0.219785 0.086782 -0.242437 -0.061663 -0.072832 -0.074612 -0.127966 -0.042095 0.151623 -0.013194 0.294439 -0.310813 -0.067554 0.116258 0.335935 0.185730 0.025107 0.088975 0.040510 0.030917 0.181977 0.086762 -0.130561 -0.068510 -0.169630 0.073228 0.026799 0.213650 0.137256 -0.080890 -0.105731 -0.231007 0.169448 0.028208 -0.043477 0.134211 -0.085667 0.006226 -0.054500 -0.192620 0.162092 -0.243092 -0.054949 0.128405 0.144311 0.051224 0.140410 -0.098245 -0.164505 0.055889 -0.156879 0.077700 0.036710 -0.034395 0.050208 0.248112 0.024135 -0.103206 -0.119906 0.128868 -0.134951 -0.061435 -0.144566 -0.144741 0.204866 0.042793 -0.163035 0.173449 0.042084 -0.155501 0.004802 0.086246 -0.059760 -0.142371 0.056874 -0.147886 -0.079029 -0.165403 -0.248768 0.060764 0.012993 0.043506 -0.063913 0.067727 0.252378 -0.009342 -0.059770 0.215233 -0.094528 0.008695 0.143896 -0.022601 0.176848 0.078412 0.113637 0.058385 0.273992 0.087753 -0.084043 
-fermentum. 0.018467 -0.164750 0.063651 -0.181081 -0.044640 -0.052222 -0.050107 -0.097841 -0.033409 0.109449 -0.007049 0.223045 -0.237403 -0.055169 0.087123 0.252425 0.139336 0.026102 0.071172 0.034224 0.022512 0.141249 0.059297 -0.093808 -0.053931 -0.128088 0.055077 0.028190 0.167736 0.104924 -0.062999 -0.075923 -0.172311 0.123961 0.019525 -0.028532 0.095169 -0.065500 0.009502 -0.041906 -0.144537 0.120462 -0.181928 -0.043665 0.099427 0.103571 0.037584 0.109101 -0.070942 -0.129967 0.039277 -0.121124 0.054661 0.029702 -0.027020 0.034089 0.195058 0.012747 -0.078435 -0.091427 0.097702 -0.101353 -0.053297 -0.104984 -0.109144 0.149525 0.026978 -0.127718 0.123682 0.024830 -0.114841 0.005176 0.067318 -0.046394 -0.104647 0.045640 -0.119839 -0.056162 -0.125340 -0.187247 0.046960 0.000650 0.034782 -0.047950 0.045240 0.194601 -0.014896 -0.042281 0.169262 -0.068271 0.004282 0.111789 -0.018819 0.140203 0.062952 0.086703 0.047902 0.203046 0.060422 -0.074484 
-diam, 0.012547 -0.174033 0.069607 -0.188180 -0.039772 -0.053386 -0.059150 -0.098793 -0.034430 0.108045 -0.006465 0.223026 -0.243695 -0.046421 0.081595 0.262491 0.140275 0.013142 0.068762 0.028189 0.019763 0.139331 0.063730 -0.092643 -0.051666 -0.136730 0.052212 0.019750 0.166337 0.107715 -0.059211 -0.079309 -0.180461 0.134018 0.013859 -0.030587 0.107023 -0.057992 0.003743 -0.043869 -0.145629 0.128969 -0.189996 -0.035336 0.102039 0.106106 0.036667 0.108309 -0.076263 -0.127956 0.040293 -0.117304 0.066658 0.031101 -0.032195 0.042550 0.188579 0.011368 -0.070789 -0.094053 0.096699 -0.102915 -0.048854 -0.105050 -0.112931 0.153914 0.030957 -0.126609 0.127524 0.028373 -0.111385 0.006150 0.066221 -0.041583 -0.110357 0.048969 -0.112850 -0.057179 -0.127197 -0.195991 0.047441 0.002476 0.039442 -0.049921 0.053226 0.194284 -0.010530 -0.047209 0.173284 -0.071132 0.006084 0.110303 -0.020203 0.137748 0.063094 0.098414 0.045849 0.212616 0.066316 -0.074418 
-sodales. 0.013108 -0.135801 0.050806 -0.150483 -0.029863 -0.039824 -0.047547 -0.073409 -0.028263 0.090671 -0.010341 0.170878 -0.186494 -0.037314 0.069788 0.204723 0.118529 0.015906 0.056268 0.021035 0.017086 0.114033 0.049307 -0.080042 -0.046738 -0.102775 0.050920 0.023812 0.128821 0.083551 -0.045964 -0.060152 -0.132245 0.104170 0.018785 -0.022250 0.077608 -0.045731 0.005641 -0.036654 -0.117115 0.101641 -0.152298 -0.035023 0.073269 0.079752 0.032729 0.081969 -0.057601 -0.099987 0.026124 -0.090850 0.052496 0.019890 -0.017642 0.026619 0.148259 0.017651 -0.057393 -0.071387 0.077113 -0.079403 -0.039126 -0.089511 -0.088828 0.121083 0.021500 -0.093370 0.096935 0.027503 -0.092604 0.004279 0.052327 -0.029789 -0.082824 0.042675 -0.086041 -0.043353 -0.101113 -0.145825 0.035104 0.009374 0.022735 -0.040106 0.040248 0.149750 -0.003980 -0.037331 0.139488 -0.059635 0.008921 0.081493 -0.013225 0.107236 0.048483 0.067070 0.037503 0.162886 0.059252 -0.056369 
-placerat, 0.022865 -0.176489 0.076500 -0.187796 -0.046513 -0.057346 -0.056631 -0.094040 -0.041763 0.124336 -0.005503 0.231111 -0.253686 -0.059727 0.092774 0.274510 0.149794 0.022953 0.076749 0.029039 0.027610 0.152356 0.069591 -0.100408 -0.054906 -0.137146 0.064683 0.022587 0.168650 0.101799 -0.064694 -0.083352 -0.189007 0.140994 0.018334 -0.029684 0.104850 -0.062409 0.000467 -0.050418 -0.156612 0.131103 -0.197290 -0.046885 0.102002 0.113461 0.044344 0.116456 -0.080406 -0.130870 0.044103 -0.128226 0.057917 0.030616 -0.022809 0.038636 0.194503 0.017517 -0.078891 -0.087578 0.101577 -0.101120 -0.051390 -0.111127 -0.117624 0.158131 0.028295 -0.129253 0.138869 0.028982 -0.120624 0.010286 0.068612 -0.042805 -0.109661 0.044118 -0.115242 -0.058441 -0.129327 -0.198622 0.047177 0.005566 0.033941 -0.052142 0.058410 0.197857 -0.008856 -0.049813 0.172484 -0.073421 0.013305 0.114829 -0.014874 0.137858 0.063397 0.086286 0.041404 0.214540 0.064606 -0.071080 
-amet. 0.022249 -0.161855 0.069173 -0.172292 -0.034658 -0.056667 -0.057617 -0.087228 -0.040091 0.100494 -0.000342 0.218177 -0.230000 -0.048225 0.087942 0.255185 0.137991 0.020531 0.068346 0.030017 0.024008 0.141030 0.061114 -0.090733 -0.051325 -0.119208 0.063687 0.029107 0.156910 0.097853 -0.056945 -0.073277 -0.179462 0.123687 0.009797 -0.025904 0.102056 -0.057878 0.001688 -0.043940 -0.149248 0.122203 -0.187112 -0.031665 0.097427 0.106210 0.037236 0.111126 -0.070993 -0.126208 0.037185 -0.112152 0.051491 0.030251 -0.024813 0.029178 0.186535 0.014827 -0.074500 -0.087334 0.098228 -0.095733 -0.041911 -0.106656 -0.111493 0.154300 0.033251 -0.124724 0.127771 0.030278 -0.108931 0.009591 0.066866 -0.045562 -0.110256 0.043342 -0.118465 -0.052334 -0.130197 -0.189034 0.049117 0.007265 0.034852 -0.047960 0.055446 0.184771 -0.013932 -0.043865 0.162280 -0.068052 0.016685 0.108066 -0.020137 0.137239 0.055363 0.085027 0.046178 0.201973 0.063370 -0.073361 
-habitant 0.030895 -0.242911 0.096592 -0.260496 -0.066839 -0.072310 -0.078653 -0.114534 -0.062213 0.152355 -0.009746 0.308406 -0.339127 -0.077013 0.109793 0.383196 0.230836 0.027064 0.115057 0.044208 0.043374 0.206960 0.103440 -0.143895 -0.074837 -0.213349 0.091628 0.038332 0.238322 0.134488 -0.088329 -0.112405 -0.264764 0.179364 0.022347 -0.038449 0.146737 -0.092863 0.021397 -0.051580 -0.216375 0.172871 -0.263340 -0.064252 0.138988 0.161468 0.070790 0.152282 -0.093016 -0.169003 0.066526 -0.172316 0.088668 0.061462 -0.059094 0.047545 0.282128 0.006780 -0.106802 -0.135386 0.146770 -0.152410 -0.054550 -0.153975 -0.165126 0.231075 0.062387 -0.162553 0.181241 0.036734 -0.180393 0.004405 0.097547 -0.052140 -0.146933 0.076423 -0.181616 -0.087843 -0.191465 -0.266059 0.067942 0.028557 0.032169 -0.085867 0.067700 0.276271 -0.005046 -0.051442 0.254863 -0.117584 -0.007183 0.161305 -0.022197 0.209313 0.082364 0.135283 0.048441 0.298210 0.081561 -0.102705 
-morbi 0.040344 -0.222802 0.088838 -0.230117 -0.053305 -0.064860 -0.067993 -0.103480 -0.056894 0.141204 0.004173 0.283579 -0.311539 -0.068394 0.106760 0.364894 0.220727 0.015073 0.108763 0.053370 0.045048 0.191759 0.096927 -0.135331 -0.073308 -0.208058 0.081422 0.042227 0.223855 0.115268 -0.073058 -0.095541 -0.246201 0.156305 0.025826 -0.040080 0.135673 -0.085675 0.022182 -0.049709 -0.196424 0.173746 -0.250587 -0.057695 0.139578 0.156039 0.061687 0.136970 -0.083689 -0.161939 0.060421 -0.156596 0.087080 0.065679 -0.050061 0.035691 0.273372 -0.006932 -0.105464 -0.141144 0.128894 -0.149677 -0.043219 -0.142461 -0.157202 0.222075 0.062917 -0.149873 0.178310 0.042893 -0.162430 0.009414 0.096841 -0.054225 -0.143194 0.069869 -0.179420 -0.091387 -0.175193 -0.254003 0.052310 0.024381 0.018782 -0.083565 0.063053 0.261529 -0.003894 -0.047593 0.239641 -0.117885 -0.010615 0.158611 -0.017254 0.194628 0.067496 0.127219 0.037771 0.289274 0.065752 -0.094175 
-senectus 0.037501 -0.214965 0.070870 -0.209975 -0.056668 -0.045207 -0.060370 -0.088477 -0.052544 0.133705 -0.004936 0.256090 -0.302653 -0.068960 0.096649 0.363976 0.221283 0.020052 0.113875 0.052807 0.050644 0.193134 0.108335 -0.132884 -0.068509 -0.217653 0.081481 0.043796 0.214159 0.099528 -0.079720 -0.092184 -0.229718 0.143244 0.013967 -0.042480 0.133930 -0.081578 0.022009 -0.049685 -0.184728 0.154942 -0.229154 -0.060041 0.125147 0.151790 0.072608 0.127305 -0.078686 -0.139748 0.063170 -0.154248 0.079897 0.067090 -0.062104 0.031226 0.255751 -0.005411 -0.102176 -0.125781 0.116991 -0.144717 -0.033732 -0.135868 -0.146067 0.215143 0.057574 -0.139743 0.158573 0.048075 -0.159188 0.006527 0.085740 -0.053771 -0.127052 0.080799 -0.169972 -0.076557 -0.155090 -0.234020 0.058868 0.033567 0.017046 -0.071027 0.065151 0.239309 0.003988 -0.037361 0.222546 -0.122952 -0.020106 0.155307 -0.016517 0.187820 0.059260 0.133579 0.030264 0.276933 0.060036 -0.094826 
-netus 0.046168 -0.223024 0.079955 -0.207475 -0.050466 -0.050710 -0.059157 -0.091983 -0.049565 0.139102 -0.002803 0.267631 -0.303606 -0.074146 0.109613 0.373213 0.223741 0.009553 0.127509 0.054170 0.053772 0.194436 0.106271 -0.139646 -0.055138 -0.224576 0.081389 0.042871 0.211159 0.089339 -0.078531 -0.084767 -0.242857 0.145844 0.019952 -0.053071 0.129030 -0.085275 0.019822 -0.049805 -0.196771 0.159307 -0.232880 -0.057104 0.120336 0.169134 0.073543 0.118896 -0.073980 -0.143136 0.060413 -0.146253 0.085835 0.065445 -0.070133 0.037666 0.255877 -0.012887 -0.094762 -0.125743 0.110975 -0.148274 -0.030895 -0.132961 -0.141076 0.216966 0.060883 -0.141523 0.170573 0.043856 -0.163153 0.006438 0.099774 -0.054679 -0.126151 0.085409 -0.173337 -0.085451 -0.146391 -0.228914 0.056159 0.035362 0.014675 -0.073933 0.072130 0.247614 -0.005748 -0.039568 0.211208 -0.127160 -0.020249 0.150941 -0.021786 0.194508 0.049710 0.138856 0.036407 0.275905 0.060931 -0.089449 
-rhoncus. 0.023341 -0.172295 0.071587 -0.190663 -0.040352 -0.054819 -0.066060 -0.100385 -0.039881 0.121443 -0.004830 0.236707 -0.245620 -0.054614 0.088827 0.274274 0.152536 0.022940 0.074780 0.031114 0.031694 0.155727 0.074052 -0.103855 -0.055740 -0.147297 0.059674 0.023895 0.179080 0.107856 -0.066620 -0.080292 -0.195789 0.133944 0.015778 -0.034127 0.113011 -0.069413 0.001060 -0.045186 -0.155938 0.134078 -0.196010 -0.040434 0.100504 0.119467 0.046437 0.115089 -0.073011 -0.134938 0.049792 -0.119215 0.068293 0.034350 -0.031718 0.044754 0.206240 0.018408 -0.083631 -0.101607 0.097588 -0.113126 -0.052739 -0.121588 -0.118576 0.166075 0.039113 -0.138963 0.137990 0.038455 -0.118103 0.005208 0.069812 -0.051426 -0.115632 0.052903 -0.119982 -0.069621 -0.138067 -0.200217 0.050285 0.006224 0.030746 -0.055002 0.045169 0.211462 -0.006676 -0.048748 0.179173 -0.076379 0.008862 0.117959 -0.018126 0.154780 0.060057 0.104175 0.048928 0.224628 0.070117 -0.068210 
-efficitur. 0.007581 -0.105830 0.035209 -0.119353 -0.030970 -0.033558 -0.028970 -0.061953 -0.020381 0.079532 0.004432 0.136820 -0.145711 -0.045572 0.051238 0.162429 0.091789 0.014060 0.047446 0.027684 0.023013 0.094565 0.049388 -0.062917 -0.027316 -0.086800 0.041024 0.018942 0.107360 0.063098 -0.033394 -0.044862 -0.112381 0.072148 0.009657 -0.012235 0.068311 -0.038912 0.004894 -0.027429 -0.090928 0.081080 -0.114910 -0.033046 0.063573 0.062129 0.022944 0.069012 -0.042796 -0.083783 0.032301 -0.067346 0.040146 0.021394 -0.013491 0.022903 0.129044 0.010778 -0.049158 -0.067375 0.067017 -0.067250 -0.031547 -0.071356 -0.078583 0.097002 0.021380 -0.089573 0.086587 0.018303 -0.073784 0.006643 0.044439 -0.025277 -0.067680 0.031846 -0.077610 -0.035072 -0.091581 -0.121477 0.031523 -0.005682 0.019610 -0.038923 0.023377 0.124051 -0.005372 -0.034112 0.120143 -0.045799 0.010080 0.073605 -0.014400 0.089197 0.030978 0.063411 0.024348 0.132309 0.034823 -0.043917 
-interdum. 0.018740 -0.142258 0.061961 -0.157696 -0.034753 -0.047109 -0.054603 -0.088207 -0.027159 0.091506 -0.016154 0.188740 -0.208702 -0.036433 0.070395 0.217716 0.121290 0.020049 0.055862 0.023447 0.015548 0.113900 0.054160 -0.079830 -0.044173 -0.107198 0.045271 0.013570 0.139741 0.087662 -0.049766 -0.067859 -0.153799 0.108686 0.016321 -0.021951 0.079953 -0.056855 0.008008 -0.036052 -0.123822 0.102779 -0.157559 -0.032881 0.089261 0.084363 0.036004 0.091098 -0.060454 -0.109637 0.035634 -0.103497 0.053177 0.025075 -0.027637 0.037413 0.162289 0.012912 -0.065461 -0.076990 0.083791 -0.082374 -0.040738 -0.096156 -0.091584 0.122480 0.033194 -0.106533 0.114886 0.027906 -0.095055 0.005880 0.061435 -0.033569 -0.092726 0.036944 -0.092301 -0.050403 -0.110053 -0.155769 0.034422 0.003670 0.029587 -0.047006 0.040208 0.160749 -0.013194 -0.041094 0.144654 -0.057922 0.006594 0.094732 -0.011365 0.119222 0.047971 0.080298 0.032270 0.177097 0.051338 -0.053299 
-iaculis, 0.024289 -0.199563 0.084309 -0.215650 -0.054401 -0.062904 -0.063561 -0.112426 -0.044034 0.132574 -0.005571 0.260249 -0.278581 -0.068152 0.106507 0.303278 0.172845 0.029366 0.084541 0.033131 0.027403 0.163394 0.072716 -0.109855 -0.066698 -0.145552 0.067446 0.027309 0.196764 0.123366 -0.067713 -0.087913 -0.209597 0.148959 0.014196 -0.036242 0.116608 -0.072493 0.012939 -0.053973 -0.180086 0.145568 -0.222528 -0.048472 0.112535 0.129811 0.048980 0.129770 -0.090633 -0.154137 0.050136 -0.141043 0.070363 0.035596 -0.030396 0.040961 0.230872 0.023006 -0.093719 -0.106763 0.118602 -0.113645 -0.062750 -0.131955 -0.132319 0.181296 0.044103 -0.145646 0.156962 0.039427 -0.131807 0.002893 0.076456 -0.054840 -0.130347 0.055295 -0.131080 -0.068918 -0.154656 -0.223923 0.058173 0.011166 0.039544 -0.056823 0.062272 0.227358 -0.015251 -0.052719 0.192359 -0.084852 0.013433 0.131276 -0.016735 0.164468 0.070613 0.111426 0.050424 0.242259 0.079700 -0.081546 
-vel. 0.014768 -0.181196 0.077237 -0.196024 -0.049277 -0.053978 -0.063734 -0.099609 -0.035205 0.122190 -0.013437 0.244035 -0.271599 -0.048834 0.097788 0.283283 0.161860 0.013627 0.080315 0.033685 0.027141 0.158655 0.067750 -0.110387 -0.058976 -0.145433 0.056431 0.031942 0.185942 0.116806 -0.067010 -0.086151 -0.198456 0.144562 0.020938 -0.031742 0.108334 -0.066328 0.011305 -0.044379 -0.171182 0.136775 -0.215087 -0.046167 0.110656 0.118692 0.045685 0.122014 -0.078974 -0.138438 0.047309 -0.126215 0.068659 0.032978 -0.031367 0.042237 0.212427 0.017525 -0.087708 -0.102131 0.112165 -0.120313 -0.047163 -0.122307 -0.130391 0.169832 0.040058 -0.133732 0.144326 0.032351 -0.133761 0.006618 0.074076 -0.047035 -0.124578 0.054393 -0.127658 -0.060071 -0.140634 -0.214446 0.050411 0.007077 0.038571 -0.057935 0.059297 0.207789 -0.008698 -0.053908 0.182133 -0.085951 0.010810 0.122926 -0.018654 0.151669 0.068152 0.098701 0.050771 0.226433 0.079363 -0.071910 
-viverra. 0.015327 -0.170731 0.073926 -0.186334 -0.038799 -0.056563 -0.065708 -0.097430 -0.037766 0.112350 -0.008955 0.225061 -0.241151 -0.053324 0.086067 0.252037 0.140885 0.015202 0.063060 0.023552 0.019407 0.139994 0.062379 -0.092314 -0.058224 -0.126887 0.056653 0.019920 0.165715 0.111688 -0.061388 -0.078140 -0.174182 0.134323 0.013504 -0.025725 0.105730 -0.057413 0.005274 -0.047279 -0.143047 0.126329 -0.194223 -0.033917 0.099263 0.103455 0.035698 0.108110 -0.078872 -0.124795 0.036767 -0.111642 0.055146 0.031770 -0.025879 0.044487 0.190687 0.017652 -0.071598 -0.091131 0.096839 -0.103819 -0.054313 -0.113237 -0.120090 0.154484 0.027514 -0.121286 0.135579 0.029707 -0.118914 0.001940 0.070901 -0.042051 -0.115715 0.045340 -0.114529 -0.060732 -0.133205 -0.188712 0.043587 0.003062 0.041856 -0.058674 0.046621 0.196363 -0.010640 -0.050016 0.175892 -0.077129 0.013945 0.112672 -0.014782 0.136609 0.063937 0.090549 0.043973 0.216126 0.072155 -0.072201 
-malesuada, 0.023179 -0.171718 0.072524 -0.191349 -0.045047 -0.058046 -0.056468 -0.093721 -0.038248 0.116582 -0.008829 0.229425 -0.239949 -0.053804 0.082555 0.264691 0.144534 0.023659 0.065986 0.029162 0.021965 0.146171 0.062831 -0.095976 -0.051887 -0.130884 0.062778 0.025891 0.176139 0.101517 -0.058012 -0.072875 -0.186975 0.133071 0.020261 -0.028965 0.105270 -0.063656 0.010159 -0.037683 -0.154725 0.124753 -0.191680 -0.038143 0.104694 0.112635 0.044616 0.111303 -0.077996 -0.129286 0.042736 -0.122780 0.062838 0.037830 -0.033248 0.040651 0.198906 0.017843 -0.076140 -0.093499 0.099358 -0.106596 -0.055102 -0.109870 -0.119415 0.160517 0.030073 -0.129848 0.135564 0.033455 -0.117904 0.003483 0.068092 -0.043784 -0.108647 0.044123 -0.114914 -0.058819 -0.137427 -0.195893 0.048655 0.006683 0.032919 -0.050454 0.047007 0.196219 -0.011868 -0.050716 0.176024 -0.080044 0.011360 0.114553 -0.017424 0.145390 0.055696 0.092367 0.044291 0.221142 0.068501 -0.070249 
-malesuada. 0.014757 -0.134323 0.061032 -0.147591 -0.038409 -0.044755 -0.048923 -0.076521 -0.033523 0.092159 -0.010060 0.190764 -0.195281 -0.042505 0.075289 0.215931 0.123474 0.011029 0.057614 0.021102 0.025490 0.120336 0.057675 -0.077848 -0.044783 -0.111094 0.052979 0.022988 0.145608 0.086433 -0.054323 -0.070769 -0.150957 0.112442 0.017439 -0.023636 0.083233 -0.053438 0.006320 -0.032980 -0.130555 0.109488 -0.159761 -0.033570 0.080738 0.091179 0.036289 0.096952 -0.066332 -0.104243 0.031838 -0.104337 0.056868 0.033164 -0.018004 0.031620 0.161352 0.010371 -0.068246 -0.072786 0.081889 -0.085710 -0.043117 -0.089751 -0.092584 0.130856 0.023545 -0.112860 0.110141 0.021781 -0.101234 0.001200 0.056444 -0.038326 -0.091161 0.044259 -0.096432 -0.053607 -0.113362 -0.163001 0.035437 0.005730 0.028122 -0.044445 0.047914 0.159801 -0.005166 -0.039266 0.147064 -0.065565 0.002074 0.097338 -0.016070 0.119345 0.046680 0.082737 0.031260 0.180196 0.053229 -0.058247 
-lacus, 0.018714 -0.152919 0.063971 -0.165791 -0.037110 -0.053067 -0.054728 -0.094124 -0.036226 0.105325 -0.007740 0.213282 -0.223503 -0.045176 0.076566 0.240376 0.132494 0.023795 0.064798 0.031736 0.027008 0.133934 0.061355 -0.090915 -0.053381 -0.120294 0.056061 0.020410 0.154069 0.090821 -0.055898 -0.072450 -0.166211 0.127758 0.011764 -0.028005 0.100650 -0.060747 0.004906 -0.039659 -0.140033 0.118078 -0.186415 -0.032755 0.092595 0.109125 0.035540 0.098946 -0.068293 -0.117739 0.037242 -0.112415 0.056784 0.028053 -0.027389 0.031449 0.182528 0.013438 -0.069741 -0.087829 0.095690 -0.094893 -0.041493 -0.101425 -0.110204 0.141190 0.028209 -0.123591 0.124931 0.026619 -0.112718 0.010200 0.062248 -0.037052 -0.106391 0.042344 -0.105775 -0.049342 -0.123000 -0.179009 0.049801 0.003375 0.027297 -0.049622 0.047491 0.173594 -0.006673 -0.045450 0.155196 -0.067332 0.005479 0.097702 -0.021158 0.123639 0.052244 0.083697 0.043093 0.194910 0.061372 -0.063695 
-auctor. 0.020209 -0.149770 0.062058 -0.171925 -0.034276 -0.048055 -0.059101 -0.093237 -0.033990 0.106390 -0.004265 0.203529 -0.224298 -0.048230 0.077372 0.244067 0.133567 0.019958 0.069968 0.031319 0.029566 0.128482 0.058451 -0.090646 -0.048724 -0.118590 0.053703 0.026545 0.151775 0.098903 -0.051768 -0.071677 -0.167905 0.117636 0.012134 -0.026485 0.096224 -0.058562 0.004476 -0.034026 -0.137220 0.115189 -0.176462 -0.041263 0.092415 0.102954 0.030785 0.099990 -0.069401 -0.112173 0.039293 -0.107736 0.057163 0.027618 -0.020724 0.038486 0.176404 0.014363 -0.072361 -0.078347 0.093574 -0.094358 -0.040288 -0.098217 -0.100666 0.142898 0.032102 -0.113876 0.118091 0.023197 -0.107758 0.004619 0.064582 -0.036242 -0.102949 0.043792 -0.107601 -0.049437 -0.122393 -0.178253 0.045089 0.001243 0.034821 -0.045925 0.047069 0.175140 -0.014774 -0.038151 0.161110 -0.066213 0.008932 0.099811 -0.021694 0.129988 0.052552 0.083460 0.040346 0.194783 0.064668 -0.062453 
-hendrerit. 0.009940 -0.124828 0.055110 -0.132415 -0.026689 -0.044289 -0.043977 -0.067027 -0.022443 0.081243 -0.009938 0.159738 -0.171349 -0.043026 0.064085 0.184443 0.104721 0.018840 0.053074 0.018947 0.019595 0.108046 0.048456 -0.074064 -0.038179 -0.100370 0.042904 0.013552 0.120666 0.074659 -0.045905 -0.057307 -0.137161 0.095277 0.016986 -0.020538 0.071238 -0.048532 0.000573 -0.028119 -0.108715 0.087997 -0.138664 -0.027440 0.068614 0.082509 0.033791 0.084548 -0.056665 -0.094384 0.028111 -0.087455 0.043679 0.026389 -0.016392 0.024779 0.144015 0.006798 -0.057405 -0.070605 0.070614 -0.079365 -0.037345 -0.084563 -0.089254 0.116219 0.023936 -0.098911 0.096976 0.018330 -0.088135 -0.000550 0.049598 -0.031259 -0.079069 0.036614 -0.087770 -0.044872 -0.094864 -0.136225 0.032223 0.006998 0.019386 -0.038539 0.033350 0.146420 -0.003821 -0.035359 0.124459 -0.051755 0.007666 0.083894 -0.011077 0.104509 0.038257 0.070012 0.028345 0.156643 0.053699 -0.051236 
-porta. 0.018651 -0.158552 0.066222 -0.178119 -0.044517 -0.049370 -0.059849 -0.095550 -0.030483 0.112535 -0.014257 0.218950 -0.236894 -0.050791 0.083328 0.253034 0.137925 0.015234 0.069141 0.025898 0.018368 0.139605 0.062559 -0.095544 -0.050543 -0.126650 0.053952 0.020804 0.164051 0.109807 -0.060537 -0.080140 -0.185106 0.127317 0.019674 -0.033998 0.101983 -0.062167 0.003890 -0.047059 -0.154139 0.125557 -0.192030 -0.035127 0.097822 0.103078 0.040014 0.106937 -0.070550 -0.123170 0.042292 -0.122785 0.066911 0.037369 -0.023583 0.040450 0.195887 0.012273 -0.080122 -0.084153 0.099807 -0.103427 -0.044698 -0.110919 -0.109675 0.154122 0.031582 -0.122446 0.134455 0.032961 -0.120912 0.006909 0.071737 -0.048294 -0.108841 0.049075 -0.108000 -0.054115 -0.132837 -0.190164 0.050389 0.013916 0.035024 -0.050444 0.053871 0.186525 -0.005775 -0.040597 0.170172 -0.070095 0.006977 0.112217 -0.019643 0.138911 0.058338 0.085207 0.038182 0.209169 0.071885 -0.061373 
-elementum, 0.029187 -0.208966 0.084784 -0.224152 -0.056851 -0.061370 -0.067334 -0.121182 -0.039270 0.140813 -0.012335 0.274568 -0.306040 -0.061059 0.108901 0.325163 0.190676 0.022722 0.087574 0.042218 0.039972 0.183711 0.084558 -0.125976 -0.064131 -0.170051 0.076194 0.032903 0.215483 0.131479 -0.070105 -0.094786 -0.225218 0.168287 0.019144 -0.042057 0.134163 -0.080597 0.010408 -0.050348 -0.189127 0.163222 -0.237651 -0.052122 0.123134 0.142007 0.052850 0.130716 -0.089766 -0.161830 0.055399 -0.149470 0.074719 0.040608 -0.033313 0.044374 0.245205 0.019092 -0.091092 -0.113508 0.124558 -0.134283 -0.061171 -0.137433 -0.148734 0.199643 0.043562 -0.153545 0.168302 0.045994 -0.145004 0.009215 0.088789 -0.058694 -0.138822 0.062150 -0.151000 -0.077062 -0.162798 -0.244365 0.054353 0.013969 0.037815 -0.074748 0.062730 0.252486 -0.010068 -0.061077 0.219902 -0.102391 0.005815 0.140451 -0.028692 0.173439 0.075204 0.117287 0.052364 0.267169 0.083529 -0.082167 
-accumsan, 0.011590 -0.167842 0.071063 -0.180483 -0.047117 -0.058374 -0.052705 -0.095859 -0.035953 0.117058 -0.003415 0.227236 -0.241330 -0.055027 0.084922 0.264116 0.147981 0.022802 0.066782 0.029145 0.021100 0.143054 0.065803 -0.103758 -0.050189 -0.134534 0.055710 0.027369 0.171832 0.104620 -0.055338 -0.082133 -0.189556 0.127475 0.012338 -0.032781 0.104611 -0.066938 0.008111 -0.040513 -0.142289 0.123951 -0.191743 -0.039711 0.099866 0.110273 0.038331 0.111861 -0.072033 -0.124541 0.043416 -0.121352 0.062646 0.034940 -0.027620 0.034090 0.198261 0.019690 -0.072705 -0.087866 0.104271 -0.098599 -0.045933 -0.107136 -0.110769 0.155122 0.028320 -0.128159 0.129069 0.035504 -0.117560 0.007136 0.068018 -0.042780 -0.107749 0.050423 -0.110362 -0.052868 -0.135365 -0.184777 0.043131 0.006241 0.032563 -0.056590 0.045984 0.202504 -0.009876 -0.048643 0.176037 -0.073376 0.009142 0.107309 -0.026096 0.138355 0.061162 0.096687 0.046948 0.216017 0.069114 -0.070933 
-mauris, 0.021996 -0.191586 0.084028 -0.211916 -0.046715 -0.064912 -0.070018 -0.114690 -0.041958 0.134928 -0.011731 0.260937 -0.274212 -0.062719 0.099393 0.297705 0.170185 0.026858 0.074178 0.029194 0.030653 0.157297 0.067304 -0.104556 -0.061607 -0.145054 0.065499 0.020727 0.200468 0.119865 -0.065762 -0.083632 -0.204651 0.146086 0.026111 -0.028850 0.114135 -0.075183 0.005946 -0.050983 -0.164169 0.144148 -0.213378 -0.048786 0.112048 0.122443 0.046659 0.121921 -0.087726 -0.141741 0.051420 -0.139327 0.074950 0.030579 -0.035606 0.048412 0.223259 0.016878 -0.084305 -0.105223 0.117070 -0.113770 -0.060638 -0.124095 -0.131896 0.179274 0.038753 -0.139492 0.152134 0.035562 -0.136846 0.001471 0.075203 -0.052785 -0.119561 0.056734 -0.123669 -0.063211 -0.145336 -0.218106 0.050024 0.002893 0.042076 -0.063897 0.054236 0.218009 -0.010681 -0.055946 0.191868 -0.088446 0.014036 0.119578 -0.017705 0.157625 0.065513 0.107775 0.056200 0.238782 0.081587 -0.078385 
-semper, 0.016689 -0.180009 0.079042 -0.203210 -0.044928 -0.061368 -0.064444 -0.110671 -0.042832 0.130185 -0.002838 0.246694 -0.271252 -0.060706 0.091634 0.294925 0.167420 0.028670 0.079191 0.036651 0.023965 0.163543 0.075676 -0.105659 -0.064906 -0.140284 0.062582 0.022839 0.191236 0.118086 -0.067269 -0.085765 -0.206466 0.142297 0.016097 -0.036780 0.116025 -0.066328 0.010469 -0.045421 -0.169752 0.144246 -0.215377 -0.040712 0.106231 0.122706 0.040020 0.129050 -0.087650 -0.140648 0.045106 -0.132412 0.069392 0.034785 -0.030442 0.035619 0.219110 0.020722 -0.085355 -0.096753 0.118534 -0.107474 -0.051741 -0.125627 -0.124096 0.172632 0.042246 -0.145603 0.146365 0.038309 -0.131868 0.004867 0.080188 -0.048159 -0.123645 0.056501 -0.127152 -0.060657 -0.149802 -0.219925 0.054250 0.009157 0.039637 -0.054465 0.057008 0.226206 -0.016175 -0.051247 0.196664 -0.084288 0.012193 0.127219 -0.018719 0.151384 0.068817 0.098086 0.050985 0.239990 0.081117 -0.075241 
-purus, 0.022024 -0.172587 0.072336 -0.191543 -0.042325 -0.060927 -0.064512 -0.107217 -0.041320 0.116484 -0.006846 0.238554 -0.249334 -0.052755 0.089300 0.267442 0.150067 0.015132 0.074981 0.024398 0.022111 0.149389 0.070579 -0.105179 -0.055034 -0.136519 0.054640 0.024241 0.181379 0.109962 -0.062223 -0.082141 -0.193955 0.137128 0.019567 -0.034893 0.112165 -0.071152 0.005797 -0.045629 -0.162584 0.139113 -0.204588 -0.044597 0.102895 0.120485 0.046352 0.111895 -0.083517 -0.136958 0.049257 -0.125591 0.065773 0.038958 -0.025875 0.045754 0.207816 0.021098 -0.081717 -0.093351 0.107412 -0.104591 -0.050561 -0.118344 -0.119449 0.168708 0.037192 -0.132967 0.140952 0.033926 -0.128055 0.005230 0.072970 -0.047688 -0.111157 0.051474 -0.119054 -0.063366 -0.136135 -0.204086 0.043564 0.001973 0.039133 -0.059163 0.056272 0.206815 -0.012668 -0.044308 0.172950 -0.081238 0.012402 0.118265 -0.021265 0.138663 0.058889 0.101427 0.046636 0.215265 0.076129 -0.070263 
-scelerisque, 0.017382 -0.165503 0.069337 -0.181445 -0.042942 -0.051640 -0.051261 -0.100169 -0.037269 0.113426 -0.013974 0.226519 -0.247168 -0.047799 0.089019 0.253602 0.146081 0.021472 0.066989 0.024784 0.023900 0.135539 0.069841 -0.092594 -0.051554 -0.134124 0.057700 0.020332 0.170171 0.100096 -0.061227 -0.075490 -0.179741 0.123854 0.018625 -0.025635 0.104423 -0.062305 0.003060 -0.038354 -0.146404 0.120158 -0.190591 -0.039178 0.096467 0.105172 0.041122 0.110428 -0.079397 -0.125889 0.038699 -0.118143 0.065038 0.031251 -0.025379 0.035954 0.195048 0.023975 -0.075460 -0.094042 0.100479 -0.096469 -0.048501 -0.111230 -0.109021 0.157276 0.035620 -0.123966 0.128179 0.029748 -0.120538 0.000170 0.069888 -0.047255 -0.106186 0.047997 -0.117564 -0.057611 -0.124446 -0.193774 0.050774 0.013286 0.035568 -0.051251 0.046859 0.197329 -0.007082 -0.052167 0.174215 -0.073992 0.014181 0.107978 -0.016972 0.138293 0.056760 0.097770 0.041551 0.207520 0.073599 -0.067464 
-interdum, 0.016499 -0.203637 0.086903 -0.221586 -0.053989 -0.064840 -0.074126 -0.112265 -0.043323 0.136518 -0.007619 0.266217 -0.288307 -0.066752 0.103341 0.317308 0.175900 0.026017 0.083958 0.031850 0.026681 0.168305 0.081148 -0.121282 -0.061864 -0.160096 0.065531 0.028145 0.202626 0.126426 -0.069447 -0.098410 -0.211297 0.157019 0.026214 -0.042013 0.127379 -0.079653 0.011550 -0.050858 -0.177142 0.151881 -0.231105 -0.047739 0.118736 0.126291 0.051266 0.132414 -0.088087 -0.150167 0.045436 -0.142206 0.074935 0.037294 -0.033148 0.049956 0.232298 0.022959 -0.094257 -0.110020 0.123177 -0.127107 -0.057609 -0.131362 -0.135381 0.190571 0.041526 -0.153430 0.162453 0.035227 -0.143312 0.006685 0.087552 -0.053894 -0.131390 0.054163 -0.139605 -0.072895 -0.159011 -0.231323 0.056305 0.007474 0.037645 -0.067010 0.065252 0.236819 -0.013088 -0.064176 0.207257 -0.088298 0.009558 0.134181 -0.030927 0.159685 0.071973 0.113689 0.048656 0.257593 0.087385 -0.083840 
-Orci 0.030431 -0.232596 0.086140 -0.246891 -0.066772 -0.070077 -0.089460 -0.129865 -0.058670 0.160367 -0.011194 0.306322 -0.344550 -0.071293 0.122280 0.373157 0.207862 0.028377 0.090105 0.045312 0.048759 0.211688 0.094984 -0.143247 -0.078001 -0.203722 0.088682 0.028910 0.248993 0.151189 -0.085619 -0.115964 -0.262297 0.179795 0.022008 -0.049519 0.152564 -0.102522 0.010918 -0.065593 -0.226588 0.185756 -0.280441 -0.060879 0.147919 0.152371 0.061073 0.152845 -0.106169 -0.184210 0.062913 -0.174783 0.092872 0.050309 -0.030491 0.052902 0.280447 0.019656 -0.115725 -0.129660 0.147620 -0.157135 -0.063716 -0.165280 -0.166067 0.237938 0.050036 -0.182195 0.194104 0.045290 -0.176398 0.013663 0.092793 -0.059772 -0.147117 0.071909 -0.170776 -0.094002 -0.193537 -0.283685 0.065174 0.014749 0.040018 -0.079932 0.076952 0.292522 -0.010008 -0.067033 0.255373 -0.112022 -0.006492 0.165594 -0.031985 0.196980 0.084945 0.136918 0.070267 0.302260 0.087848 -0.093779 
-natoque 0.032016 -0.231975 0.089102 -0.249376 -0.068299 -0.071100 -0.091446 -0.122715 -0.057229 0.153431 -0.007341 0.303823 -0.348249 -0.080067 0.119383 0.386132 0.210710 0.027558 0.093737 0.052657 0.042008 0.210204 0.091774 -0.137593 -0.083637 -0.213483 0.085345 0.030240 0.244628 0.147571 -0.092798 -0.114359 -0.269665 0.180836 0.018135 -0.047201 0.151049 -0.097656 0.019080 -0.069345 -0.230863 0.196752 -0.282952 -0.069007 0.149976 0.155404 0.058291 0.153090 -0.115466 -0.179549 0.069560 -0.172613 0.103730 0.050124 -0.031946 0.057566 0.293806 0.024875 -0.124073 -0.128058 0.146580 -0.163627 -0.065965 -0.166715 -0.169154 0.243732 0.053958 -0.184959 0.206080 0.048797 -0.178732 0.010286 0.092067 -0.062491 -0.149862 0.077046 -0.176184 -0.100424 -0.193050 -0.284822 0.071466 0.014718 0.042848 -0.077054 0.075927 0.297155 -0.012656 -0.068541 0.263884 -0.111287 -0.009763 0.164411 -0.026224 0.205065 0.076950 0.141698 0.070496 0.307982 0.097462 -0.101639 
-penatibus 0.022713 -0.213915 0.077913 -0.225602 -0.064406 -0.064102 -0.079034 -0.110177 -0.053866 0.137580 -0.004621 0.286858 -0.313751 -0.069219 0.113565 0.351297 0.194751 0.025030 0.082398 0.043711 0.049541 0.192764 0.085893 -0.128184 -0.066731 -0.203611 0.085668 0.026292 0.228277 0.133492 -0.083066 -0.104118 -0.246163 0.170156 0.020590 -0.054186 0.135531 -0.100038 0.020617 -0.064608 -0.214764 0.177422 -0.259095 -0.060653 0.134274 0.140405 0.056420 0.133828 -0.100656 -0.161873 0.063508 -0.155941 0.086133 0.043481 -0.024364 0.055622 0.261572 0.022188 -0.116298 -0.118740 0.135792 -0.151795 -0.064489 -0.146751 -0.156941 0.224365 0.047217 -0.164467 0.185537 0.039315 -0.153881 0.004604 0.082861 -0.064053 -0.137459 0.067024 -0.163839 -0.089921 -0.173918 -0.265706 0.060325 0.017674 0.038290 -0.072943 0.063572 0.265986 -0.016333 -0.057023 0.239713 -0.098740 -0.011788 0.160759 -0.025076 0.187756 0.070233 0.130352 0.060280 0.290344 0.081503 -0.085289 
-magnis 0.032371 -0.211279 0.084618 -0.241204 -0.068678 -0.075549 -0.086537 -0.110437 -0.055816 0.153111 -0.004977 0.296262 -0.328113 -0.068401 0.117706 0.375749 0.200978 0.026953 0.081238 0.043728 0.046770 0.198667 0.096695 -0.137756 -0.076442 -0.221244 0.084109 0.034370 0.244065 0.143865 -0.087349 -0.112772 -0.250837 0.175939 0.022723 -0.055642 0.140853 -0.101059 0.018011 -0.060971 -0.226161 0.192376 -0.268524 -0.069316 0.141576 0.151804 0.062455 0.139603 -0.098960 -0.168472 0.065038 -0.176476 0.105105 0.056836 -0.026873 0.054807 0.273522 0.022061 -0.125645 -0.123020 0.141706 -0.162980 -0.059689 -0.156691 -0.174084 0.237386 0.048719 -0.170028 0.196142 0.041108 -0.167568 0.002567 0.088783 -0.056782 -0.143973 0.067757 -0.168620 -0.102054 -0.195627 -0.281937 0.059083 0.015697 0.039662 -0.086107 0.069888 0.281597 -0.016779 -0.064876 0.245926 -0.103426 -0.008769 0.165798 -0.025825 0.205139 0.074983 0.137857 0.065671 0.297362 0.094299 -0.091398 
-dis 0.032810 -0.221609 0.089738 -0.247292 -0.067278 -0.077138 -0.090611 -0.119387 -0.047904 0.156210 -0.015040 0.305299 -0.342702 -0.079780 0.116813 0.385891 0.207419 0.023618 0.089761 0.049723 0.052191 0.215636 0.097267 -0.140446 -0.072135 -0.228052 0.095022 0.037879 0.252209 0.146169 -0.087555 -0.109169 -0.260249 0.178155 0.023247 -0.061061 0.150020 -0.105353 0.017341 -0.064037 -0.235171 0.194858 -0.273606 -0.069045 0.150759 0.157327 0.058318 0.142576 -0.109470 -0.172250 0.063695 -0.176716 0.107034 0.056070 -0.025933 0.066898 0.285909 0.019646 -0.125455 -0.134529 0.146240 -0.167781 -0.062049 -0.168415 -0.178479 0.238395 0.046962 -0.184408 0.208323 0.043176 -0.172267 0.000578 0.094225 -0.061784 -0.144850 0.066721 -0.177158 -0.093211 -0.192988 -0.287631 0.072315 0.021194 0.042824 -0.085493 0.065924 0.287482 -0.010479 -0.062712 0.254061 -0.115724 -0.008669 0.165125 -0.016269 0.211977 0.086819 0.144455 0.067463 0.312885 0.093856 -0.099042 
-parturient 0.029958 -0.232390 0.088387 -0.257776 -0.063875 -0.082321 -0.086971 -0.128504 -0.059868 0.156822 -0.013737 0.321915 -0.347487 -0.076608 0.125411 0.402557 0.214518 0.022550 0.093999 0.050424 0.053557 0.222658 0.106948 -0.148979 -0.078023 -0.232890 0.091170 0.039389 0.261566 0.155564 -0.084847 -0.113523 -0.267937 0.184930 0.019936 -0.062259 0.152284 -0.105149 0.027101 -0.062680 -0.233400 0.208384 -0.281230 -0.074206 0.153985 0.164058 0.069858 0.150632 -0.103029 -0.181470 0.074730 -0.178818 0.104720 0.050494 -0.026754 0.065258 0.292538 0.026614 -0.129709 -0.143838 0.152016 -0.172340 -0.074857 -0.170195 -0.185477 0.242680 0.056678 -0.193136 0.212523 0.044504 -0.168702 -0.000063 0.095171 -0.060643 -0.158315 0.069229 -0.180570 -0.102884 -0.202440 -0.295691 0.068027 0.016219 0.043308 -0.091035 0.072287 0.300455 -0.017535 -0.065479 0.258304 -0.109078 -0.011378 0.178796 -0.018169 0.221752 0.090275 0.146857 0.069967 0.325296 0.104469 -0.103141 
-montes, 0.031668 -0.222925 0.084475 -0.256974 -0.069981 -0.084061 -0.090883 -0.123392 -0.046392 0.162167 -0.011511 0.314487 -0.338985 -0.074690 0.126659 0.382319 0.208886 0.028775 0.092967 0.049641 0.053796 0.214175 0.096214 -0.138255 -0.075560 -0.223560 0.087181 0.030020 0.243668 0.146791 -0.087013 -0.108857 -0.266724 0.183169 0.016828 -0.059143 0.141578 -0.097096 0.020535 -0.065833 -0.234016 0.189272 -0.275942 -0.073027 0.144801 0.155114 0.064853 0.144926 -0.100321 -0.171501 0.064945 -0.179678 0.106624 0.051523 -0.035817 0.066232 0.279945 0.022592 -0.127392 -0.129498 0.149372 -0.163050 -0.065513 -0.161737 -0.170787 0.237470 0.050558 -0.175142 0.207830 0.045740 -0.163949 -0.002648 0.093132 -0.064737 -0.155085 0.063782 -0.179360 -0.101723 -0.201495 -0.280827 0.062731 0.016029 0.045785 -0.080744 0.071791 0.281085 -0.018378 -0.064076 0.246656 -0.110950 -0.001566 0.172671 -0.021025 0.207012 0.083492 0.144189 0.067835 0.309481 0.101395 -0.095282 
-nascetur 0.031645 -0.195472 0.073305 -0.225035 -0.057641 -0.076440 -0.074421 -0.112595 -0.038136 0.137732 -0.013743 0.277621 -0.300375 -0.071638 0.108452 0.339914 0.180595 0.021291 0.076209 0.040417 0.048312 0.178339 0.086849 -0.125886 -0.069535 -0.194387 0.081330 0.027418 0.213928 0.136237 -0.075352 -0.091735 -0.221666 0.157406 0.017394 -0.045907 0.133081 -0.080572 0.014267 -0.050061 -0.197860 0.171929 -0.237185 -0.056751 0.128027 0.136114 0.048375 0.124872 -0.088071 -0.151075 0.059028 -0.150928 0.093026 0.046424 -0.030350 0.056941 0.241040 0.026763 -0.107792 -0.122071 0.124882 -0.144996 -0.058126 -0.143351 -0.157898 0.199314 0.037011 -0.156250 0.175390 0.034288 -0.142045 0.002713 0.084568 -0.048853 -0.130008 0.054306 -0.158420 -0.079269 -0.178110 -0.244718 0.063971 0.015437 0.041441 -0.077343 0.062332 0.244176 -0.017324 -0.052809 0.224249 -0.096431 0.003063 0.142643 -0.021741 0.178680 0.074565 0.118473 0.053740 0.271557 0.087223 -0.081138 
-ridiculus 0.028811 -0.178967 0.070706 -0.200461 -0.053464 -0.065923 -0.065369 -0.095080 -0.037042 0.130341 -0.004787 0.255599 -0.277191 -0.061617 0.096387 0.307517 0.162315 0.019782 0.067853 0.035358 0.045340 0.166225 0.082362 -0.109618 -0.064186 -0.173670 0.078013 0.027614 0.190149 0.120549 -0.071274 -0.090566 -0.206035 0.149039 0.009010 -0.048452 0.114127 -0.077252 0.018291 -0.044063 -0.186981 0.153299 -0.216200 -0.057477 0.108326 0.119727 0.045501 0.113127 -0.077989 -0.139739 0.047698 -0.135129 0.085479 0.041029 -0.025022 0.043492 0.219562 0.025899 -0.095359 -0.099628 0.119836 -0.121335 -0.052112 -0.133441 -0.133046 0.186779 0.031466 -0.140966 0.161544 0.035688 -0.126556 0.000738 0.078934 -0.049579 -0.124683 0.053068 -0.138872 -0.073123 -0.160836 -0.225800 0.062069 0.009743 0.030693 -0.062960 0.050320 0.226065 -0.015522 -0.056123 0.196017 -0.084181 -0.000757 0.135357 -0.011126 0.167136 0.071444 0.105488 0.054124 0.247137 0.082914 -0.085221 
-mus. 0.027976 -0.214759 0.077455 -0.242889 -0.058962 -0.073950 -0.071308 -0.117846 -0.039352 0.150544 -0.008545 0.287955 -0.315222 -0.066035 0.114784 0.349921 0.181411 0.023126 0.083057 0.045559 0.046523 0.192358 0.086400 -0.125371 -0.070109 -0.189766 0.080139 0.033366 0.218541 0.141512 -0.073854 -0.104129 -0.231245 0.164715 0.023946 -0.053367 0.129853 -0.077152 0.016488 -0.051689 -0.204205 0.176060 -0.255560 -0.059949 0.125978 0.141981 0.053570 0.139209 -0.096000 -0.158622 0.055460 -0.160476 0.092006 0.037344 -0.024894 0.044344 0.261042 0.025422 -0.111151 -0.122340 0.130194 -0.141232 -0.064524 -0.151918 -0.150128 0.204267 0.040301 -0.171534 0.184849 0.033185 -0.140785 0.003691 0.091722 -0.050333 -0.141316 0.060688 -0.155339 -0.078045 -0.184068 -0.255146 0.065892 0.007176 0.038640 -0.076941 0.062547 0.251479 -0.023652 -0.059334 0.224067 -0.089352 0.001069 0.149596 -0.015868 0.189082 0.082391 0.117633 0.056499 0.286509 0.090694 -0.085786 
-venenatis, 0.024864 -0.201087 0.092386 -0.232030 -0.055487 -0.072166 -0.067655 -0.121486 -0.050793 0.147113 -0.007912 0.278168 -0.295350 -0.071792 0.110282 0.323059 0.182655 0.031997 0.091224 0.043484 0.040466 0.176711 0.081651 -0.115691 -0.063207 -0.159390 0.080207 0.029343 0.208840 0.130877 -0.078829 -0.097802 -0.224353 0.166151 0.017128 -0.040493 0.124336 -0.075261 0.004653 -0.058327 -0.189580 0.160157 -0.238429 -0.044557 0.120356 0.133882 0.047248 0.135325 -0.096198 -0.159025 0.046080 -0.153964 0.075300 0.045571 -0.036408 0.043532 0.251481 0.022014 -0.098244 -0.117167 0.126150 -0.121630 -0.064475 -0.143042 -0.140163 0.192082 0.042218 -0.162564 0.168183 0.033154 -0.146865 0.011919 0.089587 -0.053125 -0.143457 0.061703 -0.145239 -0.064662 -0.165373 -0.235730 0.064369 0.011313 0.046180 -0.062596 0.064289 0.251266 -0.016907 -0.064668 0.220759 -0.096777 0.010501 0.134789 -0.025743 0.178511 0.067138 0.112903 0.058075 0.265418 0.080262 -0.087523 
-justo, 0.016921 -0.173007 0.074720 -0.193768 -0.042835 -0.061692 -0.055318 -0.095086 -0.039246 0.118146 0.000124 0.229146 -0.247610 -0.062975 0.089596 0.273844 0.149233 0.020933 0.067665 0.033175 0.033604 0.149535 0.073720 -0.104784 -0.061870 -0.135876 0.060079 0.023533 0.188530 0.113795 -0.064019 -0.081348 -0.192196 0.131863 0.011041 -0.034190 0.107753 -0.064709 0.005887 -0.046159 -0.148820 0.139958 -0.203007 -0.043548 0.111551 0.110124 0.043557 0.121801 -0.076795 -0.133119 0.051088 -0.128319 0.058578 0.034870 -0.021662 0.041483 0.210857 0.020836 -0.084781 -0.098098 0.103058 -0.109671 -0.058584 -0.123736 -0.125302 0.163825 0.040424 -0.135356 0.142418 0.032876 -0.126835 0.003297 0.071071 -0.048319 -0.112380 0.050274 -0.121497 -0.069607 -0.140583 -0.200914 0.051980 0.006397 0.039659 -0.061400 0.051372 0.211601 -0.011782 -0.046775 0.181286 -0.078907 0.008289 0.111405 -0.023762 0.144057 0.059127 0.101950 0.040884 0.221929 0.067223 -0.077002 
-elit, 0.014854 -0.144115 0.059652 -0.166863 -0.039853 -0.053055 -0.050871 -0.081943 -0.033843 0.100622 -0.001932 0.197455 -0.206962 -0.046310 0.076504 0.227915 0.131197 0.020249 0.062308 0.031611 0.020538 0.129379 0.062772 -0.080818 -0.047058 -0.122862 0.048964 0.024967 0.148961 0.100843 -0.057788 -0.074338 -0.162238 0.112964 0.013835 -0.027103 0.098049 -0.058940 0.009216 -0.042074 -0.135986 0.113235 -0.172456 -0.039815 0.094100 0.096050 0.037396 0.104064 -0.072998 -0.114924 0.038179 -0.106100 0.057037 0.033933 -0.024503 0.028462 0.167928 0.015421 -0.066223 -0.087256 0.088148 -0.095973 -0.045568 -0.101855 -0.106150 0.144545 0.023978 -0.119125 0.120548 0.027956 -0.102585 0.011659 0.056809 -0.036984 -0.094356 0.048186 -0.098885 -0.055935 -0.122887 -0.165585 0.038955 0.009028 0.039206 -0.050160 0.046803 0.168434 -0.010474 -0.041528 0.153982 -0.062740 0.010250 0.100717 -0.021405 0.117731 0.052218 0.084717 0.035598 0.183652 0.059246 -0.066583 
-cubilia 0.029011 -0.160296 0.074767 -0.166813 -0.041639 -0.049689 -0.052870 -0.088715 -0.038323 0.111871 -0.000981 0.221055 -0.223265 -0.055299 0.094727 0.251959 0.151256 0.027985 0.060051 0.031607 0.036598 0.143557 0.063590 -0.093296 -0.055261 -0.126753 0.048858 0.022345 0.165964 0.101106 -0.051076 -0.078305 -0.175052 0.122122 0.023623 -0.036498 0.098495 -0.059938 0.013883 -0.028922 -0.148120 0.124741 -0.186345 -0.043412 0.095729 0.107285 0.042927 0.109439 -0.070741 -0.118183 0.043159 -0.119939 0.066571 0.032102 -0.030980 0.028314 0.191677 0.017580 -0.080749 -0.089174 0.090674 -0.100353 -0.048222 -0.101193 -0.107695 0.152844 0.038220 -0.129342 0.122270 0.029506 -0.105830 0.000465 0.067343 -0.040401 -0.110274 0.047704 -0.119722 -0.065008 -0.123111 -0.180543 0.048753 -0.002057 0.028024 -0.052165 0.042317 0.183954 -0.016791 -0.039566 0.153122 -0.075880 0.011858 0.098803 -0.019344 0.138604 0.046055 0.097226 0.046370 0.199591 0.064466 -0.071420 
-Curae; 0.020969 -0.159731 0.071020 -0.189246 -0.033428 -0.055790 -0.051194 -0.100232 -0.032574 0.112051 -0.010471 0.229909 -0.239054 -0.057976 0.093424 0.266200 0.159348 0.026024 0.066558 0.034870 0.029998 0.147817 0.064499 -0.099098 -0.055195 -0.144995 0.058568 0.019211 0.183634 0.103061 -0.064838 -0.083080 -0.186467 0.128214 0.018254 -0.040189 0.107780 -0.066202 0.010761 -0.035889 -0.154771 0.133638 -0.200770 -0.048700 0.109143 0.112748 0.045652 0.111472 -0.077076 -0.122583 0.042386 -0.120353 0.072716 0.031254 -0.035823 0.041746 0.206158 0.019052 -0.082865 -0.094448 0.096616 -0.108396 -0.051784 -0.120216 -0.113945 0.161851 0.047455 -0.131561 0.137008 0.034741 -0.115377 0.003194 0.064197 -0.048700 -0.116140 0.055037 -0.122984 -0.066093 -0.128590 -0.191266 0.044891 0.009388 0.035660 -0.058928 0.046873 0.197048 -0.013414 -0.048245 0.168338 -0.068087 0.013438 0.110927 -0.022341 0.151104 0.051395 0.091823 0.039718 0.212773 0.065795 -0.074305 
-vehicula, 0.017354 -0.181786 0.071137 -0.194187 -0.048944 -0.060106 -0.062434 -0.096854 -0.038770 0.114186 -0.012221 0.233070 -0.254554 -0.053061 0.090902 0.270595 0.157060 0.025512 0.072039 0.027406 0.029544 0.148872 0.067629 -0.099653 -0.054545 -0.143411 0.060578 0.023424 0.178602 0.109304 -0.058549 -0.086370 -0.187219 0.134452 0.017979 -0.029096 0.116499 -0.068354 0.005420 -0.049420 -0.154163 0.136873 -0.201334 -0.049513 0.101040 0.116976 0.038857 0.116095 -0.079835 -0.134910 0.050954 -0.131560 0.059585 0.037960 -0.024911 0.037923 0.204270 0.012595 -0.082153 -0.094449 0.112124 -0.108307 -0.057197 -0.117700 -0.123430 0.165428 0.036041 -0.136018 0.146624 0.029574 -0.122239 0.012245 0.071986 -0.050079 -0.112924 0.051205 -0.131852 -0.067032 -0.150411 -0.202533 0.046693 0.007812 0.041039 -0.060668 0.053369 0.213658 -0.007326 -0.045845 0.185683 -0.078051 0.013334 0.123214 -0.020394 0.147215 0.055783 0.095440 0.045226 0.228981 0.076184 -0.074171 
-ante, 0.016209 -0.113099 0.049438 -0.122577 -0.027421 -0.040147 -0.039358 -0.070754 -0.027294 0.076463 -0.003048 0.155682 -0.167256 -0.031874 0.062681 0.191154 0.099680 0.011148 0.052961 0.014919 0.022609 0.103805 0.051482 -0.075187 -0.042937 -0.095093 0.041791 0.014624 0.114676 0.071569 -0.041548 -0.054178 -0.129779 0.086013 0.007798 -0.028228 0.067122 -0.046954 0.008676 -0.027030 -0.108470 0.088092 -0.127893 -0.025889 0.069621 0.078933 0.030631 0.079152 -0.048214 -0.086000 0.021466 -0.081227 0.042064 0.026573 -0.019519 0.026323 0.137647 0.013167 -0.052802 -0.063823 0.073558 -0.066855 -0.037310 -0.077880 -0.082449 0.107795 0.020524 -0.088613 0.096738 0.016179 -0.077469 -0.001140 0.045569 -0.027870 -0.075415 0.027492 -0.079628 -0.038896 -0.090671 -0.135388 0.033124 0.007802 0.028224 -0.038810 0.031877 0.138267 -0.005508 -0.033757 0.116440 -0.051497 0.010449 0.079842 -0.012097 0.096316 0.034540 0.062101 0.028676 0.144941 0.051331 -0.048980 
-orci, 0.019725 -0.170967 0.068481 -0.190785 -0.041567 -0.056812 -0.065650 -0.097946 -0.036273 0.114177 -0.006672 0.237654 -0.254050 -0.051833 0.089039 0.270488 0.159943 0.023682 0.071449 0.030117 0.023507 0.145344 0.063993 -0.099515 -0.052664 -0.138403 0.060343 0.025734 0.181430 0.108042 -0.063286 -0.080063 -0.194525 0.129616 0.017504 -0.036348 0.113065 -0.062722 0.010615 -0.040440 -0.158117 0.134191 -0.198916 -0.042936 0.101031 0.115345 0.044593 0.116826 -0.078367 -0.130616 0.046843 -0.121064 0.063716 0.036467 -0.027826 0.047189 0.210424 0.019017 -0.074912 -0.102081 0.100766 -0.114524 -0.055736 -0.120366 -0.124155 0.162637 0.038969 -0.131679 0.136965 0.033262 -0.125014 -0.000358 0.075470 -0.043620 -0.118926 0.054278 -0.115638 -0.065481 -0.133986 -0.202502 0.047366 0.008402 0.036938 -0.053649 0.057539 0.210637 -0.004777 -0.052635 0.188111 -0.088081 0.010438 0.122553 -0.020720 0.147448 0.059940 0.097374 0.052422 0.225195 0.076753 -0.070744 
-lorem, 0.016167 -0.138504 0.058051 -0.152115 -0.032183 -0.045048 -0.050060 -0.083927 -0.024835 0.095697 -0.010658 0.185738 -0.205127 -0.038463 0.079031 0.220788 0.124432 0.011656 0.061013 0.022407 0.016409 0.120734 0.060005 -0.084891 -0.041679 -0.104347 0.047766 0.017593 0.147058 0.086574 -0.050653 -0.066848 -0.158264 0.113840 0.019276 -0.026879 0.090825 -0.054772 0.008427 -0.042854 -0.127195 0.108686 -0.160837 -0.032618 0.081363 0.091423 0.034854 0.089882 -0.060234 -0.114145 0.028963 -0.104269 0.049531 0.030402 -0.021221 0.029768 0.167998 0.010275 -0.061464 -0.073660 0.091687 -0.079320 -0.038231 -0.093078 -0.094363 0.132862 0.022763 -0.116611 0.110776 0.023013 -0.093859 0.002560 0.065099 -0.035021 -0.102158 0.037363 -0.099629 -0.049824 -0.112866 -0.160348 0.049191 0.008484 0.037502 -0.039203 0.044380 0.168503 -0.012629 -0.042858 0.138757 -0.058064 0.014544 0.092284 -0.015265 0.115040 0.050263 0.072724 0.036221 0.174287 0.062007 -0.058991 
-aliquet, 0.016435 -0.148037 0.062638 -0.162009 -0.043065 -0.048265 -0.045802 -0.090179 -0.033034 0.100746 -0.005081 0.209033 -0.220726 -0.051103 0.079586 0.236046 0.135714 0.021643 0.061546 0.029383 0.028440 0.134992 0.061610 -0.089389 -0.043937 -0.117656 0.048211 0.025177 0.149454 0.093653 -0.060640 -0.072336 -0.161808 0.124370 0.020208 -0.029846 0.092436 -0.058517 0.005188 -0.040629 -0.136761 0.118857 -0.179994 -0.039457 0.087133 0.102486 0.035598 0.103520 -0.070603 -0.118494 0.043776 -0.105320 0.055374 0.026470 -0.021775 0.036897 0.174915 0.019075 -0.066428 -0.080044 0.097101 -0.089258 -0.041348 -0.097706 -0.106488 0.148304 0.033251 -0.117604 0.121000 0.030477 -0.106264 0.010350 0.063917 -0.041163 -0.101221 0.049056 -0.105728 -0.048978 -0.122244 -0.169062 0.048047 0.008057 0.036358 -0.048380 0.045748 0.186724 -0.013330 -0.042925 0.157358 -0.066273 0.009401 0.100013 -0.018037 0.135487 0.049922 0.082992 0.043641 0.200507 0.060592 -0.060236 
-turpis, 0.024047 -0.187880 0.080345 -0.202639 -0.052845 -0.058538 -0.068789 -0.103143 -0.034426 0.131975 -0.008752 0.250971 -0.266053 -0.065238 0.095688 0.287371 0.158112 0.026337 0.077867 0.034784 0.031179 0.154218 0.074315 -0.105064 -0.058016 -0.139195 0.056971 0.021198 0.184695 0.112006 -0.067910 -0.083294 -0.198582 0.132712 0.024105 -0.030134 0.107562 -0.063091 0.007370 -0.047286 -0.160045 0.136606 -0.211399 -0.044247 0.112973 0.113811 0.048997 0.122173 -0.086489 -0.130974 0.049440 -0.126404 0.061683 0.039614 -0.033491 0.042304 0.210560 0.018517 -0.077628 -0.104732 0.113311 -0.106202 -0.052935 -0.118312 -0.121223 0.165952 0.037877 -0.136847 0.146777 0.039406 -0.127164 0.007258 0.077961 -0.051800 -0.117665 0.057412 -0.124905 -0.069856 -0.139947 -0.204609 0.055659 0.007298 0.044479 -0.059698 0.050620 0.208209 -0.014102 -0.045714 0.180190 -0.087340 0.013093 0.119448 -0.019661 0.144744 0.063344 0.105045 0.045437 0.227762 0.076678 -0.077430 
-fermentum, 0.014651 -0.157604 0.069884 -0.172885 -0.043929 -0.048810 -0.056946 -0.088992 -0.035414 0.111273 -0.001242 0.213041 -0.233150 -0.049707 0.087976 0.246626 0.143230 0.017457 0.061569 0.029489 0.026599 0.135292 0.061694 -0.091652 -0.054449 -0.127302 0.053213 0.024003 0.160660 0.098279 -0.056343 -0.077745 -0.172239 0.123071 0.020960 -0.027628 0.100629 -0.055843 0.007424 -0.042167 -0.151013 0.118844 -0.182493 -0.036226 0.090567 0.108457 0.039253 0.108538 -0.072887 -0.122164 0.040640 -0.118140 0.061328 0.030945 -0.023841 0.037985 0.184575 0.016587 -0.074828 -0.087336 0.097444 -0.096155 -0.044386 -0.105578 -0.111556 0.153681 0.031400 -0.123092 0.129926 0.026410 -0.111216 0.009097 0.070230 -0.038727 -0.106436 0.042223 -0.113716 -0.059463 -0.125071 -0.187045 0.041765 0.005111 0.030674 -0.051614 0.050584 0.189084 -0.014440 -0.050798 0.160402 -0.075296 0.007455 0.111268 -0.024811 0.139080 0.060380 0.088189 0.046975 0.209410 0.069795 -0.068904 
-mattis. 0.009626 -0.101243 0.040616 -0.118462 -0.021137 -0.031415 -0.035827 -0.061249 -0.021830 0.064326 -0.005094 0.134310 -0.145568 -0.027149 0.055054 0.161259 0.092797 0.008912 0.044471 0.021239 0.021552 0.088053 0.040753 -0.060876 -0.032139 -0.075957 0.032030 0.015442 0.105254 0.066274 -0.039294 -0.050602 -0.108405 0.084858 0.008627 -0.024003 0.064369 -0.037102 -0.001826 -0.026402 -0.093277 0.073467 -0.123181 -0.026193 0.060240 0.067821 0.025834 0.071247 -0.048963 -0.076125 0.026852 -0.073487 0.043288 0.013210 -0.016616 0.021924 0.122211 0.013598 -0.052562 -0.052660 0.062790 -0.061940 -0.027969 -0.067048 -0.067459 0.093922 0.023391 -0.080878 0.085969 0.021516 -0.074783 0.000332 0.037368 -0.025696 -0.071817 0.033469 -0.072490 -0.036044 -0.082275 -0.115915 0.032747 0.000859 0.018472 -0.036070 0.027169 0.120199 -0.013703 -0.031082 0.102903 -0.045517 0.011069 0.070324 -0.016260 0.086282 0.037534 0.057885 0.029033 0.131549 0.040800 -0.037857 
-in. 0.021299 -0.164107 0.070117 -0.182114 -0.041922 -0.053944 -0.054109 -0.088319 -0.040406 0.110745 -0.009563 0.214948 -0.241760 -0.052741 0.078896 0.263867 0.141959 0.016313 0.062036 0.034374 0.025628 0.145298 0.060929 -0.097673 -0.059751 -0.134757 0.059832 0.025231 0.172869 0.104092 -0.055287 -0.075205 -0.175682 0.128215 0.014894 -0.034331 0.099830 -0.063128 0.010204 -0.036879 -0.150944 0.127133 -0.194553 -0.038554 0.100155 0.106668 0.041128 0.112174 -0.077483 -0.128519 0.045200 -0.117144 0.062208 0.036764 -0.022944 0.041969 0.188378 0.023406 -0.074388 -0.097590 0.102711 -0.107810 -0.052185 -0.108976 -0.118097 0.162798 0.030664 -0.123567 0.134129 0.035000 -0.120911 0.005597 0.062236 -0.046315 -0.108343 0.043766 -0.119687 -0.062208 -0.131612 -0.197089 0.044240 0.009187 0.036563 -0.051054 0.049547 0.199964 -0.015743 -0.046481 0.169022 -0.080438 0.003798 0.114714 -0.017527 0.133202 0.052525 0.091384 0.043896 0.212747 0.067132 -0.067532 
-sollicitudin, 0.015250 -0.146476 0.058044 -0.164418 -0.042972 -0.043568 -0.051426 -0.083585 -0.026587 0.104264 -0.006140 0.193192 -0.207962 -0.044780 0.074357 0.223928 0.128827 0.019837 0.066845 0.028829 0.019135 0.128824 0.057318 -0.081973 -0.041054 -0.107356 0.049075 0.017287 0.143848 0.087257 -0.050800 -0.064579 -0.150787 0.114894 0.018486 -0.032769 0.081749 -0.052272 0.003514 -0.042842 -0.130819 0.102620 -0.161725 -0.035662 0.079939 0.097329 0.033387 0.086323 -0.059129 -0.107896 0.033075 -0.103463 0.050342 0.029730 -0.030970 0.028011 0.162158 0.011586 -0.062232 -0.078677 0.080083 -0.087860 -0.043993 -0.090921 -0.095199 0.140245 0.021906 -0.108130 0.114697 0.024153 -0.101052 0.000353 0.058238 -0.035278 -0.095089 0.044916 -0.097685 -0.049525 -0.106844 -0.167845 0.038108 0.012456 0.027566 -0.047918 0.043585 0.161772 -0.012614 -0.035617 0.138587 -0.066163 0.005827 0.094241 -0.014886 0.120845 0.044890 0.079129 0.035843 0.177704 0.057976 -0.059881 
-maximus. 0.010397 -0.091849 0.046179 -0.109000 -0.020876 -0.028893 -0.029547 -0.060664 -0.023417 0.063312 -0.001703 0.134044 -0.141067 -0.033186 0.053334 0.146665 0.086997 0.011924 0.035807 0.019554 0.018268 0.080180 0.036649 -0.060919 -0.036396 -0.083906 0.036704 0.015915 0.098904 0.064811 -0.038907 -0.048451 -0.101541 0.080690 0.005444 -0.011989 0.064260 -0.034689 0.007507 -0.029620 -0.084400 0.070594 -0.109068 -0.029002 0.062379 0.062629 0.027865 0.063945 -0.051431 -0.071892 0.024161 -0.073115 0.031462 0.013440 -0.016610 0.017886 0.114537 0.006407 -0.050089 -0.057612 0.055108 -0.061594 -0.024218 -0.063824 -0.067582 0.087698 0.020709 -0.068022 0.077458 0.016593 -0.068286 0.000570 0.039458 -0.023165 -0.065863 0.023954 -0.066538 -0.036770 -0.079389 -0.107650 0.025329 0.005314 0.021002 -0.029271 0.028342 0.113360 -0.007098 -0.028905 0.096943 -0.046657 0.008453 0.065071 -0.016769 0.078196 0.034021 0.049027 0.026418 0.121738 0.042846 -0.043163 
-ultrices. 0.015138 -0.133049 0.055486 -0.147822 -0.038719 -0.039457 -0.047121 -0.077871 -0.024713 0.095689 -0.005324 0.180433 -0.188415 -0.044331 0.073351 0.209929 0.111634 0.010406 0.050922 0.019088 0.015699 0.117270 0.049907 -0.075690 -0.049460 -0.110444 0.044003 0.016417 0.135834 0.088292 -0.046617 -0.059673 -0.140954 0.102460 0.009356 -0.024282 0.085394 -0.044299 0.002866 -0.038177 -0.121939 0.103112 -0.155690 -0.033337 0.078209 0.092116 0.036798 0.091829 -0.060377 -0.104579 0.036297 -0.101538 0.054242 0.024067 -0.023737 0.036051 0.154895 0.014604 -0.057214 -0.073549 0.082645 -0.079806 -0.037453 -0.089783 -0.086560 0.128156 0.022526 -0.099735 0.107578 0.029429 -0.094278 0.000472 0.061019 -0.030168 -0.086414 0.042820 -0.088003 -0.049324 -0.107209 -0.158142 0.037603 0.004431 0.030068 -0.045310 0.042631 0.162047 -0.007193 -0.038197 0.139740 -0.057883 0.001402 0.083832 -0.020451 0.113415 0.049816 0.080068 0.031824 0.172773 0.061419 -0.052091 
-volutpat, 0.015574 -0.146106 0.058487 -0.167471 -0.037862 -0.053841 -0.051608 -0.086273 -0.029073 0.103634 -0.006769 0.202157 -0.219481 -0.049986 0.074019 0.235105 0.132651 0.020982 0.061606 0.025720 0.025282 0.131274 0.061245 -0.083620 -0.044366 -0.121322 0.046395 0.016379 0.155176 0.093462 -0.055696 -0.067932 -0.166220 0.114886 0.015250 -0.023163 0.095023 -0.062932 0.001265 -0.036493 -0.136860 0.116798 -0.175711 -0.040856 0.094628 0.093248 0.037171 0.101282 -0.068192 -0.110775 0.035234 -0.105421 0.061757 0.031383 -0.020708 0.033074 0.178387 0.017959 -0.070226 -0.090268 0.091990 -0.091251 -0.047087 -0.097956 -0.102928 0.141588 0.031092 -0.109121 0.115817 0.031809 -0.110371 -0.000316 0.060220 -0.040745 -0.096722 0.046780 -0.101115 -0.053060 -0.115026 -0.168317 0.042821 0.008110 0.032429 -0.052435 0.040211 0.175218 -0.006190 -0.039668 0.154780 -0.064830 0.010598 0.094904 -0.014591 0.120309 0.054376 0.081240 0.040010 0.191444 0.064051 -0.068539 
-elementum. 0.007664 -0.147572 0.063235 -0.157533 -0.032609 -0.054091 -0.051441 -0.087517 -0.029003 0.101731 -0.010786 0.197468 -0.206175 -0.049883 0.071631 0.225004 0.126245 0.020254 0.056967 0.023394 0.020725 0.129673 0.061684 -0.083539 -0.049261 -0.117252 0.051427 0.022299 0.152376 0.099188 -0.050094 -0.070131 -0.155691 0.119858 0.018718 -0.025955 0.095256 -0.056317 0.010724 -0.039861 -0.130833 0.116603 -0.164326 -0.039438 0.092135 0.097717 0.036209 0.097812 -0.072827 -0.115451 0.031681 -0.101735 0.051295 0.031832 -0.021296 0.032266 0.176812 0.014907 -0.070736 -0.078480 0.092189 -0.089935 -0.043361 -0.099430 -0.096238 0.134198 0.031071 -0.116833 0.119389 0.031266 -0.099517 0.010412 0.062892 -0.033848 -0.101363 0.039393 -0.104599 -0.047509 -0.121477 -0.165572 0.042824 0.004786 0.033382 -0.050216 0.041429 0.171571 -0.004773 -0.042937 0.152282 -0.069873 0.006044 0.102342 -0.023993 0.118912 0.052978 0.082576 0.033127 0.182816 0.065042 -0.060750 
-posuere, 0.019040 -0.162461 0.069869 -0.182375 -0.040715 -0.055919 -0.059429 -0.093814 -0.035771 0.117236 -0.003539 0.223793 -0.240069 -0.052584 0.086234 0.259331 0.139899 0.025138 0.071394 0.033666 0.022233 0.143682 0.061601 -0.095450 -0.053288 -0.128820 0.054002 0.027365 0.164954 0.111195 -0.056806 -0.078424 -0.176266 0.132816 0.017611 -0.030624 0.103836 -0.065626 0.008311 -0.044111 -0.146505 0.126770 -0.188921 -0.039585 0.102836 0.102185 0.036709 0.114711 -0.077336 -0.128272 0.036087 -0.117880 0.062032 0.029058 -0.028367 0.036131 0.197400 0.022900 -0.075753 -0.087620 0.100754 -0.100523 -0.053250 -0.106627 -0.108215 0.153846 0.030810 -0.131754 0.137327 0.031650 -0.112175 0.007029 0.064589 -0.044361 -0.111348 0.041810 -0.111693 -0.056410 -0.133798 -0.190799 0.047570 0.004435 0.038300 -0.053892 0.049156 0.192130 -0.015264 -0.045719 0.165606 -0.068448 0.007355 0.112105 -0.019455 0.131465 0.053139 0.090014 0.039293 0.213609 0.068067 -0.067551 
-sodales, 0.011774 -0.158447 0.066103 -0.176723 -0.039317 -0.051205 -0.056675 -0.091230 -0.035299 0.104959 -0.011931 0.213439 -0.230442 -0.056801 0.083385 0.254092 0.138006 0.023651 0.066094 0.027673 0.026693 0.141989 0.066811 -0.099467 -0.048103 -0.127877 0.055006 0.019495 0.164136 0.098756 -0.061310 -0.074252 -0.171841 0.124382 0.018176 -0.027836 0.096703 -0.062731 0.003821 -0.036682 -0.144241 0.118637 -0.183419 -0.044696 0.091921 0.097886 0.040203 0.104968 -0.076018 -0.121450 0.041204 -0.119143 0.058279 0.030360 -0.024161 0.039862 0.186677 0.016734 -0.071045 -0.091138 0.100003 -0.100837 -0.047516 -0.104218 -0.105881 0.151609 0.031274 -0.120801 0.129467 0.025487 -0.112939 0.001421 0.060711 -0.044485 -0.105715 0.045993 -0.113732 -0.056656 -0.131926 -0.184715 0.039664 0.004140 0.037847 -0.055007 0.051122 0.186804 -0.015082 -0.042331 0.171640 -0.071856 0.006069 0.110800 -0.023913 0.133183 0.055182 0.089593 0.041228 0.199766 0.067682 -0.063013 
-imperdiet. 0.015009 -0.171200 0.066658 -0.182200 -0.046256 -0.058936 -0.055480 -0.094206 -0.030909 0.119295 -0.013561 0.228701 -0.250894 -0.045459 0.082032 0.261367 0.146793 0.017690 0.066951 0.024087 0.020495 0.137485 0.063758 -0.096572 -0.055669 -0.134630 0.060209 0.022389 0.176489 0.106660 -0.059462 -0.076807 -0.188092 0.135926 0.018495 -0.028002 0.099209 -0.063702 0.007665 -0.039261 -0.155293 0.119783 -0.198496 -0.043572 0.096341 0.107369 0.043099 0.110542 -0.081372 -0.128486 0.044415 -0.118788 0.068193 0.034641 -0.029553 0.043054 0.191078 0.021276 -0.071542 -0.090922 0.101492 -0.098705 -0.044618 -0.113367 -0.109735 0.157625 0.029694 -0.129894 0.132962 0.028601 -0.122526 0.001964 0.075785 -0.049110 -0.114816 0.052360 -0.115661 -0.059957 -0.127102 -0.194775 0.041959 0.007848 0.032185 -0.049761 0.051395 0.190537 -0.008500 -0.051666 0.174049 -0.080395 0.009420 0.104456 -0.023342 0.140626 0.064542 0.098480 0.047296 0.216553 0.074892 -0.067174 
-molestie. 0.013848 -0.154771 0.062996 -0.166391 -0.039824 -0.044809 -0.054819 -0.095383 -0.032163 0.102932 -0.008840 0.214260 -0.220748 -0.049702 0.082905 0.245387 0.131870 0.013922 0.063882 0.031662 0.020564 0.128690 0.063762 -0.086155 -0.048209 -0.122602 0.051784 0.020107 0.162617 0.103635 -0.049893 -0.071775 -0.169716 0.123351 0.013049 -0.028548 0.091913 -0.059826 0.009329 -0.041298 -0.141533 0.114466 -0.176374 -0.035780 0.098639 0.105762 0.035769 0.108073 -0.069888 -0.124028 0.034964 -0.108718 0.065635 0.029619 -0.027105 0.033142 0.186228 0.015670 -0.071613 -0.091670 0.097008 -0.094700 -0.045081 -0.102866 -0.102804 0.146970 0.030092 -0.114630 0.128365 0.033484 -0.108490 0.002308 0.072645 -0.039522 -0.102335 0.048201 -0.103195 -0.058723 -0.131340 -0.182154 0.045724 0.014057 0.029539 -0.050476 0.050726 0.183749 -0.008388 -0.044791 0.162761 -0.068274 0.003691 0.103730 -0.018054 0.134917 0.058705 0.088573 0.038038 0.197976 0.063253 -0.068968 
-tempus, 0.017589 -0.161408 0.062869 -0.173197 -0.043056 -0.056475 -0.055368 -0.094867 -0.029298 0.105361 -0.006646 0.208061 -0.225998 -0.046983 0.083495 0.238643 0.137643 0.024594 0.066204 0.027706 0.023875 0.133299 0.060600 -0.087572 -0.050139 -0.128134 0.054662 0.019889 0.161192 0.099004 -0.055627 -0.077189 -0.172898 0.119316 0.016027 -0.026239 0.099242 -0.062381 0.004100 -0.040613 -0.149565 0.117827 -0.184920 -0.042310 0.095635 0.099628 0.039584 0.110983 -0.072721 -0.123961 0.045152 -0.116706 0.057092 0.028947 -0.031411 0.036577 0.193595 0.016904 -0.070976 -0.087956 0.095547 -0.092624 -0.047008 -0.108029 -0.110636 0.145291 0.028376 -0.121750 0.125191 0.028082 -0.107659 0.008720 0.064020 -0.046607 -0.110910 0.049359 -0.110819 -0.057423 -0.125640 -0.186702 0.046969 0.007178 0.038035 -0.047585 0.048109 0.185758 -0.012571 -0.043300 0.159677 -0.067386 0.010913 0.109133 -0.015551 0.127299 0.060398 0.091014 0.041360 0.205039 0.063689 -0.064391 
-dapibus, 0.020346 -0.178721 0.076762 -0.185162 -0.043800 -0.053916 -0.061459 -0.103241 -0.035863 0.116265 -0.004775 0.239329 -0.255881 -0.055736 0.091898 0.276154 0.154424 0.017422 0.073513 0.031291 0.027370 0.157260 0.065921 -0.101263 -0.057958 -0.133958 0.068086 0.026462 0.177950 0.105702 -0.065634 -0.086782 -0.190665 0.139706 0.018569 -0.032610 0.111781 -0.068679 0.011181 -0.045706 -0.156003 0.128560 -0.200389 -0.039059 0.103360 0.115556 0.044764 0.117472 -0.079219 -0.130909 0.048734 -0.121437 0.066369 0.038083 -0.032211 0.042537 0.212683 0.020607 -0.080255 -0.094453 0.107441 -0.107865 -0.051934 -0.113278 -0.126839 0.163831 0.040810 -0.134314 0.135663 0.031729 -0.126552 0.004084 0.076438 -0.042148 -0.121592 0.045027 -0.121618 -0.065032 -0.143683 -0.197454 0.054020 0.011279 0.038749 -0.060332 0.050106 0.205291 -0.010425 -0.053980 0.182452 -0.080604 0.011711 0.112725 -0.018416 0.146992 0.057983 0.101744 0.049230 0.222437 0.073777 -0.071445 
-faucibus, 0.017135 -0.152095 0.057845 -0.171194 -0.042783 -0.053177 -0.048197 -0.086182 -0.030998 0.109419 -0.005849 0.204941 -0.226057 -0.053569 0.080026 0.237464 0.135791 0.023878 0.066757 0.033453 0.024150 0.132321 0.061005 -0.085981 -0.043148 -0.121508 0.058753 0.017280 0.155233 0.095260 -0.057896 -0.074704 -0.171538 0.117726 0.013958 -0.025468 0.094300 -0.059020 0.006020 -0.040789 -0.141374 0.111250 -0.179186 -0.036011 0.088962 0.094650 0.038798 0.096905 -0.074718 -0.120306 0.037132 -0.106003 0.054398 0.033745 -0.019685 0.036354 0.178810 0.014258 -0.075703 -0.083289 0.092456 -0.094818 -0.050237 -0.106905 -0.104808 0.141618 0.028006 -0.118242 0.125858 0.029726 -0.105355 0.007314 0.061385 -0.043047 -0.098942 0.045939 -0.103172 -0.048505 -0.122250 -0.180654 0.043575 0.003603 0.032381 -0.049632 0.044787 0.184908 -0.007750 -0.048415 0.159401 -0.065869 0.009760 0.102695 -0.017355 0.124302 0.053825 0.080048 0.038124 0.196427 0.065962 -0.064120 
-pulvinar. 0.011048 -0.114348 0.046238 -0.134175 -0.025669 -0.043124 -0.041883 -0.070775 -0.026981 0.077897 -0.005148 0.155614 -0.166957 -0.040254 0.058341 0.181634 0.096794 0.014543 0.042929 0.024182 0.012007 0.092292 0.048181 -0.060947 -0.039669 -0.089658 0.042205 0.017480 0.122146 0.077306 -0.034476 -0.053013 -0.119724 0.086506 0.011581 -0.015641 0.067558 -0.038178 0.010395 -0.031682 -0.100754 0.089659 -0.131075 -0.032936 0.063703 0.075825 0.030883 0.080791 -0.054410 -0.089407 0.023718 -0.078704 0.043423 0.020944 -0.015577 0.024703 0.133733 0.007723 -0.054000 -0.063137 0.072139 -0.075743 -0.032785 -0.078158 -0.079546 0.111153 0.019252 -0.089873 0.092550 0.021339 -0.073762 0.001256 0.045118 -0.031584 -0.070440 0.036456 -0.073122 -0.040159 -0.089549 -0.134048 0.029211 0.008514 0.026639 -0.030974 0.027547 0.131344 -0.003302 -0.035314 0.117345 -0.052219 0.005728 0.073954 -0.013398 0.100083 0.044361 0.059155 0.025108 0.141825 0.045931 -0.048227 
-convallis. 0.011899 -0.116487 0.051106 -0.127083 -0.025101 -0.041882 -0.037161 -0.073467 -0.024766 0.083379 -0.008703 0.165997 -0.178256 -0.042838 0.067199 0.192675 0.110591 0.012767 0.055668 0.021325 0.021788 0.102625 0.045551 -0.066717 -0.040694 -0.093961 0.044131 0.018176 0.123928 0.073725 -0.044018 -0.061245 -0.125955 0.091329 0.013103 -0.019607 0.077500 -0.049879 0.003261 -0.028662 -0.107058 0.095079 -0.141957 -0.027935 0.074424 0.075330 0.029088 0.077805 -0.055101 -0.086904 0.027710 -0.080742 0.048817 0.018446 -0.019259 0.029037 0.138038 0.009493 -0.051378 -0.069200 0.072507 -0.075735 -0.029243 -0.081935 -0.080523 0.108078 0.022794 -0.085422 0.094916 0.026867 -0.083030 0.007367 0.049209 -0.025486 -0.075917 0.039205 -0.088287 -0.046441 -0.092308 -0.134683 0.029164 0.001795 0.022133 -0.036043 0.033173 0.139943 -0.008332 -0.032363 0.119122 -0.050901 0.009276 0.073367 -0.016425 0.095365 0.045946 0.067600 0.030402 0.154942 0.050761 -0.050211 
-tempor, 0.009696 -0.133785 0.053149 -0.140546 -0.032804 -0.042856 -0.040185 -0.076384 -0.028265 0.094257 -0.006120 0.174966 -0.188325 -0.041976 0.072279 0.213440 0.118815 0.010677 0.060255 0.018041 0.019724 0.119341 0.050360 -0.080576 -0.045252 -0.101454 0.046053 0.015334 0.133485 0.086394 -0.046253 -0.067264 -0.149344 0.105113 0.013865 -0.028188 0.082969 -0.053684 0.007429 -0.037241 -0.115692 0.095261 -0.148861 -0.026798 0.080212 0.086039 0.037045 0.084726 -0.064257 -0.101655 0.036551 -0.092492 0.049045 0.030213 -0.023391 0.032732 0.151086 0.010907 -0.056305 -0.074190 0.075702 -0.085904 -0.034875 -0.089428 -0.092716 0.126520 0.025006 -0.102935 0.108243 0.027526 -0.098582 0.004913 0.052913 -0.033301 -0.082332 0.034456 -0.089144 -0.047296 -0.104426 -0.157050 0.038155 0.002505 0.026032 -0.039817 0.042741 0.157051 -0.004148 -0.037397 0.130344 -0.060801 0.004733 0.084553 -0.019686 0.105911 0.049160 0.074922 0.029914 0.170099 0.055215 -0.058571 
-convallis, 0.009840 -0.127445 0.058254 -0.139623 -0.029857 -0.039441 -0.045969 -0.075883 -0.028684 0.083957 -0.001922 0.172705 -0.196163 -0.046283 0.070391 0.211593 0.122007 0.012943 0.051678 0.027351 0.017435 0.110955 0.054593 -0.080451 -0.045729 -0.104754 0.044598 0.017103 0.129839 0.081919 -0.045473 -0.060301 -0.142869 0.107312 0.011264 -0.021693 0.079405 -0.048987 0.004382 -0.035976 -0.115869 0.098071 -0.148813 -0.033528 0.079534 0.088456 0.029383 0.087828 -0.061597 -0.096628 0.036340 -0.097249 0.045912 0.023058 -0.020843 0.024681 0.151478 0.011631 -0.059740 -0.073568 0.076996 -0.077281 -0.044403 -0.087466 -0.087932 0.127087 0.024118 -0.095545 0.099818 0.024475 -0.086112 0.003341 0.056745 -0.029369 -0.090086 0.032785 -0.092435 -0.048315 -0.104990 -0.150712 0.038240 0.000568 0.032532 -0.037113 0.040000 0.157240 -0.004552 -0.039192 0.132502 -0.063553 0.002658 0.089671 -0.014060 0.106813 0.039587 0.072975 0.028553 0.165668 0.056890 -0.057262 
-dignissim. 0.015386 -0.129384 0.051816 -0.147570 -0.029813 -0.040525 -0.041941 -0.081800 -0.033904 0.092355 -0.004869 0.187167 -0.199384 -0.048385 0.073414 0.219895 0.122628 0.018020 0.061406 0.028184 0.023749 0.117745 0.055048 -0.082066 -0.040746 -0.111020 0.048529 0.023539 0.144278 0.093502 -0.044326 -0.061172 -0.149023 0.103717 0.015535 -0.026376 0.081256 -0.045222 0.004263 -0.036290 -0.117204 0.105934 -0.154098 -0.037939 0.082461 0.091892 0.029796 0.088350 -0.065908 -0.100607 0.032168 -0.093417 0.050333 0.030059 -0.024401 0.027692 0.155474 0.014028 -0.060419 -0.073880 0.078378 -0.088617 -0.039273 -0.094676 -0.094440 0.127258 0.023085 -0.102740 0.105986 0.026608 -0.092433 0.002528 0.057630 -0.034844 -0.085568 0.036675 -0.099428 -0.046341 -0.115234 -0.158827 0.037846 0.009224 0.027231 -0.041096 0.037146 0.156461 -0.009982 -0.037559 0.141413 -0.062190 0.007676 0.091063 -0.015790 0.110043 0.051245 0.074696 0.031646 0.172981 0.050527 -0.055629 
-porttitor, 0.019452 -0.152968 0.059272 -0.158689 -0.036466 -0.051317 -0.054104 -0.088020 -0.028741 0.094938 -0.005043 0.191464 -0.212876 -0.045134 0.073248 0.235150 0.133720 0.024083 0.063614 0.024295 0.019451 0.134738 0.063673 -0.082407 -0.049700 -0.122681 0.053028 0.016335 0.149194 0.091754 -0.053068 -0.069590 -0.162206 0.114245 0.013412 -0.031344 0.093882 -0.053635 0.004009 -0.036349 -0.132088 0.110502 -0.166049 -0.038880 0.088723 0.097362 0.035206 0.093065 -0.061591 -0.109630 0.042517 -0.110410 0.054736 0.028420 -0.023240 0.033153 0.169500 0.016728 -0.073185 -0.087807 0.087580 -0.097454 -0.042844 -0.099919 -0.100465 0.141363 0.028113 -0.108554 0.120224 0.024350 -0.100485 0.003980 0.057703 -0.036972 -0.097842 0.047724 -0.099501 -0.058000 -0.124048 -0.173281 0.042059 0.004078 0.035289 -0.050833 0.040533 0.174504 -0.013613 -0.042500 0.155319 -0.067971 0.010783 0.096405 -0.017867 0.121626 0.045486 0.083924 0.034868 0.187461 0.058360 -0.059674 
-cursus, 0.019766 -0.145998 0.054850 -0.155113 -0.030432 -0.044848 -0.045002 -0.075482 -0.028049 0.093059 -0.004737 0.187443 -0.205744 -0.043485 0.071679 0.221956 0.126751 0.018285 0.056024 0.029947 0.025369 0.123365 0.056620 -0.077416 -0.042594 -0.109127 0.046111 0.018090 0.145046 0.085170 -0.052934 -0.063282 -0.157092 0.105774 0.018458 -0.023546 0.090302 -0.056943 0.004740 -0.040721 -0.128881 0.107066 -0.164971 -0.037773 0.085389 0.097666 0.036301 0.094505 -0.062289 -0.106139 0.038960 -0.102662 0.051659 0.032165 -0.024554 0.032152 0.169223 0.016726 -0.061848 -0.081258 0.085651 -0.091816 -0.038671 -0.095520 -0.098417 0.138486 0.024927 -0.106699 0.115907 0.031767 -0.095198 0.008001 0.053664 -0.034907 -0.088289 0.038909 -0.099245 -0.045053 -0.111335 -0.156198 0.037027 0.003391 0.030791 -0.048764 0.043305 0.167674 -0.014307 -0.045724 0.139819 -0.062373 0.002516 0.093154 -0.020300 0.117887 0.049596 0.073640 0.035815 0.178706 0.052072 -0.056334 
-consectetur, 0.018955 -0.146826 0.060258 -0.155032 -0.032852 -0.047905 -0.046123 -0.080065 -0.034460 0.101712 -0.003312 0.191278 -0.215714 -0.044432 0.075207 0.228752 0.128945 0.018918 0.058393 0.028404 0.024013 0.126127 0.064978 -0.082753 -0.052251 -0.116339 0.048460 0.018661 0.152713 0.093006 -0.053420 -0.073034 -0.156231 0.111286 0.010557 -0.025694 0.093924 -0.050513 0.004179 -0.038264 -0.125726 0.110297 -0.167828 -0.042925 0.082159 0.092536 0.037972 0.099711 -0.064297 -0.112559 0.043246 -0.105252 0.057162 0.033935 -0.026787 0.033455 0.176371 0.016625 -0.068237 -0.081123 0.086625 -0.094500 -0.041437 -0.096283 -0.096442 0.138779 0.025509 -0.113977 0.118723 0.025728 -0.101799 0.005367 0.063947 -0.034757 -0.091406 0.039014 -0.099911 -0.056671 -0.117492 -0.167603 0.045884 0.009900 0.026831 -0.048163 0.041968 0.173380 -0.013391 -0.037787 0.157499 -0.067444 0.003707 0.100869 -0.015497 0.122978 0.047344 0.085926 0.036494 0.188135 0.061538 -0.067280 
-laoreet, 0.015455 -0.133006 0.053712 -0.149399 -0.032064 -0.048641 -0.044318 -0.079048 -0.025256 0.085625 -0.005392 0.185512 -0.190050 -0.037306 0.064562 0.208279 0.123786 0.015300 0.056893 0.021182 0.023031 0.115634 0.051366 -0.080014 -0.044924 -0.103714 0.043160 0.023748 0.141354 0.092376 -0.044582 -0.067585 -0.151722 0.110621 0.016068 -0.026649 0.091632 -0.054991 0.005889 -0.038624 -0.128484 0.110085 -0.159191 -0.038289 0.083506 0.094528 0.035954 0.096307 -0.065223 -0.109368 0.038615 -0.107564 0.058051 0.031021 -0.025656 0.036064 0.158885 0.015664 -0.058302 -0.077859 0.087274 -0.090954 -0.042323 -0.091821 -0.091706 0.131686 0.028323 -0.105944 0.115169 0.022370 -0.099258 0.002256 0.054873 -0.036839 -0.095232 0.045938 -0.095903 -0.045897 -0.114048 -0.156087 0.035021 0.000962 0.028741 -0.051148 0.046753 0.163663 -0.008416 -0.040366 0.151704 -0.066593 0.003173 0.092324 -0.021092 0.118765 0.054040 0.078388 0.032621 0.176930 0.060839 -0.059414 
-ac. 0.008627 -0.118668 0.047143 -0.133849 -0.031872 -0.034640 -0.046152 -0.065877 -0.030764 0.082970 -0.010677 0.164952 -0.178054 -0.035652 0.065799 0.196012 0.104973 0.010758 0.053385 0.021762 0.018479 0.108841 0.047069 -0.072926 -0.041053 -0.100753 0.045979 0.020907 0.125403 0.078976 -0.042493 -0.064726 -0.137348 0.094751 0.012132 -0.028272 0.075815 -0.041499 0.006451 -0.035499 -0.113966 0.098186 -0.145231 -0.034865 0.073824 0.077283 0.026681 0.076678 -0.057269 -0.097430 0.033303 -0.085924 0.043515 0.024824 -0.022430 0.023961 0.149734 0.013348 -0.050526 -0.060194 0.076840 -0.070910 -0.037913 -0.077991 -0.084537 0.117488 0.021937 -0.098565 0.095149 0.028797 -0.082837 -0.000995 0.047976 -0.035824 -0.080058 0.033996 -0.085872 -0.046301 -0.096850 -0.145321 0.038616 0.010564 0.022242 -0.039377 0.033450 0.143239 -0.009676 -0.040386 0.125540 -0.060199 0.010408 0.084607 -0.021947 0.096633 0.041579 0.064692 0.036381 0.160746 0.050370 -0.053134 
-finibus, 0.015236 -0.153516 0.066574 -0.176417 -0.039490 -0.051916 -0.057718 -0.091277 -0.032710 0.102628 -0.009205 0.209858 -0.231901 -0.047686 0.082092 0.240043 0.132220 0.012382 0.068835 0.025216 0.021058 0.134578 0.060955 -0.090002 -0.050442 -0.117873 0.057019 0.023343 0.155457 0.100666 -0.055104 -0.079170 -0.165173 0.120290 0.022678 -0.031641 0.092038 -0.062675 0.009917 -0.044836 -0.143205 0.112374 -0.173567 -0.042687 0.091838 0.106053 0.042351 0.103192 -0.074791 -0.116477 0.042849 -0.110241 0.062885 0.026157 -0.027240 0.040210 0.186915 0.017655 -0.072881 -0.086257 0.095223 -0.096363 -0.040323 -0.100620 -0.105975 0.149649 0.034829 -0.112172 0.129610 0.035747 -0.111617 -0.000469 0.066655 -0.037836 -0.099094 0.041117 -0.103107 -0.049665 -0.122563 -0.183168 0.039178 0.013174 0.032467 -0.050091 0.047076 0.175509 -0.012881 -0.044625 0.157913 -0.073662 0.011389 0.101893 -0.021947 0.123439 0.059083 0.082548 0.045274 0.191031 0.060605 -0.059692 
-tristique, 0.018175 -0.153542 0.059505 -0.170754 -0.038579 -0.052820 -0.055567 -0.090126 -0.031930 0.109072 -0.005791 0.213371 -0.227947 -0.051300 0.082163 0.242797 0.135353 0.012131 0.063297 0.028682 0.020977 0.132478 0.058492 -0.085919 -0.046669 -0.121910 0.050474 0.023469 0.158390 0.093322 -0.057269 -0.075930 -0.166516 0.126578 0.019071 -0.024117 0.097825 -0.059172 0.006815 -0.034067 -0.144361 0.118346 -0.174214 -0.041694 0.089528 0.096537 0.039298 0.106525 -0.072253 -0.113964 0.034943 -0.107209 0.063418 0.032724 -0.028329 0.035814 0.180120 0.017984 -0.069353 -0.086113 0.091741 -0.092018 -0.042526 -0.100052 -0.106346 0.140537 0.034946 -0.111390 0.126102 0.029432 -0.112234 0.007762 0.067439 -0.040517 -0.104514 0.045455 -0.105970 -0.049385 -0.118583 -0.174466 0.040127 0.005930 0.031485 -0.052341 0.047388 0.183393 -0.007041 -0.042534 0.156391 -0.067987 0.005544 0.104226 -0.023675 0.131747 0.056844 0.087792 0.039041 0.203289 0.062597 -0.069235 
-vestibulum, 0.012086 -0.159754 0.064117 -0.180550 -0.039283 -0.050473 -0.055374 -0.092061 -0.030647 0.108322 -0.002194 0.215486 -0.231264 -0.051813 0.082093 0.248932 0.139565 0.020289 0.068998 0.027407 0.031197 0.140457 0.069651 -0.099640 -0.043927 -0.131206 0.057863 0.027006 0.161178 0.101627 -0.057587 -0.075502 -0.176542 0.122396 0.013107 -0.029560 0.098546 -0.065557 0.011236 -0.041740 -0.146560 0.127312 -0.190295 -0.034135 0.092005 0.111970 0.042499 0.106816 -0.074414 -0.127314 0.038617 -0.114515 0.058762 0.028136 -0.024071 0.037530 0.196567 0.017697 -0.073501 -0.091464 0.100043 -0.102522 -0.051563 -0.108889 -0.108004 0.156143 0.034138 -0.129119 0.124631 0.024178 -0.107948 0.013664 0.066927 -0.046809 -0.108849 0.046392 -0.116416 -0.051533 -0.127030 -0.179041 0.045710 0.004716 0.033028 -0.050683 0.052828 0.196423 -0.005718 -0.040822 0.172769 -0.071795 0.009558 0.108793 -0.017832 0.142175 0.056521 0.093577 0.041769 0.214662 0.068227 -0.070299 
-facilisis, 0.006566 -0.135906 0.061346 -0.154961 -0.039177 -0.053133 -0.047071 -0.081657 -0.026949 0.097477 -0.009740 0.197194 -0.210317 -0.044137 0.071302 0.228389 0.121996 0.021722 0.062356 0.020381 0.023598 0.122138 0.059098 -0.087356 -0.039848 -0.105044 0.048607 0.015676 0.149427 0.086372 -0.055824 -0.062844 -0.156016 0.109460 0.018014 -0.022816 0.088834 -0.046143 0.010250 -0.033600 -0.119331 0.103239 -0.157322 -0.027671 0.077374 0.090428 0.029210 0.090585 -0.062224 -0.100638 0.028283 -0.102632 0.054165 0.024099 -0.017267 0.033650 0.159420 0.010001 -0.059423 -0.068972 0.084735 -0.079442 -0.045283 -0.088178 -0.099693 0.124480 0.029293 -0.108443 0.107690 0.024797 -0.094447 0.006074 0.051891 -0.029689 -0.097481 0.037484 -0.094260 -0.043947 -0.115264 -0.156639 0.041507 0.009140 0.028480 -0.046131 0.036420 0.162187 -0.010392 -0.042766 0.141839 -0.053947 0.012935 0.083940 -0.020654 0.114282 0.049317 0.078948 0.031138 0.174298 0.054087 -0.055235 
-hendrerit, 0.010171 -0.135812 0.056970 -0.146510 -0.033780 -0.046737 -0.044408 -0.078329 -0.027045 0.085293 -0.001849 0.173550 -0.178828 -0.040247 0.064276 0.196806 0.109743 0.014298 0.051868 0.025360 0.028469 0.110907 0.050794 -0.076406 -0.036220 -0.102669 0.048878 0.013675 0.128028 0.081333 -0.050085 -0.058662 -0.136447 0.099050 0.014724 -0.023271 0.085928 -0.051191 0.003946 -0.037036 -0.111033 0.102235 -0.148419 -0.035355 0.082427 0.081713 0.029041 0.080201 -0.059232 -0.098651 0.038815 -0.087275 0.045748 0.030217 -0.019497 0.023739 0.153180 0.014887 -0.057812 -0.065442 0.078087 -0.080681 -0.040682 -0.087957 -0.086502 0.119138 0.022577 -0.101436 0.101020 0.018587 -0.086667 0.006348 0.050754 -0.036450 -0.087540 0.032749 -0.095184 -0.046090 -0.105778 -0.150902 0.040929 0.002796 0.022737 -0.041868 0.042837 0.148730 -0.013431 -0.039507 0.132142 -0.051387 0.010495 0.090451 -0.011218 0.102395 0.044033 0.072439 0.032926 0.159677 0.056448 -0.048998 
-gravida. 0.019496 -0.144794 0.062540 -0.164734 -0.036041 -0.050749 -0.052647 -0.088837 -0.029226 0.099954 -0.010482 0.194926 -0.221032 -0.047723 0.079288 0.233975 0.131801 0.015732 0.061637 0.025423 0.021047 0.131990 0.060920 -0.090156 -0.052483 -0.115850 0.048900 0.024532 0.149981 0.088661 -0.053124 -0.072463 -0.160131 0.112904 0.012329 -0.030865 0.084912 -0.061087 0.008218 -0.036373 -0.130402 0.110409 -0.169181 -0.032415 0.091362 0.101554 0.032046 0.101111 -0.070765 -0.116176 0.033147 -0.110910 0.055727 0.030676 -0.029098 0.037942 0.173663 0.011814 -0.067496 -0.076470 0.087841 -0.091030 -0.040578 -0.098243 -0.103698 0.139606 0.026835 -0.110986 0.118515 0.031230 -0.110482 0.005255 0.061307 -0.040606 -0.094965 0.046380 -0.105845 -0.054352 -0.113465 -0.170824 0.048432 0.001220 0.032291 -0.050035 0.044671 0.170468 -0.011787 -0.039073 0.148184 -0.070864 0.005110 0.096585 -0.020693 0.125742 0.049268 0.083637 0.034450 0.191607 0.065972 -0.066431 
-porta, 0.014656 -0.173419 0.074907 -0.188788 -0.041409 -0.052269 -0.054127 -0.092671 -0.034441 0.117899 -0.007832 0.231470 -0.246485 -0.057114 0.090426 0.273184 0.153674 0.024690 0.075663 0.028482 0.029047 0.146056 0.070205 -0.103337 -0.056840 -0.144949 0.057130 0.023932 0.182809 0.110674 -0.065044 -0.086138 -0.187489 0.136107 0.013921 -0.038515 0.107764 -0.067341 0.008446 -0.044112 -0.155384 0.132488 -0.201997 -0.044888 0.105912 0.117666 0.049135 0.112790 -0.078691 -0.137982 0.051727 -0.123992 0.061443 0.029032 -0.029514 0.037611 0.208849 0.022259 -0.084261 -0.093514 0.110026 -0.112370 -0.049784 -0.117136 -0.121197 0.172492 0.032211 -0.129731 0.135388 0.034201 -0.119799 0.004936 0.068883 -0.045781 -0.108615 0.045788 -0.122790 -0.066855 -0.136000 -0.204923 0.053314 0.003740 0.037127 -0.055575 0.057326 0.208168 -0.008744 -0.054107 0.185178 -0.076602 0.006511 0.118851 -0.018807 0.149087 0.058203 0.102420 0.041966 0.225045 0.072246 -0.068493 
-finibus. 0.014087 -0.105396 0.043653 -0.117744 -0.024348 -0.040088 -0.031096 -0.066865 -0.022648 0.067012 -0.004447 0.142741 -0.162204 -0.038703 0.062081 0.173707 0.095553 0.008219 0.049271 0.022329 0.014035 0.090447 0.041163 -0.060332 -0.032997 -0.084444 0.033219 0.018846 0.101777 0.069008 -0.035945 -0.052100 -0.111989 0.080012 0.013030 -0.014602 0.066932 -0.039241 0.005101 -0.028165 -0.094148 0.081655 -0.119498 -0.023121 0.061741 0.071447 0.026530 0.066499 -0.045458 -0.080359 0.023687 -0.071930 0.038279 0.023267 -0.014570 0.029891 0.121949 0.011265 -0.047226 -0.061202 0.067500 -0.065837 -0.034297 -0.063993 -0.072276 0.102332 0.020298 -0.077110 0.083672 0.023057 -0.070006 0.006421 0.043198 -0.029848 -0.068183 0.031329 -0.069004 -0.037637 -0.088629 -0.124938 0.027823 0.003027 0.021144 -0.035332 0.037400 0.127422 -0.003548 -0.028983 0.108536 -0.047012 0.007205 0.068697 -0.008765 0.087751 0.033234 0.062719 0.025710 0.132272 0.049124 -0.041465 
-pulvinar, 0.017227 -0.149474 0.065749 -0.170468 -0.038486 -0.047739 -0.047863 -0.083727 -0.032720 0.103774 0.000105 0.200361 -0.219519 -0.051597 0.075215 0.230493 0.131441 0.024271 0.062528 0.030196 0.020441 0.126968 0.056620 -0.088254 -0.044987 -0.114397 0.046696 0.018526 0.149936 0.095874 -0.048588 -0.074759 -0.161447 0.124507 0.009492 -0.027807 0.098004 -0.061725 0.008399 -0.041366 -0.130165 0.115495 -0.173364 -0.036783 0.086499 0.103308 0.036206 0.097631 -0.072859 -0.111506 0.040331 -0.104140 0.051963 0.032859 -0.025658 0.031968 0.180829 0.020528 -0.072339 -0.079387 0.088256 -0.090018 -0.046829 -0.095491 -0.101535 0.145720 0.036125 -0.120456 0.116506 0.023646 -0.109230 0.001462 0.062731 -0.036958 -0.097205 0.041015 -0.107910 -0.053901 -0.122127 -0.170780 0.043978 0.005470 0.028013 -0.048900 0.046085 0.175789 -0.004649 -0.039166 0.148205 -0.062878 0.003655 0.102773 -0.019824 0.124895 0.052132 0.086421 0.042836 0.185352 0.059804 -0.057096 
-aliquam. 0.014578 -0.116717 0.046330 -0.132936 -0.033156 -0.036426 -0.043280 -0.068702 -0.027080 0.082014 -0.006872 0.157249 -0.177684 -0.030247 0.061827 0.185987 0.112509 0.013208 0.049749 0.024330 0.016732 0.101725 0.047391 -0.067403 -0.039974 -0.096288 0.038612 0.015422 0.111075 0.071825 -0.042029 -0.049201 -0.121704 0.092117 0.015359 -0.022345 0.073623 -0.041179 0.001824 -0.025460 -0.108145 0.089832 -0.133817 -0.031845 0.064691 0.074742 0.025245 0.074971 -0.053531 -0.082467 0.024511 -0.080529 0.043310 0.029289 -0.020919 0.026756 0.133518 0.015528 -0.052941 -0.057652 0.066020 -0.067766 -0.035980 -0.077572 -0.077756 0.102705 0.024180 -0.086550 0.094045 0.020817 -0.087067 0.004717 0.052800 -0.028307 -0.081047 0.033357 -0.073949 -0.043418 -0.085224 -0.135616 0.034338 0.009876 0.022452 -0.040035 0.038225 0.129136 -0.012272 -0.033452 0.118858 -0.055682 0.001655 0.080312 -0.012176 0.098232 0.037079 0.069817 0.034753 0.144954 0.044601 -0.049032 
-sagittis. 0.011511 -0.096057 0.037928 -0.104184 -0.029945 -0.031750 -0.037839 -0.059638 -0.018057 0.062915 -0.003516 0.136657 -0.140316 -0.032869 0.048165 0.157496 0.082678 0.007874 0.036840 0.017823 0.013800 0.082273 0.040655 -0.061189 -0.036056 -0.077594 0.036766 0.018056 0.100287 0.060175 -0.033196 -0.044421 -0.104439 0.075457 0.007647 -0.013837 0.058924 -0.042466 0.006527 -0.029815 -0.084725 0.078664 -0.115208 -0.029091 0.053279 0.065767 0.029184 0.067345 -0.041929 -0.073941 0.027369 -0.072197 0.036477 0.017195 -0.016755 0.022830 0.111485 0.005718 -0.046055 -0.056005 0.054021 -0.061778 -0.026825 -0.064442 -0.063966 0.089618 0.022257 -0.068443 0.077987 0.016609 -0.063780 0.000703 0.041495 -0.024884 -0.063879 0.028320 -0.063372 -0.038088 -0.079655 -0.111490 0.021818 0.011189 0.023645 -0.029051 0.023732 0.107973 -0.001110 -0.024062 0.092622 -0.042716 -0.002554 0.066628 -0.008814 0.074496 0.036125 0.051636 0.023899 0.113200 0.037705 -0.039383 
-rutrum. 0.008445 -0.094152 0.040921 -0.102105 -0.021772 -0.027665 -0.030301 -0.058019 -0.019132 0.062363 -0.004112 0.132767 -0.132295 -0.029801 0.050338 0.151280 0.086623 0.012658 0.042739 0.020401 0.016538 0.085017 0.037954 -0.052524 -0.029018 -0.077109 0.033149 0.015152 0.102873 0.060976 -0.036399 -0.040792 -0.101737 0.074555 0.015144 -0.014335 0.063869 -0.031104 -0.000282 -0.024028 -0.088523 0.071635 -0.106612 -0.023820 0.058996 0.065771 0.024083 0.063828 -0.049655 -0.078258 0.026062 -0.070196 0.035941 0.023698 -0.017140 0.018343 0.112905 0.012745 -0.047745 -0.058883 0.060958 -0.061187 -0.030360 -0.066387 -0.066882 0.089412 0.024311 -0.065998 0.080832 0.021502 -0.064924 0.005539 0.036366 -0.020811 -0.061845 0.036174 -0.072414 -0.039823 -0.074767 -0.115141 0.027659 0.003800 0.022733 -0.036725 0.033065 0.117050 -0.010692 -0.023752 0.100277 -0.049096 0.000223 0.059372 -0.016871 0.085175 0.028618 0.054121 0.021076 0.126990 0.038406 -0.042762 
-pellentesque, 0.017018 -0.189022 0.074542 -0.209022 -0.053873 -0.061783 -0.060116 -0.110116 -0.037580 0.125614 -0.010028 0.253769 -0.278383 -0.062253 0.098497 0.297439 0.171064 0.023627 0.076071 0.036232 0.029747 0.161555 0.079955 -0.114551 -0.064557 -0.149555 0.062829 0.027988 0.200670 0.120709 -0.071127 -0.094519 -0.210964 0.150286 0.015006 -0.034842 0.115214 -0.069001 0.010835 -0.050027 -0.171476 0.149545 -0.223431 -0.048957 0.112878 0.123244 0.049449 0.132509 -0.087082 -0.141303 0.051805 -0.136899 0.070728 0.042375 -0.036327 0.047705 0.228360 0.019777 -0.092738 -0.111712 0.118915 -0.118854 -0.054551 -0.131713 -0.128134 0.183556 0.038261 -0.140344 0.149729 0.031329 -0.137777 0.007783 0.079180 -0.051229 -0.131944 0.051429 -0.138993 -0.065374 -0.152232 -0.219494 0.055497 0.006564 0.034731 -0.066627 0.058472 0.230270 -0.009577 -0.052983 0.196054 -0.084160 0.009819 0.133671 -0.023886 0.165311 0.073791 0.112415 0.050080 0.252127 0.079526 -0.081216 
-lacinia, 0.010618 -0.124725 0.047721 -0.136788 -0.032097 -0.035688 -0.043592 -0.073762 -0.021647 0.078644 -0.000097 0.163753 -0.173329 -0.038867 0.065498 0.184165 0.107782 0.017497 0.044500 0.026296 0.020107 0.100302 0.047501 -0.065598 -0.039877 -0.097373 0.043297 0.016654 0.127617 0.080672 -0.048144 -0.056064 -0.130865 0.096853 0.016308 -0.019207 0.074056 -0.049733 0.009676 -0.033045 -0.104176 0.092508 -0.135234 -0.035943 0.072212 0.079153 0.026270 0.076583 -0.051695 -0.096524 0.035103 -0.088400 0.040583 0.021896 -0.016452 0.025581 0.142092 0.008979 -0.051435 -0.066951 0.072417 -0.073602 -0.040298 -0.076864 -0.081239 0.114192 0.020323 -0.097100 0.095717 0.019822 -0.087691 -0.000491 0.054535 -0.030763 -0.079174 0.037576 -0.084420 -0.047894 -0.102396 -0.138040 0.031325 0.002078 0.023389 -0.042479 0.038287 0.141234 -0.002768 -0.037284 0.124115 -0.051685 0.007150 0.084328 -0.010291 0.099054 0.045722 0.067060 0.026630 0.157973 0.052019 -0.049991 
-rutrum, 0.016514 -0.158824 0.062706 -0.176677 -0.036875 -0.053644 -0.056623 -0.093066 -0.033473 0.112563 -0.008901 0.221588 -0.240589 -0.047653 0.084277 0.258062 0.144822 0.014095 0.067832 0.028666 0.026406 0.148788 0.061567 -0.097725 -0.051134 -0.128140 0.052590 0.022349 0.168489 0.101042 -0.063791 -0.079999 -0.178726 0.128339 0.017456 -0.035040 0.098226 -0.065947 0.002947 -0.049218 -0.146738 0.120611 -0.189045 -0.042043 0.103678 0.109382 0.045462 0.112534 -0.081836 -0.125464 0.043323 -0.118595 0.063122 0.028219 -0.031874 0.038368 0.194687 0.013528 -0.071711 -0.091955 0.104203 -0.106862 -0.045973 -0.113344 -0.118723 0.152823 0.032853 -0.123400 0.130589 0.033634 -0.118095 0.008768 0.065992 -0.046026 -0.113939 0.050016 -0.116353 -0.062437 -0.127976 -0.192707 0.048690 0.000639 0.035504 -0.054464 0.054264 0.189412 -0.009905 -0.044900 0.168694 -0.079445 0.005654 0.107563 -0.021754 0.133725 0.055976 0.096766 0.042248 0.211545 0.075212 -0.068617 
-rhoncus, 0.013983 -0.122639 0.045999 -0.128750 -0.031485 -0.042993 -0.041930 -0.069312 -0.022237 0.081285 -0.003158 0.160175 -0.171952 -0.042654 0.065831 0.186565 0.103328 0.019008 0.049963 0.023657 0.016383 0.100489 0.045608 -0.065313 -0.034495 -0.092810 0.038247 0.016545 0.124373 0.075059 -0.038184 -0.054108 -0.125335 0.090970 0.013567 -0.016420 0.078581 -0.039953 0.001244 -0.029643 -0.107744 0.091502 -0.135083 -0.027638 0.071514 0.075123 0.030363 0.082667 -0.052950 -0.092152 0.028938 -0.087197 0.048398 0.026429 -0.014619 0.023092 0.140851 0.010040 -0.054825 -0.065609 0.070678 -0.075596 -0.034904 -0.081239 -0.083473 0.107519 0.024208 -0.089965 0.092913 0.023923 -0.083775 0.005988 0.044426 -0.031421 -0.076701 0.033060 -0.083514 -0.046400 -0.098293 -0.137315 0.029086 0.006038 0.023507 -0.037676 0.040176 0.139957 -0.004879 -0.033225 0.119802 -0.052005 0.008647 0.082896 -0.011892 0.097807 0.045225 0.060371 0.033971 0.156615 0.051756 -0.050075 
-congue, 0.015869 -0.125728 0.047231 -0.134226 -0.027327 -0.043234 -0.046889 -0.069429 -0.024059 0.082473 -0.009762 0.160237 -0.177327 -0.033688 0.059433 0.183829 0.100361 0.016991 0.050973 0.021600 0.022751 0.106864 0.047235 -0.070629 -0.042465 -0.091457 0.045496 0.013692 0.119925 0.074955 -0.037715 -0.054830 -0.134466 0.093477 0.013272 -0.025305 0.073426 -0.043477 0.008304 -0.029396 -0.111169 0.084587 -0.139196 -0.028624 0.066229 0.082039 0.029125 0.075492 -0.057785 -0.087995 0.030964 -0.087351 0.046477 0.026480 -0.017700 0.024320 0.147589 0.008412 -0.050238 -0.064807 0.076994 -0.068461 -0.034006 -0.084075 -0.088147 0.113023 0.024540 -0.091729 0.095576 0.026520 -0.086695 0.001023 0.054642 -0.031169 -0.079883 0.027983 -0.083054 -0.040071 -0.097124 -0.141653 0.032807 0.007472 0.023403 -0.033054 0.037893 0.141197 -0.010282 -0.034241 0.124503 -0.056761 0.010518 0.076669 -0.015479 0.095109 0.044951 0.063278 0.033788 0.150878 0.043974 -0.053681 
-varius, 0.012713 -0.149146 0.059503 -0.162293 -0.040588 -0.048335 -0.050929 -0.087524 -0.033689 0.095865 -0.004606 0.201313 -0.209202 -0.046024 0.077048 0.229699 0.128663 0.012006 0.062130 0.022343 0.021434 0.122640 0.056244 -0.086631 -0.050827 -0.121608 0.050698 0.017533 0.152596 0.098222 -0.055249 -0.074770 -0.160897 0.109449 0.011318 -0.026900 0.090898 -0.055576 0.008479 -0.041959 -0.136370 0.114812 -0.166598 -0.037324 0.090855 0.090987 0.037550 0.098199 -0.066389 -0.113121 0.038549 -0.110612 0.051076 0.033455 -0.024500 0.034206 0.175809 0.020800 -0.065741 -0.080162 0.089093 -0.091994 -0.040208 -0.100579 -0.105066 0.144877 0.027328 -0.107921 0.120358 0.022547 -0.103745 0.007875 0.063772 -0.033247 -0.098845 0.043788 -0.105733 -0.051406 -0.124232 -0.165102 0.043878 0.005538 0.026305 -0.051835 0.040385 0.176659 -0.004525 -0.045099 0.157932 -0.066589 0.008702 0.099424 -0.021180 0.126274 0.046475 0.087052 0.037646 0.190199 0.057648 -0.059453 
-maximus, 0.013443 -0.176565 0.075767 -0.197809 -0.043898 -0.056256 -0.064697 -0.109319 -0.036400 0.123607 -0.012147 0.240865 -0.254853 -0.057679 0.087642 0.283904 0.162282 0.027333 0.068719 0.031980 0.030839 0.151317 0.077465 -0.105380 -0.059080 -0.141509 0.065840 0.025447 0.186042 0.117540 -0.066378 -0.079203 -0.193646 0.142523 0.021145 -0.030620 0.112694 -0.063622 0.005217 -0.043586 -0.162766 0.140195 -0.204051 -0.045048 0.112755 0.114885 0.046711 0.121011 -0.077258 -0.139632 0.048714 -0.128554 0.067216 0.032280 -0.023815 0.044165 0.211839 0.020181 -0.085073 -0.099891 0.112416 -0.118120 -0.056127 -0.121562 -0.128553 0.165314 0.035252 -0.140886 0.143840 0.032741 -0.127255 0.009752 0.076761 -0.044712 -0.116458 0.056521 -0.129598 -0.068392 -0.143790 -0.204554 0.046224 0.003401 0.033618 -0.061686 0.055724 0.211317 -0.012325 -0.049704 0.187003 -0.080842 0.004255 0.115429 -0.026573 0.147199 0.064061 0.098190 0.048926 0.234781 0.071742 -0.071958 
-ultricies, 0.017137 -0.153110 0.063934 -0.178284 -0.042360 -0.054650 -0.054722 -0.092357 -0.033009 0.106230 -0.007551 0.210014 -0.231311 -0.054491 0.086728 0.244642 0.139536 0.017148 0.066907 0.029885 0.025162 0.132529 0.065512 -0.089122 -0.052865 -0.122014 0.056513 0.021168 0.162715 0.103432 -0.053956 -0.081638 -0.170303 0.121954 0.017038 -0.026667 0.104263 -0.057242 0.011376 -0.042077 -0.140601 0.124047 -0.181755 -0.036352 0.095735 0.106444 0.035736 0.106321 -0.069607 -0.118338 0.039066 -0.111037 0.061844 0.030284 -0.022539 0.039226 0.193200 0.017691 -0.078801 -0.086266 0.094626 -0.104033 -0.053922 -0.108170 -0.106803 0.150898 0.033946 -0.119127 0.128086 0.025038 -0.110500 0.006277 0.062752 -0.038265 -0.103120 0.049461 -0.111219 -0.059570 -0.132418 -0.178438 0.040278 0.005146 0.033032 -0.050552 0.053301 0.190030 -0.011533 -0.047324 0.162326 -0.074284 0.004911 0.106158 -0.014890 0.134960 0.059287 0.084134 0.036187 0.199783 0.061168 -0.070499 
-eleifend, 0.011397 -0.118712 0.052796 -0.132729 -0.026485 -0.037249 -0.034305 -0.067589 -0.021380 0.079145 -0.002051 0.159549 -0.169790 -0.035942 0.057926 0.184398 0.100746 0.015385 0.046790 0.025479 0.022296 0.100478 0.051936 -0.068666 -0.034650 -0.090566 0.036310 0.012085 0.118834 0.068408 -0.044472 -0.052362 -0.127261 0.085472 0.014811 -0.017760 0.071837 -0.044217 0.011284 -0.034565 -0.106118 0.090573 -0.129501 -0.025355 0.071017 0.081414 0.030383 0.080997 -0.051255 -0.089507 0.028049 -0.079136 0.040686 0.022316 -0.021218 0.029631 0.139240 0.015616 -0.059555 -0.066036 0.074104 -0.073205 -0.033484 -0.075655 -0.083484 0.113204 0.023558 -0.094292 0.097585 0.025235 -0.081700 0.000734 0.050697 -0.029043 -0.077223 0.032816 -0.078274 -0.039100 -0.096882 -0.135445 0.029172 -0.001049 0.023120 -0.041211 0.030237 0.138334 -0.002743 -0.032682 0.123723 -0.050397 0.003365 0.074217 -0.016643 0.102050 0.044935 0.070587 0.028405 0.147193 0.047654 -0.049771 
-molestie, 0.017237 -0.161235 0.067213 -0.175324 -0.043255 -0.055953 -0.050963 -0.092647 -0.033380 0.110683 -0.002212 0.212097 -0.229608 -0.048937 0.080324 0.248608 0.141152 0.015702 0.064788 0.031754 0.027716 0.136267 0.062974 -0.097686 -0.049913 -0.128288 0.058443 0.024486 0.157593 0.100113 -0.061805 -0.077512 -0.176604 0.123361 0.014041 -0.025429 0.099755 -0.057283 0.006743 -0.044435 -0.143684 0.115511 -0.179201 -0.039035 0.092531 0.106693 0.041654 0.108181 -0.071977 -0.118819 0.036169 -0.112934 0.061656 0.035172 -0.020246 0.032933 0.185990 0.013888 -0.067918 -0.088643 0.098023 -0.095912 -0.048260 -0.107285 -0.114244 0.149802 0.031028 -0.120185 0.129888 0.026707 -0.108796 0.008011 0.067792 -0.039473 -0.105644 0.040070 -0.108060 -0.053421 -0.125109 -0.190520 0.045937 0.012163 0.034498 -0.052205 0.053379 0.184194 -0.015484 -0.042844 0.168464 -0.067974 0.013149 0.108042 -0.018589 0.137483 0.052525 0.089570 0.037918 0.209892 0.062191 -0.068738 
diff --git a/tools/batch/README.md b/tools/batch/README.md
new file mode 100644
index 0000000000..1ba8cfc5e1
--- /dev/null
+++ b/tools/batch/README.md
@@ -0,0 +1,50 @@
+# Launch AWS Jobs
+For contributors of GluonNLP, you can try to launch jobs via AWS Batch.
+Once you've correctly configured the AWS CLI, you may use the following command in which `remote` and `source-ref` denote the repository url and branch name respectively.
+
+```
+python3 submit-job.py \
+--region us-east-1 \
+--job-type p3.2x \
+--source-ref master \
+--work-dir tools/batch \
+--remote https://github.com/dmlc/gluon-nlp \
+--command "python3 hello_world.py" \
+--wait
+```
+
+## Conversion Toolkits
+Following the instruction of [converting scripts](../../scripts/conversion_toolkits), several pre-trained models could be converted through the corresponding conversion tool as below command where `${MODEL_TYPE}` could be selected from `[albert, bert, electra, mobilebert, bart, robert, xmlr]`.
+```bash
+bash run_batch_conversion ${MODEL_TYPE}
+```
+## Fine-tuning Downstream Tasks
+
+### Question Answering
+We can quickly deploy an experiment via [squad fine-tuning scripts](../../scripts/question_answering#squad) as
+
+```bash
+bash run_batch_squad.sh ${MODEL_NAME}
+```
+
+in which `${MODEL_NAME}` is the name of available pre-trained models listing as following:
+|    MODEL_NAME      |
+|:------------------:|
+| uncased_bert_base  |
+| uncased_bert_large |
+| albert_base        |
+| albert_large       |
+| albert_xlarge      |  
+| albert_xxlarge     |
+| electra_small      |
+| electra_base       |
+| electra_large      |
+| roberta_base       |
+| roberta_large      |
+| mobilebert         |
+
+### Machine Translation
+
+### Text Translation
+
+## Pre-trained Model Training
diff --git a/tools/batch/batch_states/test.sh b/tools/batch/batch_states/test.sh
new file mode 100755
index 0000000000..2b6c2d36ba
--- /dev/null
+++ b/tools/batch/batch_states/test.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+# Shell script for installing dependencies and running test on AWS Batch
+
+echo $PWD
+
+sudo apt-get install libopenblas-dev
+python3 -m pip install --user --quiet -upgrade pip
+python3 -m pip install --user --quiet setuptools pytest pytest-cov contextvars
+python3 -m pip install --upgrade --quiet cython
+python3 -m pip install --pre --user --quiet "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
+python3 -m pip install --user --quiet -e .[extras]
+python3 -m pytest --cov=./ --cov-report=xml --durations=50 --device="gpu" /gluon-nlp/tests/
diff --git a/tools/batch/docker/Dockerfile b/tools/batch/docker/Dockerfile
new file mode 100644
index 0000000000..c0b8592ca7
--- /dev/null
+++ b/tools/batch/docker/Dockerfile
@@ -0,0 +1,27 @@
+FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+      build-essential \
+      locales \
+      cmake \
+      wget \
+      subversion \
+      git \
+      curl \
+      vim \
+      unzip \
+      sudo \
+      ca-certificates \
+      libjpeg-dev \
+      libpng-dev \
+      libfreetype6-dev \
+      python3-dev \
+      python3-pip \
+      python3-setuptools \
+      libxft-dev &&\
+  rm -rf /var/lib/apt/lists/*
+
+RUN pip3 install --upgrade pip && pip3 install awscli && pip3 install --pre 'mxnet-cu102' -f https://dist.mxnet.io/python
+RUN git clone https://github.com/dmlc/gluon-nlp
+WORKDIR gluon-nlp
+ADD gluon_nlp_job.sh .
diff --git a/tools/batch/docker/README.md b/tools/batch/docker/README.md
new file mode 100644
index 0000000000..80efb0d9d1
--- /dev/null
+++ b/tools/batch/docker/README.md
@@ -0,0 +1,22 @@
+# Updating the Docker for AWS Batch.
+
+Our current batch job dockers are in 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1. To
+update the docker:
+- update the Dockerfile
+- Make sure docker and docker-compose, as well as the docker python package are installed.
+- Export the AWS account credentials as environment variables
+- CD to the same folder as the Dockerfile and execute the following:
+
+```
+# this executes a command that logs into ECR.
+$(aws ecr get-login --no-include-email --region us-east-1)
+
+# builds the Dockerfile as gluon-nlp-1 docker.
+docker build -t gluon-nlp-1 .
+
+# tags the recent build as gluon-nlp-1:latest, which AWS batch pulls from.
+docker tag gluon-nlp-1:latest 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest
+
+# pushes the change
+docker push 747303060528.dkr.ecr.us-east-1.amazonaws.com/gluon-nlp-1:latest
+```
diff --git a/ci/batch/docker/gluon_nlp_job.sh b/tools/batch/docker/gluon_nlp_job.sh
similarity index 51%
rename from ci/batch/docker/gluon_nlp_job.sh
rename to tools/batch/docker/gluon_nlp_job.sh
index 38be81db67..0c41085416 100755
--- a/ci/batch/docker/gluon_nlp_job.sh
+++ b/tools/batch/docker/gluon_nlp_job.sh
@@ -7,12 +7,11 @@ echo "jobQueue: $AWS_BATCH_JQ_NAME"
 echo "computeEnvironment: $AWS_BATCH_CE_NAME"
 
 SOURCE_REF=$1
-CONDA_ENV=$2
-WORK_DIR=$3
-COMMAND=$4
-SAVED_OUTPUT=$5
-SAVE_PATH=$6
-REMOTE=$7
+WORK_DIR=$2
+COMMAND=$3
+SAVED_OUTPUT=$4
+SAVE_PATH=$5
+REMOTE=$6
 
 if [ ! -z $REMOTE ]; then
     git remote set-url origin $REMOTE
@@ -20,20 +19,15 @@ fi;
 
 git fetch origin $SOURCE_REF:working
 git checkout working
-conda env update --prune -p conda/$CONDA_ENV -f env/$CONDA_ENV.yml
-source activate ./conda/$CONDA_ENV
-pip install -v -e .
-python -m spacy download en
-python -m spacy download de
-python -m nltk.downloader all
-pip install awscli
+pip3 install -v -e .[extras]
+TIMESTAMP=$(date '+%Y%m%dT%H%M%SZ')
 
 cd $WORK_DIR
 /bin/bash -o pipefail -c "$COMMAND"
 COMMAND_EXIT_CODE=$?
 if [[ -f $SAVED_OUTPUT ]]; then
-  aws s3 cp $SAVED_OUTPUT s3://gluon-nlp-staging/$SAVE_PATH;
+  aws s3 cp $SAVED_OUTPUT s3://gluon-nlp-dev/batch/$TIMESTAMP/$AWS_BATCH_JOB_ID/$SAVE_PATH;
 elif [[ -d $SAVED_OUTPUT ]]; then
-  aws s3 cp --recursive $SAVED_OUTPUT s3://gluon-nlp-staging/$SAVE_PATH;
+  aws s3 cp --recursive $SAVED_OUTPUT s3://gluon-nlp-dev/batch/$TIMESTAMP/$AWS_BATCH_JOB_ID/$SAVE_PATH;
 fi;
 exit $COMMAND_EXIT_CODE
diff --git a/tools/batch/hello_world.py b/tools/batch/hello_world.py
new file mode 100644
index 0000000000..f84e06d6b9
--- /dev/null
+++ b/tools/batch/hello_world.py
@@ -0,0 +1,10 @@
+from gluonnlp.data.vocab import Vocab
+import mxnet as mx
+
+
+if __name__ == '__main__':
+    vocab = Vocab(['Hello', 'World!'], unk_token=None)
+    print(vocab)
+    num_gpus = mx.context.num_gpus()
+    print('Number of GPUS:', num_gpus)
+
diff --git a/tools/batch/run_batch_conversion.sh b/tools/batch/run_batch_conversion.sh
new file mode 100644
index 0000000000..167ab31f03
--- /dev/null
+++ b/tools/batch/run_batch_conversion.sh
@@ -0,0 +1,10 @@
+MODEL_NAME=$1
+
+python3 tools/batch/submit-job.py \
+    --region us-east-1 \
+    --source-ref master \
+    --job-type g4dn.4x \
+    --name convert_${MODEL_NAME} \
+    --work-dir scripts/conversion_toolkits \
+    --remote https://github.com/dmlc/gluon-nlp/ \
+    --command 'bash convert_'${MODEL_NAME}'.sh | tee stdout.log'
diff --git a/tools/batch/run_batch_squad.sh b/tools/batch/run_batch_squad.sh
new file mode 100644
index 0000000000..340676ef18
--- /dev/null
+++ b/tools/batch/run_batch_squad.sh
@@ -0,0 +1,10 @@
+MODEL_NAME=$1
+
+python3 submit-job.py \
+    --region us-east-1 \
+    --source-ref master \
+    --job-type g4dn.12x \
+    --name test_squad_${MODEL_NAME} \
+    --work-dir scripts/question_answering \
+    --remote https://github.com/dmlc/gluon-nlp/ \
+    --command 'bash commands/run_squad2_'${MODEL_NAME}'.sh | tee stdout.log'
diff --git a/ci/batch/submit-job.py b/tools/batch/submit-job.py
similarity index 79%
rename from ci/batch/submit-job.py
rename to tools/batch/submit-job.py
index ec99e44f47..f11fd50b64 100644
--- a/ci/batch/submit-job.py
+++ b/tools/batch/submit-job.py
@@ -15,16 +15,16 @@
 parser.add_argument('--region', help='Default region when creating new connections', type=str,
                     default=None)
 parser.add_argument('--name', help='name of the job', type=str, default='dummy')
-parser.add_argument('--job-queue', help='name of the job queue to submit this job', type=str,
-                    default='gluon-nlp-jobs')
-parser.add_argument('--job-definition', help='name of the job job definition', type=str,
-                    default='gluon-nlp-jobs:8')
+parser.add_argument('--job-type', help='type of job to submit.', type=str,
+                    choices=['g4dn.4x', 'g4dn.8x', 'g4dn.12x', 'g4dn.16x',
+                             'p3.2x', 'p3.8x', 'p3.16x', 'p3dn.24x',
+                             'c5n.18x'], default='g4dn.4x')
 parser.add_argument('--source-ref',
                     help='ref in GluonNLP main github. e.g. master, refs/pull/500/head',
                     type=str, default='master')
 parser.add_argument('--work-dir',
-                    help='working directory inside the repo. e.g. scripts/sentiment_analysis',
-                    type=str, default='scripts/bert')
+                    help='working directory inside the repo. e.g. scripts/preprocess',
+                    type=str, default='scripts/preprocess')
 parser.add_argument('--saved-output',
                     help='output to be saved, relative to working directory. '
                          'it can be either a single file or a directory',
@@ -32,9 +32,6 @@
 parser.add_argument('--save-path',
                     help='s3 path where files are saved.',
                     type=str, default='batch/temp/{}'.format(datetime.now().isoformat()))
-parser.add_argument('--conda-env',
-                    help='conda environment preset to use.',
-                    type=str, default='gpu/py3')
 parser.add_argument('--command', help='command to run', type=str,
                     default='git rev-parse HEAD | tee stdout.log')
 parser.add_argument('--remote',
@@ -44,11 +41,13 @@
                     'Non-zero exit code if job fails.', action='store_true')
 parser.add_argument('--timeout', help='job timeout in seconds', default=None, type=int)
 
+
 args = parser.parse_args()
 
 session = boto3.Session(profile_name=args.profile, region_name=args.region)
 batch, cloudwatch = [session.client(service_name=sn) for sn in ['batch', 'logs']]
 
+
 def printLogs(logGroupName, logStreamName, startTime):
     kwargs = {'logGroupName': logGroupName,
               'logStreamName': logStreamName,
@@ -72,38 +71,52 @@ def printLogs(logGroupName, logStreamName, startTime):
     return lastTimestamp
 
 
-def getLogStream(logGroupName, jobName, jobId):
-    response = cloudwatch.describe_log_streams(
-        logGroupName=logGroupName,
-        logStreamNamePrefix=jobName + '/' + jobId
-    )
-    logStreams = response['logStreams']
-    if not logStreams:
-        return ''
-    else:
-        return logStreams[0]['logStreamName']
-
 def nowInMillis():
     endTime = long(total_seconds(datetime.utcnow() - datetime(1970, 1, 1))) * 1000
     return endTime
 
 
+job_definitions = {
+    'g4dn.4x': 'gluon-nlp-1-jobs:5',
+    'g4dn.8x': 'gluon-nlp-1-jobs:4',
+    'g4dn.12x': 'gluon-nlp-1-4gpu-jobs:1',
+    'g4dn.16x': 'gluon-nlp-1-jobs:3',
+    'p3.2x': 'gluon-nlp-1-jobs:11',
+    'p3.8x': 'gluon-nlp-1-4gpu-jobs:2',
+    'p3.16x': 'gluon-nlp-1-8gpu-jobs:1',
+    'p3dn.24x': 'gluon-nlp-1-8gpu-jobs:2',
+    'c5n.18x': 'gluon-nlp-1-cpu-jobs:2',
+}
+
+job_queues = {
+    'g4dn.4x': 'g4dn',
+    'g4dn.8x': 'g4dn',
+    'g4dn.12x': 'g4dn-multi-gpu',
+    'g4dn.16x': 'g4dn',
+    'p3.2x': 'p3',
+    'p3.8x': 'p3-4gpu',
+    'p3.16x': 'p3-8gpu',
+    'p3dn.24x': 'p3dn-8gpu',
+    'c5n.18x': 'c5n',
+}
+
+
 def main():
     spin = ['-', '/', '|', '\\', '-', '/', '|', '\\']
     logGroupName = '/aws/batch/job'
 
     jobName = re.sub('[^A-Za-z0-9_\-]', '', args.name)[:128]  # Enforce AWS Batch jobName rules
-    jobQueue = args.job_queue
-    jobDefinition = args.job_definition
+    jobType = args.job_type
+    jobQueue = job_queues[jobType]
+    jobDefinition = job_definitions[jobType]
     command = args.command.split()
     wait = args.wait
 
-    parameters={
+    parameters = {
         'SOURCE_REF': args.source_ref,
         'WORK_DIR': args.work_dir,
         'SAVED_OUTPUT': args.saved_output,
         'SAVE_PATH': args.save_path,
-        'CONDA_ENV': args.conda_env,
         'COMMAND': args.command,
         'REMOTE': args.remote
     }
@@ -124,7 +137,6 @@ def main():
     running = False
     status_set = set()
     startTime = 0
-
     while wait:
         time.sleep(random.randint(5, 10))
         describeJobsResponse = batch.describe_jobs(jobs=[jobId])
@@ -136,10 +148,10 @@ def main():
             sys.exit(status == 'FAILED')
 
         elif status == 'RUNNING':
-            logStreamName = getLogStream(logGroupName, jobName, jobId)
+            logStreamName = describeJobsResponse['jobs'][0]['container']['logStreamName']
             if not running:
                 running = True
-                print('\rJob [{} - {}] is RUNNING.'.format(jobName, jobId))
+                print('\rJob [{}, {}] is RUNNING.'.format(jobName, jobId))
                 if logStreamName:
                     print('Output [{}]:\n {}'.format(logStreamName, '=' * 80))
             if logStreamName:
@@ -150,5 +162,6 @@ def main():
             sys.stdout.flush()
             spinner += 1
 
+
 if __name__ == '__main__':
     main()
diff --git a/ci/batch/wait-job.py b/tools/batch/wait-job.py
similarity index 100%
rename from ci/batch/wait-job.py
rename to tools/batch/wait-job.py
diff --git a/tools/diagnose.py b/tools/diagnose.py
deleted file mode 100644
index c0f5840378..0000000000
--- a/tools/diagnose.py
+++ /dev/null
@@ -1,196 +0,0 @@
-#!/usr/bin/env python
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Diagnose script for checking OS/hardware/python/pip/mxnet/network.
-The output of this script can be a very good hint to issue/problem.
-"""
-import platform, subprocess, sys, os
-import socket, time
-try:
-    from urllib.request import urlopen
-    from urllib.parse import urlparse
-except ImportError:
-    from urlparse import urlparse
-    from urllib2 import urlopen
-import argparse
-
-def parse_args():
-    """Parse arguments."""
-    parser = argparse.ArgumentParser(
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-        description='Diagnose script for checking the current system.')
-    choices = ['python', 'pip', 'gluonnlp', 'mxnet', 'os', 'hardware', 'network']
-    for choice in choices:
-        parser.add_argument('--' + choice, default=1, type=int,
-                            help='Diagnose {}.'.format(choice))
-    parser.add_argument('--region', default='', type=str,
-                        help="Additional sites in which region(s) to test. \
-                        Specify 'cn' for example to test mirror sites in China.")
-    parser.add_argument('--timeout', default=10, type=int,
-                        help="Connection test timeout threshold, 0 to disable.")
-    args = parser.parse_args()
-    return args
-
-URLS = {
-    'MXNet': 'https://github.com/apache/incubator-mxnet',
-    'GluonNLP GitHub': 'https://github.com/dmlc/gluon-nlp',
-    'GluonNLP': 'http://gluon-nlp.mxnet.io',
-    'D2L': 'http://d2l.ai',
-    'D2L (zh-cn)': 'http://zh.d2l.ai',
-    'FashionMNIST': 'https://repo.mxnet.io/gluon/dataset/fashion-mnist/train-labels-idx1-ubyte.gz',
-    'PYPI': 'https://pypi.python.org/pypi/pip',
-    'Conda': 'https://repo.continuum.io/pkgs/free/',
-}
-REGIONAL_URLS = {
-    'cn': {
-        'PYPI(douban)': 'https://pypi.douban.com/',
-        'Conda(tsinghua)': 'https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/free/',
-    }
-}
-
-def test_connection(name, url, timeout=10):
-    """Simple connection test"""
-    urlinfo = urlparse(url)
-    start = time.time()
-    try:
-        ip = socket.gethostbyname(urlinfo.netloc)
-    except Exception as e:
-        print('Error resolving DNS for {}: {}, {}'.format(name, url, e))
-        return
-    dns_elapsed = time.time() - start
-    start = time.time()
-    try:
-        _ = urlopen(url, timeout=timeout)
-    except Exception as e:
-        print("Error open {}: {}, {}, DNS finished in {} sec.".format(name, url, e, dns_elapsed))
-        return
-    load_elapsed = time.time() - start
-    print("Timing for {}: {}, DNS: {:.4f} sec, LOAD: {:.4f} sec.".format(name, url, dns_elapsed, load_elapsed))
-
-def check_python():
-    print('----------Python Info----------')
-    print('Version      :', platform.python_version())
-    print('Compiler     :', platform.python_compiler())
-    print('Build        :', platform.python_build())
-    print('Arch         :', platform.architecture())
-
-def check_pip():
-    print('------------Pip Info-----------')
-    try:
-        import pip
-        print('Version      :', pip.__version__)
-        print('Directory    :', os.path.dirname(pip.__file__))
-    except ImportError:
-        print('No corresponding pip install for current python.')
-
-def check_gluonnlp():
-    print('----------GluonNLP Info-----------')
-    try:
-        import gluonnlp
-        print('Version      :', gluonnlp.__version__)
-    except ImportError:
-        print('No GluonNLP installed.')
-    except Exception as e:
-        import traceback
-        if not isinstance(e, IOError):
-            print("An error occured trying to import gluonnlp.")
-        print(traceback.format_exc())
-
-def check_mxnet():
-    print('----------MXNet Info-----------')
-    try:
-        import mxnet
-        print('Version      :', mxnet.__version__)
-        mx_dir = os.path.dirname(mxnet.__file__)
-        print('Directory    :', mx_dir)
-        print('Num GPUs     :', mxnet.util.get_gpu_count())
-        commit_hash = os.path.join(mx_dir, 'COMMIT_HASH')
-        with open(commit_hash, 'r') as f:
-            ch = f.read().strip()
-            print('Commit Hash   :', ch)
-    except ImportError:
-        print('No MXNet installed.')
-    except IOError:
-        print('Hashtag not found. Not installed from pre-built package.')
-    except Exception as e:
-        import traceback
-        if not isinstance(e, IOError):
-            print("An error occured trying to import mxnet.")
-            print("This is very likely due to missing missing or incompatible library files.")
-        print(traceback.format_exc())
-
-def check_os():
-    print('----------System Info----------')
-    print('Platform     :', platform.platform())
-    print('system       :', platform.system())
-    print('node         :', platform.node())
-    print('release      :', platform.release())
-    print('version      :', platform.version())
-
-def check_hardware():
-    print('----------Hardware Info----------')
-    print('machine      :', platform.machine())
-    print('processor    :', platform.processor())
-    if sys.platform.startswith('darwin'):
-        pipe = subprocess.Popen(('sysctl', '-a'), stdout=subprocess.PIPE)
-        output = pipe.communicate()[0]
-        for line in output.split(b'\n'):
-            if b'brand_string' in line or b'features' in line:
-                print(line.strip())
-    elif sys.platform.startswith('linux'):
-        subprocess.call(['lscpu'])
-    elif sys.platform.startswith('win32'):
-        subprocess.call(['wmic', 'cpu', 'get', 'name'])
-
-def check_network(args):
-    print('----------Network Test----------')
-    if args.timeout > 0:
-        print('Setting timeout: {}'.format(args.timeout))
-        socket.setdefaulttimeout(10)
-    for region in args.region.strip().split(','):
-        r = region.strip().lower()
-        if not r:
-            continue
-        if r in REGIONAL_URLS:
-            URLS.update(REGIONAL_URLS[r])
-        else:
-            import warnings
-            warnings.warn('Region {} do not need specific test, please refer to global sites.'.format(r))
-    for name, url in URLS.items():
-        test_connection(name, url, args.timeout)
-
-if __name__ == '__main__':
-    args = parse_args()
-    if args.python:
-        check_python()
-
-    if args.pip:
-        check_pip()
-
-    if args.mxnet:
-        check_mxnet()
-
-    if args.os:
-        check_os()
-
-    if args.hardware:
-        check_hardware()
-
-    if args.network:
-        check_network(args)
diff --git a/tools/docker/README.md b/tools/docker/README.md
new file mode 100644
index 0000000000..6b90b0121d
--- /dev/null
+++ b/tools/docker/README.md
@@ -0,0 +1,46 @@
+# Docker Support in GluonNLP
+We provide the [Docker](https://www.docker.com/) container with everything set up to run GluonNLP.
+With the prebuilt docker image, there is no need to worry about the operating systems or system dependencies. 
+You can launch a [JupyterLab](https://jupyterlab.readthedocs.io/en/stable/) development environment 
+and try out to use GluonNLP to solve your problem.
+
+## Run Docker
+You can run the docker with the following command.
+
+```
+docker pull gluonai/gluon-nlp:gpu-latest
+docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --shm-size=2g gluonai/gluon-nlp:gpu-latest
+```
+
+Here, we open the ports 8888, 8787, 8786, which are used for connecting to JupyterLab. 
+Also, we set `--shm-size` to `2g`. This sets the shared memory storage to 2GB. Since NCCL will 
+create shared memory segments, this argument is essential for the JupyterNotebook to work with NCCL. 
+(See also https://github.com/NVIDIA/nccl/issues/290).
+
+The folder structure of the docker image will be
+```
+/workspace/
+├── gluonnlp
+├── horovod
+├── mxnet
+├── notebooks
+├── data
+```
+
+If you have a multi-GPU instance, e.g., [g4dn.12xlarge](https://aws.amazon.com/ec2/instance-types/g4/),
+[p2.8xlarge](https://aws.amazon.com/ec2/instance-types/p2/),
+[p3.8xlarge](https://aws.amazon.com/ec2/instance-types/p3/), you can try to run the following 
+command to verify the installation of horovod + MXNet
+
+```
+docker run --gpus all --rm -it --shm-size=4g gluonai/gluon-nlp:gpu-latest \
+    horovodrun -np 2 python3 -m pytest /workspace/horovod/horovod/test/test_mxnet.py
+```
+
+
+## Build your own Docker Image
+To build a docker image fom the dockerfile, you may use the following command:
+
+```
+docker build -f ubuntu18.04-devel-gpu.Dockerfile -t gluonai/gluon-nlp:gpu-latest .
+```
diff --git a/tools/docker/devel_entrypoint.sh b/tools/docker/devel_entrypoint.sh
new file mode 100644
index 0000000000..6a91eb26a3
--- /dev/null
+++ b/tools/docker/devel_entrypoint.sh
@@ -0,0 +1,5 @@
+#!/bin/bash
+
+source /start_jupyter.sh
+
+exec "$@"
diff --git a/tools/docker/start_jupyter.sh b/tools/docker/start_jupyter.sh
new file mode 100644
index 0000000000..695ad45d88
--- /dev/null
+++ b/tools/docker/start_jupyter.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+# Run Jupyter in foreground if $JUPYTER_FG is set
+if [[ "${JUPYTER_FG}" == "true" ]]; then
+   jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token=''
+   exit 0
+else
+   nohup jupyter-lab --allow-root --ip=0.0.0.0 --no-browser --NotebookApp.token='' > /dev/null 2>&1 &
+
+   echo "Notebook server successfully started, a JupyterLab instance has been executed!"
+   echo "Make local folders visible by volume mounting to /workspace/notebook"
+   echo "To access visit http://localhost:8888 on your host machine."
+   echo 'Ensure the following arguments to "docker run" are added to expose the server ports to your host machine:
+      -p 8888:8888 -p 8787:8787 -p 8786:8786'
+fi
diff --git a/tools/docker/ubuntu18.04-devel-gpu.Dockerfile b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile
new file mode 100644
index 0000000000..43d1a740f9
--- /dev/null
+++ b/tools/docker/ubuntu18.04-devel-gpu.Dockerfile
@@ -0,0 +1,175 @@
+FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
+
+LABEL maintainer="GluonNLP Team"
+
+ARG DEBIAN_FRONTEND=noninteractive
+
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:/usr/local/lib" \
+    PYTHONIOENCODING=UTF-8 \
+    LANG=C.UTF-8 \
+    LC_ALL=C.UTF-8
+
+ENV WORKDIR=/workspace
+ENV SHELL=/bin/bash
+
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+    software-properties-common \
+    build-essential \
+    ca-certificates \
+    curl \
+    emacs \
+    subversion \
+    locales \
+    cmake \
+    git \
+    libopencv-dev \
+    htop \
+    vim \
+    wget \
+    unzip \
+    libopenblas-dev \
+    ninja-build \
+    openssh-client \
+    openssh-server \
+    python3-dev \
+    python3-pip \
+    python3-setuptools \
+    libxft-dev \
+    zlib1g-dev \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+RUN python3 -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+###########################################################################
+# Horovod dependencies
+###########################################################################
+
+# Install Open MPI
+RUN mkdir /tmp/openmpi \
+ && cd /tmp/openmpi \
+ && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
+ && tar zxf openmpi-4.0.1.tar.gz \
+ && cd openmpi-4.0.1 \
+ && ./configure --enable-orterun-prefix-by-default \
+ && make -j $(nproc) all \
+ && make install \
+ && ldconfig \
+ && rm -rf /tmp/openmpi
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
+ && echo '#!/bin/bash' > /usr/local/bin/mpirun \
+ && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
+ && chmod a+x /usr/local/bin/mpirun
+
+RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
+
+ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
+ENV PATH=/usr/local/openmpi/bin/:/usr/local/bin:/root/.local/bin:$PATH
+
+RUN ln -s $(which python3) /usr/local/bin/python
+
+RUN mkdir -p ${WORKDIR}
+
+# install PyYAML==5.1.2 to avoid conflict with latest awscli
+# python-dateutil==2.8.0 to satisfy botocore associated with latest awscli
+RUN pip3 install --no-cache --upgrade \
+    wheel \
+    numpy==1.19.1 \
+    pandas==0.25.1 \
+    pytest \
+    Pillow \
+    requests==2.22.0 \
+    scikit-learn==0.20.4 \
+    scipy==1.2.2 \
+    urllib3==1.25.8 \
+    python-dateutil==2.8.0 \
+    sagemaker-experiments==0.* \
+    PyYAML==5.3.1 \
+    mpi4py==3.0.2 \
+    jupyterlab==2.2.4 \
+    cmake \
+    awscli
+
+# Install MXNet
+RUN mkdir -p ${WORKDIR}/mxnet \
+ && cd ${WORKDIR}/mxnet \
+ && git clone --single-branch --branch master --recursive https://github.com/apache/incubator-mxnet \
+ && cd incubator-mxnet \
+ && mkdir build \
+ && cd build \
+ && cmake -DMXNET_CUDA_ARCH="3.0;5.0;6.0;7.0" -GNinja -C ../config/linux_gpu.cmake .. \
+ && cmake --build . \
+ && cd ../python \
+ && python3 -m pip install -U -e . --user
+
+# Install Horovod
+# TODO Fix once https://github.com/horovod/horovod/pull/2155 gets merged
+RUN mkdir ${WORKDIR}/horovod \
+ && cd ${WORKDIR}/horovod \
+ && git clone --single-branch --branch mx2-pr --recursive https://github.com/eric-haibin-lin/horovod \
+ && cd horovod \
+ && ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \
+ && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL HOROVOD_WITHOUT_GLOO=1 \
+    HOROVOD_WITH_MPI=1 HOROVOD_WITH_MXNET=1 HOROVOD_WITHOUT_PYTORCH=1 \
+    HOROVOD_WITHOUT_TENSORFLOW=1 python3 setup.py install --user \
+ && ldconfig
+
+RUN mkdir -p ${WORKDIR}/notebook
+RUN mkdir -p ${WORKDIR}/data
+RUN mkdir -p /.init
+RUN cd ${WORKDIR} \
+   && git clone https://github.com/dmlc/gluon-nlp \
+   && cd gluon-nlp \
+   && git checkout master \
+   && python3 -m pip install -U -e ."[extras]" --user
+
+COPY start_jupyter.sh /start_jupyter.sh
+COPY devel_entrypoint.sh /devel_entrypoint.sh
+RUN chmod +x /devel_entrypoint.sh
+
+EXPOSE 8888
+EXPOSE 8787
+EXPOSE 8786
+
+WORKDIR ${WORKDIR}
+
+# Debug horovod by default
+RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
+
+# Install NodeJS + Tensorboard + TensorboardX
+RUN curl -sL https://deb.nodesource.com/setup_14.x | bash - \
+    && apt-get install -y nodejs
+
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+    libsndfile1-dev
+
+RUN pip3 install --no-cache --upgrade \
+    soundfile==0.10.2 \
+    ipywidgets==7.5.1 \
+    jupyter_tensorboard==0.2.0 \
+    widgetsnbextension==3.5.1 \
+    tensorboard==2.1.1 \
+    tensorboardX==2.1
+RUN jupyter labextension install jupyterlab_tensorboard \
+   && jupyter nbextension enable --py widgetsnbextension \
+   && jupyter labextension install @jupyter-widgets/jupyterlab-manager
+
+# Revise default shell to /bin/bash
+RUN jupyter notebook --generate-config \
+  && echo "c.NotebookApp.terminado_settings = { 'shell_command': ['/bin/bash'] }" >> /root/.jupyter/jupyter_notebook_config.py
+
+# Add Tini
+ARG TINI_VERSION=v0.19.0
+ADD https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini /tini
+RUN chmod +x /tini
+ENTRYPOINT [ "/tini", "--", "/devel_entrypoint.sh" ]
+CMD ["/bin/bash"]
diff --git a/tools/plot_bucketing_strategies.py b/tools/plot_bucketing_strategies.py
deleted file mode 100644
index d14badf813..0000000000
--- a/tools/plot_bucketing_strategies.py
+++ /dev/null
@@ -1,261 +0,0 @@
-import numpy as np
-import matplotlib.pyplot as plt
-import matplotlib as mpl
-mpl.rcParams['hatch.linewidth'] = 0.3
-from matplotlib.patches import Rectangle
-
-np.random.seed(215)
-# COLORS = ["#247BA0",
-#           "#70C1B3",
-#           "#B2DBBF",
-#           "#F3FFBD",
-#           "#FF1654"]
-# COLORS = ["#abd8ea",
-#           "#37aff7",
-#           "#2f97ef",
-#           "#178fd7",
-#           "#0077c0"]
-COLORS = ["#deecfb",
-          "#bedaf7",
-          "#7ab3ef",
-          "#368ce7",
-          "#1666ba"]
-# COLORS = ["#e6f0e1",
-#           "#d7ecd1",
-#           "#bbdfba",
-#           "#a4cca5",
-#           "#8bbf8c"]
-
-PAD_COLOR = "#F0F0F0"
-PAD_ALPHA = 1.0
-PAD_LINEWIDTH = 1.5
-ALPHA = 1.0
-RECT_LINEIWDTH = 0.5
-EDGECOLOR = 'k'
-BAR_HEIGHT = 5
-bucket_ranges = [(5, 10),
-                 (10, 15),
-                 (15, 20),
-                 (20, 25),
-                 (25, 30)]
-bucket_label = ["[5, 10)",
-                "[10, 15)",
-                "[15, 20)",
-                "[20, 25)",
-                "[25, 30)"]
-bucket_nums = [48, 32, 16, 8, 8]
-
-MAX_LEN = 30
-BASE_RATIO = 20.0 / float(sum(bucket_nums))
-PAD_LEN = 0.005
-BATCH_SIZE = 8
-
-
-def plot_seq(seq_info, x_begin, y_begin, x_end, y_end, save_path=None, title=None):
-    fig, ax = plt.subplots(figsize=(len(seq_info) * BASE_RATIO, 5))
-    ax.set_axis_off()
-    x_len = (x_end - x_begin) / float(len(seq_info))
-    y_len = float(y_end - y_begin)
-    legend_objs = [None for _ in range(len(bucket_label))]
-    # Draw Rectangles
-    for i, (seq_len, bucket_id) in enumerate(seq_info):
-        rect = Rectangle((x_begin + x_len * i, y_begin), x_len,
-                         y_len * float(seq_len) / MAX_LEN,
-                         facecolor=COLORS[bucket_id],
-                         linewidth=RECT_LINEIWDTH,
-                         edgecolor=EDGECOLOR,
-                         alpha=ALPHA)
-        legend_objs[bucket_id] = rect
-        ax.add_patch(rect)
-
-    fig.legend(legend_objs, bucket_label, loc="upper center", ncol=len(legend_objs), borderaxespad=0.05, fontsize=12)
-    title = r'Data Samples' if title is None else title
-    ax.text(0.5, - 0.04, title, horizontalalignment='center',
-            verticalalignment='center', fontsize=14)
-    fig.tight_layout()
-    if save_path is not None:
-        fig.savefig(save_path, bbox_inches='tight', pad_inches=0)
-
-
-def plot_bucket_seq(seq_info, x_begin, y_begin, x_end, y_end, bucket_sizes, save_path=None, title=None, sort_length=False):
-    fig, ax = plt.subplots(figsize=(len(seq_info) * BASE_RATIO, 5))
-    ax.set_axis_off()
-    bucket_num = len(bucket_sizes)
-    x_len = (x_end - x_begin - PAD_LEN * (bucket_num - 1)) / float(len(seq_info))
-    y_len = float(y_end - y_begin)
-    legend_objs = [None for _ in range(len(bucket_label))]
-    # Draw Rectangles
-    sample_id = 0
-    print(bucket_sizes)
-    for bucket_id, size in enumerate(bucket_sizes):
-        bucket_seq_info = seq_info[sample_id:(sample_id + size)]
-        if sort_length:
-            bucket_seq_info = sorted(bucket_seq_info, key=lambda ele: ele[0], reverse=True)
-        for i in range(size):
-            seq_len, color_id = bucket_seq_info[i]
-            rect = Rectangle((x_begin + x_len * sample_id + bucket_id * PAD_LEN, y_begin), x_len,
-                             y_len * float(seq_len) / MAX_LEN,
-                             facecolor=COLORS[color_id],
-                             linewidth=RECT_LINEIWDTH,
-                             edgecolor=EDGECOLOR,
-                             alpha=ALPHA)
-            sample_id += 1
-            legend_objs[color_id] = rect
-            ax.add_patch(rect)
-        ax.annotate(r'$Bucket_{%d}$' %bucket_id,
-                    xy=(x_begin + x_len * (sample_id - size / 2.0) + bucket_id * PAD_LEN, y_begin - 0.02),
-                    xytext=(x_begin + x_len * (sample_id - size / 2.0) + bucket_id * PAD_LEN, y_begin - 0.1),
-                    xycoords='axes fraction',
-                    fontsize=12, ha='center', va='bottom',
-                    arrowprops=dict(arrowstyle='-[, widthB=%g, lengthB=0.5' % (size * x_len * 55), lw=1.0))
-    fig.legend(legend_objs, bucket_label, loc="upper center", ncol=len(legend_objs), borderaxespad=0.05, fontsize=12)
-    title = r'Bucket Data Samples' if title is None else title
-    ax.text(0.5, - 0.04, title, horizontalalignment='center',
-            verticalalignment='center', fontsize=14)
-    fig.tight_layout()
-    if save_path is not None:
-        fig.savefig(save_path, bbox_inches='tight', pad_inches=0)
-
-
-def add_padded_batches(ax, all_batch_inds, seq_info, x_begin, y_begin, x_end, y_end):
-    batch_num = len(all_batch_inds)
-    cnt = 0
-    padding_val = 0.0
-    legend_objs = [None for _ in range(len(bucket_label) + 1)]
-    x_len = (x_end - x_begin - PAD_LEN * (batch_num - 1)) / float(len(seq_info))
-    y_len = float(y_end - y_begin)
-    for bid, batch_inds in enumerate(all_batch_inds):
-        max_seq_len = max(seq_info[j][0] for j in batch_inds)
-        rect = Rectangle((x_begin + x_len * cnt + bid * PAD_LEN, y_begin),
-                         len(batch_inds) * x_len,
-                         y_len * float(max_seq_len) / MAX_LEN,
-                         facecolor=PAD_COLOR,
-                         linewidth=PAD_LINEWIDTH,
-                         linestyle='-',
-                         edgecolor=EDGECOLOR,
-                         hatch='/',
-                         alpha=PAD_ALPHA,
-                         zorder=1)
-        ax.add_patch(rect)
-        border_rect = Rectangle((x_begin + x_len * cnt + bid * PAD_LEN, y_begin),
-                         len(batch_inds) * x_len,
-                         y_len * float(max_seq_len) / MAX_LEN,
-                         linewidth=PAD_LINEWIDTH,
-                         linestyle='-',
-                         edgecolor=EDGECOLOR,
-                         fill=False,
-                         zorder=20)
-        ax.add_patch(border_rect)
-        ax.text(x_begin + x_len * cnt + bid * PAD_LEN + len(batch_inds) * x_len / 2.0,
-                y_begin - 0.03,
-                r'$B_{%d}$' % bid, horizontalalignment='center', verticalalignment='center',
-                fontsize=12)
-        legend_objs[0] = rect
-
-        for ind in batch_inds:
-            seq_len, color_id = seq_info[ind]
-            rect = Rectangle((x_begin + x_len * cnt + bid * PAD_LEN, y_begin), x_len,
-                             y_len * float(seq_len) / MAX_LEN,
-                             facecolor=COLORS[color_id],
-                             linewidth=RECT_LINEIWDTH,
-                             edgecolor=EDGECOLOR,
-                             alpha=ALPHA,
-                             label=bucket_label[bucket_id],
-                             zorder=10)
-            cnt += 1
-            legend_objs[color_id + 1] = rect
-            ax.add_patch(rect)
-            padding_val += max_seq_len - seq_len
-    avg_padding = padding_val / float(len(seq_info))
-    return avg_padding, legend_objs
-
-def plot_batches(seq_info, all_batch_inds, x_begin, y_begin, x_end, y_end, save_path=None,
-                 title="Bucketing Strategy. "):
-    fig, ax = plt.subplots(figsize=(len(seq_info) * BASE_RATIO, 5))
-    ax.set_axis_off()
-    avg_pad, legend_objs = add_padded_batches(ax, all_batch_inds, seq_info, x_begin, y_begin, x_end, y_end)
-    fig.legend(legend_objs, ['padding'] + bucket_label, loc="upper center", ncol=len(legend_objs),
-               borderaxespad=0.04, fontsize=12)
-    ax.text(0.5, - 0.04,
-            r'%sAvg Pad = %.1f' % (title, avg_pad),
-            horizontalalignment='center', verticalalignment='center', fontsize=14)
-    fig.tight_layout()
-    if save_path is not None:
-        fig.savefig(save_path, bbox_inches='tight', pad_inches=0)
-    print('%savg padding=%g' %(title, avg_pad))
-    return avg_pad
-
-
-def get_no_bucket_inds(seq_info):
-    batch_inds = []
-    for begin in range(0, len(seq_info), BATCH_SIZE):
-        end = min(begin + BATCH_SIZE, len(seq_info))
-        batch_inds.append(list(range(begin, end)))
-    return batch_inds
-
-
-def get_sorted_bucket_inds(seq_info, mult=4):
-    batch_inds = []
-    for bucket_begin in range(0, len(seq_info), BATCH_SIZE * mult):
-        bucket_end = min(bucket_begin + BATCH_SIZE * mult, len(seq_info))
-        bucket_sample_ids = sorted(range(bucket_begin, bucket_end),
-                                   key=lambda ele: seq_info[ele][0],
-                                   reverse=True)
-        for begin in range(0, bucket_end - bucket_begin, BATCH_SIZE):
-            end = min(begin + BATCH_SIZE, bucket_end - bucket_begin)
-            batch_inds.append(bucket_sample_ids[begin:end])
-    return batch_inds
-
-
-def get_fixed_bucket_inds(seq_info, ratio=0.0):
-    bucket_sample_ids = [[] for _ in range(5)]
-    batch_inds = []
-    for i, (seq_len, color_id) in enumerate(seq_info):
-        bucket_sample_ids[color_id].append(i)
-    bucket_seq_len = [ele[1] - 1 for ele in bucket_ranges]
-
-    bucket_batch_sizes = [max(int(BATCH_SIZE * ratio * max(bucket_seq_len) / float(ele_len)), BATCH_SIZE)
-                         for ele_len in bucket_seq_len]
-    bucket_sample_ids, bucket_batch_sizes = bucket_sample_ids[::-1], bucket_batch_sizes[::-1]
-    for i, (sample_ids, batch_size) in enumerate(zip(bucket_sample_ids, bucket_batch_sizes)):
-        for begin in range(0, len(sample_ids), batch_size):
-            end = min(begin + batch_size, len(sample_ids))
-            batch_inds.append(sample_ids[begin:end])
-    return bucket_sample_ids, batch_inds
-
-
-seq_info = []
-for bucket_id, (brange, bnum) in enumerate(zip(bucket_ranges, bucket_nums)):
-    for _ in range(bnum):
-        seq_info.append((np.random.randint(brange[0], brange[1]), bucket_id))
-np.random.shuffle(seq_info)
-plot_seq(seq_info, 0.0, 0.0, 0.99, 0.97, save_path="data_samples.png")
-batch_inds = get_no_bucket_inds(seq_info)
-plot_batches(seq_info, batch_inds, 0.0, 0.05, 0.99, 0.97,
-             save_path="no_bucket_strategy.png", title="No Bucketing Strategy. ")
-
-sorted_bucket_size = [BATCH_SIZE * 4 for _ in range(0, len(seq_info), BATCH_SIZE * 4)]
-sorted_bucket_size[-1] -= BATCH_SIZE * 4 * len(sorted_bucket_size) - len(seq_info)
-plot_bucket_seq(seq_info, 0.0, 0.1, 0.99, 0.97, bucket_sizes=sorted_bucket_size, save_path="sorted_bucket_data_samples.png", title=r"Data Samples. Bucket Size = %d" %(BATCH_SIZE * 4))
-plot_bucket_seq(seq_info, 0.0, 0.1, 0.99, 0.97, bucket_sizes=sorted_bucket_size, save_path="sorted_bucket_data_samples_after_sort.png", title=r"Sorted Data Samples. Bucket Size = %d" %(BATCH_SIZE * 4), sort_length=True)
-# plot_seq_batch_size_padded(seq_info, 0.0, 0.05, 0.99, 0.97, batch_size=BATCH_SIZE, save_path="no_bucket_strategy.png")
-
-batch_inds = get_sorted_bucket_inds(seq_info, mult=4)
-plot_batches(seq_info, batch_inds, 0.0, 0.05, 0.99, 0.97,
-             save_path="sorted_bucket_strategy.png", title=r"Sorted Bucketing Strategy. Bucket Size = %d, " % (BATCH_SIZE * 4))
-
-
-bucket_sample_ids, batch_inds = get_fixed_bucket_inds(seq_info, 0.0)
-plot_batches(seq_info, batch_inds, 0.0, 0.05, 0.99, 0.97,
-             save_path="fixed_bucket_strategy_ratio0.0.png", title="Fixed Bucketing Strategy. Ratio = 0.0, ")
-
-plot_bucket_seq([seq_info[i] for i in sum(bucket_sample_ids, [])], 0.0, 0.1, 0.99, 0.97,
-                bucket_sizes=[len(ele) for ele in bucket_sample_ids],
-                save_path="fixed_bucket_data_samples.png", title=r"Reorganized Data Samples")
-
-bucket_sample_ids, batch_inds = get_fixed_bucket_inds(seq_info, 0.7)
-plot_batches(seq_info, batch_inds, 0.0, 0.05, 0.99, 0.97,
-             save_path="fixed_bucket_strategy_ratio0.7.png", title="Fixed Bucketing Strategy. Ratio = 0.7, ")
-
-
-plt.show()